Call LZ4_memcpy() instead of memcpy()
`LZ4_memcpy()` uses `__builtin_memcpy()` to ensure that clang/gcc can inline the `memcpy()` calls in freestanding mode. This is necessary for decompressing the Linux Kernel with LZ4. Without an analogous patch decompression ran at 77 MB/s, and with the patch it ran at 884 MB/s.
This commit is contained in:
parent
f4054274fa
commit
fe2a1b3707
87
lib/lz4.c
87
lib/lz4.c
@ -277,6 +277,21 @@ typedef enum {
|
|||||||
/*-************************************
|
/*-************************************
|
||||||
* Reading and writing into memory
|
* Reading and writing into memory
|
||||||
**************************************/
|
**************************************/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* LZ4 relies on memcpy with a constant size being inlined. In freestanding
|
||||||
|
* environments, the compiler can't assume the implementation of memcpy() is
|
||||||
|
* standard compliant, so it can't apply its specialized memcpy() inlining
|
||||||
|
* logic. When possible, use __builtin_memcpy() to tell the compiler to analyze
|
||||||
|
* memcpy() as if it were standard compliant, so it can inline it in freestanding
|
||||||
|
* environments. This is needed when decompressing the Linux Kernel, for example.
|
||||||
|
*/
|
||||||
|
#if defined(__GNUC__) && (__GNUC__ >= 4)
|
||||||
|
#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
|
||||||
|
#else
|
||||||
|
#define LZ4_memcpy(dst, src, size) memcpy(dst, src, size)
|
||||||
|
#endif
|
||||||
|
|
||||||
static unsigned LZ4_isLittleEndian(void)
|
static unsigned LZ4_isLittleEndian(void)
|
||||||
{
|
{
|
||||||
const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */
|
const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */
|
||||||
@ -311,27 +326,27 @@ static void LZ4_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = val
|
|||||||
|
|
||||||
static U16 LZ4_read16(const void* memPtr)
|
static U16 LZ4_read16(const void* memPtr)
|
||||||
{
|
{
|
||||||
U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
|
U16 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
|
||||||
}
|
}
|
||||||
|
|
||||||
static U32 LZ4_read32(const void* memPtr)
|
static U32 LZ4_read32(const void* memPtr)
|
||||||
{
|
{
|
||||||
U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
|
U32 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
|
||||||
}
|
}
|
||||||
|
|
||||||
static reg_t LZ4_read_ARCH(const void* memPtr)
|
static reg_t LZ4_read_ARCH(const void* memPtr)
|
||||||
{
|
{
|
||||||
reg_t val; memcpy(&val, memPtr, sizeof(val)); return val;
|
reg_t val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void LZ4_write16(void* memPtr, U16 value)
|
static void LZ4_write16(void* memPtr, U16 value)
|
||||||
{
|
{
|
||||||
memcpy(memPtr, &value, sizeof(value));
|
LZ4_memcpy(memPtr, &value, sizeof(value));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void LZ4_write32(void* memPtr, U32 value)
|
static void LZ4_write32(void* memPtr, U32 value)
|
||||||
{
|
{
|
||||||
memcpy(memPtr, &value, sizeof(value));
|
LZ4_memcpy(memPtr, &value, sizeof(value));
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* LZ4_FORCE_MEMORY_ACCESS */
|
#endif /* LZ4_FORCE_MEMORY_ACCESS */
|
||||||
@ -366,7 +381,7 @@ void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd)
|
|||||||
const BYTE* s = (const BYTE*)srcPtr;
|
const BYTE* s = (const BYTE*)srcPtr;
|
||||||
BYTE* const e = (BYTE*)dstEnd;
|
BYTE* const e = (BYTE*)dstEnd;
|
||||||
|
|
||||||
do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
|
do { LZ4_memcpy(d,s,8); d+=8; s+=8; } while (d<e);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const unsigned inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
|
static const unsigned inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
|
||||||
@ -397,11 +412,11 @@ LZ4_memcpy_using_offset_base(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, con
|
|||||||
dstPtr[2] = srcPtr[2];
|
dstPtr[2] = srcPtr[2];
|
||||||
dstPtr[3] = srcPtr[3];
|
dstPtr[3] = srcPtr[3];
|
||||||
srcPtr += inc32table[offset];
|
srcPtr += inc32table[offset];
|
||||||
memcpy(dstPtr+4, srcPtr, 4);
|
LZ4_memcpy(dstPtr+4, srcPtr, 4);
|
||||||
srcPtr -= dec64table[offset];
|
srcPtr -= dec64table[offset];
|
||||||
dstPtr += 8;
|
dstPtr += 8;
|
||||||
} else {
|
} else {
|
||||||
memcpy(dstPtr, srcPtr, 8);
|
LZ4_memcpy(dstPtr, srcPtr, 8);
|
||||||
dstPtr += 8;
|
dstPtr += 8;
|
||||||
srcPtr += 8;
|
srcPtr += 8;
|
||||||
}
|
}
|
||||||
@ -419,7 +434,7 @@ LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd)
|
|||||||
const BYTE* s = (const BYTE*)srcPtr;
|
const BYTE* s = (const BYTE*)srcPtr;
|
||||||
BYTE* const e = (BYTE*)dstEnd;
|
BYTE* const e = (BYTE*)dstEnd;
|
||||||
|
|
||||||
do { memcpy(d,s,16); memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e);
|
do { LZ4_memcpy(d,s,16); LZ4_memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* LZ4_memcpy_using_offset() presumes :
|
/* LZ4_memcpy_using_offset() presumes :
|
||||||
@ -438,23 +453,23 @@ LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const si
|
|||||||
memset(v, *srcPtr, 8);
|
memset(v, *srcPtr, 8);
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
memcpy(v, srcPtr, 2);
|
LZ4_memcpy(v, srcPtr, 2);
|
||||||
memcpy(&v[2], srcPtr, 2);
|
LZ4_memcpy(&v[2], srcPtr, 2);
|
||||||
memcpy(&v[4], &v[0], 4);
|
LZ4_memcpy(&v[4], &v[0], 4);
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
memcpy(v, srcPtr, 4);
|
LZ4_memcpy(v, srcPtr, 4);
|
||||||
memcpy(&v[4], srcPtr, 4);
|
LZ4_memcpy(&v[4], srcPtr, 4);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
|
LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(dstPtr, v, 8);
|
LZ4_memcpy(dstPtr, v, 8);
|
||||||
dstPtr += 8;
|
dstPtr += 8;
|
||||||
while (dstPtr < dstEnd) {
|
while (dstPtr < dstEnd) {
|
||||||
memcpy(dstPtr, v, 8);
|
LZ4_memcpy(dstPtr, v, 8);
|
||||||
dstPtr += 8;
|
dstPtr += 8;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1166,7 +1181,7 @@ _last_literals:
|
|||||||
} else {
|
} else {
|
||||||
*op++ = (BYTE)(lastRun<<ML_BITS);
|
*op++ = (BYTE)(lastRun<<ML_BITS);
|
||||||
}
|
}
|
||||||
memcpy(op, anchor, lastRun);
|
LZ4_memcpy(op, anchor, lastRun);
|
||||||
ip = anchor + lastRun;
|
ip = anchor + lastRun;
|
||||||
op += lastRun;
|
op += lastRun;
|
||||||
}
|
}
|
||||||
@ -1542,7 +1557,7 @@ int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream,
|
|||||||
* cost to copy the dictionary's tables into the active context,
|
* cost to copy the dictionary's tables into the active context,
|
||||||
* so that the compression loop is only looking into one table.
|
* so that the compression loop is only looking into one table.
|
||||||
*/
|
*/
|
||||||
memcpy(streamPtr, streamPtr->dictCtx, sizeof(LZ4_stream_t));
|
LZ4_memcpy(streamPtr, streamPtr->dictCtx, sizeof(LZ4_stream_t));
|
||||||
result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
|
result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
|
||||||
} else {
|
} else {
|
||||||
result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingDictCtx, noDictIssue, acceleration);
|
result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingDictCtx, noDictIssue, acceleration);
|
||||||
@ -1750,12 +1765,12 @@ LZ4_decompress_generic(
|
|||||||
/* We don't need to check oend, since we check it once for each loop below */
|
/* We don't need to check oend, since we check it once for each loop below */
|
||||||
if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) { goto safe_literal_copy; }
|
if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) { goto safe_literal_copy; }
|
||||||
/* Literals can only be 14, but hope compilers optimize if we copy by a register size */
|
/* Literals can only be 14, but hope compilers optimize if we copy by a register size */
|
||||||
memcpy(op, ip, 16);
|
LZ4_memcpy(op, ip, 16);
|
||||||
} else { /* LZ4_decompress_fast() */
|
} else { /* LZ4_decompress_fast() */
|
||||||
/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time :
|
/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time :
|
||||||
* it doesn't know input length, and relies on end-of-block properties */
|
* it doesn't know input length, and relies on end-of-block properties */
|
||||||
memcpy(op, ip, 8);
|
LZ4_memcpy(op, ip, 8);
|
||||||
if (length > 8) { memcpy(op+8, ip+8, 8); }
|
if (length > 8) { LZ4_memcpy(op+8, ip+8, 8); }
|
||||||
}
|
}
|
||||||
ip += length; op = cpy;
|
ip += length; op = cpy;
|
||||||
}
|
}
|
||||||
@ -1791,9 +1806,9 @@ LZ4_decompress_generic(
|
|||||||
assert(match <= op);
|
assert(match <= op);
|
||||||
assert(op + 18 <= oend);
|
assert(op + 18 <= oend);
|
||||||
|
|
||||||
memcpy(op, match, 8);
|
LZ4_memcpy(op, match, 8);
|
||||||
memcpy(op+8, match+8, 8);
|
LZ4_memcpy(op+8, match+8, 8);
|
||||||
memcpy(op+16, match+16, 2);
|
LZ4_memcpy(op+16, match+16, 2);
|
||||||
op += length;
|
op += length;
|
||||||
continue;
|
continue;
|
||||||
} } }
|
} } }
|
||||||
@ -1816,14 +1831,14 @@ LZ4_decompress_generic(
|
|||||||
/* match stretches into both external dictionary and current block */
|
/* match stretches into both external dictionary and current block */
|
||||||
size_t const copySize = (size_t)(lowPrefix - match);
|
size_t const copySize = (size_t)(lowPrefix - match);
|
||||||
size_t const restSize = length - copySize;
|
size_t const restSize = length - copySize;
|
||||||
memcpy(op, dictEnd - copySize, copySize);
|
LZ4_memcpy(op, dictEnd - copySize, copySize);
|
||||||
op += copySize;
|
op += copySize;
|
||||||
if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */
|
if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */
|
||||||
BYTE* const endOfMatch = op + restSize;
|
BYTE* const endOfMatch = op + restSize;
|
||||||
const BYTE* copyFrom = lowPrefix;
|
const BYTE* copyFrom = lowPrefix;
|
||||||
while (op < endOfMatch) { *op++ = *copyFrom++; }
|
while (op < endOfMatch) { *op++ = *copyFrom++; }
|
||||||
} else {
|
} else {
|
||||||
memcpy(op, lowPrefix, restSize);
|
LZ4_memcpy(op, lowPrefix, restSize);
|
||||||
op += restSize;
|
op += restSize;
|
||||||
} }
|
} }
|
||||||
continue;
|
continue;
|
||||||
@ -1864,7 +1879,7 @@ LZ4_decompress_generic(
|
|||||||
/* strictly "less than" on input, to re-enter the loop with at least one byte */
|
/* strictly "less than" on input, to re-enter the loop with at least one byte */
|
||||||
&& likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend)) ) {
|
&& likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend)) ) {
|
||||||
/* Copy the literals */
|
/* Copy the literals */
|
||||||
memcpy(op, ip, endOnInput ? 16 : 8);
|
LZ4_memcpy(op, ip, endOnInput ? 16 : 8);
|
||||||
op += length; ip += length;
|
op += length; ip += length;
|
||||||
|
|
||||||
/* The second stage: prepare for match copying, decode full info.
|
/* The second stage: prepare for match copying, decode full info.
|
||||||
@ -1879,9 +1894,9 @@ LZ4_decompress_generic(
|
|||||||
&& (offset >= 8)
|
&& (offset >= 8)
|
||||||
&& (dict==withPrefix64k || match >= lowPrefix) ) {
|
&& (dict==withPrefix64k || match >= lowPrefix) ) {
|
||||||
/* Copy the match. */
|
/* Copy the match. */
|
||||||
memcpy(op + 0, match + 0, 8);
|
LZ4_memcpy(op + 0, match + 0, 8);
|
||||||
memcpy(op + 8, match + 8, 8);
|
LZ4_memcpy(op + 8, match + 8, 8);
|
||||||
memcpy(op +16, match +16, 2);
|
LZ4_memcpy(op +16, match +16, 2);
|
||||||
op += length + MINMATCH;
|
op += length + MINMATCH;
|
||||||
/* Both stages worked, load the next token. */
|
/* Both stages worked, load the next token. */
|
||||||
continue;
|
continue;
|
||||||
@ -1994,14 +2009,14 @@ LZ4_decompress_generic(
|
|||||||
/* match stretches into both external dictionary and current block */
|
/* match stretches into both external dictionary and current block */
|
||||||
size_t const copySize = (size_t)(lowPrefix - match);
|
size_t const copySize = (size_t)(lowPrefix - match);
|
||||||
size_t const restSize = length - copySize;
|
size_t const restSize = length - copySize;
|
||||||
memcpy(op, dictEnd - copySize, copySize);
|
LZ4_memcpy(op, dictEnd - copySize, copySize);
|
||||||
op += copySize;
|
op += copySize;
|
||||||
if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */
|
if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */
|
||||||
BYTE* const endOfMatch = op + restSize;
|
BYTE* const endOfMatch = op + restSize;
|
||||||
const BYTE* copyFrom = lowPrefix;
|
const BYTE* copyFrom = lowPrefix;
|
||||||
while (op < endOfMatch) *op++ = *copyFrom++;
|
while (op < endOfMatch) *op++ = *copyFrom++;
|
||||||
} else {
|
} else {
|
||||||
memcpy(op, lowPrefix, restSize);
|
LZ4_memcpy(op, lowPrefix, restSize);
|
||||||
op += restSize;
|
op += restSize;
|
||||||
} }
|
} }
|
||||||
continue;
|
continue;
|
||||||
@ -2020,7 +2035,7 @@ LZ4_decompress_generic(
|
|||||||
if (matchEnd > op) { /* overlap copy */
|
if (matchEnd > op) { /* overlap copy */
|
||||||
while (op < copyEnd) { *op++ = *match++; }
|
while (op < copyEnd) { *op++ = *match++; }
|
||||||
} else {
|
} else {
|
||||||
memcpy(op, match, mlen);
|
LZ4_memcpy(op, match, mlen);
|
||||||
}
|
}
|
||||||
op = copyEnd;
|
op = copyEnd;
|
||||||
if (op == oend) { break; }
|
if (op == oend) { break; }
|
||||||
@ -2034,10 +2049,10 @@ LZ4_decompress_generic(
|
|||||||
op[2] = match[2];
|
op[2] = match[2];
|
||||||
op[3] = match[3];
|
op[3] = match[3];
|
||||||
match += inc32table[offset];
|
match += inc32table[offset];
|
||||||
memcpy(op+4, match, 4);
|
LZ4_memcpy(op+4, match, 4);
|
||||||
match -= dec64table[offset];
|
match -= dec64table[offset];
|
||||||
} else {
|
} else {
|
||||||
memcpy(op, match, 8);
|
LZ4_memcpy(op, match, 8);
|
||||||
match += 8;
|
match += 8;
|
||||||
}
|
}
|
||||||
op += 8;
|
op += 8;
|
||||||
@ -2052,7 +2067,7 @@ LZ4_decompress_generic(
|
|||||||
}
|
}
|
||||||
while (op < cpy) { *op++ = *match++; }
|
while (op < cpy) { *op++ = *match++; }
|
||||||
} else {
|
} else {
|
||||||
memcpy(op, match, 8);
|
LZ4_memcpy(op, match, 8);
|
||||||
if (length > 16) { LZ4_wildCopy8(op+8, match+8, cpy); }
|
if (length > 16) { LZ4_wildCopy8(op+8, match+8, cpy); }
|
||||||
}
|
}
|
||||||
op = cpy; /* wildcopy correction */
|
op = cpy; /* wildcopy correction */
|
||||||
|
Loading…
Reference in New Issue
Block a user