Merge pull request #890 from terrelln/lz4-memcpy

Call LZ4_memcpy() instead of memcpy()
2020-08-03 18:34:16 -07:00 · 2020-08-03 18:34:16 -07:00 · cd5d78ebd0
commit cd5d78ebd0
parent 349b343cf0 fe2a1b3707
1 changed files with 51 additions and 36 deletions
--- a/lib/lz4.c
+++ b/lib/lz4.c
@ -277,6 +277,21 @@ typedef enum {
 /*-************************************
 *  Reading and writing into memory
 **************************************/
+
+/**
+ * LZ4 relies on memcpy with a constant size being inlined. In freestanding
+ * environments, the compiler can't assume the implementation of memcpy() is
+ * standard compliant, so it can't apply its specialized memcpy() inlining
+ * logic. When possible, use __builtin_memcpy() to tell the compiler to analyze
+ * memcpy() as if it were standard compliant, so it can inline it in freestanding
+ * environments. This is needed when decompressing the Linux Kernel, for example.
+ */
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
+#else
+#define LZ4_memcpy(dst, src, size) memcpy(dst, src, size)
+#endif
+
 static unsigned LZ4_isLittleEndian(void)
 {
    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental */
@ -311,27 +326,27 @@ static void LZ4_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = val

 static U16 LZ4_read16(const void* memPtr)
 {
-    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+    U16 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
 }

 static U32 LZ4_read32(const void* memPtr)
 {
-    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+    U32 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
 }

 static reg_t LZ4_read_ARCH(const void* memPtr)
 {
-    reg_t val; memcpy(&val, memPtr, sizeof(val)); return val;
+    reg_t val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
 }

 static void LZ4_write16(void* memPtr, U16 value)
 {
-    memcpy(memPtr, &value, sizeof(value));
+    LZ4_memcpy(memPtr, &value, sizeof(value));
 }

 static void LZ4_write32(void* memPtr, U32 value)
 {
-    memcpy(memPtr, &value, sizeof(value));
+    LZ4_memcpy(memPtr, &value, sizeof(value));
 }

 #endif /* LZ4_FORCE_MEMORY_ACCESS */
@ -366,7 +381,7 @@ void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd)
    const BYTE* s = (const BYTE*)srcPtr;
    BYTE* const e = (BYTE*)dstEnd;

-    do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
+    do { LZ4_memcpy(d,s,8); d+=8; s+=8; } while (d<e);
 }

 static const unsigned inc32table[8] = {0, 1, 2,  1,  0,  4, 4, 4};
@ -397,11 +412,11 @@ LZ4_memcpy_using_offset_base(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, con
        dstPtr[2] = srcPtr[2];
        dstPtr[3] = srcPtr[3];
        srcPtr += inc32table[offset];
-        memcpy(dstPtr+4, srcPtr, 4);
+        LZ4_memcpy(dstPtr+4, srcPtr, 4);
        srcPtr -= dec64table[offset];
        dstPtr += 8;
    } else {
-        memcpy(dstPtr, srcPtr, 8);
+        LZ4_memcpy(dstPtr, srcPtr, 8);
        dstPtr += 8;
        srcPtr += 8;
    }
@ -419,7 +434,7 @@ LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd)
    const BYTE* s = (const BYTE*)srcPtr;
    BYTE* const e = (BYTE*)dstEnd;

-    do { memcpy(d,s,16); memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e);
+    do { LZ4_memcpy(d,s,16); LZ4_memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e);
 }

 /* LZ4_memcpy_using_offset()  presumes :
@ -438,23 +453,23 @@ LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const si
        memset(v, *srcPtr, 8);
        break;
    case 2:
-        memcpy(v, srcPtr, 2);
-        memcpy(&v[2], srcPtr, 2);
-        memcpy(&v[4], &v[0], 4);
+        LZ4_memcpy(v, srcPtr, 2);
+        LZ4_memcpy(&v[2], srcPtr, 2);
+        LZ4_memcpy(&v[4], &v[0], 4);
        break;
    case 4:
-        memcpy(v, srcPtr, 4);
-        memcpy(&v[4], srcPtr, 4);
+        LZ4_memcpy(v, srcPtr, 4);
+        LZ4_memcpy(&v[4], srcPtr, 4);
        break;
    default:
        LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
        return;
    }

-    memcpy(dstPtr, v, 8);
+    LZ4_memcpy(dstPtr, v, 8);
    dstPtr += 8;
    while (dstPtr < dstEnd) {
-        memcpy(dstPtr, v, 8);
+        LZ4_memcpy(dstPtr, v, 8);
        dstPtr += 8;
    }
 }
@ -1166,7 +1181,7 @@ _last_literals:
        } else {
            *op++ = (BYTE)(lastRun<<ML_BITS);
        }
-        memcpy(op, anchor, lastRun);
+        LZ4_memcpy(op, anchor, lastRun);
        ip = anchor + lastRun;
        op += lastRun;
    }
@ -1542,7 +1557,7 @@ int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream,
                 * cost to copy the dictionary's tables into the active context,
                 * so that the compression loop is only looking into one table.
                 */
-                memcpy(streamPtr, streamPtr->dictCtx, sizeof(LZ4_stream_t));
+                LZ4_memcpy(streamPtr, streamPtr->dictCtx, sizeof(LZ4_stream_t));
                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
            } else {
                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingDictCtx, noDictIssue, acceleration);
@ -1750,12 +1765,12 @@ LZ4_decompress_generic(
                    /* We don't need to check oend, since we check it once for each loop below */
                    if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) { goto safe_literal_copy; }
                    /* Literals can only be 14, but hope compilers optimize if we copy by a register size */
-                    memcpy(op, ip, 16);
+                    LZ4_memcpy(op, ip, 16);
                } else {  /* LZ4_decompress_fast() */
                    /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time :
                     * it doesn't know input length, and relies on end-of-block properties */
-                    memcpy(op, ip, 8);
-                    if (length > 8) { memcpy(op+8, ip+8, 8); }
+                    LZ4_memcpy(op, ip, 8);
+                    if (length > 8) { LZ4_memcpy(op+8, ip+8, 8); }
                }
                ip += length; op = cpy;
            }
@ -1791,9 +1806,9 @@ LZ4_decompress_generic(
                        assert(match <= op);
                        assert(op + 18 <= oend);

-                        memcpy(op, match, 8);
-                        memcpy(op+8, match+8, 8);
-                        memcpy(op+16, match+16, 2);
+                        LZ4_memcpy(op, match, 8);
+                        LZ4_memcpy(op+8, match+8, 8);
+                        LZ4_memcpy(op+16, match+16, 2);
                        op += length;
                        continue;
            }   }   }
@ -1816,14 +1831,14 @@ LZ4_decompress_generic(
                    /* match stretches into both external dictionary and current block */
                    size_t const copySize = (size_t)(lowPrefix - match);
                    size_t const restSize = length - copySize;
-                    memcpy(op, dictEnd - copySize, copySize);
+                    LZ4_memcpy(op, dictEnd - copySize, copySize);
                    op += copySize;
                    if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
                        BYTE* const endOfMatch = op + restSize;
                        const BYTE* copyFrom = lowPrefix;
                        while (op < endOfMatch) { *op++ = *copyFrom++; }
                    } else {
-                        memcpy(op, lowPrefix, restSize);
+                        LZ4_memcpy(op, lowPrefix, restSize);
                        op += restSize;
                }   }
                continue;
@ -1864,7 +1879,7 @@ LZ4_decompress_generic(
                /* strictly "less than" on input, to re-enter the loop with at least one byte */
              && likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend)) ) {
                /* Copy the literals */
-                memcpy(op, ip, endOnInput ? 16 : 8);
+                LZ4_memcpy(op, ip, endOnInput ? 16 : 8);
                op += length; ip += length;

                /* The second stage: prepare for match copying, decode full info.
@ -1879,9 +1894,9 @@ LZ4_decompress_generic(
                  && (offset >= 8)
                  && (dict==withPrefix64k || match >= lowPrefix) ) {
                    /* Copy the match. */
-                    memcpy(op + 0, match + 0, 8);
-                    memcpy(op + 8, match + 8, 8);
-                    memcpy(op +16, match +16, 2);
+                    LZ4_memcpy(op + 0, match + 0, 8);
+                    LZ4_memcpy(op + 8, match + 8, 8);
+                    LZ4_memcpy(op +16, match +16, 2);
                    op += length + MINMATCH;
                    /* Both stages worked, load the next token. */
                    continue;
@ -1994,14 +2009,14 @@ LZ4_decompress_generic(
                    /* match stretches into both external dictionary and current block */
                    size_t const copySize = (size_t)(lowPrefix - match);
                    size_t const restSize = length - copySize;
-                    memcpy(op, dictEnd - copySize, copySize);
+                    LZ4_memcpy(op, dictEnd - copySize, copySize);
                    op += copySize;
                    if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
                        BYTE* const endOfMatch = op + restSize;
                        const BYTE* copyFrom = lowPrefix;
                        while (op < endOfMatch) *op++ = *copyFrom++;
                    } else {
-                        memcpy(op, lowPrefix, restSize);
+                        LZ4_memcpy(op, lowPrefix, restSize);
                        op += restSize;
                }   }
                continue;
@ -2020,7 +2035,7 @@ LZ4_decompress_generic(
                if (matchEnd > op) {   /* overlap copy */
                    while (op < copyEnd) { *op++ = *match++; }
                } else {
-                    memcpy(op, match, mlen);
+                    LZ4_memcpy(op, match, mlen);
                }
                op = copyEnd;
                if (op == oend) { break; }
@ -2034,10 +2049,10 @@ LZ4_decompress_generic(
                op[2] = match[2];
                op[3] = match[3];
                match += inc32table[offset];
-                memcpy(op+4, match, 4);
+                LZ4_memcpy(op+4, match, 4);
                match -= dec64table[offset];
            } else {
-                memcpy(op, match, 8);
+                LZ4_memcpy(op, match, 8);
                match += 8;
            }
            op += 8;
@ -2052,7 +2067,7 @@ LZ4_decompress_generic(
                }
                while (op < cpy) { *op++ = *match++; }
            } else {
-                memcpy(op, match, 8);
+                LZ4_memcpy(op, match, 8);
                if (length > 16)  { LZ4_wildCopy8(op+8, match+8, cpy); }
            }
            op = cpy;   /* wildcopy correction */