/* dictBuilder - dictionary builder for zstd Copyright (C) Yann Collet 2016 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - Zstd homepage : https://www.zstd.net */ /*-************************************** * Compiler Options ****************************************/ /* Disable some Visual warning messages */ #ifdef _MSC_VER # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ #endif /* Unix Large Files support (>4GB) */ #define _FILE_OFFSET_BITS 64 #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */ # define _LARGEFILE_SOURCE #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */ # define _LARGEFILE64_SOURCE #endif /*-************************************* * Dependencies ***************************************/ #include /* malloc, free */ #include /* memset */ #include /* fprintf, fopen, ftello64 */ #include /* stat64 */ #include /* stat64 */ #include /* clock */ #include "mem.h" /* read */ #include "error_private.h" #include "fse.h" #include "huff0_static.h" #include "zstd_internal.h" #include "divsufsort.h" #include "zdict_static.h" /*-************************************* * Compiler specifics ***************************************/ #if !defined(S_ISREG) # define S_ISREG(x) (((x) & S_IFMT) == S_IFREG) #endif /*-************************************* * Constants ***************************************/ #define KB *(1 <<10) #define MB *(1 <<20) #define GB *(1U<<30) #define DICTLISTSIZE 10000 #define NOISELENGTH 32 #define PRIME1 2654435761U #define PRIME2 2246822519U #define MINRATIO 4 static const U32 g_compressionLevel_default = 5; static const U32 g_selectivity_default = 9; static const size_t g_provision_entropySize = 200; static const size_t g_min_fast_dictContent = 192; /*-************************************* * Console display ***************************************/ #define DISPLAY(...) fprintf(stderr, __VA_ARGS__) #define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); } static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */ #define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \ if (ZDICT_GetMilliSpan(g_time) > refreshRate) \ { g_time = clock(); DISPLAY(__VA_ARGS__); \ if (g_displayLevel>=4) fflush(stdout); } } static const unsigned refreshRate = 300; static clock_t g_time = 0; static void ZDICT_printHex(U32 dlevel, const void* ptr, size_t length) { const BYTE* const b = (const BYTE*)ptr; size_t u; for (u=0; u126) c = '.'; /* non-printable char */ DISPLAYLEVEL(dlevel, "%c", c); } } /*-******************************************************** * Helper functions **********************************************************/ static unsigned ZDICT_GetMilliSpan(clock_t nPrevious) { clock_t nCurrent = clock(); unsigned nSpan = (unsigned)(((nCurrent - nPrevious) * 1000) / CLOCKS_PER_SEC); return nSpan; } unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); } const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); } /*-******************************************************** * Dictionary training functions **********************************************************/ static unsigned ZDICT_NbCommonBytes (register size_t val) { if (MEM_isLittleEndian()) { if (MEM_64bits()) { # if defined(_MSC_VER) && defined(_WIN64) unsigned long r = 0; _BitScanForward64( &r, (U64)val ); return (unsigned)(r>>3); # elif defined(__GNUC__) && (__GNUC__ >= 3) return (__builtin_ctzll((U64)val) >> 3); # else static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; # endif } else { /* 32 bits */ # if defined(_MSC_VER) unsigned long r=0; _BitScanForward( &r, (U32)val ); return (unsigned)(r>>3); # elif defined(__GNUC__) && (__GNUC__ >= 3) return (__builtin_ctz((U32)val) >> 3); # else static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; # endif } } else { /* Big Endian CPU */ if (MEM_64bits()) { # if defined(_MSC_VER) && defined(_WIN64) unsigned long r = 0; _BitScanReverse64( &r, val ); return (unsigned)(r>>3); # elif defined(__GNUC__) && (__GNUC__ >= 3) return (__builtin_clzll(val) >> 3); # else unsigned r; const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } r += (!val); return r; # endif } else { /* 32 bits */ # if defined(_MSC_VER) unsigned long r = 0; _BitScanReverse( &r, (unsigned long)val ); return (unsigned)(r>>3); # elif defined(__GNUC__) && (__GNUC__ >= 3) return (__builtin_clz((U32)val) >> 3); # else unsigned r; if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } r += (!val); return r; # endif } } } /*! ZDICT_count() : Count the nb of common bytes between 2 pointers. Note : this function presumes end of buffer followed by noisy guard band. */ static size_t ZDICT_count(const void* pIn, const void* pMatch) { const char* const pStart = (const char*)pIn; for (;;) { size_t diff = MEM_readST(pMatch) ^ MEM_readST(pIn); if (!diff) { pIn = (const char*)pIn+sizeof(size_t); pMatch = (const char*)pMatch+sizeof(size_t); continue; } pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff); return (size_t)((const char*)pIn - pStart); } } typedef struct { U32 pos; U32 length; U32 savings; } dictItem; static void ZDICT_initDictItem(dictItem* d) { d->pos = 1; d->length = 0; d->savings = (U32)(-1); } #define LLIMIT 64 /* heuristic determined experimentally */ #define MINMATCHLENGTH 7 /* heuristic determined experimentally */ static dictItem ZDICT_analyzePos( BYTE* doneMarks, const int* suffix, U32 start, const void* buffer, U32 minRatio) { U32 lengthList[LLIMIT] = {0}; U32 cumulLength[LLIMIT] = {0}; U32 savings[LLIMIT] = {0}; const BYTE* b = (const BYTE*)buffer; size_t length; size_t maxLength = LLIMIT; size_t pos = suffix[start]; U32 end = start; dictItem solution; /* init */ memset(&solution, 0, sizeof(solution)); doneMarks[pos] = 1; /* trivial repetition cases */ if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2)) ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3)) ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) { /* skip and mark segment */ U16 u16 = MEM_read16(b+pos+4); U32 u, e = 6; while (MEM_read16(b+pos+e) == u16) e+=2 ; if (b[pos+e] == b[pos+e-1]) e++; for (u=1; u=MINMATCHLENGTH); /* look backward */ do { length = ZDICT_count(b + pos, b + *(suffix+start-1)); if (length >=MINMATCHLENGTH) start--; } while(length >= MINMATCHLENGTH); /* exit if not found a minimum nb of repetitions */ if (end-start < minRatio) { U32 idx; for(idx=start; idx= %u at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos); DISPLAYLEVEL(4, "\n"); for (searchLength = MINMATCHLENGTH ; ; searchLength++) { BYTE currentChar = 0; U32 currentCount = 0; U32 currentID = refinedStart; U32 id; U32 selectedCount = 0; U32 selectedID = currentID; for (id =refinedStart; id < refinedEnd; id++) { if (b[ suffix[id] + searchLength] != currentChar) { if (currentCount > selectedCount) { selectedCount = currentCount; selectedID = currentID; } currentID = id; currentChar = b[ suffix[id] + searchLength]; currentCount = 0; } currentCount ++; } if (currentCount > selectedCount) { /* for last */ selectedCount = currentCount; selectedID = currentID; } if (selectedCount < minRatio) break; refinedStart = selectedID; refinedEnd = refinedStart + selectedCount; } /* evaluate gain based on new ref */ start = refinedStart; pos = suffix[refinedStart]; end = start; memset(lengthList, 0, sizeof(lengthList)); /* look forward */ do { end++; length = ZDICT_count(b + pos, b + suffix[end]); if (length >= LLIMIT) length = LLIMIT-1; lengthList[length]++; } while (length >=MINMATCHLENGTH); /* look backward */ do { length = ZDICT_count(b + pos, b + suffix[start-1]); if (length >= LLIMIT) length = LLIMIT-1; lengthList[length]++; if (length >=MINMATCHLENGTH) start--; } while(length >= MINMATCHLENGTH); /* largest useful length */ memset(cumulLength, 0, sizeof(cumulLength)); cumulLength[maxLength-1] = lengthList[maxLength-1]; for (i=(int)(maxLength-2); i>=0; i--) cumulLength[i] = cumulLength[i+1] + lengthList[i]; for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break; maxLength = i; /* reduce maxLength in case of final into repetitive data */ { U32 l = (U32)maxLength; BYTE c = b[pos + maxLength-1]; while (b[pos+l-2]==c) l--; maxLength = l; } if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */ /* calculate savings */ savings[5] = 0; for (i=MINMATCHLENGTH; i<=(int)maxLength; i++) savings[i] = savings[i-1] + (lengthList[i] * (i-3)); DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n", (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength); solution.pos = (U32)pos; solution.length = (U32)maxLength; solution.savings = savings[maxLength]; /* mark positions done */ { U32 id; U32 testedPos; for (id=start; id solution.length) length = solution.length; } pEnd = (U32)(testedPos + length); for (p=testedPos; ppos; const U32 max = elt.pos + (elt.length-1); /* tail overlap */ U32 u; for (u=1; u elt.pos) && (table[u].pos < max)) { /* overlap */ /* append */ U32 addedLength = table[u].pos - elt.pos; table[u].length += addedLength; table[u].pos = elt.pos; table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ table[u].savings += elt.length / 8; /* rough approx */ elt = table[u]; while ((u>1) && (table[u-1].savings < elt.savings)) table[u] = table[u-1], u--; table[u] = elt; return u; } } /* front overlap */ for (u=1; u elt.pos) && (table[u].pos < elt.pos)) { /* overlap */ /* append */ int addedLength = (elt.pos + elt.length) - (table[u].pos + table[u].length); table[u].savings += elt.length / 8; /* rough approx */ if (addedLength > 0) { /* otherwise, already included */ table[u].length += addedLength; table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ } elt = table[u]; while ((u>1) && (table[u-1].savings < elt.savings)) table[u] = table[u-1], u--; table[u] = elt; return u; } } return 0; } static void ZDICT_removeDictItem(dictItem* table, U32 id) { /* convention : first element is nb of elts */ U32 max = table->pos; U32 u; if (!id) return; /* protection, should never happen */ for (u=id; upos--; } static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt) { /* merge if possible */ U32 mergeId = ZDICT_checkMerge(table, elt, 0); if (mergeId) { U32 newMerge = 1; while (newMerge) { newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId); if (newMerge) ZDICT_removeDictItem(table, mergeId); mergeId = newMerge; } return; } /* insert */ { U32 current; U32 nextElt = table->pos; if (nextElt >= maxSize) nextElt = maxSize-1; current = nextElt-1; while (table[current].savings < elt.savings) { table[current+1] = table[current]; current--; } table[current+1] = elt; table->pos = nextElt+1; } } static U32 ZDICT_dictSize(const dictItem* dictList) { U32 u, dictSize = 0; for (u=1; u> shiftRatio; int divSuftSortResult; size_t result = 0; /* init */ DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) { result = ERROR(memory_allocation); goto _cleanup; } if (minRatio < MINRATIO) minRatio = MINRATIO; memset(doneMarks, 0, bufferSize+16); /* sort */ DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20)); divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0); if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; } suffix[bufferSize] = (int)bufferSize; /* leads into noise */ suffix0[0] = (int)bufferSize; /* leads into noise */ { /* build reverse suffix sort */ size_t pos; for (pos=0; pos < bufferSize; pos++) reverseSuffix[suffix[pos]] = (U32)pos; /* build file pos */ filePos[0] = 0; for (pos=1; pospos; /* convention : nb of useful elts within dictList */ U32 currentSize = 0; U32 n; for (n=1; n maxDictSize) break; } dictList->pos = n; } _cleanup: free(suffix0); free(reverseSuffix); free(doneMarks); free(filePos); return result; } static void ZDICT_fillNoise(void* buffer, size_t length) { unsigned acc = PRIME1; size_t p=0;; for (p=0; p> 21); } } typedef struct { ZSTD_CCtx* ref; ZSTD_CCtx* zc; void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */ } EStats_ress_t; static void ZDICT_countEStats(EStats_ress_t esr, U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, const void* src, size_t srcSize) { const seqStore_t* seqStorePtr; if (srcSize > ZSTD_BLOCKSIZE_MAX) srcSize = ZSTD_BLOCKSIZE_MAX; /* protection vs large samples */ ZSTD_copyCCtx(esr.zc, esr.ref); ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); seqStorePtr = ZSTD_getSeqStore(esr.zc); /* literals stats */ { const BYTE* bytePtr; for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++) countLit[*bytePtr]++; } /* seqStats */ { size_t const nbSeq = (size_t)(seqStorePtr->offset - seqStorePtr->offsetStart); ZSTD_seqToCodes(seqStorePtr, nbSeq); { const BYTE* codePtr = seqStorePtr->offCodeStart; size_t u; for (u=0; umlCodeStart; size_t u; for (u=0; ullCodeStart; size_t u; for (u=0; u1) { /* selectivity == 1 => fast mode */ ZDICT_trainBuffer(dictList, dictListSize, samplesBuffer, sBuffSize, sampleSizes, nbSamples, selectivity, (U32)targetDictSize); /* display best matches */ if (g_displayLevel>= 3) { U32 const nb = 25; U32 const dictContentSize = ZDICT_dictSize(dictList); U32 u; DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize); DISPLAYLEVEL(3, "list %u best segments \n", nb); for (u=1; u<=nb; u++) { U32 p = dictList[u].pos; U32 l = dictList[u].length; U32 d = MIN(40, l); DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |", u, l, p, dictList[u].savings); ZDICT_printHex(3, (const char*)samplesBuffer+p, d); DISPLAYLEVEL(3, "| \n"); } } } /* create dictionary */ { U32 dictContentSize = ZDICT_dictSize(dictList); size_t hSize; BYTE* ptr; U32 u; /* build dict content */ ptr = (BYTE*)dictBuffer + maxDictSize; for (u=1; upos; u++) { U32 l = dictList[u].length; ptr -= l; if (ptr<(BYTE*)dictBuffer) return ERROR(GENERIC); /* should not happen */ memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l); } /* fast mode dict content */ if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */ DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */ DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10)); dictContentSize = (U32)ZDICT_fastSampling(dictBuffer, targetDictSize, samplesBuffer, sBuffSize); } /* dictionary header */ MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC); hSize = 4; /* entropic tables */ DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ DISPLAYLEVEL(2, "statistics ... \n"); hSize += ZDICT_analyzeEntropy((char*)dictBuffer+4, maxDictSize-4, compressionLevel, samplesBuffer, sampleSizes, nbSamples, (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize); if (hSize + dictContentSize < maxDictSize) memmove((char*)dictBuffer + hSize, (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize); dictSize = MIN(maxDictSize, hSize+dictContentSize); } /* clean up */ free(dictList); return dictSize; } /* issue : samplesBuffer need to be followed by a noisy guard band. * work around : duplicate the buffer, and add the noise */ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, ZDICT_params_t params) { void* newBuff; size_t sBuffSize; { unsigned u; for (u=0, sBuffSize=0; u no dictionary */ newBuff = malloc(sBuffSize + NOISELENGTH); if (!newBuff) return ERROR(memory_allocation); memcpy(newBuff, samplesBuffer, sBuffSize); ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */ { size_t const result = ZDICT_trainFromBuffer_unsafe( dictBuffer, dictBufferCapacity, newBuff, samplesSizes, nbSamples, params); free(newBuff); return result; } } size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) { ZDICT_params_t params; memset(¶ms, 0, sizeof(params)); return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity, samplesBuffer, samplesSizes, nbSamples, params); }