From 7682e49d0a4c5d5b725962c82d0762da9e32eba8 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sun, 31 Jan 2016 23:45:35 +0100 Subject: [PATCH] added buffer interface to dictBuilder --- dictBuilder/dibcli.c | 13 ++- dictBuilder/dictBuilder.c | 234 ++++++++++++++++++++++---------------- dictBuilder/dictBuilder.h | 55 +++++++-- 3 files changed, 193 insertions(+), 109 deletions(-) diff --git a/dictBuilder/dibcli.c b/dictBuilder/dibcli.c index f93e9707..8566c624 100644 --- a/dictBuilder/dibcli.c +++ b/dictBuilder/dibcli.c @@ -184,7 +184,7 @@ int main(int argCount, const char** argv) if (!strcmp(argument, "--verbose")) { g_displayLevel++; if (g_displayLevel<3) g_displayLevel=3; continue; } if (!strcmp(argument, "--quiet")) { g_displayLevel--; continue; } if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; } - if (!strcmp(argument, "--fast")) { selectionLevel=0; cLevel=1; continue; } + if (!strcmp(argument, "--fast")) { selectionLevel=1; cLevel=1; continue; } /* Decode commands (note : aggregated commands are allowed) */ if (argument[0]=='-') { @@ -247,8 +247,15 @@ int main(int argCount, const char** argv) } /* building ... */ - DiB_setNotificationLevel(g_displayLevel); - operationResult = DiB_trainDictionary(dictFileName, maxDictSize, selectionLevel, cLevel, filenameTable, filenameIdx); + { + DiB_params_t param; + param.selectivityLevel = selectionLevel; + param.compressionLevel = cLevel; + DiB_setNotificationLevel(g_displayLevel); + operationResult = DiB_trainFromFiles(dictFileName, maxDictSize, + filenameTable, filenameIdx, + param); + } if (main_pause) waitEnter(); free((void*)filenameTable); diff --git a/dictBuilder/dictBuilder.c b/dictBuilder/dictBuilder.c index fed9b18d..944853fb 100644 --- a/dictBuilder/dictBuilder.c +++ b/dictBuilder/dictBuilder.c @@ -43,14 +43,14 @@ /*-************************************* * Includes ***************************************/ -#include /* malloc, free */ -#include /* memset */ -#include /* fprintf, fopen, ftello64 */ -#include /* stat64 */ -#include /* stat64 */ -#include /* clock */ +#include /* malloc, free */ +#include /* memset */ +#include /* fprintf, fopen, ftello64 */ +#include /* stat64 */ +#include /* stat64 */ +#include /* clock */ -#include "mem.h" /* read */ +#include "mem.h" /* read */ #include "error_private.h" #include "divsufsort.h" #include "dictBuilder.h" @@ -58,17 +58,13 @@ #include "huff0_static.h" -/* ************************************* +/*-************************************* * Compiler specifics ***************************************/ #if !defined(S_ISREG) # define S_ISREG(x) (((x) & S_IFMT) == S_IFREG) #endif -#ifdef _MSC_VER -#define snprintf sprintf_s -#endif - /*-************************************* * Constants @@ -87,6 +83,9 @@ static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_ #define MINRATIO 4 static const U32 g_compressionLevel_default = 5; +static const U32 g_selectivity_default = 9; +static const size_t g_provision_entropySize = 200; +static const size_t g_min_fast_dictContent = 192; /*-************************************* @@ -146,6 +145,10 @@ static unsigned DiB_GetMilliSpan(clock_t nPrevious) return nSpan; } +unsigned DiB_isError(size_t errorCode) { return ERR_isError(errorCode); } + +const char* DiB_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); } + /* ******************************************************** * File related operations @@ -563,9 +566,8 @@ static U32 DiB_dictSize(const dictItem* dictList) static void DiB_trainBuffer(dictItem* dictList, U32 dictListSize, const void* const buffer, const size_t bufferSize, /* buffer must end with noisy guard band */ - const char* displayName, - const size_t* fileSizes, unsigned nbFiles, unsigned maxDictSize, - U32 shiftRatio) + const size_t* fileSizes, unsigned nbFiles, + U32 shiftRatio, unsigned maxDictSize) { saidx_t* const suffix0 = (saidx_t*)malloc((bufferSize+2)*sizeof(*suffix0)); saidx_t* const suffix = suffix0+1; @@ -583,7 +585,7 @@ static void DiB_trainBuffer(dictItem* dictList, U32 dictListSize, memset(doneMarks, 0, bufferSize+16); /* sort */ - DISPLAYLEVEL(2, "sorting %s ...\n", displayName); + DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20)); errorCode = divsufsort((const sauchar_t*)buffer, suffix, (saidx_t)bufferSize); if (errorCode != 0) EXM_THROW(2, "sort failed"); suffix[bufferSize] = (saidx_t)bufferSize; /* leads into noise */ @@ -699,7 +701,7 @@ static void DiB_countEStats(EStats_ress_t esr, #define OFFCODE_MAX 18 static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize, unsigned compressionLevel, - const void* srcBuffer, size_t* fileSizes, unsigned nbFiles, + const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles, const void* dictBuffer, size_t dictBufferSize) { U32 countLit[256]; @@ -793,8 +795,7 @@ static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize, static void DiB_saveDict(const char* dictFileName, - const void* buff1, size_t buff1Size, - const void* buff2, size_t buff2Size) + const void* buff, size_t buffSize) { FILE* f; size_t n; @@ -802,11 +803,8 @@ static void DiB_saveDict(const char* dictFileName, f = fopen(dictFileName, "wb"); if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); - n = fwrite(buff1, 1, buff1Size, f); - if (n!=buff1Size) EXM_THROW(4, "%s : write error", dictFileName) - - n = fwrite(buff2, 1, buff2Size, f); - if (n!=buff2Size) EXM_THROW(4, "%s : write error", dictFileName) + n = fwrite(buff, 1, buffSize, f); + if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) n = (size_t)fclose(f); if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) @@ -853,46 +851,35 @@ static size_t DiB_fastSampling(void* dictBuffer, size_t dictSize, } -int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, - unsigned shiftRatio, unsigned compressionLevel, - const char** fileNamesTable, unsigned nbFiles) +static size_t DiB_trainFromBuffer_internal( + void* dictBuffer, size_t maxDictSize, + const void* samplesBuffer, const size_t* sampleSizes, unsigned nbSamples, + DiB_params_t params) { - void* srcBuffer; - size_t benchedSize; - size_t* fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t)); - unsigned long long totalSizeToLoad = DiB_getTotalFileSize(fileNamesTable, nbFiles); - const U32 dictListSize = MAX( MAX(DICTLISTSIZE, nbFiles), maxDictSize/16); + const U32 dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), maxDictSize/16); dictItem* dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList)); - char mfName[20] = {0}; - const char* displayName = NULL; + unsigned selectivity = params.selectivityLevel; + unsigned compressionLevel = params.compressionLevel; + size_t targetDictSize = maxDictSize - g_provision_entropySize; + size_t sBuffSize; + size_t dictSize = 0; + + /* checks */ + if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) return ERROR(dstSize_tooSmall); /* init */ - benchedSize = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT; - if ((unsigned long long)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad; - if (benchedSize < totalSizeToLoad) - DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20)); - - /* Memory allocation & restrictions */ - srcBuffer = malloc(benchedSize+NOISELENGTH); /* + noise */ - if ((!fileSizes) || (!srcBuffer) || (!dictList)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */ + { unsigned u; for (u=0, sBuffSize=0; u0) - { - snprintf (mfName, sizeof(mfName), " %u files", nbFiles); - if (nbFiles > 1) displayName = mfName; - else displayName = fileNamesTable[0]; - + /* select stripes */ + if (selectivity>1) { DiB_trainBuffer(dictList, dictListSize, - srcBuffer, benchedSize, - displayName, - fileSizes, nbFiles, maxDictSize, - shiftRatio); + samplesBuffer, sBuffSize, + sampleSizes, nbSamples, + selectivity, targetDictSize); /* display best matches */ if (g_displayLevel>= 3) { @@ -907,72 +894,127 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, U32 d = MIN(40, l); DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |", u, l, p, dictList[u].savings); - DiB_printHex(3, (char*)srcBuffer+p, d); + DiB_printHex(3, (const char*)samplesBuffer+p, d); DISPLAYLEVEL(3, "| \n"); } } } /* create dictionary */ { - void* dictContent; U32 dictContentSize = DiB_dictSize(dictList); - void* dictHeader; - size_t dictHeaderSize, hSize, addedContentLength; + size_t hSize; BYTE* ptr; U32 u; - /* build dict */ - #define EBSIZE (2 KB) - dictHeaderSize = EBSIZE; - dictHeader = malloc(dictHeaderSize); - dictContent = malloc(maxDictSize); - if (!dictHeader || !dictContent) EXM_THROW(2, "not enough memory"); - /* build dict content */ - ptr = (BYTE*)dictContent + maxDictSize; + ptr = (BYTE*)dictBuffer + maxDictSize; for (u=1; upos; u++) { U32 l = dictList[u].length; ptr -= l; - memcpy(ptr, (char*)srcBuffer+dictList[u].pos, l); + if (ptr<(BYTE*)dictBuffer) return ERROR(GENERIC); /* should not happen */ + memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l); } /* fast mode dict content */ - if (shiftRatio==0) { /* note could also be used to complete a dictionary, but not necessarily better */ - addedContentLength = ptr-(BYTE*)dictContent; - DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ - DISPLAYLEVEL(2, "Adding %u KB from fast sampling \n", (U32)(addedContentLength>>10)); - addedContentLength = DiB_fastSampling(dictContent, addedContentLength, srcBuffer, benchedSize); - if (!ERR_isError(addedContentLength)) - ptr -= addedContentLength, dictContentSize += addedContentLength; + if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */ + DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */ + DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10)); + dictContentSize = DiB_fastSampling((char*)dictBuffer + g_provision_entropySize, + targetDictSize, samplesBuffer, sBuffSize); } - /* dictionary header */ - MEM_writeLE32(dictHeader, ZSTD_DICT_MAGIC); + /* dictionary header */ + MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC); hSize = 4; - dictHeaderSize -= 4; /* entropic tables */ + DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ DISPLAYLEVEL(2, "statistics ... \n"); - hSize += DiB_analyzeEntropy((char*)dictHeader+4, dictHeaderSize, + hSize += DiB_analyzeEntropy((char*)dictBuffer+4, maxDictSize-4, compressionLevel, - srcBuffer, fileSizes, nbFiles, - ptr, dictContentSize); + samplesBuffer, sampleSizes, nbSamples, + (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize); - /* save dict */ - { - size_t dictSize = hSize + dictContentSize; - DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); - DiB_saveDict(dictFileName, dictHeader, hSize, ptr, dictContentSize); - //DiB_saveDict(dictFileName, NULL, 0, dictContent, dictContentSize); // content only - } - /* clean */ - free(dictHeader); - free(dictContent); + if (hSize + dictContentSize < maxDictSize) + memmove((char*)dictBuffer + hSize, (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize); + dictSize = MIN(maxDictSize, hSize+dictContentSize); } /* clean up */ - free(srcBuffer); - free(fileSizes); free(dictList); - return 0; + return dictSize; } + +/* issue : samplesBuffer need to be followed by a noisy guard band. +* work around : duplicate the buffer, and add the noise ? */ +size_t DiB_trainFromBuffer(void* dictBuffer, size_t maxDictSize, + const void* samplesBuffer, const size_t* sampleSizes, unsigned nbSamples, + DiB_params_t params) +{ + size_t sBuffSize; + void* newBuff; + size_t result; + + { unsigned u; for (u=0, sBuffSize=0; u totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad; + if (benchedSize < totalSizeToLoad) + DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20)); + + /* Memory allocation & restrictions */ + srcBuffer = malloc(benchedSize+NOISELENGTH); /* + noise */ + if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */ + + /* Load input buffer */ + DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles); + DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */ + + /* call buffer version */ + dictSize = DiB_trainFromBuffer_internal(dictBuffer, maxDictSize, + srcBuffer, fileSizes, nbFiles, + params); + if (DiB_isError(dictSize)) + { + DISPLAYLEVEL(1, "dictionary training failed : %s", DiB_getErrorName(dictSize)); /* should not happen */ + result = 1; + goto _cleanup; + } + + /* save dict */ + DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); + DiB_saveDict(dictFileName, dictBuffer, dictSize); + + /* clean up */ +_cleanup: + free(srcBuffer); + free(dictBuffer); + free(fileSizes); + return result; +} diff --git a/dictBuilder/dictBuilder.h b/dictBuilder/dictBuilder.h index 3c52c7da..a022583e 100644 --- a/dictBuilder/dictBuilder.h +++ b/dictBuilder/dictBuilder.h @@ -26,6 +26,9 @@ /* This library is designed for a single-threaded console application. * It exit() and printf() into stderr when it encounters an error condition. */ +#ifndef DICTBUILDER_H_001 +#define DICTBUILDER_H_001 + /*-************************************* * Version ***************************************/ @@ -36,24 +39,56 @@ unsigned DiB_versionNumber (void); +/*-************************************* +* Public type +***************************************/ +typedef struct { + unsigned selectivityLevel; /* 0 means default; larger => bigger selection => larger dictionary */ + unsigned compressionLevel; /* 0 means default; target a specific zstd compression level */ +} DiB_params_t; + + /*-************************************* * Public functions ***************************************/ -/*! DiB_trainDictionary - Train a dictionary from a set of files provided by @fileNamesTable - Resulting dictionary is written in file @dictFileName. - @selectivityLevel change criteria for insertion into the dictionary (more => bigger selection => larger dictionary) - @compressionLevel can be used to target a specific compression level of zstd. 0 means "default". - @result : 0 == ok +/*! DiB_trainFromBuffer + Train a dictionary from a memory buffer @samplesBuffer + where @nbSamples samples have been stored concatenated. + Each sample size is provided into an orderly table @sampleSizes. + Resulting dictionary will be saved into @dictBuffer. + @parameters is optional and can be provided with 0 values to mean "default". + @result : size of dictionary stored into @dictBuffer (<= @dictBufferSize) + or an error code, which can be tested by DiB_isError(). + note : DiB_trainFromBuffer() will send notifications into stderr if instructed to, using DiB_setNotificationLevel() */ -int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, - unsigned selectivityLevel, unsigned compressionLevel, - const char** fileNamesTable, unsigned nbFiles); +size_t DiB_trainFromBuffer(void* dictBuffer, size_t dictBufferSize, + const void* samplesBuffer, const size_t* sampleSizes, unsigned nbSamples, + DiB_params_t parameters); +/*! DiB_trainFromFiles + Train a dictionary from a set of files provided by @fileNamesTable + Resulting dictionary is written into file @dictFileName. + @parameters is optional and can be provided with 0 values. + @result : 0 == ok. Any other : error. +*/ +int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize, + const char** fileNamesTable, unsigned nbFiles, + DiB_params_t parameters); + + +/*-************************************* +* Helper functions +***************************************/ +unsigned DiB_isError(size_t errorCode); +const char* DiB_getErrorName(size_t errorCode); + /*! DiB_setNotificationLevel Set amount of notification to be displayed on the console. - 0 = no console notification (default). + default initial value : 0 = no console notification. Note : not thread-safe (use a global constant) */ void DiB_setNotificationLevel(unsigned l); + + +#endif