added fast sampling mode
This commit is contained in:
parent
863ec40f1e
commit
f5229e0cd8
@ -20,43 +20,19 @@
|
|||||||
|
|
||||||
You can contact the author at :
|
You can contact the author at :
|
||||||
- zstd source repository : https://github.com/Cyan4973/zstd
|
- zstd source repository : https://github.com/Cyan4973/zstd
|
||||||
- ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/*-************************************
|
||||||
/**************************************
|
|
||||||
* Compiler Options
|
|
||||||
**************************************/
|
|
||||||
#define _CRT_SECURE_NO_WARNINGS /* Visual : removes warning from strcpy */
|
|
||||||
#define _POSIX_SOURCE 1 /* triggers fileno() within <stdio.h> on unix */
|
|
||||||
|
|
||||||
|
|
||||||
/**************************************
|
|
||||||
* Includes
|
* Includes
|
||||||
**************************************/
|
**************************************/
|
||||||
#include <stdio.h> /* fprintf, getchar */
|
|
||||||
#include <stdlib.h> /* exit, calloc, free */
|
#include <stdlib.h> /* exit, calloc, free */
|
||||||
#include <string.h> /* strcmp, strlen */
|
#include <string.h> /* strcmp, strlen */
|
||||||
|
#include <stdio.h> /* fprintf, getchar */
|
||||||
|
|
||||||
#include "dictBuilder.h"
|
#include "dictBuilder.h"
|
||||||
|
|
||||||
|
|
||||||
/**************************************
|
/*-************************************
|
||||||
* OS-specific Includes
|
|
||||||
**************************************/
|
|
||||||
#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
|
|
||||||
# include <fcntl.h> /* _O_BINARY */
|
|
||||||
# include <io.h> /* _setmode, _isatty */
|
|
||||||
# define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY)
|
|
||||||
# define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream))
|
|
||||||
#else
|
|
||||||
# include <unistd.h> /* isatty */
|
|
||||||
# define SET_BINARY_MODE(file)
|
|
||||||
# define IS_CONSOLE(stdStream) isatty(fileno(stdStream))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
/**************************************
|
|
||||||
* Constants
|
* Constants
|
||||||
**************************************/
|
**************************************/
|
||||||
#define PROGRAM_DESCRIPTION "Dictionary builder"
|
#define PROGRAM_DESCRIPTION "Dictionary builder"
|
||||||
@ -72,21 +48,22 @@
|
|||||||
#define MB *(1 <<20)
|
#define MB *(1 <<20)
|
||||||
#define GB *(1U<<30)
|
#define GB *(1U<<30)
|
||||||
|
|
||||||
|
static const unsigned compressionLevelDefault = 5;
|
||||||
static const unsigned selectionLevelDefault = 9; /* determined experimentally */
|
static const unsigned selectionLevelDefault = 9; /* determined experimentally */
|
||||||
static const unsigned maxDictSizeDefault = 110 KB;
|
static const unsigned maxDictSizeDefault = 110 KB;
|
||||||
static const char* dictFileNameDefault = "dictionary";
|
static const char* dictFileNameDefault = "dictionary";
|
||||||
|
|
||||||
|
|
||||||
/**************************************
|
/*-************************************
|
||||||
* Display Macros
|
* Display Macros
|
||||||
**************************************/
|
**************************************/
|
||||||
#define DISPLAY(...) fprintf(displayOut, __VA_ARGS__)
|
#define DISPLAY(...) fprintf(g_displayOut, __VA_ARGS__)
|
||||||
#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
|
#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
|
||||||
static FILE* displayOut;
|
static FILE* g_displayOut;
|
||||||
static unsigned displayLevel = 2; // 0 : no display // 1: errors // 2 : + result + interaction + warnings ; // 3 : + progression; // 4 : + information
|
static unsigned g_displayLevel = 2; // 0 : no display // 1: errors // 2 : + result + interaction + warnings ; // 3 : + progression; // 4 : + information
|
||||||
|
|
||||||
|
|
||||||
/**************************************
|
/*-************************************
|
||||||
* Exceptions
|
* Exceptions
|
||||||
**************************************/
|
**************************************/
|
||||||
#define DEBUG 0
|
#define DEBUG 0
|
||||||
@ -101,7 +78,7 @@ static unsigned displayLevel = 2; // 0 : no display // 1: errors // 2 : + re
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**************************************
|
/*-************************************
|
||||||
* Command Line
|
* Command Line
|
||||||
**************************************/
|
**************************************/
|
||||||
static int usage(const char* programName)
|
static int usage(const char* programName)
|
||||||
@ -110,8 +87,8 @@ static int usage(const char* programName)
|
|||||||
DISPLAY( " %s [arg] [filenames]\n", programName);
|
DISPLAY( " %s [arg] [filenames]\n", programName);
|
||||||
DISPLAY( "\n");
|
DISPLAY( "\n");
|
||||||
DISPLAY( "Arguments :\n");
|
DISPLAY( "Arguments :\n");
|
||||||
DISPLAY( "--maxdict : limit dictionary to specified size (default : %u) \n", maxDictSizeDefault);
|
|
||||||
DISPLAY( " -o : name of dictionary file (default: %s) \n", dictFileNameDefault);
|
DISPLAY( " -o : name of dictionary file (default: %s) \n", dictFileNameDefault);
|
||||||
|
DISPLAY( "--maxdict : limit dictionary to specified size (default : %u) \n", maxDictSizeDefault);
|
||||||
DISPLAY( " -h/-H : display help/long help and exit\n");
|
DISPLAY( " -h/-H : display help/long help and exit\n");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -122,8 +99,10 @@ static int usage_advanced(const char* programName)
|
|||||||
usage(programName);
|
usage(programName);
|
||||||
DISPLAY( "\n");
|
DISPLAY( "\n");
|
||||||
DISPLAY( "Advanced arguments :\n");
|
DISPLAY( "Advanced arguments :\n");
|
||||||
DISPLAY( " -# : selection level # (default :%u)\n", selectionLevelDefault);
|
|
||||||
DISPLAY( " -V : display Version number and exit\n");
|
DISPLAY( " -V : display Version number and exit\n");
|
||||||
|
DISPLAY( "--fast : fast sampling mode\n");
|
||||||
|
DISPLAY( " -L# : target compression level (default: %u)\n", compressionLevelDefault);
|
||||||
|
DISPLAY( " -S# : dictionary selectivity level # (default: %u)\n", selectionLevelDefault);
|
||||||
DISPLAY( " -v : verbose mode\n");
|
DISPLAY( " -v : verbose mode\n");
|
||||||
DISPLAY( " -q : suppress warnings; specify twice to suppress errors too\n");
|
DISPLAY( " -q : suppress warnings; specify twice to suppress errors too\n");
|
||||||
return 0;
|
return 0;
|
||||||
@ -132,7 +111,7 @@ static int usage_advanced(const char* programName)
|
|||||||
static int badusage(const char* programName)
|
static int badusage(const char* programName)
|
||||||
{
|
{
|
||||||
DISPLAYLEVEL(1, "Incorrect parameters\n");
|
DISPLAYLEVEL(1, "Incorrect parameters\n");
|
||||||
if (displayLevel >= 1) usage(programName);
|
if (g_displayLevel >= 1) usage(programName);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -153,6 +132,7 @@ int main(int argCount, const char** argv)
|
|||||||
operationResult=0,
|
operationResult=0,
|
||||||
nextArgumentIsMaxDict=0,
|
nextArgumentIsMaxDict=0,
|
||||||
nextArgumentIsDictFileName=0;
|
nextArgumentIsDictFileName=0;
|
||||||
|
unsigned cLevel = compressionLevelDefault;
|
||||||
unsigned maxDictSize = maxDictSizeDefault;
|
unsigned maxDictSize = maxDictSizeDefault;
|
||||||
unsigned selectionLevel = selectionLevelDefault;
|
unsigned selectionLevel = selectionLevelDefault;
|
||||||
const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); /* argCount >= 1 */
|
const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); /* argCount >= 1 */
|
||||||
@ -161,7 +141,7 @@ int main(int argCount, const char** argv)
|
|||||||
const char* dictFileName = dictFileNameDefault;
|
const char* dictFileName = dictFileNameDefault;
|
||||||
|
|
||||||
/* init */
|
/* init */
|
||||||
displayOut = stderr; /* unfortunately, cannot be set at declaration */
|
g_displayOut = stderr; /* unfortunately, cannot be set at declaration */
|
||||||
if (filenameTable==NULL) EXM_THROW(1, "not enough memory\n");
|
if (filenameTable==NULL) EXM_THROW(1, "not enough memory\n");
|
||||||
/* Pick out program name from path. Don't rely on stdlib because of conflicting behavior */
|
/* Pick out program name from path. Don't rely on stdlib because of conflicting behavior */
|
||||||
for (i = (int)strlen(programName); i > 0; i--) { if ((programName[i] == '/') || (programName[i] == '\\')) { i++; break; } }
|
for (i = (int)strlen(programName); i > 0; i--) { if ((programName[i] == '/') || (programName[i] == '\\')) { i++; break; } }
|
||||||
@ -190,40 +170,44 @@ int main(int argCount, const char** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* long commands (--long-word) */
|
/* long commands (--long-word) */
|
||||||
if (!strcmp(argument, "--version")) { displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; }
|
if (!strcmp(argument, "--version")) { g_displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; }
|
||||||
if (!strcmp(argument, "--help")) { displayOut=stdout; return usage_advanced(programName); }
|
if (!strcmp(argument, "--help")) { g_displayOut=stdout; return usage_advanced(programName); }
|
||||||
if (!strcmp(argument, "--verbose")) { displayLevel=4; continue; }
|
if (!strcmp(argument, "--verbose")) { g_displayLevel++; if (g_displayLevel<3) g_displayLevel=3; continue; }
|
||||||
if (!strcmp(argument, "--quiet")) { displayLevel--; continue; }
|
if (!strcmp(argument, "--quiet")) { g_displayLevel--; continue; }
|
||||||
if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; }
|
if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; }
|
||||||
|
if (!strcmp(argument, "--fast")) { selectionLevel=0; cLevel=1; continue; }
|
||||||
|
|
||||||
/* Decode commands (note : aggregated commands are allowed) */
|
/* Decode commands (note : aggregated commands are allowed) */
|
||||||
if (argument[0]=='-') {
|
if (argument[0]=='-') {
|
||||||
argument++;
|
argument++;
|
||||||
|
|
||||||
while (argument[0]!=0) {
|
while (argument[0]!=0) {
|
||||||
/* selection Level */
|
|
||||||
if ((*argument>='0') && (*argument<='9')) {
|
|
||||||
selectionLevel = 0;
|
|
||||||
while ((*argument >= '0') && (*argument <= '9')) {
|
|
||||||
selectionLevel *= 10;
|
|
||||||
selectionLevel += *argument - '0';
|
|
||||||
argument++;
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
switch(argument[0])
|
switch(argument[0])
|
||||||
{
|
{
|
||||||
/* Display help */
|
/* Display help */
|
||||||
case 'V': displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; /* Version Only */
|
case 'V': g_displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; /* Version Only */
|
||||||
case 'H':
|
case 'H':
|
||||||
case 'h': displayOut=stdout; return usage_advanced(programName);
|
case 'h': g_displayOut=stdout; return usage_advanced(programName);
|
||||||
|
|
||||||
|
/* Selection level */
|
||||||
|
case 'S': argument++;
|
||||||
|
selectionLevel = 0;
|
||||||
|
while ((*argument >= '0') && (*argument <= '9'))
|
||||||
|
selectionLevel *= 10, selectionLevel += *argument++ - '0';
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* Selection level */
|
||||||
|
case 'L': argument++;
|
||||||
|
cLevel = 0;
|
||||||
|
while ((*argument >= '0') && (*argument <= '9'))
|
||||||
|
cLevel *= 10, cLevel += *argument++ - '0';
|
||||||
|
break;
|
||||||
|
|
||||||
/* Verbose mode */
|
/* Verbose mode */
|
||||||
case 'v': displayLevel++; if (displayLevel<3) displayLevel=3; argument++; break;
|
case 'v': g_displayLevel++; if (g_displayLevel<3) g_displayLevel=3; argument++; break;
|
||||||
|
|
||||||
/* Quiet mode */
|
/* Quiet mode */
|
||||||
case 'q': displayLevel--; argument++; break;
|
case 'q': g_displayLevel--; argument++; break;
|
||||||
|
|
||||||
/* dictionary name */
|
/* dictionary name */
|
||||||
case 'o': nextArgumentIsDictFileName=1; argument++; break;
|
case 'o': nextArgumentIsDictFileName=1; argument++; break;
|
||||||
@ -247,8 +231,8 @@ int main(int argCount, const char** argv)
|
|||||||
if (filenameIdx==0) return badusage(programName);
|
if (filenameIdx==0) return badusage(programName);
|
||||||
|
|
||||||
/* building ... */
|
/* building ... */
|
||||||
DiB_setNotificationLevel(displayLevel);
|
DiB_setNotificationLevel(g_displayLevel);
|
||||||
operationResult = DiB_trainDictionary(dictFileName, maxDictSize, selectionLevel, filenameTable, filenameIdx);
|
operationResult = DiB_trainDictionary(dictFileName, maxDictSize, selectionLevel, cLevel, filenameTable, filenameIdx);
|
||||||
|
|
||||||
if (main_pause) waitEnter();
|
if (main_pause) waitEnter();
|
||||||
free((void*)filenameTable);
|
free((void*)filenameTable);
|
||||||
|
@ -51,6 +51,7 @@
|
|||||||
#include <time.h> /* clock */
|
#include <time.h> /* clock */
|
||||||
|
|
||||||
#include "mem.h" /* read */
|
#include "mem.h" /* read */
|
||||||
|
#include "error_private.h"
|
||||||
#include "divsufsort.h"
|
#include "divsufsort.h"
|
||||||
#include "dictBuilder.h"
|
#include "dictBuilder.h"
|
||||||
#include "zstd_compress.c"
|
#include "zstd_compress.c"
|
||||||
@ -85,6 +86,7 @@ static const size_t maxMemory = (sizeof(size_t)==4) ? (2 GB - 64 MB) : (size_t
|
|||||||
#define PRIME2 2246822519U
|
#define PRIME2 2246822519U
|
||||||
|
|
||||||
#define MINRATIO 4
|
#define MINRATIO 4
|
||||||
|
static const U32 g_compressionLevel_default = 5;
|
||||||
|
|
||||||
|
|
||||||
/*-*************************************
|
/*-*************************************
|
||||||
@ -714,6 +716,7 @@ static void DiB_countEStats(EStats_ress_t esr,
|
|||||||
|
|
||||||
#define OFFCODE_MAX 18
|
#define OFFCODE_MAX 18
|
||||||
static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
||||||
|
unsigned compressionLevel,
|
||||||
const void* srcBuffer, size_t* fileSizes, unsigned nbFiles,
|
const void* srcBuffer, size_t* fileSizes, unsigned nbFiles,
|
||||||
const void* dictBuffer, size_t dictBufferSize)
|
const void* dictBuffer, size_t dictBufferSize)
|
||||||
{
|
{
|
||||||
@ -740,7 +743,8 @@ static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|||||||
esr.zc = ZSTD_createCCtx();
|
esr.zc = ZSTD_createCCtx();
|
||||||
esr.workPlace = malloc(BLOCKSIZE);
|
esr.workPlace = malloc(BLOCKSIZE);
|
||||||
if (!esr.ref || !esr.zc || !esr.workPlace) EXM_THROW(30, "Not enough memory");
|
if (!esr.ref || !esr.zc || !esr.workPlace) EXM_THROW(30, "Not enough memory");
|
||||||
params = ZSTD_getParams(5, dictBufferSize + 15 KB);
|
if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
|
||||||
|
params = ZSTD_getParams(compressionLevel, dictBufferSize + 15 KB);
|
||||||
params.strategy = ZSTD_greedy;
|
params.strategy = ZSTD_greedy;
|
||||||
ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params);
|
ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params);
|
||||||
|
|
||||||
@ -827,8 +831,49 @@ static void DiB_saveDict(const char* dictFileName,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned shiftRatio,
|
#define DIB_FASTSEGMENTSIZE 64
|
||||||
const char** fileNamesTable, unsigned nbFiles)
|
/*! DiB_fastSampling (based on an idea by Giuseppe Ottaviano)
|
||||||
|
Fill @dictBuffer with stripes of size DIB_FASTSEGMENTSIZE from @samplesBuffer
|
||||||
|
up to @dictSize.
|
||||||
|
Filling starts from the end of @dictBuffer, down to maximum possible.
|
||||||
|
if @dictSize is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of @dictBuffer won't be used.
|
||||||
|
@return : amount of data written into @dictBuffer
|
||||||
|
or an error Code (if @dictSize or @samplesSize too small)
|
||||||
|
*/
|
||||||
|
static size_t DiB_fastSampling(void* dictBuffer, size_t dictSize,
|
||||||
|
const void* samplesBuffer, size_t samplesSize)
|
||||||
|
{
|
||||||
|
char* dstPtr = (char*)dictBuffer + dictSize;
|
||||||
|
const char* srcPtr = (const char*)samplesBuffer;
|
||||||
|
size_t nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
|
||||||
|
size_t segNb, interSize;
|
||||||
|
|
||||||
|
if (nbSegments <= 2) return ERROR(srcSize_wrong);
|
||||||
|
if (samplesSize < dictSize) return ERROR(srcSize_wrong);
|
||||||
|
|
||||||
|
/* first and last segments are part of dictionary, in case they contain interesting header/footer */
|
||||||
|
dstPtr -= DIB_FASTSEGMENTSIZE;
|
||||||
|
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
|
||||||
|
dstPtr -= DIB_FASTSEGMENTSIZE;
|
||||||
|
memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
|
||||||
|
|
||||||
|
/* regularly copy a segment */
|
||||||
|
interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
|
||||||
|
srcPtr += DIB_FASTSEGMENTSIZE;
|
||||||
|
for (segNb=2; segNb < nbSegments; segNb++) {
|
||||||
|
srcPtr += interSize;
|
||||||
|
dstPtr -= DIB_FASTSEGMENTSIZE;
|
||||||
|
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
|
||||||
|
srcPtr += DIB_FASTSEGMENTSIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
return nbSegments * DIB_FASTSEGMENTSIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize,
|
||||||
|
unsigned shiftRatio, unsigned compressionLevel,
|
||||||
|
const char** fileNamesTable, unsigned nbFiles)
|
||||||
{
|
{
|
||||||
void* srcBuffer;
|
void* srcBuffer;
|
||||||
size_t benchedSize;
|
size_t benchedSize;
|
||||||
@ -852,42 +897,44 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned
|
|||||||
|
|
||||||
/* Load input buffer */
|
/* Load input buffer */
|
||||||
DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
|
DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
|
||||||
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* for end of buffer condition */
|
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
||||||
|
|
||||||
/* Train */
|
if (shiftRatio>0)
|
||||||
snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
|
{
|
||||||
if (nbFiles > 1) displayName = mfName;
|
/* analyze samples */
|
||||||
else displayName = fileNamesTable[0];
|
snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
|
||||||
|
if (nbFiles > 1) displayName = mfName;
|
||||||
|
else displayName = fileNamesTable[0];
|
||||||
|
|
||||||
DiB_trainBuffer(dictList, dictListSize,
|
DiB_trainBuffer(dictList, dictListSize,
|
||||||
srcBuffer, benchedSize,
|
srcBuffer, benchedSize,
|
||||||
displayName,
|
displayName,
|
||||||
fileSizes, nbFiles, maxDictSize,
|
fileSizes, nbFiles, maxDictSize,
|
||||||
shiftRatio);
|
shiftRatio);
|
||||||
|
|
||||||
/* display best matches */
|
/* display best matches */
|
||||||
if (g_displayLevel>= 3) {
|
if (g_displayLevel>= 3) {
|
||||||
const U32 nb = 25;
|
const U32 nb = 25;
|
||||||
U32 u;
|
U32 u;
|
||||||
U32 dictContentSize = DiB_dictSize(dictList);
|
U32 dictContentSize = DiB_dictSize(dictList);
|
||||||
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
|
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
|
||||||
DISPLAYLEVEL(3, "list %u best segments \n", nb);
|
DISPLAYLEVEL(3, "list %u best segments \n", nb);
|
||||||
for (u=1; u<=nb; u++) {
|
for (u=1; u<=nb; u++) {
|
||||||
U32 p = dictList[u].pos;
|
U32 p = dictList[u].pos;
|
||||||
U32 l = dictList[u].length;
|
U32 l = dictList[u].length;
|
||||||
U32 d = MIN(40, l);
|
U32 d = MIN(40, l);
|
||||||
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
|
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
|
||||||
u, l, p, dictList[u].savings);
|
u, l, p, dictList[u].savings);
|
||||||
DiB_printHex(3, (char*)srcBuffer+p, d);
|
DiB_printHex(3, (char*)srcBuffer+p, d);
|
||||||
DISPLAYLEVEL(3, "| \n");
|
DISPLAYLEVEL(3, "| \n");
|
||||||
} }
|
} } }
|
||||||
|
|
||||||
/* create dictionary */
|
/* create dictionary */
|
||||||
{
|
{
|
||||||
void* dictContent;
|
void* dictContent;
|
||||||
U32 dictContentSize = DiB_dictSize(dictList);
|
U32 dictContentSize = DiB_dictSize(dictList);
|
||||||
void* dictHeader;
|
void* dictHeader;
|
||||||
size_t dictHeaderSize, hSize;
|
size_t dictHeaderSize, hSize, addedContentLength;
|
||||||
BYTE* ptr;
|
BYTE* ptr;
|
||||||
U32 u;
|
U32 u;
|
||||||
|
|
||||||
@ -895,18 +942,27 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned
|
|||||||
#define EBSIZE (2 KB)
|
#define EBSIZE (2 KB)
|
||||||
dictHeaderSize = EBSIZE;
|
dictHeaderSize = EBSIZE;
|
||||||
dictHeader = malloc(dictHeaderSize);
|
dictHeader = malloc(dictHeaderSize);
|
||||||
dictContent = malloc(dictContentSize);
|
dictContent = malloc(maxDictSize);
|
||||||
if (!dictHeader || !dictContent) EXM_THROW(2, "not enough memory");
|
if (!dictHeader || !dictContent) EXM_THROW(2, "not enough memory");
|
||||||
|
|
||||||
/* build dict content */
|
/* build dict content */
|
||||||
ptr = (BYTE*)dictContent + dictContentSize;
|
ptr = (BYTE*)dictContent + maxDictSize;
|
||||||
|
|
||||||
for (u=1; u<dictList->pos; u++) {
|
for (u=1; u<dictList->pos; u++) {
|
||||||
U32 l = dictList[u].length;
|
U32 l = dictList[u].length;
|
||||||
ptr -= l;
|
ptr -= l;
|
||||||
memcpy(ptr, (char*)srcBuffer+dictList[u].pos, l);
|
memcpy(ptr, (char*)srcBuffer+dictList[u].pos, l);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* fast dict content mode */
|
||||||
|
if (shiftRatio==0) {
|
||||||
|
addedContentLength = ptr-(BYTE*)dictContent;
|
||||||
|
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
||||||
|
DISPLAYLEVEL(2, "Adding %u KB from fast sampling \n", (U32)(addedContentLength>>10));
|
||||||
|
addedContentLength = DiB_fastSampling(dictContent, addedContentLength, srcBuffer, benchedSize);
|
||||||
|
if (!ERR_isError(addedContentLength))
|
||||||
|
ptr -= addedContentLength, dictContentSize += addedContentLength;
|
||||||
|
}
|
||||||
|
|
||||||
/* dictionary header */
|
/* dictionary header */
|
||||||
MEM_writeLE32(dictHeader, ZSTD_DICT_MAGIC);
|
MEM_writeLE32(dictHeader, ZSTD_DICT_MAGIC);
|
||||||
hSize = 4;
|
hSize = 4;
|
||||||
@ -915,14 +971,15 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned
|
|||||||
/* entropic tables */
|
/* entropic tables */
|
||||||
DISPLAYLEVEL(2, "statistics ... \n");
|
DISPLAYLEVEL(2, "statistics ... \n");
|
||||||
hSize += DiB_analyzeEntropy((char*)dictHeader+4, dictHeaderSize,
|
hSize += DiB_analyzeEntropy((char*)dictHeader+4, dictHeaderSize,
|
||||||
srcBuffer, fileSizes, nbFiles,
|
compressionLevel,
|
||||||
dictContent, dictContentSize);
|
srcBuffer, fileSizes, nbFiles,
|
||||||
|
ptr, dictContentSize);
|
||||||
|
|
||||||
/* save dict */
|
/* save dict */
|
||||||
{
|
{
|
||||||
size_t dictSize = hSize + dictContentSize;
|
size_t dictSize = hSize + dictContentSize;
|
||||||
DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
|
DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
|
||||||
DiB_saveDict(dictFileName, dictHeader, hSize, dictContent, dictContentSize);
|
DiB_saveDict(dictFileName, dictHeader, hSize, ptr, dictContentSize);
|
||||||
//DiB_saveDict(dictFileName, NULL, 0, dictContent, dictContentSize); // content only
|
//DiB_saveDict(dictFileName, NULL, 0, dictContent, dictContentSize); // content only
|
||||||
}
|
}
|
||||||
/* clean */
|
/* clean */
|
||||||
|
@ -24,7 +24,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
/* This library is designed for a single-threaded console application.
|
/* This library is designed for a single-threaded console application.
|
||||||
* It abruptly exits (exit() function) when it encounters an error condition. */
|
* It exit() and printf() into stderr when it encounters an error condition. */
|
||||||
|
|
||||||
/*-*************************************
|
/*-*************************************
|
||||||
* Version
|
* Version
|
||||||
@ -37,14 +37,17 @@ unsigned DiB_versionNumber (void);
|
|||||||
|
|
||||||
|
|
||||||
/*-*************************************
|
/*-*************************************
|
||||||
* Main functions
|
* Public functions
|
||||||
***************************************/
|
***************************************/
|
||||||
/*! DiB_trainDictionary
|
/*! DiB_trainDictionary
|
||||||
Train a dictionary from a set of files provided by @fileNamesTable
|
Train a dictionary from a set of files provided by @fileNamesTable
|
||||||
Resulting dictionary is written in file @dictFileName
|
Resulting dictionary is written in file @dictFileName.
|
||||||
@result : 0 if fine
|
@selectivityLevel change criteria for insertion into the dictionary (more => bigger selection => larger dictionary)
|
||||||
|
@compressionLevel can be used to target a specific compression level of zstd. 0 means "default".
|
||||||
|
@result : 0 == ok
|
||||||
*/
|
*/
|
||||||
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned selectivityLevel,
|
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize,
|
||||||
|
unsigned selectivityLevel, unsigned compressionLevel,
|
||||||
const char** fileNamesTable, unsigned nbFiles);
|
const char** fileNamesTable, unsigned nbFiles);
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user