added fast sampling mode
This commit is contained in:
parent
863ec40f1e
commit
f5229e0cd8
@ -20,43 +20,19 @@
|
||||
|
||||
You can contact the author at :
|
||||
- zstd source repository : https://github.com/Cyan4973/zstd
|
||||
- ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
|
||||
*/
|
||||
|
||||
|
||||
/**************************************
|
||||
* Compiler Options
|
||||
**************************************/
|
||||
#define _CRT_SECURE_NO_WARNINGS /* Visual : removes warning from strcpy */
|
||||
#define _POSIX_SOURCE 1 /* triggers fileno() within <stdio.h> on unix */
|
||||
|
||||
|
||||
/**************************************
|
||||
/*-************************************
|
||||
* Includes
|
||||
**************************************/
|
||||
#include <stdio.h> /* fprintf, getchar */
|
||||
#include <stdlib.h> /* exit, calloc, free */
|
||||
#include <string.h> /* strcmp, strlen */
|
||||
#include <stdio.h> /* fprintf, getchar */
|
||||
|
||||
#include "dictBuilder.h"
|
||||
|
||||
|
||||
/**************************************
|
||||
* OS-specific Includes
|
||||
**************************************/
|
||||
#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
|
||||
# include <fcntl.h> /* _O_BINARY */
|
||||
# include <io.h> /* _setmode, _isatty */
|
||||
# define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY)
|
||||
# define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream))
|
||||
#else
|
||||
# include <unistd.h> /* isatty */
|
||||
# define SET_BINARY_MODE(file)
|
||||
# define IS_CONSOLE(stdStream) isatty(fileno(stdStream))
|
||||
#endif
|
||||
|
||||
|
||||
/**************************************
|
||||
/*-************************************
|
||||
* Constants
|
||||
**************************************/
|
||||
#define PROGRAM_DESCRIPTION "Dictionary builder"
|
||||
@ -72,21 +48,22 @@
|
||||
#define MB *(1 <<20)
|
||||
#define GB *(1U<<30)
|
||||
|
||||
static const unsigned compressionLevelDefault = 5;
|
||||
static const unsigned selectionLevelDefault = 9; /* determined experimentally */
|
||||
static const unsigned maxDictSizeDefault = 110 KB;
|
||||
static const char* dictFileNameDefault = "dictionary";
|
||||
|
||||
|
||||
/**************************************
|
||||
/*-************************************
|
||||
* Display Macros
|
||||
**************************************/
|
||||
#define DISPLAY(...) fprintf(displayOut, __VA_ARGS__)
|
||||
#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
|
||||
static FILE* displayOut;
|
||||
static unsigned displayLevel = 2; // 0 : no display // 1: errors // 2 : + result + interaction + warnings ; // 3 : + progression; // 4 : + information
|
||||
#define DISPLAY(...) fprintf(g_displayOut, __VA_ARGS__)
|
||||
#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
|
||||
static FILE* g_displayOut;
|
||||
static unsigned g_displayLevel = 2; // 0 : no display // 1: errors // 2 : + result + interaction + warnings ; // 3 : + progression; // 4 : + information
|
||||
|
||||
|
||||
/**************************************
|
||||
/*-************************************
|
||||
* Exceptions
|
||||
**************************************/
|
||||
#define DEBUG 0
|
||||
@ -101,7 +78,7 @@ static unsigned displayLevel = 2; // 0 : no display // 1: errors // 2 : + re
|
||||
}
|
||||
|
||||
|
||||
/**************************************
|
||||
/*-************************************
|
||||
* Command Line
|
||||
**************************************/
|
||||
static int usage(const char* programName)
|
||||
@ -110,8 +87,8 @@ static int usage(const char* programName)
|
||||
DISPLAY( " %s [arg] [filenames]\n", programName);
|
||||
DISPLAY( "\n");
|
||||
DISPLAY( "Arguments :\n");
|
||||
DISPLAY( "--maxdict : limit dictionary to specified size (default : %u) \n", maxDictSizeDefault);
|
||||
DISPLAY( " -o : name of dictionary file (default: %s) \n", dictFileNameDefault);
|
||||
DISPLAY( "--maxdict : limit dictionary to specified size (default : %u) \n", maxDictSizeDefault);
|
||||
DISPLAY( " -h/-H : display help/long help and exit\n");
|
||||
return 0;
|
||||
}
|
||||
@ -122,8 +99,10 @@ static int usage_advanced(const char* programName)
|
||||
usage(programName);
|
||||
DISPLAY( "\n");
|
||||
DISPLAY( "Advanced arguments :\n");
|
||||
DISPLAY( " -# : selection level # (default :%u)\n", selectionLevelDefault);
|
||||
DISPLAY( " -V : display Version number and exit\n");
|
||||
DISPLAY( "--fast : fast sampling mode\n");
|
||||
DISPLAY( " -L# : target compression level (default: %u)\n", compressionLevelDefault);
|
||||
DISPLAY( " -S# : dictionary selectivity level # (default: %u)\n", selectionLevelDefault);
|
||||
DISPLAY( " -v : verbose mode\n");
|
||||
DISPLAY( " -q : suppress warnings; specify twice to suppress errors too\n");
|
||||
return 0;
|
||||
@ -132,7 +111,7 @@ static int usage_advanced(const char* programName)
|
||||
static int badusage(const char* programName)
|
||||
{
|
||||
DISPLAYLEVEL(1, "Incorrect parameters\n");
|
||||
if (displayLevel >= 1) usage(programName);
|
||||
if (g_displayLevel >= 1) usage(programName);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -153,6 +132,7 @@ int main(int argCount, const char** argv)
|
||||
operationResult=0,
|
||||
nextArgumentIsMaxDict=0,
|
||||
nextArgumentIsDictFileName=0;
|
||||
unsigned cLevel = compressionLevelDefault;
|
||||
unsigned maxDictSize = maxDictSizeDefault;
|
||||
unsigned selectionLevel = selectionLevelDefault;
|
||||
const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); /* argCount >= 1 */
|
||||
@ -161,7 +141,7 @@ int main(int argCount, const char** argv)
|
||||
const char* dictFileName = dictFileNameDefault;
|
||||
|
||||
/* init */
|
||||
displayOut = stderr; /* unfortunately, cannot be set at declaration */
|
||||
g_displayOut = stderr; /* unfortunately, cannot be set at declaration */
|
||||
if (filenameTable==NULL) EXM_THROW(1, "not enough memory\n");
|
||||
/* Pick out program name from path. Don't rely on stdlib because of conflicting behavior */
|
||||
for (i = (int)strlen(programName); i > 0; i--) { if ((programName[i] == '/') || (programName[i] == '\\')) { i++; break; } }
|
||||
@ -190,40 +170,44 @@ int main(int argCount, const char** argv)
|
||||
}
|
||||
|
||||
/* long commands (--long-word) */
|
||||
if (!strcmp(argument, "--version")) { displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; }
|
||||
if (!strcmp(argument, "--help")) { displayOut=stdout; return usage_advanced(programName); }
|
||||
if (!strcmp(argument, "--verbose")) { displayLevel=4; continue; }
|
||||
if (!strcmp(argument, "--quiet")) { displayLevel--; continue; }
|
||||
if (!strcmp(argument, "--version")) { g_displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; }
|
||||
if (!strcmp(argument, "--help")) { g_displayOut=stdout; return usage_advanced(programName); }
|
||||
if (!strcmp(argument, "--verbose")) { g_displayLevel++; if (g_displayLevel<3) g_displayLevel=3; continue; }
|
||||
if (!strcmp(argument, "--quiet")) { g_displayLevel--; continue; }
|
||||
if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; }
|
||||
if (!strcmp(argument, "--fast")) { selectionLevel=0; cLevel=1; continue; }
|
||||
|
||||
/* Decode commands (note : aggregated commands are allowed) */
|
||||
if (argument[0]=='-') {
|
||||
argument++;
|
||||
|
||||
while (argument[0]!=0) {
|
||||
/* selection Level */
|
||||
if ((*argument>='0') && (*argument<='9')) {
|
||||
selectionLevel = 0;
|
||||
while ((*argument >= '0') && (*argument <= '9')) {
|
||||
selectionLevel *= 10;
|
||||
selectionLevel += *argument - '0';
|
||||
argument++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
switch(argument[0])
|
||||
{
|
||||
/* Display help */
|
||||
case 'V': displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; /* Version Only */
|
||||
case 'V': g_displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; /* Version Only */
|
||||
case 'H':
|
||||
case 'h': displayOut=stdout; return usage_advanced(programName);
|
||||
case 'h': g_displayOut=stdout; return usage_advanced(programName);
|
||||
|
||||
/* Selection level */
|
||||
case 'S': argument++;
|
||||
selectionLevel = 0;
|
||||
while ((*argument >= '0') && (*argument <= '9'))
|
||||
selectionLevel *= 10, selectionLevel += *argument++ - '0';
|
||||
break;
|
||||
|
||||
/* Selection level */
|
||||
case 'L': argument++;
|
||||
cLevel = 0;
|
||||
while ((*argument >= '0') && (*argument <= '9'))
|
||||
cLevel *= 10, cLevel += *argument++ - '0';
|
||||
break;
|
||||
|
||||
/* Verbose mode */
|
||||
case 'v': displayLevel++; if (displayLevel<3) displayLevel=3; argument++; break;
|
||||
case 'v': g_displayLevel++; if (g_displayLevel<3) g_displayLevel=3; argument++; break;
|
||||
|
||||
/* Quiet mode */
|
||||
case 'q': displayLevel--; argument++; break;
|
||||
case 'q': g_displayLevel--; argument++; break;
|
||||
|
||||
/* dictionary name */
|
||||
case 'o': nextArgumentIsDictFileName=1; argument++; break;
|
||||
@ -247,8 +231,8 @@ int main(int argCount, const char** argv)
|
||||
if (filenameIdx==0) return badusage(programName);
|
||||
|
||||
/* building ... */
|
||||
DiB_setNotificationLevel(displayLevel);
|
||||
operationResult = DiB_trainDictionary(dictFileName, maxDictSize, selectionLevel, filenameTable, filenameIdx);
|
||||
DiB_setNotificationLevel(g_displayLevel);
|
||||
operationResult = DiB_trainDictionary(dictFileName, maxDictSize, selectionLevel, cLevel, filenameTable, filenameIdx);
|
||||
|
||||
if (main_pause) waitEnter();
|
||||
free((void*)filenameTable);
|
||||
|
@ -51,6 +51,7 @@
|
||||
#include <time.h> /* clock */
|
||||
|
||||
#include "mem.h" /* read */
|
||||
#include "error_private.h"
|
||||
#include "divsufsort.h"
|
||||
#include "dictBuilder.h"
|
||||
#include "zstd_compress.c"
|
||||
@ -85,6 +86,7 @@ static const size_t maxMemory = (sizeof(size_t)==4) ? (2 GB - 64 MB) : (size_t
|
||||
#define PRIME2 2246822519U
|
||||
|
||||
#define MINRATIO 4
|
||||
static const U32 g_compressionLevel_default = 5;
|
||||
|
||||
|
||||
/*-*************************************
|
||||
@ -714,6 +716,7 @@ static void DiB_countEStats(EStats_ress_t esr,
|
||||
|
||||
#define OFFCODE_MAX 18
|
||||
static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
||||
unsigned compressionLevel,
|
||||
const void* srcBuffer, size_t* fileSizes, unsigned nbFiles,
|
||||
const void* dictBuffer, size_t dictBufferSize)
|
||||
{
|
||||
@ -740,7 +743,8 @@ static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
||||
esr.zc = ZSTD_createCCtx();
|
||||
esr.workPlace = malloc(BLOCKSIZE);
|
||||
if (!esr.ref || !esr.zc || !esr.workPlace) EXM_THROW(30, "Not enough memory");
|
||||
params = ZSTD_getParams(5, dictBufferSize + 15 KB);
|
||||
if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
|
||||
params = ZSTD_getParams(compressionLevel, dictBufferSize + 15 KB);
|
||||
params.strategy = ZSTD_greedy;
|
||||
ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params);
|
||||
|
||||
@ -827,7 +831,48 @@ static void DiB_saveDict(const char* dictFileName,
|
||||
}
|
||||
|
||||
|
||||
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned shiftRatio,
|
||||
#define DIB_FASTSEGMENTSIZE 64
|
||||
/*! DiB_fastSampling (based on an idea by Giuseppe Ottaviano)
|
||||
Fill @dictBuffer with stripes of size DIB_FASTSEGMENTSIZE from @samplesBuffer
|
||||
up to @dictSize.
|
||||
Filling starts from the end of @dictBuffer, down to maximum possible.
|
||||
if @dictSize is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of @dictBuffer won't be used.
|
||||
@return : amount of data written into @dictBuffer
|
||||
or an error Code (if @dictSize or @samplesSize too small)
|
||||
*/
|
||||
static size_t DiB_fastSampling(void* dictBuffer, size_t dictSize,
|
||||
const void* samplesBuffer, size_t samplesSize)
|
||||
{
|
||||
char* dstPtr = (char*)dictBuffer + dictSize;
|
||||
const char* srcPtr = (const char*)samplesBuffer;
|
||||
size_t nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
|
||||
size_t segNb, interSize;
|
||||
|
||||
if (nbSegments <= 2) return ERROR(srcSize_wrong);
|
||||
if (samplesSize < dictSize) return ERROR(srcSize_wrong);
|
||||
|
||||
/* first and last segments are part of dictionary, in case they contain interesting header/footer */
|
||||
dstPtr -= DIB_FASTSEGMENTSIZE;
|
||||
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
|
||||
dstPtr -= DIB_FASTSEGMENTSIZE;
|
||||
memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
|
||||
|
||||
/* regularly copy a segment */
|
||||
interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
|
||||
srcPtr += DIB_FASTSEGMENTSIZE;
|
||||
for (segNb=2; segNb < nbSegments; segNb++) {
|
||||
srcPtr += interSize;
|
||||
dstPtr -= DIB_FASTSEGMENTSIZE;
|
||||
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
|
||||
srcPtr += DIB_FASTSEGMENTSIZE;
|
||||
}
|
||||
|
||||
return nbSegments * DIB_FASTSEGMENTSIZE;
|
||||
}
|
||||
|
||||
|
||||
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize,
|
||||
unsigned shiftRatio, unsigned compressionLevel,
|
||||
const char** fileNamesTable, unsigned nbFiles)
|
||||
{
|
||||
void* srcBuffer;
|
||||
@ -852,9 +897,11 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned
|
||||
|
||||
/* Load input buffer */
|
||||
DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
|
||||
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* for end of buffer condition */
|
||||
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
||||
|
||||
/* Train */
|
||||
if (shiftRatio>0)
|
||||
{
|
||||
/* analyze samples */
|
||||
snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
|
||||
if (nbFiles > 1) displayName = mfName;
|
||||
else displayName = fileNamesTable[0];
|
||||
@ -880,14 +927,14 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned
|
||||
u, l, p, dictList[u].savings);
|
||||
DiB_printHex(3, (char*)srcBuffer+p, d);
|
||||
DISPLAYLEVEL(3, "| \n");
|
||||
} }
|
||||
} } }
|
||||
|
||||
/* create dictionary */
|
||||
{
|
||||
void* dictContent;
|
||||
U32 dictContentSize = DiB_dictSize(dictList);
|
||||
void* dictHeader;
|
||||
size_t dictHeaderSize, hSize;
|
||||
size_t dictHeaderSize, hSize, addedContentLength;
|
||||
BYTE* ptr;
|
||||
U32 u;
|
||||
|
||||
@ -895,18 +942,27 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned
|
||||
#define EBSIZE (2 KB)
|
||||
dictHeaderSize = EBSIZE;
|
||||
dictHeader = malloc(dictHeaderSize);
|
||||
dictContent = malloc(dictContentSize);
|
||||
dictContent = malloc(maxDictSize);
|
||||
if (!dictHeader || !dictContent) EXM_THROW(2, "not enough memory");
|
||||
|
||||
/* build dict content */
|
||||
ptr = (BYTE*)dictContent + dictContentSize;
|
||||
|
||||
ptr = (BYTE*)dictContent + maxDictSize;
|
||||
for (u=1; u<dictList->pos; u++) {
|
||||
U32 l = dictList[u].length;
|
||||
ptr -= l;
|
||||
memcpy(ptr, (char*)srcBuffer+dictList[u].pos, l);
|
||||
}
|
||||
|
||||
/* fast dict content mode */
|
||||
if (shiftRatio==0) {
|
||||
addedContentLength = ptr-(BYTE*)dictContent;
|
||||
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
||||
DISPLAYLEVEL(2, "Adding %u KB from fast sampling \n", (U32)(addedContentLength>>10));
|
||||
addedContentLength = DiB_fastSampling(dictContent, addedContentLength, srcBuffer, benchedSize);
|
||||
if (!ERR_isError(addedContentLength))
|
||||
ptr -= addedContentLength, dictContentSize += addedContentLength;
|
||||
}
|
||||
|
||||
/* dictionary header */
|
||||
MEM_writeLE32(dictHeader, ZSTD_DICT_MAGIC);
|
||||
hSize = 4;
|
||||
@ -915,14 +971,15 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned
|
||||
/* entropic tables */
|
||||
DISPLAYLEVEL(2, "statistics ... \n");
|
||||
hSize += DiB_analyzeEntropy((char*)dictHeader+4, dictHeaderSize,
|
||||
compressionLevel,
|
||||
srcBuffer, fileSizes, nbFiles,
|
||||
dictContent, dictContentSize);
|
||||
ptr, dictContentSize);
|
||||
|
||||
/* save dict */
|
||||
{
|
||||
size_t dictSize = hSize + dictContentSize;
|
||||
DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
|
||||
DiB_saveDict(dictFileName, dictHeader, hSize, dictContent, dictContentSize);
|
||||
DiB_saveDict(dictFileName, dictHeader, hSize, ptr, dictContentSize);
|
||||
//DiB_saveDict(dictFileName, NULL, 0, dictContent, dictContentSize); // content only
|
||||
}
|
||||
/* clean */
|
||||
|
@ -24,7 +24,7 @@
|
||||
*/
|
||||
|
||||
/* This library is designed for a single-threaded console application.
|
||||
* It abruptly exits (exit() function) when it encounters an error condition. */
|
||||
* It exit() and printf() into stderr when it encounters an error condition. */
|
||||
|
||||
/*-*************************************
|
||||
* Version
|
||||
@ -37,14 +37,17 @@ unsigned DiB_versionNumber (void);
|
||||
|
||||
|
||||
/*-*************************************
|
||||
* Main functions
|
||||
* Public functions
|
||||
***************************************/
|
||||
/*! DiB_trainDictionary
|
||||
Train a dictionary from a set of files provided by @fileNamesTable
|
||||
Resulting dictionary is written in file @dictFileName
|
||||
@result : 0 if fine
|
||||
Resulting dictionary is written in file @dictFileName.
|
||||
@selectivityLevel change criteria for insertion into the dictionary (more => bigger selection => larger dictionary)
|
||||
@compressionLevel can be used to target a specific compression level of zstd. 0 means "default".
|
||||
@result : 0 == ok
|
||||
*/
|
||||
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned selectivityLevel,
|
||||
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize,
|
||||
unsigned selectivityLevel, unsigned compressionLevel,
|
||||
const char** fileNamesTable, unsigned nbFiles);
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user