added fast sampling mode

This commit is contained in:
Yann Collet 2016-01-29 02:45:26 +01:00
parent 863ec40f1e
commit f5229e0cd8
3 changed files with 144 additions and 100 deletions

View File

@ -20,43 +20,19 @@
You can contact the author at :
- zstd source repository : https://github.com/Cyan4973/zstd
- ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
*/
/**************************************
* Compiler Options
**************************************/
#define _CRT_SECURE_NO_WARNINGS /* Visual : removes warning from strcpy */
#define _POSIX_SOURCE 1 /* triggers fileno() within <stdio.h> on unix */
/**************************************
/*-************************************
* Includes
**************************************/
#include <stdio.h> /* fprintf, getchar */
#include <stdlib.h> /* exit, calloc, free */
#include <string.h> /* strcmp, strlen */
#include <stdio.h> /* fprintf, getchar */
#include "dictBuilder.h"
/**************************************
* OS-specific Includes
**************************************/
#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
# include <fcntl.h> /* _O_BINARY */
# include <io.h> /* _setmode, _isatty */
# define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY)
# define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream))
#else
# include <unistd.h> /* isatty */
# define SET_BINARY_MODE(file)
# define IS_CONSOLE(stdStream) isatty(fileno(stdStream))
#endif
/**************************************
/*-************************************
* Constants
**************************************/
#define PROGRAM_DESCRIPTION "Dictionary builder"
@ -72,21 +48,22 @@
#define MB *(1 <<20)
#define GB *(1U<<30)
static const unsigned compressionLevelDefault = 5;
static const unsigned selectionLevelDefault = 9; /* determined experimentally */
static const unsigned maxDictSizeDefault = 110 KB;
static const char* dictFileNameDefault = "dictionary";
/**************************************
/*-************************************
* Display Macros
**************************************/
#define DISPLAY(...) fprintf(displayOut, __VA_ARGS__)
#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
static FILE* displayOut;
static unsigned displayLevel = 2; // 0 : no display // 1: errors // 2 : + result + interaction + warnings ; // 3 : + progression; // 4 : + information
#define DISPLAY(...) fprintf(g_displayOut, __VA_ARGS__)
#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
static FILE* g_displayOut;
static unsigned g_displayLevel = 2; // 0 : no display // 1: errors // 2 : + result + interaction + warnings ; // 3 : + progression; // 4 : + information
/**************************************
/*-************************************
* Exceptions
**************************************/
#define DEBUG 0
@ -101,7 +78,7 @@ static unsigned displayLevel = 2; // 0 : no display // 1: errors // 2 : + re
}
/**************************************
/*-************************************
* Command Line
**************************************/
static int usage(const char* programName)
@ -110,8 +87,8 @@ static int usage(const char* programName)
DISPLAY( " %s [arg] [filenames]\n", programName);
DISPLAY( "\n");
DISPLAY( "Arguments :\n");
DISPLAY( "--maxdict : limit dictionary to specified size (default : %u) \n", maxDictSizeDefault);
DISPLAY( " -o : name of dictionary file (default: %s) \n", dictFileNameDefault);
DISPLAY( "--maxdict : limit dictionary to specified size (default : %u) \n", maxDictSizeDefault);
DISPLAY( " -h/-H : display help/long help and exit\n");
return 0;
}
@ -122,8 +99,10 @@ static int usage_advanced(const char* programName)
usage(programName);
DISPLAY( "\n");
DISPLAY( "Advanced arguments :\n");
DISPLAY( " -# : selection level # (default :%u)\n", selectionLevelDefault);
DISPLAY( " -V : display Version number and exit\n");
DISPLAY( "--fast : fast sampling mode\n");
DISPLAY( " -L# : target compression level (default: %u)\n", compressionLevelDefault);
DISPLAY( " -S# : dictionary selectivity level # (default: %u)\n", selectionLevelDefault);
DISPLAY( " -v : verbose mode\n");
DISPLAY( " -q : suppress warnings; specify twice to suppress errors too\n");
return 0;
@ -132,7 +111,7 @@ static int usage_advanced(const char* programName)
static int badusage(const char* programName)
{
DISPLAYLEVEL(1, "Incorrect parameters\n");
if (displayLevel >= 1) usage(programName);
if (g_displayLevel >= 1) usage(programName);
return 1;
}
@ -153,6 +132,7 @@ int main(int argCount, const char** argv)
operationResult=0,
nextArgumentIsMaxDict=0,
nextArgumentIsDictFileName=0;
unsigned cLevel = compressionLevelDefault;
unsigned maxDictSize = maxDictSizeDefault;
unsigned selectionLevel = selectionLevelDefault;
const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); /* argCount >= 1 */
@ -161,7 +141,7 @@ int main(int argCount, const char** argv)
const char* dictFileName = dictFileNameDefault;
/* init */
displayOut = stderr; /* unfortunately, cannot be set at declaration */
g_displayOut = stderr; /* unfortunately, cannot be set at declaration */
if (filenameTable==NULL) EXM_THROW(1, "not enough memory\n");
/* Pick out program name from path. Don't rely on stdlib because of conflicting behavior */
for (i = (int)strlen(programName); i > 0; i--) { if ((programName[i] == '/') || (programName[i] == '\\')) { i++; break; } }
@ -190,40 +170,44 @@ int main(int argCount, const char** argv)
}
/* long commands (--long-word) */
if (!strcmp(argument, "--version")) { displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; }
if (!strcmp(argument, "--help")) { displayOut=stdout; return usage_advanced(programName); }
if (!strcmp(argument, "--verbose")) { displayLevel=4; continue; }
if (!strcmp(argument, "--quiet")) { displayLevel--; continue; }
if (!strcmp(argument, "--version")) { g_displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; }
if (!strcmp(argument, "--help")) { g_displayOut=stdout; return usage_advanced(programName); }
if (!strcmp(argument, "--verbose")) { g_displayLevel++; if (g_displayLevel<3) g_displayLevel=3; continue; }
if (!strcmp(argument, "--quiet")) { g_displayLevel--; continue; }
if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; }
if (!strcmp(argument, "--fast")) { selectionLevel=0; cLevel=1; continue; }
/* Decode commands (note : aggregated commands are allowed) */
if (argument[0]=='-') {
argument++;
while (argument[0]!=0) {
/* selection Level */
if ((*argument>='0') && (*argument<='9')) {
selectionLevel = 0;
while ((*argument >= '0') && (*argument <= '9')) {
selectionLevel *= 10;
selectionLevel += *argument - '0';
argument++;
}
continue;
}
switch(argument[0])
{
/* Display help */
case 'V': displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; /* Version Only */
case 'V': g_displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; /* Version Only */
case 'H':
case 'h': displayOut=stdout; return usage_advanced(programName);
case 'h': g_displayOut=stdout; return usage_advanced(programName);
/* Selection level */
case 'S': argument++;
selectionLevel = 0;
while ((*argument >= '0') && (*argument <= '9'))
selectionLevel *= 10, selectionLevel += *argument++ - '0';
break;
/* Selection level */
case 'L': argument++;
cLevel = 0;
while ((*argument >= '0') && (*argument <= '9'))
cLevel *= 10, cLevel += *argument++ - '0';
break;
/* Verbose mode */
case 'v': displayLevel++; if (displayLevel<3) displayLevel=3; argument++; break;
case 'v': g_displayLevel++; if (g_displayLevel<3) g_displayLevel=3; argument++; break;
/* Quiet mode */
case 'q': displayLevel--; argument++; break;
case 'q': g_displayLevel--; argument++; break;
/* dictionary name */
case 'o': nextArgumentIsDictFileName=1; argument++; break;
@ -247,8 +231,8 @@ int main(int argCount, const char** argv)
if (filenameIdx==0) return badusage(programName);
/* building ... */
DiB_setNotificationLevel(displayLevel);
operationResult = DiB_trainDictionary(dictFileName, maxDictSize, selectionLevel, filenameTable, filenameIdx);
DiB_setNotificationLevel(g_displayLevel);
operationResult = DiB_trainDictionary(dictFileName, maxDictSize, selectionLevel, cLevel, filenameTable, filenameIdx);
if (main_pause) waitEnter();
free((void*)filenameTable);

View File

@ -51,6 +51,7 @@
#include <time.h> /* clock */
#include "mem.h" /* read */
#include "error_private.h"
#include "divsufsort.h"
#include "dictBuilder.h"
#include "zstd_compress.c"
@ -85,6 +86,7 @@ static const size_t maxMemory = (sizeof(size_t)==4) ? (2 GB - 64 MB) : (size_t
#define PRIME2 2246822519U
#define MINRATIO 4
static const U32 g_compressionLevel_default = 5;
/*-*************************************
@ -714,6 +716,7 @@ static void DiB_countEStats(EStats_ress_t esr,
#define OFFCODE_MAX 18
static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
unsigned compressionLevel,
const void* srcBuffer, size_t* fileSizes, unsigned nbFiles,
const void* dictBuffer, size_t dictBufferSize)
{
@ -740,7 +743,8 @@ static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
esr.zc = ZSTD_createCCtx();
esr.workPlace = malloc(BLOCKSIZE);
if (!esr.ref || !esr.zc || !esr.workPlace) EXM_THROW(30, "Not enough memory");
params = ZSTD_getParams(5, dictBufferSize + 15 KB);
if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
params = ZSTD_getParams(compressionLevel, dictBufferSize + 15 KB);
params.strategy = ZSTD_greedy;
ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params);
@ -827,8 +831,49 @@ static void DiB_saveDict(const char* dictFileName,
}
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned shiftRatio,
const char** fileNamesTable, unsigned nbFiles)
#define DIB_FASTSEGMENTSIZE 64
/*! DiB_fastSampling (based on an idea by Giuseppe Ottaviano)
Fill @dictBuffer with stripes of size DIB_FASTSEGMENTSIZE from @samplesBuffer
up to @dictSize.
Filling starts from the end of @dictBuffer, down to maximum possible.
if @dictSize is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of @dictBuffer won't be used.
@return : amount of data written into @dictBuffer
or an error Code (if @dictSize or @samplesSize too small)
*/
static size_t DiB_fastSampling(void* dictBuffer, size_t dictSize,
const void* samplesBuffer, size_t samplesSize)
{
char* dstPtr = (char*)dictBuffer + dictSize;
const char* srcPtr = (const char*)samplesBuffer;
size_t nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
size_t segNb, interSize;
if (nbSegments <= 2) return ERROR(srcSize_wrong);
if (samplesSize < dictSize) return ERROR(srcSize_wrong);
/* first and last segments are part of dictionary, in case they contain interesting header/footer */
dstPtr -= DIB_FASTSEGMENTSIZE;
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
dstPtr -= DIB_FASTSEGMENTSIZE;
memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
/* regularly copy a segment */
interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
srcPtr += DIB_FASTSEGMENTSIZE;
for (segNb=2; segNb < nbSegments; segNb++) {
srcPtr += interSize;
dstPtr -= DIB_FASTSEGMENTSIZE;
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
srcPtr += DIB_FASTSEGMENTSIZE;
}
return nbSegments * DIB_FASTSEGMENTSIZE;
}
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize,
unsigned shiftRatio, unsigned compressionLevel,
const char** fileNamesTable, unsigned nbFiles)
{
void* srcBuffer;
size_t benchedSize;
@ -852,42 +897,44 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned
/* Load input buffer */
DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* for end of buffer condition */
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
/* Train */
snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
if (nbFiles > 1) displayName = mfName;
else displayName = fileNamesTable[0];
if (shiftRatio>0)
{
/* analyze samples */
snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
if (nbFiles > 1) displayName = mfName;
else displayName = fileNamesTable[0];
DiB_trainBuffer(dictList, dictListSize,
srcBuffer, benchedSize,
displayName,
fileSizes, nbFiles, maxDictSize,
shiftRatio);
DiB_trainBuffer(dictList, dictListSize,
srcBuffer, benchedSize,
displayName,
fileSizes, nbFiles, maxDictSize,
shiftRatio);
/* display best matches */
if (g_displayLevel>= 3) {
const U32 nb = 25;
U32 u;
U32 dictContentSize = DiB_dictSize(dictList);
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
DISPLAYLEVEL(3, "list %u best segments \n", nb);
for (u=1; u<=nb; u++) {
U32 p = dictList[u].pos;
U32 l = dictList[u].length;
U32 d = MIN(40, l);
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
u, l, p, dictList[u].savings);
DiB_printHex(3, (char*)srcBuffer+p, d);
DISPLAYLEVEL(3, "| \n");
} }
/* display best matches */
if (g_displayLevel>= 3) {
const U32 nb = 25;
U32 u;
U32 dictContentSize = DiB_dictSize(dictList);
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
DISPLAYLEVEL(3, "list %u best segments \n", nb);
for (u=1; u<=nb; u++) {
U32 p = dictList[u].pos;
U32 l = dictList[u].length;
U32 d = MIN(40, l);
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
u, l, p, dictList[u].savings);
DiB_printHex(3, (char*)srcBuffer+p, d);
DISPLAYLEVEL(3, "| \n");
} } }
/* create dictionary */
{
void* dictContent;
U32 dictContentSize = DiB_dictSize(dictList);
void* dictHeader;
size_t dictHeaderSize, hSize;
size_t dictHeaderSize, hSize, addedContentLength;
BYTE* ptr;
U32 u;
@ -895,18 +942,27 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned
#define EBSIZE (2 KB)
dictHeaderSize = EBSIZE;
dictHeader = malloc(dictHeaderSize);
dictContent = malloc(dictContentSize);
dictContent = malloc(maxDictSize);
if (!dictHeader || !dictContent) EXM_THROW(2, "not enough memory");
/* build dict content */
ptr = (BYTE*)dictContent + dictContentSize;
ptr = (BYTE*)dictContent + maxDictSize;
for (u=1; u<dictList->pos; u++) {
U32 l = dictList[u].length;
ptr -= l;
memcpy(ptr, (char*)srcBuffer+dictList[u].pos, l);
}
/* fast dict content mode */
if (shiftRatio==0) {
addedContentLength = ptr-(BYTE*)dictContent;
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
DISPLAYLEVEL(2, "Adding %u KB from fast sampling \n", (U32)(addedContentLength>>10));
addedContentLength = DiB_fastSampling(dictContent, addedContentLength, srcBuffer, benchedSize);
if (!ERR_isError(addedContentLength))
ptr -= addedContentLength, dictContentSize += addedContentLength;
}
/* dictionary header */
MEM_writeLE32(dictHeader, ZSTD_DICT_MAGIC);
hSize = 4;
@ -915,14 +971,15 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned
/* entropic tables */
DISPLAYLEVEL(2, "statistics ... \n");
hSize += DiB_analyzeEntropy((char*)dictHeader+4, dictHeaderSize,
srcBuffer, fileSizes, nbFiles,
dictContent, dictContentSize);
compressionLevel,
srcBuffer, fileSizes, nbFiles,
ptr, dictContentSize);
/* save dict */
{
size_t dictSize = hSize + dictContentSize;
DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
DiB_saveDict(dictFileName, dictHeader, hSize, dictContent, dictContentSize);
DiB_saveDict(dictFileName, dictHeader, hSize, ptr, dictContentSize);
//DiB_saveDict(dictFileName, NULL, 0, dictContent, dictContentSize); // content only
}
/* clean */

View File

@ -24,7 +24,7 @@
*/
/* This library is designed for a single-threaded console application.
* It abruptly exits (exit() function) when it encounters an error condition. */
* It exit() and printf() into stderr when it encounters an error condition. */
/*-*************************************
* Version
@ -37,14 +37,17 @@ unsigned DiB_versionNumber (void);
/*-*************************************
* Main functions
* Public functions
***************************************/
/*! DiB_trainDictionary
Train a dictionary from a set of files provided by @fileNamesTable
Resulting dictionary is written in file @dictFileName
@result : 0 if fine
Resulting dictionary is written in file @dictFileName.
@selectivityLevel change criteria for insertion into the dictionary (more => bigger selection => larger dictionary)
@compressionLevel can be used to target a specific compression level of zstd. 0 means "default".
@result : 0 == ok
*/
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned selectivityLevel,
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize,
unsigned selectivityLevel, unsigned compressionLevel,
const char** fileNamesTable, unsigned nbFiles);