added fast sampling mode

This commit is contained in:
Yann Collet 2016-01-29 02:45:26 +01:00
parent 863ec40f1e
commit f5229e0cd8
3 changed files with 144 additions and 100 deletions

View File

@ -20,43 +20,19 @@
You can contact the author at : You can contact the author at :
- zstd source repository : https://github.com/Cyan4973/zstd - zstd source repository : https://github.com/Cyan4973/zstd
- ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
*/ */
/*-************************************
/**************************************
* Compiler Options
**************************************/
#define _CRT_SECURE_NO_WARNINGS /* Visual : removes warning from strcpy */
#define _POSIX_SOURCE 1 /* triggers fileno() within <stdio.h> on unix */
/**************************************
* Includes * Includes
**************************************/ **************************************/
#include <stdio.h> /* fprintf, getchar */
#include <stdlib.h> /* exit, calloc, free */ #include <stdlib.h> /* exit, calloc, free */
#include <string.h> /* strcmp, strlen */ #include <string.h> /* strcmp, strlen */
#include <stdio.h> /* fprintf, getchar */
#include "dictBuilder.h" #include "dictBuilder.h"
/************************************** /*-************************************
* OS-specific Includes
**************************************/
#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
# include <fcntl.h> /* _O_BINARY */
# include <io.h> /* _setmode, _isatty */
# define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY)
# define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream))
#else
# include <unistd.h> /* isatty */
# define SET_BINARY_MODE(file)
# define IS_CONSOLE(stdStream) isatty(fileno(stdStream))
#endif
/**************************************
* Constants * Constants
**************************************/ **************************************/
#define PROGRAM_DESCRIPTION "Dictionary builder" #define PROGRAM_DESCRIPTION "Dictionary builder"
@ -72,21 +48,22 @@
#define MB *(1 <<20) #define MB *(1 <<20)
#define GB *(1U<<30) #define GB *(1U<<30)
static const unsigned compressionLevelDefault = 5;
static const unsigned selectionLevelDefault = 9; /* determined experimentally */ static const unsigned selectionLevelDefault = 9; /* determined experimentally */
static const unsigned maxDictSizeDefault = 110 KB; static const unsigned maxDictSizeDefault = 110 KB;
static const char* dictFileNameDefault = "dictionary"; static const char* dictFileNameDefault = "dictionary";
/************************************** /*-************************************
* Display Macros * Display Macros
**************************************/ **************************************/
#define DISPLAY(...) fprintf(displayOut, __VA_ARGS__) #define DISPLAY(...) fprintf(g_displayOut, __VA_ARGS__)
#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } #define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
static FILE* displayOut; static FILE* g_displayOut;
static unsigned displayLevel = 2; // 0 : no display // 1: errors // 2 : + result + interaction + warnings ; // 3 : + progression; // 4 : + information static unsigned g_displayLevel = 2; // 0 : no display // 1: errors // 2 : + result + interaction + warnings ; // 3 : + progression; // 4 : + information
/************************************** /*-************************************
* Exceptions * Exceptions
**************************************/ **************************************/
#define DEBUG 0 #define DEBUG 0
@ -101,7 +78,7 @@ static unsigned displayLevel = 2; // 0 : no display // 1: errors // 2 : + re
} }
/************************************** /*-************************************
* Command Line * Command Line
**************************************/ **************************************/
static int usage(const char* programName) static int usage(const char* programName)
@ -110,8 +87,8 @@ static int usage(const char* programName)
DISPLAY( " %s [arg] [filenames]\n", programName); DISPLAY( " %s [arg] [filenames]\n", programName);
DISPLAY( "\n"); DISPLAY( "\n");
DISPLAY( "Arguments :\n"); DISPLAY( "Arguments :\n");
DISPLAY( "--maxdict : limit dictionary to specified size (default : %u) \n", maxDictSizeDefault);
DISPLAY( " -o : name of dictionary file (default: %s) \n", dictFileNameDefault); DISPLAY( " -o : name of dictionary file (default: %s) \n", dictFileNameDefault);
DISPLAY( "--maxdict : limit dictionary to specified size (default : %u) \n", maxDictSizeDefault);
DISPLAY( " -h/-H : display help/long help and exit\n"); DISPLAY( " -h/-H : display help/long help and exit\n");
return 0; return 0;
} }
@ -122,8 +99,10 @@ static int usage_advanced(const char* programName)
usage(programName); usage(programName);
DISPLAY( "\n"); DISPLAY( "\n");
DISPLAY( "Advanced arguments :\n"); DISPLAY( "Advanced arguments :\n");
DISPLAY( " -# : selection level # (default :%u)\n", selectionLevelDefault);
DISPLAY( " -V : display Version number and exit\n"); DISPLAY( " -V : display Version number and exit\n");
DISPLAY( "--fast : fast sampling mode\n");
DISPLAY( " -L# : target compression level (default: %u)\n", compressionLevelDefault);
DISPLAY( " -S# : dictionary selectivity level # (default: %u)\n", selectionLevelDefault);
DISPLAY( " -v : verbose mode\n"); DISPLAY( " -v : verbose mode\n");
DISPLAY( " -q : suppress warnings; specify twice to suppress errors too\n"); DISPLAY( " -q : suppress warnings; specify twice to suppress errors too\n");
return 0; return 0;
@ -132,7 +111,7 @@ static int usage_advanced(const char* programName)
static int badusage(const char* programName) static int badusage(const char* programName)
{ {
DISPLAYLEVEL(1, "Incorrect parameters\n"); DISPLAYLEVEL(1, "Incorrect parameters\n");
if (displayLevel >= 1) usage(programName); if (g_displayLevel >= 1) usage(programName);
return 1; return 1;
} }
@ -153,6 +132,7 @@ int main(int argCount, const char** argv)
operationResult=0, operationResult=0,
nextArgumentIsMaxDict=0, nextArgumentIsMaxDict=0,
nextArgumentIsDictFileName=0; nextArgumentIsDictFileName=0;
unsigned cLevel = compressionLevelDefault;
unsigned maxDictSize = maxDictSizeDefault; unsigned maxDictSize = maxDictSizeDefault;
unsigned selectionLevel = selectionLevelDefault; unsigned selectionLevel = selectionLevelDefault;
const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); /* argCount >= 1 */ const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); /* argCount >= 1 */
@ -161,7 +141,7 @@ int main(int argCount, const char** argv)
const char* dictFileName = dictFileNameDefault; const char* dictFileName = dictFileNameDefault;
/* init */ /* init */
displayOut = stderr; /* unfortunately, cannot be set at declaration */ g_displayOut = stderr; /* unfortunately, cannot be set at declaration */
if (filenameTable==NULL) EXM_THROW(1, "not enough memory\n"); if (filenameTable==NULL) EXM_THROW(1, "not enough memory\n");
/* Pick out program name from path. Don't rely on stdlib because of conflicting behavior */ /* Pick out program name from path. Don't rely on stdlib because of conflicting behavior */
for (i = (int)strlen(programName); i > 0; i--) { if ((programName[i] == '/') || (programName[i] == '\\')) { i++; break; } } for (i = (int)strlen(programName); i > 0; i--) { if ((programName[i] == '/') || (programName[i] == '\\')) { i++; break; } }
@ -190,40 +170,44 @@ int main(int argCount, const char** argv)
} }
/* long commands (--long-word) */ /* long commands (--long-word) */
if (!strcmp(argument, "--version")) { displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; } if (!strcmp(argument, "--version")) { g_displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; }
if (!strcmp(argument, "--help")) { displayOut=stdout; return usage_advanced(programName); } if (!strcmp(argument, "--help")) { g_displayOut=stdout; return usage_advanced(programName); }
if (!strcmp(argument, "--verbose")) { displayLevel=4; continue; } if (!strcmp(argument, "--verbose")) { g_displayLevel++; if (g_displayLevel<3) g_displayLevel=3; continue; }
if (!strcmp(argument, "--quiet")) { displayLevel--; continue; } if (!strcmp(argument, "--quiet")) { g_displayLevel--; continue; }
if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; } if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; }
if (!strcmp(argument, "--fast")) { selectionLevel=0; cLevel=1; continue; }
/* Decode commands (note : aggregated commands are allowed) */ /* Decode commands (note : aggregated commands are allowed) */
if (argument[0]=='-') { if (argument[0]=='-') {
argument++; argument++;
while (argument[0]!=0) { while (argument[0]!=0) {
/* selection Level */
if ((*argument>='0') && (*argument<='9')) {
selectionLevel = 0;
while ((*argument >= '0') && (*argument <= '9')) {
selectionLevel *= 10;
selectionLevel += *argument - '0';
argument++;
}
continue;
}
switch(argument[0]) switch(argument[0])
{ {
/* Display help */ /* Display help */
case 'V': displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; /* Version Only */ case 'V': g_displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; /* Version Only */
case 'H': case 'H':
case 'h': displayOut=stdout; return usage_advanced(programName); case 'h': g_displayOut=stdout; return usage_advanced(programName);
/* Selection level */
case 'S': argument++;
selectionLevel = 0;
while ((*argument >= '0') && (*argument <= '9'))
selectionLevel *= 10, selectionLevel += *argument++ - '0';
break;
/* Selection level */
case 'L': argument++;
cLevel = 0;
while ((*argument >= '0') && (*argument <= '9'))
cLevel *= 10, cLevel += *argument++ - '0';
break;
/* Verbose mode */ /* Verbose mode */
case 'v': displayLevel++; if (displayLevel<3) displayLevel=3; argument++; break; case 'v': g_displayLevel++; if (g_displayLevel<3) g_displayLevel=3; argument++; break;
/* Quiet mode */ /* Quiet mode */
case 'q': displayLevel--; argument++; break; case 'q': g_displayLevel--; argument++; break;
/* dictionary name */ /* dictionary name */
case 'o': nextArgumentIsDictFileName=1; argument++; break; case 'o': nextArgumentIsDictFileName=1; argument++; break;
@ -247,8 +231,8 @@ int main(int argCount, const char** argv)
if (filenameIdx==0) return badusage(programName); if (filenameIdx==0) return badusage(programName);
/* building ... */ /* building ... */
DiB_setNotificationLevel(displayLevel); DiB_setNotificationLevel(g_displayLevel);
operationResult = DiB_trainDictionary(dictFileName, maxDictSize, selectionLevel, filenameTable, filenameIdx); operationResult = DiB_trainDictionary(dictFileName, maxDictSize, selectionLevel, cLevel, filenameTable, filenameIdx);
if (main_pause) waitEnter(); if (main_pause) waitEnter();
free((void*)filenameTable); free((void*)filenameTable);

View File

@ -51,6 +51,7 @@
#include <time.h> /* clock */ #include <time.h> /* clock */
#include "mem.h" /* read */ #include "mem.h" /* read */
#include "error_private.h"
#include "divsufsort.h" #include "divsufsort.h"
#include "dictBuilder.h" #include "dictBuilder.h"
#include "zstd_compress.c" #include "zstd_compress.c"
@ -85,6 +86,7 @@ static const size_t maxMemory = (sizeof(size_t)==4) ? (2 GB - 64 MB) : (size_t
#define PRIME2 2246822519U #define PRIME2 2246822519U
#define MINRATIO 4 #define MINRATIO 4
static const U32 g_compressionLevel_default = 5;
/*-************************************* /*-*************************************
@ -714,6 +716,7 @@ static void DiB_countEStats(EStats_ress_t esr,
#define OFFCODE_MAX 18 #define OFFCODE_MAX 18
static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize, static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
unsigned compressionLevel,
const void* srcBuffer, size_t* fileSizes, unsigned nbFiles, const void* srcBuffer, size_t* fileSizes, unsigned nbFiles,
const void* dictBuffer, size_t dictBufferSize) const void* dictBuffer, size_t dictBufferSize)
{ {
@ -740,7 +743,8 @@ static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
esr.zc = ZSTD_createCCtx(); esr.zc = ZSTD_createCCtx();
esr.workPlace = malloc(BLOCKSIZE); esr.workPlace = malloc(BLOCKSIZE);
if (!esr.ref || !esr.zc || !esr.workPlace) EXM_THROW(30, "Not enough memory"); if (!esr.ref || !esr.zc || !esr.workPlace) EXM_THROW(30, "Not enough memory");
params = ZSTD_getParams(5, dictBufferSize + 15 KB); if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
params = ZSTD_getParams(compressionLevel, dictBufferSize + 15 KB);
params.strategy = ZSTD_greedy; params.strategy = ZSTD_greedy;
ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params); ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params);
@ -827,8 +831,49 @@ static void DiB_saveDict(const char* dictFileName,
} }
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned shiftRatio, #define DIB_FASTSEGMENTSIZE 64
const char** fileNamesTable, unsigned nbFiles) /*! DiB_fastSampling (based on an idea by Giuseppe Ottaviano)
Fill @dictBuffer with stripes of size DIB_FASTSEGMENTSIZE from @samplesBuffer
up to @dictSize.
Filling starts from the end of @dictBuffer, down to maximum possible.
if @dictSize is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of @dictBuffer won't be used.
@return : amount of data written into @dictBuffer
or an error Code (if @dictSize or @samplesSize too small)
*/
static size_t DiB_fastSampling(void* dictBuffer, size_t dictSize,
const void* samplesBuffer, size_t samplesSize)
{
char* dstPtr = (char*)dictBuffer + dictSize;
const char* srcPtr = (const char*)samplesBuffer;
size_t nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
size_t segNb, interSize;
if (nbSegments <= 2) return ERROR(srcSize_wrong);
if (samplesSize < dictSize) return ERROR(srcSize_wrong);
/* first and last segments are part of dictionary, in case they contain interesting header/footer */
dstPtr -= DIB_FASTSEGMENTSIZE;
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
dstPtr -= DIB_FASTSEGMENTSIZE;
memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
/* regularly copy a segment */
interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
srcPtr += DIB_FASTSEGMENTSIZE;
for (segNb=2; segNb < nbSegments; segNb++) {
srcPtr += interSize;
dstPtr -= DIB_FASTSEGMENTSIZE;
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
srcPtr += DIB_FASTSEGMENTSIZE;
}
return nbSegments * DIB_FASTSEGMENTSIZE;
}
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize,
unsigned shiftRatio, unsigned compressionLevel,
const char** fileNamesTable, unsigned nbFiles)
{ {
void* srcBuffer; void* srcBuffer;
size_t benchedSize; size_t benchedSize;
@ -852,42 +897,44 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned
/* Load input buffer */ /* Load input buffer */
DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles); DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* for end of buffer condition */ DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
/* Train */ if (shiftRatio>0)
snprintf (mfName, sizeof(mfName), " %u files", nbFiles); {
if (nbFiles > 1) displayName = mfName; /* analyze samples */
else displayName = fileNamesTable[0]; snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
if (nbFiles > 1) displayName = mfName;
else displayName = fileNamesTable[0];
DiB_trainBuffer(dictList, dictListSize, DiB_trainBuffer(dictList, dictListSize,
srcBuffer, benchedSize, srcBuffer, benchedSize,
displayName, displayName,
fileSizes, nbFiles, maxDictSize, fileSizes, nbFiles, maxDictSize,
shiftRatio); shiftRatio);
/* display best matches */ /* display best matches */
if (g_displayLevel>= 3) { if (g_displayLevel>= 3) {
const U32 nb = 25; const U32 nb = 25;
U32 u; U32 u;
U32 dictContentSize = DiB_dictSize(dictList); U32 dictContentSize = DiB_dictSize(dictList);
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize); DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
DISPLAYLEVEL(3, "list %u best segments \n", nb); DISPLAYLEVEL(3, "list %u best segments \n", nb);
for (u=1; u<=nb; u++) { for (u=1; u<=nb; u++) {
U32 p = dictList[u].pos; U32 p = dictList[u].pos;
U32 l = dictList[u].length; U32 l = dictList[u].length;
U32 d = MIN(40, l); U32 d = MIN(40, l);
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |", DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
u, l, p, dictList[u].savings); u, l, p, dictList[u].savings);
DiB_printHex(3, (char*)srcBuffer+p, d); DiB_printHex(3, (char*)srcBuffer+p, d);
DISPLAYLEVEL(3, "| \n"); DISPLAYLEVEL(3, "| \n");
} } } } }
/* create dictionary */ /* create dictionary */
{ {
void* dictContent; void* dictContent;
U32 dictContentSize = DiB_dictSize(dictList); U32 dictContentSize = DiB_dictSize(dictList);
void* dictHeader; void* dictHeader;
size_t dictHeaderSize, hSize; size_t dictHeaderSize, hSize, addedContentLength;
BYTE* ptr; BYTE* ptr;
U32 u; U32 u;
@ -895,18 +942,27 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned
#define EBSIZE (2 KB) #define EBSIZE (2 KB)
dictHeaderSize = EBSIZE; dictHeaderSize = EBSIZE;
dictHeader = malloc(dictHeaderSize); dictHeader = malloc(dictHeaderSize);
dictContent = malloc(dictContentSize); dictContent = malloc(maxDictSize);
if (!dictHeader || !dictContent) EXM_THROW(2, "not enough memory"); if (!dictHeader || !dictContent) EXM_THROW(2, "not enough memory");
/* build dict content */ /* build dict content */
ptr = (BYTE*)dictContent + dictContentSize; ptr = (BYTE*)dictContent + maxDictSize;
for (u=1; u<dictList->pos; u++) { for (u=1; u<dictList->pos; u++) {
U32 l = dictList[u].length; U32 l = dictList[u].length;
ptr -= l; ptr -= l;
memcpy(ptr, (char*)srcBuffer+dictList[u].pos, l); memcpy(ptr, (char*)srcBuffer+dictList[u].pos, l);
} }
/* fast dict content mode */
if (shiftRatio==0) {
addedContentLength = ptr-(BYTE*)dictContent;
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
DISPLAYLEVEL(2, "Adding %u KB from fast sampling \n", (U32)(addedContentLength>>10));
addedContentLength = DiB_fastSampling(dictContent, addedContentLength, srcBuffer, benchedSize);
if (!ERR_isError(addedContentLength))
ptr -= addedContentLength, dictContentSize += addedContentLength;
}
/* dictionary header */ /* dictionary header */
MEM_writeLE32(dictHeader, ZSTD_DICT_MAGIC); MEM_writeLE32(dictHeader, ZSTD_DICT_MAGIC);
hSize = 4; hSize = 4;
@ -915,14 +971,15 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned
/* entropic tables */ /* entropic tables */
DISPLAYLEVEL(2, "statistics ... \n"); DISPLAYLEVEL(2, "statistics ... \n");
hSize += DiB_analyzeEntropy((char*)dictHeader+4, dictHeaderSize, hSize += DiB_analyzeEntropy((char*)dictHeader+4, dictHeaderSize,
srcBuffer, fileSizes, nbFiles, compressionLevel,
dictContent, dictContentSize); srcBuffer, fileSizes, nbFiles,
ptr, dictContentSize);
/* save dict */ /* save dict */
{ {
size_t dictSize = hSize + dictContentSize; size_t dictSize = hSize + dictContentSize;
DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
DiB_saveDict(dictFileName, dictHeader, hSize, dictContent, dictContentSize); DiB_saveDict(dictFileName, dictHeader, hSize, ptr, dictContentSize);
//DiB_saveDict(dictFileName, NULL, 0, dictContent, dictContentSize); // content only //DiB_saveDict(dictFileName, NULL, 0, dictContent, dictContentSize); // content only
} }
/* clean */ /* clean */

View File

@ -24,7 +24,7 @@
*/ */
/* This library is designed for a single-threaded console application. /* This library is designed for a single-threaded console application.
* It abruptly exits (exit() function) when it encounters an error condition. */ * It exit() and printf() into stderr when it encounters an error condition. */
/*-************************************* /*-*************************************
* Version * Version
@ -37,14 +37,17 @@ unsigned DiB_versionNumber (void);
/*-************************************* /*-*************************************
* Main functions * Public functions
***************************************/ ***************************************/
/*! DiB_trainDictionary /*! DiB_trainDictionary
Train a dictionary from a set of files provided by @fileNamesTable Train a dictionary from a set of files provided by @fileNamesTable
Resulting dictionary is written in file @dictFileName Resulting dictionary is written in file @dictFileName.
@result : 0 if fine @selectivityLevel change criteria for insertion into the dictionary (more => bigger selection => larger dictionary)
@compressionLevel can be used to target a specific compression level of zstd. 0 means "default".
@result : 0 == ok
*/ */
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned selectivityLevel, int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize,
unsigned selectivityLevel, unsigned compressionLevel,
const char** fileNamesTable, unsigned nbFiles); const char** fileNamesTable, unsigned nbFiles);