changed dictionary from using fixed amount of bytes for the header / entropy tables

This commit is contained in:
Paul Cruz 2017-06-14 17:23:56 -07:00
parent 664ed05ff6
commit d93207a79f

View File

@ -1316,25 +1316,30 @@ static int generateCorpusWithDict(U32 seed, unsigned numFiles, const char* const
const char* const origPath, const size_t dictSize) const char* const origPath, const size_t dictSize)
{ {
char outPath[MAX_PATH]; char outPath[MAX_PATH];
BYTE* dictContent;
BYTE* fullDict; BYTE* fullDict;
U32 dictID; U32 dictID;
unsigned fnum; unsigned fnum;
BYTE* decompressedPtr; BYTE* decompressedPtr;
BYTE* dictContent;
const size_t headerSize = dictSize/4;
const size_t dictContentSize = dictSize - dictSize/4;
ZSTD_DCtx* dctx = ZSTD_createDCtx(); ZSTD_DCtx* dctx = ZSTD_createDCtx();
if(snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) { if(snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) {
DISPLAY("Error: path too long\n"); DISPLAY("Error: path too long\n");
return 1; return 1;
} }
if(dictSize < 400){ {
DISPLAY("Error: either no size given or given dictionary size is too small\n"); /* use 3/4 of dictionary for content, save rest for header/entropy tables */
return 1; if(dictContentSize < 128 || dictSize < 256){
DISPLAY("Error: dictionary size is too small\n");
return 1;
}
} }
/* Generate the dictionary randomly first */ /* Generate the dictionary randomly first */
dictContent = malloc(dictSize-400);
dictID = RAND(&seed); dictID = RAND(&seed);
fullDict = malloc(dictSize); fullDict = malloc(dictSize);
RAND_buffer(&seed, dictContent, dictSize-400); dictContent = fullDict + headerSize;
RAND_buffer(&seed, (void*)dictContent, dictContentSize);
{ {
size_t dictWriteSize = 0; size_t dictWriteSize = 0;
@ -1347,7 +1352,7 @@ static int generateCorpusWithDict(U32 seed, unsigned numFiles, const char* const
size_t* curr = sampleSizes; size_t* curr = sampleSizes;
size_t totalSize = 0; size_t totalSize = 0;
while(i++ < numSamples){ while(i++ < numSamples){
*curr = RAND(&seed) % (4 << 15); *curr = RAND(&seed) % dictContentSize;
totalSize += *curr; totalSize += *curr;
curr++; curr++;
} }
@ -1360,7 +1365,7 @@ static int generateCorpusWithDict(U32 seed, unsigned numFiles, const char* const
{ {
/* take substring from dictionary content */ /* take substring from dictionary content */
size_t pos = 0; size_t pos = 0;
const BYTE* endDict = dictContent + dictSize - 400; BYTE* endDict = dictContent + dictContentSize;
while(i++ < numSamples){ while(i++ < numSamples){
size_t currSize = *(curr++); size_t currSize = *(curr++);
BYTE* startSubstring = endDict - currSize; BYTE* startSubstring = endDict - currSize;
@ -1379,7 +1384,7 @@ static int generateCorpusWithDict(U32 seed, unsigned numFiles, const char* const
/* finalize dictionary with random samples */ /* finalize dictionary with random samples */
dictWriteSize = ZDICT_finalizeDictionary(fullDict, dictSize, dictWriteSize = ZDICT_finalizeDictionary(fullDict, dictSize,
dictContent, dictSize-400, dictContent, dictContentSize,
samples, sampleSizes, numSamples, samples, sampleSizes, numSamples,
zdictParams); zdictParams);
} }