added code for generating dictionary/test files randomly. Still need to make sure dictionary ID matches

This commit is contained in:
Paul Cruz 2017-06-08 17:06:30 -07:00
parent 2a39ac5486
commit c2d909e396

View File

@ -1121,13 +1121,12 @@ static void initFrame(frame_t* fr)
}
/* Return the final seed */
static U32 generateFrame(U32 seed, frame_t* fr)
static U32 generateFrame(U32 seed, frame_t* fr, int genDict, size_t dictSize)
{
/* generate a complete frame */
DISPLAYLEVEL(1, "frame seed: %u\n", seed);
initFrame(fr);
writeFrameHeader(&seed, fr);
writeBlocks(&seed, fr);
writeChecksum(fr);
@ -1135,6 +1134,7 @@ static U32 generateFrame(U32 seed, frame_t* fr)
return seed;
}
/*-*******************************************************
* Test Mode
*********************************************************/
@ -1215,7 +1215,7 @@ static int runTestMode(U32 seed, unsigned numFiles, unsigned const testDurationS
else
DISPLAYUPDATE("\r%u ", fnum);
seed = generateFrame(seed, &fr);
seed = generateFrame(seed, &fr, 0, 0);
{ size_t const r = testDecodeSimple(&fr);
if (ZSTD_isError(r)) {
@ -1250,7 +1250,7 @@ static int generateFile(U32 seed, const char* const path,
DISPLAY("seed: %u\n", seed);
generateFrame(seed, &fr);
generateFrame(seed, &fr, 0, 0);
outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path);
if (origPath) {
@ -1272,7 +1272,7 @@ static int generateCorpus(U32 seed, unsigned numFiles, const char* const path,
DISPLAYUPDATE("\r%u/%u ", fnum, numFiles);
seed = generateFrame(seed, &fr);
seed = generateFrame(seed, &fr, 0, 0);
if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) {
DISPLAY("Error: path too long\n");
@ -1294,6 +1294,81 @@ static int generateCorpus(U32 seed, unsigned numFiles, const char* const path,
return 0;
}
static int generateCorpusWithDict(U32 seed, unsigned numFiles, const char* const path,
const char* const origPath, const size_t dictSize)
{
const size_t minDictSize = 8;
char outPath[MAX_PATH];
U32 dictID;
BYTE* dictStart;
unsigned fnum;
ZSTD_DCtx* dctx = ZSTD_createDCtx();
if(snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) {
DISPLAY("Error: path too long\n");
return 1;
}
/* Generate the dictionary randomly first */
if(dictSize < minDictSize){
DISPLAY("Error: dictionary size (%zu) is too small\n", dictSize);
}
else{
/* variable declaration */
dictStart = malloc(dictSize);
size_t pos = 0;
dictID = RAND(&seed) + 1;
/* write dictionary magic number */
MEM_writeLE32(dictStart + pos, ZSTD_DICT_MAGIC);
pos += 4;
/* write random dictionary ID */
MEM_writeLE32(dictStart + pos, dictID);
pos += 4;
/* randomly generate the rest of the dictionary */
RAND_buffer(&seed, dictStart + pos, dictSize-8);
outputBuffer(dictStart, dictSize, outPath);
}
/* generate random compressed/decompressed files */
for (fnum = 0; fnum < numFiles; fnum++) {
frame_t fr;
size_t returnValue;
BYTE* decompressedPtr = malloc(MAX_DECOMPRESSED_SIZE);
DISPLAYUPDATE("\r%u/%u ", fnum, numFiles);
seed = generateFrame(seed, &fr, 1, dictSize);
if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) {
DISPLAY("Error: path too long\n");
return 1;
}
outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, outPath);
if (origPath) {
if (snprintf(outPath, MAX_PATH, "%s/z%06u", origPath, fnum) + 1 > MAX_PATH) {
DISPLAY("Error: path too long\n");
return 1;
}
outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, outPath);
}
/* if asked, supply the decompressed version */
returnValue = ZSTD_decompress_usingDict(dctx, decompressedPtr, MAX_DECOMPRESSED_SIZE,
fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart,
dictStart,dictSize);
}
/* write uncompressed versions of files */
DISPLAY("This is origPath: %s\nAnd this is numFiles: %d\n", origPath, numFiles);
return 0;
}
/*_*******************************************************
* Command line
@ -1350,6 +1425,8 @@ int main(int argc, char** argv)
int testMode = 0;
const char* path = NULL;
const char* origPath = NULL;
int genDict = 0;
unsigned dictSize = (10 << 10); /* 10 kB default */
int argNb;
@ -1410,6 +1487,10 @@ int main(int argc, char** argv)
argument++;
if (strcmp(argument, "content-size") == 0) {
opts.contentSize = 1;
} else if(strcmp(argument, "train-dict") == 0){
argument += 11;
dictSize = readInt(&argument);
genDict = 1;
} else {
advancedUsage(argv[0]);
return 1;
@ -1441,9 +1522,16 @@ int main(int argc, char** argv)
return 1;
}
if (numFiles == 0) {
if (numFiles == 0 && genDict == 0) {
return generateFile(seed, path, origPath);
} else {
} else if (genDict == 0){
return generateCorpus(seed, numFiles, path, origPath);
} else if (numFiles == 0){
/* should generate a single file with a dictionary */
return generateCorpusWithDict(seed, 1, path, origPath, dictSize);
} else{
/* should generate multiple files with a dictionary */
return generateCorpusWithDict(seed, numFiles, path, origPath, dictSize);
}
}