Merge pull request #840 from stellamplau/decodeCorpus-blocks

Make decodecorpus generate raw compressed blocks
This commit is contained in:
Yann Collet 2017-09-13 09:34:04 -07:00 committed by GitHub
commit a1bc08834f

View File

@ -238,6 +238,11 @@ typedef struct {
size_t dictContentSize;
BYTE* dictContent;
} dictInfo;
typedef enum {
gt_frame = 0, /* generate frames */
gt_block, /* generate compressed blocks without block/frame headers */
} genType_e;
/*-*******************************************************
* Generator Functions
*********************************************************/
@ -453,7 +458,7 @@ static size_t writeHufHeader(U32* seed, HUF_CElt* hufTable, void* dst, size_t ds
return op - ostart;
}
/* Write a Huffman coded literals block and return the litearls size */
/* Write a Huffman coded literals block and return the literals size */
static size_t writeLiteralsBlockCompressed(U32* seed, frame_t* frame, size_t contentSize)
{
BYTE* origop = (BYTE*)frame->data;
@ -1165,6 +1170,61 @@ static void initFrame(frame_t* fr)
fr->stats.rep[2] = 8;
}
/**
* Generated a single zstd compressed block with no block/frame header.
* Returns the final seed.
*/
static U32 generateCompressedBlock(U32 seed, frame_t* frame, dictInfo info)
{
size_t blockContentSize;
int blockWritten = 0;
BYTE* op;
DISPLAYLEVEL(1, "block seed: %u\n", seed);
initFrame(frame);
op = (BYTE*)frame->data;
while (!blockWritten) {
size_t cSize;
/* generate window size */
{
int const exponent = RAND(&seed) % (MAX_WINDOW_LOG - 10);
int const mantissa = RAND(&seed) % 8;
frame->header.windowSize = (1U << (exponent + 10));
frame->header.windowSize += (frame->header.windowSize / 8) * mantissa;
}
/* generate content size */
{
size_t const maxBlockSize = MIN(MAX_BLOCK_SIZE, frame->header.windowSize);
if (RAND(&seed) & 15) {
/* some full size blocks */
blockContentSize = maxBlockSize;
} else if (RAND(&seed) & 7) {
/* some small blocks <= 128 bytes*/
blockContentSize = RAND(&seed) % (1U << 7);
} else {
/* some variable size blocks */
blockContentSize = RAND(&seed) % maxBlockSize;
}
}
/* try generating a compressed block */
frame->oldStats = frame->stats;
frame->data = op;
cSize = writeCompressedBlock(&seed, frame, blockContentSize, info);
if (cSize > blockContentSize) {
/* data doesn't compress -- try again */
frame->stats = frame->oldStats; /* don't update the stats */
DISPLAYLEVEL(3, " can't compress block\n");
} else {
blockWritten = 1;
DISPLAYLEVEL(3, " block size: %u\n", (U32)cSize);
frame->src = (BYTE*)frame->src + blockContentSize;
}
}
return seed;
}
/* Return the final seed */
static U32 generateFrame(U32 seed, frame_t* fr, dictInfo info)
{
@ -1323,7 +1383,7 @@ cleanup:
return ret;
}
static size_t testDecodeWithDict(U32 seed)
static size_t testDecodeWithDict(U32 seed, genType_e genType)
{
/* create variables */
size_t const dictSize = RAND(&seed) % (10 << 20) + ZDICT_DICTSIZE_MIN + ZDICT_CONTENTSIZE_MIN;
@ -1346,34 +1406,47 @@ static size_t testDecodeWithDict(U32 seed)
{
frame_t fr;
dictInfo info;
ZSTD_DCtx* const dctx = ZSTD_createDCtx();
size_t ret;
/* generate frame */
/* get dict info */
{
size_t const headerSize = MAX(dictSize/4, 256);
size_t const dictContentSize = dictSize-headerSize;
BYTE* const dictContent = fullDict+headerSize;
dictInfo const info = initDictInfo(1, dictContentSize, dictContent, dictID);
seed = generateFrame(seed, &fr, info);
info = initDictInfo(1, dictContentSize, dictContent, dictID);
}
/* manually decompress and check difference */
{
ZSTD_DCtx* const dctx = ZSTD_createDCtx();
{
size_t const returnValue = ZSTD_decompress_usingDict(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart,
fullDict, dictSize);
if (ZSTD_isError(returnValue)) {
errorDetected = returnValue;
goto dictTestCleanup;
}
}
if (memcmp(DECOMPRESSED_BUFFER, fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart) != 0) {
errorDetected = ERROR(corruption_detected);
if (genType == gt_frame) {
/* Test frame */
seed = generateFrame(seed, &fr, info);
ret = ZSTD_decompress_usingDict(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart,
fullDict, dictSize);
} else {
/* Test block */
seed = generateCompressedBlock(seed, &fr, info);
ret = ZSTD_decompressBegin_usingDict(dctx, fullDict, dictSize);
if (ZSTD_isError(ret)) {
errorDetected = ret;
ZSTD_freeDCtx(dctx);
goto dictTestCleanup;
}
ZSTD_freeDCtx(dctx);
ret = ZSTD_decompressBlock(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart);
}
ZSTD_freeDCtx(dctx);
if (ZSTD_isError(ret)) {
errorDetected = ret;
goto dictTestCleanup;
}
if (memcmp(DECOMPRESSED_BUFFER, fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart) != 0) {
errorDetected = ERROR(corruption_detected);
goto dictTestCleanup;
}
}
@ -1382,7 +1455,91 @@ dictTestCleanup:
return errorDetected;
}
static int runTestMode(U32 seed, unsigned numFiles, unsigned const testDurationS)
static size_t testDecodeRawBlock(frame_t* fr)
{
ZSTD_DCtx* dctx = ZSTD_createDCtx();
size_t ret = ZSTD_decompressBegin(dctx);
if (ZSTD_isError(ret)) return ret;
ret = ZSTD_decompressBlock(
dctx,
DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
fr->dataStart, (BYTE*)fr->data - (BYTE*)fr->dataStart);
ZSTD_freeDCtx(dctx);
if (ZSTD_isError(ret)) return ret;
if (memcmp(DECOMPRESSED_BUFFER, fr->srcStart,
(BYTE*)fr->src - (BYTE*)fr->srcStart) != 0) {
return ERROR(corruption_detected);
}
return ret;
}
static int runBlockTest(U32* seed)
{
frame_t fr;
U32 const seedCopy = *seed;
{
dictInfo const info = initDictInfo(0, 0, NULL, 0);
*seed = generateCompressedBlock(*seed, &fr, info);
}
{ size_t const r = testDecodeRawBlock(&fr);
if (ZSTD_isError(r)) {
DISPLAY("Error in block mode on test seed %u: %s\n", seedCopy,
ZSTD_getErrorName(r));
return 1;
}
}
{
size_t const r = testDecodeWithDict(*seed, gt_block);
if (ZSTD_isError(r)) {
DISPLAY("Error in block mode with dictionary on test seed %u: %s\n",
seedCopy, ZSTD_getErrorName(r));
return 1;
}
}
return 0;
}
static int runFrameTest(U32* seed)
{
frame_t fr;
U32 const seedCopy = *seed;
{
dictInfo const info = initDictInfo(0, 0, NULL, 0);
*seed = generateFrame(*seed, &fr, info);
}
{ size_t const r = testDecodeSimple(&fr);
if (ZSTD_isError(r)) {
DISPLAY("Error in simple mode on test seed %u: %s\n", seedCopy,
ZSTD_getErrorName(r));
return 1;
}
}
{ size_t const r = testDecodeStreaming(&fr);
if (ZSTD_isError(r)) {
DISPLAY("Error in streaming mode on test seed %u: %s\n", seedCopy,
ZSTD_getErrorName(r));
return 1;
}
}
{
/* don't create a dictionary that is too big */
size_t const r = testDecodeWithDict(*seed, gt_frame);
if (ZSTD_isError(r)) {
DISPLAY("Error in dictionary mode on test seed %u: %s\n", seedCopy, ZSTD_getErrorName(r));
return 1;
}
}
return 0;
}
static int runTestMode(U32 seed, unsigned numFiles, unsigned const testDurationS,
genType_e genType)
{
unsigned fnum;
@ -1394,39 +1551,21 @@ static int runTestMode(U32 seed, unsigned numFiles, unsigned const testDurationS
DISPLAY("seed: %u\n", seed);
for (fnum = 0; fnum < numFiles || clockSpan(startClock) < maxClockSpan; fnum++) {
frame_t fr;
U32 const seedCopy = seed;
if (fnum < numFiles)
DISPLAYUPDATE("\r%u/%u ", fnum, numFiles);
else
DISPLAYUPDATE("\r%u ", fnum);
{
dictInfo const info = initDictInfo(0, 0, NULL, 0);
seed = generateFrame(seed, &fr, info);
}
int ret;
if (genType == gt_frame) {
ret = runFrameTest(&seed);
} else {
ret = runBlockTest(&seed);
}
{ size_t const r = testDecodeSimple(&fr);
if (ZSTD_isError(r)) {
DISPLAY("Error in simple mode on test seed %u: %s\n", seedCopy,
ZSTD_getErrorName(r));
return 1;
}
}
{ size_t const r = testDecodeStreaming(&fr);
if (ZSTD_isError(r)) {
DISPLAY("Error in streaming mode on test seed %u: %s\n", seedCopy,
ZSTD_getErrorName(r));
return 1;
}
}
{
/* don't create a dictionary that is too big */
size_t const r = testDecodeWithDict(seed);
if (ZSTD_isError(r)) {
DISPLAY("Error in dictionary mode on test seed %u: %s\n", seedCopy, ZSTD_getErrorName(r));
return 1;
}
if (ret) {
return ret;
}
}
}
@ -1441,7 +1580,7 @@ static int runTestMode(U32 seed, unsigned numFiles, unsigned const testDurationS
*********************************************************/
static int generateFile(U32 seed, const char* const path,
const char* const origPath)
const char* const origPath, genType_e genType)
{
frame_t fr;
@ -1449,9 +1588,12 @@ static int generateFile(U32 seed, const char* const path,
{
dictInfo const info = initDictInfo(0, 0, NULL, 0);
generateFrame(seed, &fr, info);
if (genType == gt_frame) {
generateFrame(seed, &fr, info);
} else {
generateCompressedBlock(seed, &fr, info);
}
}
outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path);
if (origPath) {
outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, origPath);
@ -1460,7 +1602,7 @@ static int generateFile(U32 seed, const char* const path,
}
static int generateCorpus(U32 seed, unsigned numFiles, const char* const path,
const char* const origPath)
const char* const origPath, genType_e genType)
{
char outPath[MAX_PATH];
unsigned fnum;
@ -1474,7 +1616,11 @@ static int generateCorpus(U32 seed, unsigned numFiles, const char* const path,
{
dictInfo const info = initDictInfo(0, 0, NULL, 0);
seed = generateFrame(seed, &fr, info);
if (genType == gt_frame) {
seed = generateFrame(seed, &fr, info);
} else {
seed = generateCompressedBlock(seed, &fr, info);
}
}
if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) {
@ -1498,7 +1644,8 @@ static int generateCorpus(U32 seed, unsigned numFiles, const char* const path,
}
static int generateCorpusWithDict(U32 seed, unsigned numFiles, const char* const path,
const char* const origPath, const size_t dictSize)
const char* const origPath, const size_t dictSize,
genType_e genType)
{
char outPath[MAX_PATH];
BYTE* fullDict;
@ -1550,7 +1697,11 @@ static int generateCorpusWithDict(U32 seed, unsigned numFiles, const char* const
size_t const dictContentSize = dictSize-headerSize;
BYTE* const dictContent = fullDict+headerSize;
dictInfo const info = initDictInfo(1, dictContentSize, dictContent, dictID);
seed = generateFrame(seed, &fr, info);
if (genType == gt_frame) {
seed = generateFrame(seed, &fr, info);
} else {
seed = generateCompressedBlock(seed, &fr, info);
}
}
if (numFiles != 0) {
@ -1630,6 +1781,7 @@ static void advancedUsage(const char* programName)
DISPLAY( "Advanced arguments :\n");
DISPLAY( " --content-size : always include the content size in the frame header\n");
DISPLAY( " --use-dict=# : include a dictionary used to decompress the corpus\n");
DISPLAY( " --gen-blocks : generate raw compressed blocks without block/frame headers\n");
}
/*! readU32FromChar() :
@ -1676,6 +1828,7 @@ int main(int argc, char** argv)
const char* origPath = NULL;
int useDict = 0;
unsigned dictSize = (10 << 10); /* 10 kB default */
genType_e genType = gt_frame;
int argNb;
@ -1739,6 +1892,8 @@ int main(int argc, char** argv)
} else if (longCommandWArg(&argument, "use-dict=")) {
dictSize = readU32FromChar(&argument);
useDict = 1;
} else if (strcmp(argument, "gen-blocks") == 0) {
genType = gt_block;
} else {
advancedUsage(argv[0]);
return 1;
@ -1755,7 +1910,7 @@ int main(int argc, char** argv)
}
if (testMode) {
return runTestMode(seed, numFiles, testDuration);
return runTestMode(seed, numFiles, testDuration, genType);
} else {
if (testDuration) {
DISPLAY("Error: -T requires test mode (-t)\n\n");
@ -1771,12 +1926,12 @@ int main(int argc, char** argv)
}
if (numFiles == 0 && useDict == 0) {
return generateFile(seed, path, origPath);
return generateFile(seed, path, origPath, genType);
} else if (useDict == 0){
return generateCorpus(seed, numFiles, path, origPath);
return generateCorpus(seed, numFiles, path, origPath, genType);
} else {
/* should generate files with a dictionary */
return generateCorpusWithDict(seed, numFiles, path, origPath, dictSize);
return generateCorpusWithDict(seed, numFiles, path, origPath, dictSize, genType);
}
}