diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index b4ae4e87..8308bf5d 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -392,6 +392,11 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) bounds.upperBound = ZSTD_TARGETCBLOCKSIZE_MAX; return bounds; + case ZSTD_c_srcSizeHint: + bounds.lowerBound = ZSTD_SRCSIZEHINT_MIN; + bounds.upperBound = ZSTD_SRCSIZEHINT_MAX; + return bounds; + default: { ZSTD_bounds const boundError = { ERROR(parameter_unsupported), 0, 0 }; return boundError; @@ -448,6 +453,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) case ZSTD_c_forceAttachDict: case ZSTD_c_literalCompressionMode: case ZSTD_c_targetCBlockSize: + case ZSTD_c_srcSizeHint: default: return 0; } @@ -494,6 +500,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) case ZSTD_c_ldmMinMatch: case ZSTD_c_ldmBucketSizeLog: case ZSTD_c_targetCBlockSize: + case ZSTD_c_srcSizeHint: break; default: RETURN_ERROR(parameter_unsupported); @@ -674,6 +681,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, CCtxParams->targetCBlockSize = value; return CCtxParams->targetCBlockSize; + case ZSTD_c_srcSizeHint : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_srcSizeHint, value); + CCtxParams->srcSizeHint = value; + return CCtxParams->srcSizeHint; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } } @@ -779,6 +792,9 @@ size_t ZSTD_CCtxParams_getParameter( case ZSTD_c_targetCBlockSize : *value = (int)CCtxParams->targetCBlockSize; break; + case ZSTD_c_srcSizeHint : + *value = (int)CCtxParams->srcSizeHint; + break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } return 0; @@ -1029,7 +1045,11 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar, ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize) { - ZSTD_compressionParameters cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize); + ZSTD_compressionParameters cParams; + if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) { + srcSizeHint = CCtxParams->srcSizeHint; + } + cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize); if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog; if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog; diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index 6d623cc6..3e590ec3 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -203,6 +203,9 @@ struct ZSTD_CCtx_params_s { size_t targetCBlockSize; /* Tries to fit compressed block size to be around targetCBlockSize. * No target when targetCBlockSize == 0. * There is no guarantee on compressed block size */ + int srcSizeHint; /* User's best guess of source size. + * Hint is not valid when srcSizeHint == 0. + * There is no guarantee that hint is close to actual source size */ ZSTD_dictAttachPref_e attachDictPref; ZSTD_literalCompressionMode_e literalCompressionMode; diff --git a/lib/legacy/zstd_v02.c b/lib/legacy/zstd_v02.c index 793df602..de0a4bd6 100644 --- a/lib/legacy/zstd_v02.c +++ b/lib/legacy/zstd_v02.c @@ -2889,6 +2889,7 @@ static size_t ZSTD_decodeLiteralsBlock(void* ctx, const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2; /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */ if (litSize > srcSize-11) /* risk of reading too far with wildcopy */ { + if (litSize > BLOCKSIZE) return ERROR(corruption_detected); if (litSize > srcSize-3) return ERROR(corruption_detected); memcpy(dctx->litBuffer, istart, litSize); dctx->litPtr = dctx->litBuffer; diff --git a/lib/legacy/zstd_v04.c b/lib/legacy/zstd_v04.c index 645a6e31..201ce2b6 100644 --- a/lib/legacy/zstd_v04.c +++ b/lib/legacy/zstd_v04.c @@ -2655,6 +2655,7 @@ static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2; /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */ if (litSize > srcSize-11) /* risk of reading too far with wildcopy */ { + if (litSize > BLOCKSIZE) return ERROR(corruption_detected); if (litSize > srcSize-3) return ERROR(corruption_detected); memcpy(dctx->litBuffer, istart, litSize); dctx->litPtr = dctx->litBuffer; @@ -3034,9 +3035,12 @@ static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, { /* blockType == blockCompressed */ const BYTE* ip = (const BYTE*)src; + size_t litCSize; + + if (srcSize > BLOCKSIZE) return ERROR(corruption_detected); /* Decode literals sub-block */ - size_t litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize); + litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize); if (ZSTD_isError(litCSize)) return litCSize; ip += litCSize; srcSize -= litCSize; diff --git a/lib/zstd.h b/lib/zstd.h index f8e95f22..38c99e01 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -15,6 +15,7 @@ extern "C" { #define ZSTD_H_235446 /* ====== Dependency ======*/ +#include /* INT_MAX */ #include /* size_t */ @@ -386,6 +387,7 @@ typedef enum { * ZSTD_c_forceAttachDict * ZSTD_c_literalCompressionMode * ZSTD_c_targetCBlockSize + * ZSTD_c_srcSizeHint * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. * note : never ever use experimentalParam? names directly; * also, the enums values themselves are unstable and can still change. @@ -396,6 +398,7 @@ typedef enum { ZSTD_c_experimentalParam4=1001, ZSTD_c_experimentalParam5=1002, ZSTD_c_experimentalParam6=1003, + ZSTD_c_experimentalParam7=1004, } ZSTD_cParameter; typedef struct { @@ -1063,6 +1066,8 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); /* Advanced parameter bounds */ #define ZSTD_TARGETCBLOCKSIZE_MIN 64 #define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX +#define ZSTD_SRCSIZEHINT_MIN 0 +#define ZSTD_SRCSIZEHINT_MAX INT_MAX /* internal */ #define ZSTD_HASHLOG3_MAX 17 @@ -1441,6 +1446,12 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre * There is no guarantee on compressed block size (default:0) */ #define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 +/* User's best guess of source size. + * Hint is not valid when srcSizeHint == 0. + * There is no guarantee that hint is close to actual source size, + * but compression ratio may regress significantly if guess considerably underestimates */ +#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7 + /*! ZSTD_CCtx_getParameter() : * Get the requested compression parameter value, selected by enum ZSTD_cParameter, * and store it into int* value. diff --git a/programs/fileio.c b/programs/fileio.c index 569a410c..20e2ee2a 100644 --- a/programs/fileio.c +++ b/programs/fileio.c @@ -30,6 +30,7 @@ #include /* strcmp, strlen */ #include #include /* errno */ +#include /* INT_MAX */ #include #include "timefn.h" /* UTIL_getTime, UTIL_clockSpanMicro */ @@ -304,7 +305,9 @@ struct FIO_prefs_s { int ldmMinMatch; int ldmBucketSizeLog; int ldmHashRateLog; + size_t streamSrcSize; size_t targetCBlockSize; + int srcSizeHint; ZSTD_literalCompressionMode_e literalCompressionMode; /* IO preferences */ @@ -349,7 +352,9 @@ FIO_prefs_t* FIO_createPreferences(void) ret->ldmMinMatch = 0; ret->ldmBucketSizeLog = FIO_LDM_PARAM_NOTSET; ret->ldmHashRateLog = FIO_LDM_PARAM_NOTSET; + ret->streamSrcSize = 0; ret->targetCBlockSize = 0; + ret->srcSizeHint = 0; ret->literalCompressionMode = ZSTD_lcm_auto; return ret; } @@ -418,10 +423,18 @@ void FIO_setRsyncable(FIO_prefs_t* const prefs, int rsyncable) { prefs->rsyncable = rsyncable; } +void FIO_setStreamSrcSize(FIO_prefs_t* const prefs, size_t streamSrcSize) { + prefs->streamSrcSize = streamSrcSize; +} + void FIO_setTargetCBlockSize(FIO_prefs_t* const prefs, size_t targetCBlockSize) { prefs->targetCBlockSize = targetCBlockSize; } +void FIO_setSrcSizeHint(FIO_prefs_t* const prefs, size_t srcSizeHint) { + prefs->srcSizeHint = (int)MIN((size_t)INT_MAX, srcSizeHint); +} + void FIO_setLiteralCompressionMode( FIO_prefs_t* const prefs, ZSTD_literalCompressionMode_e mode) { @@ -633,7 +646,6 @@ typedef struct { static cRess_t FIO_createCResources(FIO_prefs_t* const prefs, const char* dictFileName, int cLevel, - U64 srcSize, ZSTD_compressionParameters comprParams) { cRess_t ress; memset(&ress, 0, sizeof(ress)); @@ -667,6 +679,8 @@ static cRess_t FIO_createCResources(FIO_prefs_t* const prefs, CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_compressionLevel, cLevel) ); /* max compressed block size */ CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_targetCBlockSize, (int)prefs->targetCBlockSize) ); + /* source size hint */ + CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_srcSizeHint, (int)prefs->srcSizeHint) ); /* long distance matching */ CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_enableLongDistanceMatching, prefs->ldmFlag) ); CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_ldmHashLog, prefs->ldmHashLog) ); @@ -698,10 +712,7 @@ static cRess_t FIO_createCResources(FIO_prefs_t* const prefs, CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_rsyncable, prefs->rsyncable) ); #endif /* dictionary */ - CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, srcSize) ); /* set the value temporarily for dictionary loading, to adapt compression parameters */ CHECK( ZSTD_CCtx_loadDictionary(ress.cctx, dictBuffer, dictBuffSize) ); - CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, ZSTD_CONTENTSIZE_UNKNOWN) ); /* reset */ - free(dictBuffer); } @@ -1003,6 +1014,9 @@ FIO_compressZstdFrame(FIO_prefs_t* const prefs, /* init */ if (fileSize != UTIL_FILESIZE_UNKNOWN) { CHECK(ZSTD_CCtx_setPledgedSrcSize(ress.cctx, fileSize)); + } else if (prefs->streamSrcSize > 0) { + /* unknown source size; use the declared stream size */ + CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, prefs->streamSrcSize) ); } (void)srcFileName; @@ -1361,10 +1375,7 @@ int FIO_compressFilename(FIO_prefs_t* const prefs, const char* dictFileName, int compressionLevel, ZSTD_compressionParameters comprParams) { - U64 const fileSize = UTIL_getFileSize(srcFileName); - U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? ZSTD_CONTENTSIZE_UNKNOWN : fileSize; - - cRess_t const ress = FIO_createCResources(prefs, dictFileName, compressionLevel, srcSize, comprParams); + cRess_t const ress = FIO_createCResources(prefs, dictFileName, compressionLevel, comprParams); int const result = FIO_compressFilename_srcFile(prefs, ress, dstFileName, srcFileName, compressionLevel); @@ -1415,10 +1426,7 @@ int FIO_compressMultipleFilenames(FIO_prefs_t* const prefs, ZSTD_compressionParameters comprParams) { int error = 0; - U64 const firstFileSize = UTIL_getFileSize(inFileNamesTable[0]); - U64 const firstSrcSize = (firstFileSize == UTIL_FILESIZE_UNKNOWN) ? ZSTD_CONTENTSIZE_UNKNOWN : firstFileSize; - U64 const srcSize = (nbFiles != 1) ? ZSTD_CONTENTSIZE_UNKNOWN : firstSrcSize ; - cRess_t ress = FIO_createCResources(prefs, dictFileName, compressionLevel, srcSize, comprParams); + cRess_t ress = FIO_createCResources(prefs, dictFileName, compressionLevel, comprParams); /* init */ assert(outFileName != NULL || suffix != NULL); diff --git a/programs/fileio.h b/programs/fileio.h index 311f8c0e..096d90b5 100644 --- a/programs/fileio.h +++ b/programs/fileio.h @@ -71,7 +71,9 @@ void FIO_setOverlapLog(FIO_prefs_t* const prefs, int overlapLog); void FIO_setRemoveSrcFile(FIO_prefs_t* const prefs, unsigned flag); void FIO_setSparseWrite(FIO_prefs_t* const prefs, unsigned sparse); /**< 0: no sparse; 1: disable on stdout; 2: always enabled */ void FIO_setRsyncable(FIO_prefs_t* const prefs, int rsyncable); +void FIO_setStreamSrcSize(FIO_prefs_t* const prefs, size_t streamSrcSize); void FIO_setTargetCBlockSize(FIO_prefs_t* const prefs, size_t targetCBlockSize); +void FIO_setSrcSizeHint(FIO_prefs_t* const prefs, size_t srcSizeHint); void FIO_setLiteralCompressionMode( FIO_prefs_t* const prefs, ZSTD_literalCompressionMode_e mode); diff --git a/programs/zstd.1.md b/programs/zstd.1.md index 3ab2667a..dff4d9ea 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -144,6 +144,18 @@ the last one takes effect. Due to the chaotic nature of dynamic adaptation, compressed result is not reproducible. _note_ : at the time of this writing, `--adapt` can remain stuck at low speed when combined with multiple worker threads (>=2). +* `--stream-size=#` : + Sets the pledged source size of input coming from a stream. This value must be exact, as it + will be included in the produced frame header. Incorrect stream sizes will cause an error. + This information will be used to better optimize compression parameters, resulting in + better and potentially faster compression, especially for smaller source sizes. +* `--size-hint=#`: + When handling input from a stream, `zstd` must guess how large the source size + will be when optimizing compression parameters. If the stream size is relatively + small, this guess may be a poor one, resulting in a higher compression ratio than + expected. This feature allows for controlling the guess when needed. + Exact guesses result in better compression ratios. Overestimates result in slightly + degraded compression ratios, while underestimates may result in significant degradation. * `--rsyncable` : `zstd` will periodically synchronize the compression state to make the compressed file more rsync-friendly. There is a negligible impact to diff --git a/programs/zstdcli.c b/programs/zstdcli.c index de286cdf..98df728a 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -141,6 +141,8 @@ static int usage_advanced(const char* programName) DISPLAY( "--long[=#]: enable long distance matching with given window log (default: %u)\n", g_defaultMaxWindowLog); DISPLAY( "--fast[=#]: switch to ultra fast compression level (default: %u)\n", 1); DISPLAY( "--adapt : dynamically adapt compression level to I/O conditions \n"); + DISPLAY( "--stream-size=# : optimize compression parameters for streaming input of given number of bytes \n"); + DISPLAY( "--size-hint=# optimize compression parameters for streaming input of approximately this size\n"); DISPLAY( "--target-compressed-block-size=# : make compressed block near targeted size \n"); #ifdef ZSTD_MULTITHREAD DISPLAY( " -T# : spawns # compression threads (default: 1, 0==# cores) \n"); @@ -588,7 +590,9 @@ int main(int argCount, const char* argv[]) const char* suffix = ZSTD_EXTENSION; unsigned maxDictSize = g_defaultMaxDictSize; unsigned dictID = 0; + size_t streamSrcSize = 0; size_t targetCBlockSize = 0; + size_t srcSizeHint = 0; int dictCLevel = g_defaultDictCLevel; unsigned dictSelect = g_defaultSelectivityLevel; #ifdef UTIL_HAS_CREATEFILELIST @@ -745,7 +749,9 @@ int main(int argCount, const char* argv[]) if (longCommandWArg(&argument, "--maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--dictID=")) { dictID = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--zstd=")) { if (!parseCompressionParameters(argument, &compressionParams)) CLEAN_RETURN(badusage(programName)); continue; } + if (longCommandWArg(&argument, "--stream-size=")) { streamSrcSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--target-compressed-block-size=")) { targetCBlockSize = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "--size-hint=")) { srcSizeHint = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--long")) { unsigned ldmWindowLog = 0; ldmFlag = 1; @@ -1150,7 +1156,9 @@ int main(int argCount, const char* argv[]) FIO_setAdaptMin(prefs, adaptMin); FIO_setAdaptMax(prefs, adaptMax); FIO_setRsyncable(prefs, rsyncable); + FIO_setStreamSrcSize(prefs, streamSrcSize); FIO_setTargetCBlockSize(prefs, targetCBlockSize); + FIO_setSrcSizeHint(prefs, srcSizeHint); FIO_setLiteralCompressionMode(prefs, literalCompressionMode); if (adaptMin > cLevel) cLevel = adaptMin; if (adaptMax < cLevel) cLevel = adaptMax; @@ -1160,7 +1168,7 @@ int main(int argCount, const char* argv[]) else operationResult = FIO_compressMultipleFilenames(prefs, filenameTable, filenameIdx, outFileName, suffix, dictFileName, cLevel, compressionParams); #else - (void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)targetCBlockSize; /* not used when ZSTD_NOCOMPRESS set */ + (void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)targetCBlockSize; (void)streamSrcSize; (void)srcSizeHint; /* not used when ZSTD_NOCOMPRESS set */ DISPLAY("Compression not supported \n"); #endif } else { /* decompression or test */ diff --git a/tests/fuzz/Makefile b/tests/fuzz/Makefile index 8bf16b1f..08dedd66 100644 --- a/tests/fuzz/Makefile +++ b/tests/fuzz/Makefile @@ -113,15 +113,6 @@ zstd_frame_info: $(FUZZ_HEADERS) $(FUZZ_OBJ) zstd_frame_info.o libregression.a: $(FUZZ_HEADERS) $(PRGDIR)/util.h $(PRGDIR)/util.c regression_driver.o $(AR) $(FUZZ_ARFLAGS) $@ regression_driver.o -# Install libfuzzer (not usable for MSAN testing) -# Provided for convenience. To use this library run make libFuzzer and -# set LDFLAGS=-L. -.PHONY: libFuzzer -libFuzzer: - @$(RM) -rf Fuzzer - @git clone https://chromium.googlesource.com/chromium/llvm-project/compiler-rt/lib/fuzzer Fuzzer - @cd Fuzzer && ./build.sh - corpora/%_seed_corpus.zip: @mkdir -p corpora $(DOWNLOAD) $@ $(CORPORA_URL_PREFIX)$*_seed_corpus.zip diff --git a/tests/fuzz/README.md b/tests/fuzz/README.md index 9e0bb259..856a57f8 100644 --- a/tests/fuzz/README.md +++ b/tests/fuzz/README.md @@ -35,6 +35,8 @@ The environment variables can be overridden with the corresponding flags `--cc`, `--cflags`, etc. The specific fuzzing engine is selected with `LIB_FUZZING_ENGINE` or `--lib-fuzzing-engine`, the default is `libregression.a`. +Alternatively, you can use Clang's built in fuzzing engine with +`--enable-fuzzer`. It has flags that can easily set up sanitizers `--enable-{a,ub,m}san`, and coverage instrumentation `--enable-coverage`. It sets sane defaults which can be overridden with flags `--debug`, @@ -51,22 +53,25 @@ The command used to run the fuzzer is printed for debugging. ## LibFuzzer ``` -# Build libfuzzer if necessary -make libFuzzer # Build the fuzz targets -./fuzz.py build all --enable-coverage --enable-asan --enable-ubsan --lib-fuzzing-engine Fuzzer/libFuzzer.a --cc clang --cxx clang++ +./fuzz.py build all --enable-fuzzer --enable-asan --enable-ubsan --cc clang --cxx clang++ # OR equivalently -CC=clang CXX=clang++ LIB_FUZZING_ENGINE=Fuzzer/libFuzzer.a ./fuzz.py build all --enable-coverage --enable-asan --enable-ubsan +CC=clang CXX=clang++ ./fuzz.py build all --enable-fuzzer --enable-asan --enable-ubsan # Run the fuzzer -./fuzz.py libfuzzer TARGET -max_len=8192 -jobs=4 +./fuzz.py libfuzzer TARGET ``` where `TARGET` could be `simple_decompress`, `stream_round_trip`, etc. ### MSAN -Fuzzing with `libFuzzer` and `MSAN` will require building a C++ standard library -and libFuzzer with MSAN. +Fuzzing with `libFuzzer` and `MSAN` is as easy as: + +``` +CC=clang CXX=clang++ ./fuzz.py build all --enable-fuzzer --enable-msan +./fuzz.py libfuzzer TARGET +``` + `fuzz.py` respects the environment variables / flags `MSAN_EXTRA_CPPFLAGS`, `MSAN_EXTRA_CFLAGS`, `MSAN_EXTRA_CXXFLAGS`, `MSAN_EXTRA_LDFLAGS` to easily pass the extra parameters only for MSAN. diff --git a/tests/fuzz/fuzz.py b/tests/fuzz/fuzz.py index d993209a..faf8ce8a 100755 --- a/tests/fuzz/fuzz.py +++ b/tests/fuzz/fuzz.py @@ -24,21 +24,38 @@ def abs_join(a, *p): return os.path.abspath(os.path.join(a, *p)) +class InputType(object): + RAW_DATA = 1 + COMPRESSED_DATA = 2 + + +class FrameType(object): + ZSTD = 1 + BLOCK = 2 + + +class TargetInfo(object): + def __init__(self, input_type, frame_type=FrameType.ZSTD): + self.input_type = input_type + self.frame_type = frame_type + + # Constants FUZZ_DIR = os.path.abspath(os.path.dirname(__file__)) CORPORA_DIR = abs_join(FUZZ_DIR, 'corpora') -TARGETS = [ - 'simple_round_trip', - 'stream_round_trip', - 'block_round_trip', - 'simple_decompress', - 'stream_decompress', - 'block_decompress', - 'dictionary_round_trip', - 'dictionary_decompress', - 'zstd_frame_info', - 'simple_compress', -] +TARGET_INFO = { + 'simple_round_trip': TargetInfo(InputType.RAW_DATA), + 'stream_round_trip': TargetInfo(InputType.RAW_DATA), + 'block_round_trip': TargetInfo(InputType.RAW_DATA, FrameType.BLOCK), + 'simple_decompress': TargetInfo(InputType.COMPRESSED_DATA), + 'stream_decompress': TargetInfo(InputType.COMPRESSED_DATA), + 'block_decompress': TargetInfo(InputType.COMPRESSED_DATA, FrameType.BLOCK), + 'dictionary_round_trip': TargetInfo(InputType.RAW_DATA), + 'dictionary_decompress': TargetInfo(InputType.COMPRESSED_DATA), + 'zstd_frame_info': TargetInfo(InputType.COMPRESSED_DATA), + 'simple_compress': TargetInfo(InputType.RAW_DATA), +} +TARGETS = list(TARGET_INFO.keys()) ALL_TARGETS = TARGETS + ['all'] FUZZ_RNG_SEED_SIZE = 4 @@ -67,7 +84,7 @@ MSAN_EXTRA_LDFLAGS = os.environ.get('MSAN_EXTRA_LDFLAGS', '') def create(r): d = os.path.abspath(r) if not os.path.isdir(d): - os.mkdir(d) + os.makedirs(d) return d @@ -158,7 +175,7 @@ def compiler_version(cc, cxx): assert(b'clang' in cxx_version_bytes) compiler = 'clang' elif b'gcc' in cc_version_bytes: - assert(b'gcc' in cxx_version_bytes) + assert(b'gcc' in cxx_version_bytes or b'g++' in cxx_version_bytes) compiler = 'gcc' if compiler is not None: version_regex = b'([0-9])+\.([0-9])+\.([0-9])+' @@ -699,7 +716,8 @@ def gen(args): '-o{}'.format(decompressed), ] - if 'block_' in args.TARGET: + info = TARGET_INFO[args.TARGET] + if info.frame_type == FrameType.BLOCK: cmd += [ '--gen-blocks', '--max-block-size-log={}'.format(args.max_size_log) @@ -710,10 +728,11 @@ def gen(args): print(' '.join(cmd)) subprocess.check_call(cmd) - if '_round_trip' in args.TARGET: + if info.input_type == InputType.RAW_DATA: print('using decompressed data in {}'.format(decompressed)) samples = decompressed - elif '_decompress' in args.TARGET: + else: + assert info.input_type == InputType.COMPRESSED_DATA print('using compressed data in {}'.format(compressed)) samples = compressed diff --git a/tests/fuzz/fuzz_helpers.h b/tests/fuzz/fuzz_helpers.h index 0cf79d0d..0ee85fc7 100644 --- a/tests/fuzz/fuzz_helpers.h +++ b/tests/fuzz/fuzz_helpers.h @@ -14,6 +14,7 @@ #ifndef FUZZ_HELPERS_H #define FUZZ_HELPERS_H +#include "debug.h" #include "fuzz.h" #include "xxhash.h" #include "zstd.h" diff --git a/tests/fuzz/regression_driver.c b/tests/fuzz/regression_driver.c index 658c685f..e3ebcd5c 100644 --- a/tests/fuzz/regression_driver.c +++ b/tests/fuzz/regression_driver.c @@ -36,6 +36,7 @@ int main(int argc, char const **argv) { fprintf(stderr, "WARNING: No files passed to %s\n", argv[0]); for (i = 0; i < numFiles; ++i) { char const *fileName = files[i]; + DEBUGLOG(3, "Running %s", fileName); size_t const fileSize = UTIL_getFileSize(fileName); size_t readSize; FILE *file; diff --git a/tests/fuzz/zstd_helpers.c b/tests/fuzz/zstd_helpers.c index 9dff2895..5ff057b8 100644 --- a/tests/fuzz/zstd_helpers.c +++ b/tests/fuzz/zstd_helpers.c @@ -90,6 +90,9 @@ void FUZZ_setRandomParameters(ZSTD_CCtx *cctx, size_t srcSize, uint32_t *state) setRand(cctx, ZSTD_c_forceMaxWindow, 0, 1, state); setRand(cctx, ZSTD_c_literalCompressionMode, 0, 2, state); setRand(cctx, ZSTD_c_forceAttachDict, 0, 2, state); + if (FUZZ_rand32(state, 0, 1) == 0) { + setRand(cctx, ZSTD_c_srcSizeHint, ZSTD_SRCSIZEHINT_MIN, 2 * srcSize, state); + } } FUZZ_dict_t FUZZ_train(void const* src, size_t srcSize, uint32_t *state) diff --git a/tests/playTests.sh b/tests/playTests.sh index 69387321..ad096fdd 100755 --- a/tests/playTests.sh +++ b/tests/playTests.sh @@ -108,7 +108,6 @@ else fi - println "\n===> simple tests " ./datagen > tmp @@ -409,6 +408,53 @@ println "compress multiple files including a missing one (notHere) : " $ZSTD -f tmp1 notHere tmp2 && die "missing file not detected!" +println "\n===> stream-size mode" + +./datagen -g11000 > tmp +println "test : basic file compression vs sized streaming compression" +file_size=$($ZSTD -14 -f tmp -o tmp.zst && wc -c < tmp.zst) +stream_size=$(cat tmp | $ZSTD -14 --stream-size=11000 | wc -c) +if [ "$stream_size" -gt "$file_size" ]; then + die "hinted compression larger than expected" +fi +println "test : sized streaming compression and decompression" +cat tmp | $ZSTD -14 -f tmp -o --stream-size=11000 tmp.zst +$ZSTD -df tmp.zst -o tmp_decompress +cmp tmp tmp_decompress || die "difference between original and decompressed file" +println "test : incorrect stream size" +cat tmp | $ZSTD -14 -f -o tmp.zst --stream-size=11001 && die "should fail with incorrect stream size" + + +println "\n===> size-hint mode" + +./datagen -g11000 > tmp +./datagen -g11000 > tmp2 +./datagen > tmpDict +println "test : basic file compression vs hinted streaming compression" +file_size=$($ZSTD -14 -f tmp -o tmp.zst && wc -c < tmp.zst) +stream_size=$(cat tmp | $ZSTD -14 --size-hint=11000 | wc -c) +if [ "$stream_size" -ge "$file_size" ]; then + die "hinted compression larger than expected" +fi +println "test : hinted streaming compression and decompression" +cat tmp | $ZSTD -14 -f -o tmp.zst --size-hint=11000 +$ZSTD -df tmp.zst -o tmp_decompress +cmp tmp tmp_decompress || die "difference between original and decompressed file" +println "test : hinted streaming compression with dictionary" +cat tmp | $ZSTD -14 -f -D tmpDict --size-hint=11000 | $ZSTD -t -D tmpDict +println "test : multiple file compression with hints and dictionary" +$ZSTD -14 -f -D tmpDict --size-hint=11000 tmp tmp2 +$ZSTD -14 -f -o tmp1_.zst -D tmpDict --size-hint=11000 tmp +$ZSTD -14 -f -o tmp2_.zst -D tmpDict --size-hint=11000 tmp2 +cmp tmp.zst tmp1_.zst || die "first file's output differs" +cmp tmp2.zst tmp2_.zst || die "second file's output differs" +println "test : incorrect hinted stream sizes" +cat tmp | $ZSTD -14 -f --size-hint=11050 | $ZSTD -t # slightly too high +cat tmp | $ZSTD -14 -f --size-hint=10950 | $ZSTD -t # slightly too low +cat tmp | $ZSTD -14 -f --size-hint=22000 | $ZSTD -t # considerably too high +cat tmp | $ZSTD -14 -f --size-hint=5500 | $ZSTD -t # considerably too low + + println "\n===> dictionary tests " println "- test with raw dict (content only) " diff --git a/tests/zstreamtest.c b/tests/zstreamtest.c index 56f16766..9af08ebe 100644 --- a/tests/zstreamtest.c +++ b/tests/zstreamtest.c @@ -2106,6 +2106,7 @@ static int fuzzerTests_newAPI(U32 seed, int nbTests, int startTest, if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_ldmMinMatch, FUZ_randomClampedLength(&lseed, ZSTD_LDM_MINMATCH_MIN, ZSTD_LDM_MINMATCH_MAX), opaqueAPI) ); if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_ldmBucketSizeLog, FUZ_randomClampedLength(&lseed, ZSTD_LDM_BUCKETSIZELOG_MIN, ZSTD_LDM_BUCKETSIZELOG_MAX), opaqueAPI) ); if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_ldmHashRateLog, FUZ_randomClampedLength(&lseed, ZSTD_LDM_HASHRATELOG_MIN, ZSTD_LDM_HASHRATELOG_MAX), opaqueAPI) ); + if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_srcSizeHint, FUZ_randomClampedLength(&lseed, ZSTD_SRCSIZEHINT_MIN, ZSTD_SRCSIZEHINT_MAX), opaqueAPI) ); } /* mess with frame parameters */