Merge pull request #1733 from nmagerko/size-hint

Add --size-hint=# option
This commit is contained in:
Nick Terrell 2019-08-23 10:16:10 -07:00 committed by GitHub
commit d0750a1c9c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 91 additions and 2 deletions

View File

@ -392,6 +392,11 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
bounds.upperBound = ZSTD_TARGETCBLOCKSIZE_MAX;
return bounds;
case ZSTD_c_srcSizeHint:
bounds.lowerBound = ZSTD_SRCSIZEHINT_MIN;
bounds.upperBound = ZSTD_SRCSIZEHINT_MAX;
return bounds;
default:
{ ZSTD_bounds const boundError = { ERROR(parameter_unsupported), 0, 0 };
return boundError;
@ -448,6 +453,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
case ZSTD_c_forceAttachDict:
case ZSTD_c_literalCompressionMode:
case ZSTD_c_targetCBlockSize:
case ZSTD_c_srcSizeHint:
default:
return 0;
}
@ -494,6 +500,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
case ZSTD_c_ldmMinMatch:
case ZSTD_c_ldmBucketSizeLog:
case ZSTD_c_targetCBlockSize:
case ZSTD_c_srcSizeHint:
break;
default: RETURN_ERROR(parameter_unsupported);
@ -674,6 +681,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
CCtxParams->targetCBlockSize = value;
return CCtxParams->targetCBlockSize;
case ZSTD_c_srcSizeHint :
if (value!=0) /* 0 ==> default */
BOUNDCHECK(ZSTD_c_srcSizeHint, value);
CCtxParams->srcSizeHint = value;
return CCtxParams->srcSizeHint;
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
}
}
@ -779,6 +792,9 @@ size_t ZSTD_CCtxParams_getParameter(
case ZSTD_c_targetCBlockSize :
*value = (int)CCtxParams->targetCBlockSize;
break;
case ZSTD_c_srcSizeHint :
*value = (int)CCtxParams->srcSizeHint;
break;
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
}
return 0;
@ -1029,7 +1045,11 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize)
{
ZSTD_compressionParameters cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize);
ZSTD_compressionParameters cParams;
if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) {
srcSizeHint = CCtxParams->srcSizeHint;
}
cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize);
if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog;
if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog;

View File

@ -203,6 +203,9 @@ struct ZSTD_CCtx_params_s {
size_t targetCBlockSize; /* Tries to fit compressed block size to be around targetCBlockSize.
* No target when targetCBlockSize == 0.
* There is no guarantee on compressed block size */
int srcSizeHint; /* User's best guess of source size.
* Hint is not valid when srcSizeHint == 0.
* There is no guarantee that hint is close to actual source size */
ZSTD_dictAttachPref_e attachDictPref;
ZSTD_literalCompressionMode_e literalCompressionMode;

View File

@ -15,6 +15,7 @@ extern "C" {
#define ZSTD_H_235446
/* ====== Dependency ======*/
#include <limits.h> /* INT_MAX */
#include <stddef.h> /* size_t */
@ -386,6 +387,7 @@ typedef enum {
* ZSTD_c_forceAttachDict
* ZSTD_c_literalCompressionMode
* ZSTD_c_targetCBlockSize
* ZSTD_c_srcSizeHint
* Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
* note : never ever use experimentalParam? names directly;
* also, the enums values themselves are unstable and can still change.
@ -396,6 +398,7 @@ typedef enum {
ZSTD_c_experimentalParam4=1001,
ZSTD_c_experimentalParam5=1002,
ZSTD_c_experimentalParam6=1003,
ZSTD_c_experimentalParam7=1004,
} ZSTD_cParameter;
typedef struct {
@ -1063,6 +1066,8 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
/* Advanced parameter bounds */
#define ZSTD_TARGETCBLOCKSIZE_MIN 64
#define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX
#define ZSTD_SRCSIZEHINT_MIN 0
#define ZSTD_SRCSIZEHINT_MAX INT_MAX
/* internal */
#define ZSTD_HASHLOG3_MAX 17
@ -1441,6 +1446,12 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre
* There is no guarantee on compressed block size (default:0) */
#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
/* User's best guess of source size.
* Hint is not valid when srcSizeHint == 0.
* There is no guarantee that hint is close to actual source size,
* but compression ratio may regress significantly if guess considerably underestimates */
#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7
/*! ZSTD_CCtx_getParameter() :
* Get the requested compression parameter value, selected by enum ZSTD_cParameter,
* and store it into int* value.

View File

@ -30,6 +30,7 @@
#include <string.h> /* strcmp, strlen */
#include <assert.h>
#include <errno.h> /* errno */
#include <limits.h> /* INT_MAX */
#include <signal.h>
#include "timefn.h" /* UTIL_getTime, UTIL_clockSpanMicro */
@ -306,6 +307,7 @@ struct FIO_prefs_s {
int ldmHashRateLog;
size_t streamSrcSize;
size_t targetCBlockSize;
int srcSizeHint;
ZSTD_literalCompressionMode_e literalCompressionMode;
/* IO preferences */
@ -352,6 +354,7 @@ FIO_prefs_t* FIO_createPreferences(void)
ret->ldmHashRateLog = FIO_LDM_PARAM_NOTSET;
ret->streamSrcSize = 0;
ret->targetCBlockSize = 0;
ret->srcSizeHint = 0;
ret->literalCompressionMode = ZSTD_lcm_auto;
return ret;
}
@ -428,6 +431,10 @@ void FIO_setTargetCBlockSize(FIO_prefs_t* const prefs, size_t targetCBlockSize)
prefs->targetCBlockSize = targetCBlockSize;
}
void FIO_setSrcSizeHint(FIO_prefs_t* const prefs, size_t srcSizeHint) {
prefs->srcSizeHint = (int)MIN((size_t)INT_MAX, srcSizeHint);
}
void FIO_setLiteralCompressionMode(
FIO_prefs_t* const prefs,
ZSTD_literalCompressionMode_e mode) {
@ -672,6 +679,8 @@ static cRess_t FIO_createCResources(FIO_prefs_t* const prefs,
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_compressionLevel, cLevel) );
/* max compressed block size */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_targetCBlockSize, (int)prefs->targetCBlockSize) );
/* source size hint */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_srcSizeHint, (int)prefs->srcSizeHint) );
/* long distance matching */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_enableLongDistanceMatching, prefs->ldmFlag) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_ldmHashLog, prefs->ldmHashLog) );

View File

@ -73,6 +73,7 @@ void FIO_setSparseWrite(FIO_prefs_t* const prefs, unsigned sparse); /**< 0: no
void FIO_setRsyncable(FIO_prefs_t* const prefs, int rsyncable);
void FIO_setStreamSrcSize(FIO_prefs_t* const prefs, size_t streamSrcSize);
void FIO_setTargetCBlockSize(FIO_prefs_t* const prefs, size_t targetCBlockSize);
void FIO_setSrcSizeHint(FIO_prefs_t* const prefs, size_t srcSizeHint);
void FIO_setLiteralCompressionMode(
FIO_prefs_t* const prefs,
ZSTD_literalCompressionMode_e mode);

View File

@ -149,6 +149,13 @@ the last one takes effect.
will be included in the produced frame header. Incorrect stream sizes will cause an error.
This information will be used to better optimize compression parameters, resulting in
better and potentially faster compression, especially for smaller source sizes.
* `--size-hint=#`:
When handling input from a stream, `zstd` must guess how large the source size
will be when optimizing compression parameters. If the stream size is relatively
small, this guess may be a poor one, resulting in a higher compression ratio than
expected. This feature allows for controlling the guess when needed.
Exact guesses result in better compression ratios. Overestimates result in slightly
degraded compression ratios, while underestimates may result in significant degradation.
* `--rsyncable` :
`zstd` will periodically synchronize the compression state to make the
compressed file more rsync-friendly. There is a negligible impact to

View File

@ -142,6 +142,7 @@ static int usage_advanced(const char* programName)
DISPLAY( "--fast[=#]: switch to ultra fast compression level (default: %u)\n", 1);
DISPLAY( "--adapt : dynamically adapt compression level to I/O conditions \n");
DISPLAY( "--stream-size=# : optimize compression parameters for streaming input of given number of bytes \n");
DISPLAY( "--size-hint=# optimize compression parameters for streaming input of approximately this size\n");
DISPLAY( "--target-compressed-block-size=# : make compressed block near targeted size \n");
#ifdef ZSTD_MULTITHREAD
DISPLAY( " -T# : spawns # compression threads (default: 1, 0==# cores) \n");
@ -591,6 +592,7 @@ int main(int argCount, const char* argv[])
unsigned dictID = 0;
size_t streamSrcSize = 0;
size_t targetCBlockSize = 0;
size_t srcSizeHint = 0;
int dictCLevel = g_defaultDictCLevel;
unsigned dictSelect = g_defaultSelectivityLevel;
#ifdef UTIL_HAS_CREATEFILELIST
@ -749,6 +751,7 @@ int main(int argCount, const char* argv[])
if (longCommandWArg(&argument, "--zstd=")) { if (!parseCompressionParameters(argument, &compressionParams)) CLEAN_RETURN(badusage(programName)); continue; }
if (longCommandWArg(&argument, "--stream-size=")) { streamSrcSize = readU32FromChar(&argument); continue; }
if (longCommandWArg(&argument, "--target-compressed-block-size=")) { targetCBlockSize = readU32FromChar(&argument); continue; }
if (longCommandWArg(&argument, "--size-hint=")) { srcSizeHint = readU32FromChar(&argument); continue; }
if (longCommandWArg(&argument, "--long")) {
unsigned ldmWindowLog = 0;
ldmFlag = 1;
@ -1155,6 +1158,7 @@ int main(int argCount, const char* argv[])
FIO_setRsyncable(prefs, rsyncable);
FIO_setStreamSrcSize(prefs, streamSrcSize);
FIO_setTargetCBlockSize(prefs, targetCBlockSize);
FIO_setSrcSizeHint(prefs, srcSizeHint);
FIO_setLiteralCompressionMode(prefs, literalCompressionMode);
if (adaptMin > cLevel) cLevel = adaptMin;
if (adaptMax < cLevel) cLevel = adaptMax;
@ -1164,7 +1168,7 @@ int main(int argCount, const char* argv[])
else
operationResult = FIO_compressMultipleFilenames(prefs, filenameTable, filenameIdx, outFileName, suffix, dictFileName, cLevel, compressionParams);
#else
(void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)streamSrcSize; (void)targetCBlockSize; /* not used when ZSTD_NOCOMPRESS set */
(void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)targetCBlockSize; (void)streamSrcSize; (void)srcSizeHint; /* not used when ZSTD_NOCOMPRESS set */
DISPLAY("Compression not supported \n");
#endif
} else { /* decompression or test */

View File

@ -90,6 +90,9 @@ void FUZZ_setRandomParameters(ZSTD_CCtx *cctx, size_t srcSize, uint32_t *state)
setRand(cctx, ZSTD_c_forceMaxWindow, 0, 1, state);
setRand(cctx, ZSTD_c_literalCompressionMode, 0, 2, state);
setRand(cctx, ZSTD_c_forceAttachDict, 0, 2, state);
if (FUZZ_rand32(state, 0, 1) == 0) {
setRand(cctx, ZSTD_c_srcSizeHint, ZSTD_SRCSIZEHINT_MIN, 2 * srcSize, state);
}
}
FUZZ_dict_t FUZZ_train(void const* src, size_t srcSize, uint32_t *state)

View File

@ -425,6 +425,36 @@ println "test : incorrect stream size"
cat tmp | $ZSTD -14 -f -o tmp.zst --stream-size=11001 && die "should fail with incorrect stream size"
println "\n===> size-hint mode"
./datagen -g11000 > tmp
./datagen -g11000 > tmp2
./datagen > tmpDict
println "test : basic file compression vs hinted streaming compression"
file_size=$($ZSTD -14 -f tmp -o tmp.zst && wc -c < tmp.zst)
stream_size=$(cat tmp | $ZSTD -14 --size-hint=11000 | wc -c)
if [ "$stream_size" -ge "$file_size" ]; then
die "hinted compression larger than expected"
fi
println "test : hinted streaming compression and decompression"
cat tmp | $ZSTD -14 -f -o tmp.zst --size-hint=11000
$ZSTD -df tmp.zst -o tmp_decompress
cmp tmp tmp_decompress || die "difference between original and decompressed file"
println "test : hinted streaming compression with dictionary"
cat tmp | $ZSTD -14 -f -D tmpDict --size-hint=11000 | $ZSTD -t -D tmpDict
println "test : multiple file compression with hints and dictionary"
$ZSTD -14 -f -D tmpDict --size-hint=11000 tmp tmp2
$ZSTD -14 -f -o tmp1_.zst -D tmpDict --size-hint=11000 tmp
$ZSTD -14 -f -o tmp2_.zst -D tmpDict --size-hint=11000 tmp2
cmp tmp.zst tmp1_.zst || die "first file's output differs"
cmp tmp2.zst tmp2_.zst || die "second file's output differs"
println "test : incorrect hinted stream sizes"
cat tmp | $ZSTD -14 -f --size-hint=11050 | $ZSTD -t # slightly too high
cat tmp | $ZSTD -14 -f --size-hint=10950 | $ZSTD -t # slightly too low
cat tmp | $ZSTD -14 -f --size-hint=22000 | $ZSTD -t # considerably too high
cat tmp | $ZSTD -14 -f --size-hint=5500 | $ZSTD -t # considerably too low
println "\n===> dictionary tests "
println "- test with raw dict (content only) "

View File

@ -2106,6 +2106,7 @@ static int fuzzerTests_newAPI(U32 seed, int nbTests, int startTest,
if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_ldmMinMatch, FUZ_randomClampedLength(&lseed, ZSTD_LDM_MINMATCH_MIN, ZSTD_LDM_MINMATCH_MAX), opaqueAPI) );
if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_ldmBucketSizeLog, FUZ_randomClampedLength(&lseed, ZSTD_LDM_BUCKETSIZELOG_MIN, ZSTD_LDM_BUCKETSIZELOG_MAX), opaqueAPI) );
if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_ldmHashRateLog, FUZ_randomClampedLength(&lseed, ZSTD_LDM_HASHRATELOG_MIN, ZSTD_LDM_HASHRATELOG_MAX), opaqueAPI) );
if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_srcSizeHint, FUZ_randomClampedLength(&lseed, ZSTD_SRCSIZEHINT_MIN, ZSTD_SRCSIZEHINT_MAX), opaqueAPI) );
}
/* mess with frame parameters */