Merge pull request #1733 from nmagerko/size-hint
Add --size-hint=# option
This commit is contained in:
commit
d0750a1c9c
@ -392,6 +392,11 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
|
||||
bounds.upperBound = ZSTD_TARGETCBLOCKSIZE_MAX;
|
||||
return bounds;
|
||||
|
||||
case ZSTD_c_srcSizeHint:
|
||||
bounds.lowerBound = ZSTD_SRCSIZEHINT_MIN;
|
||||
bounds.upperBound = ZSTD_SRCSIZEHINT_MAX;
|
||||
return bounds;
|
||||
|
||||
default:
|
||||
{ ZSTD_bounds const boundError = { ERROR(parameter_unsupported), 0, 0 };
|
||||
return boundError;
|
||||
@ -448,6 +453,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
|
||||
case ZSTD_c_forceAttachDict:
|
||||
case ZSTD_c_literalCompressionMode:
|
||||
case ZSTD_c_targetCBlockSize:
|
||||
case ZSTD_c_srcSizeHint:
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
@ -494,6 +500,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
|
||||
case ZSTD_c_ldmMinMatch:
|
||||
case ZSTD_c_ldmBucketSizeLog:
|
||||
case ZSTD_c_targetCBlockSize:
|
||||
case ZSTD_c_srcSizeHint:
|
||||
break;
|
||||
|
||||
default: RETURN_ERROR(parameter_unsupported);
|
||||
@ -674,6 +681,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
|
||||
CCtxParams->targetCBlockSize = value;
|
||||
return CCtxParams->targetCBlockSize;
|
||||
|
||||
case ZSTD_c_srcSizeHint :
|
||||
if (value!=0) /* 0 ==> default */
|
||||
BOUNDCHECK(ZSTD_c_srcSizeHint, value);
|
||||
CCtxParams->srcSizeHint = value;
|
||||
return CCtxParams->srcSizeHint;
|
||||
|
||||
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
|
||||
}
|
||||
}
|
||||
@ -779,6 +792,9 @@ size_t ZSTD_CCtxParams_getParameter(
|
||||
case ZSTD_c_targetCBlockSize :
|
||||
*value = (int)CCtxParams->targetCBlockSize;
|
||||
break;
|
||||
case ZSTD_c_srcSizeHint :
|
||||
*value = (int)CCtxParams->srcSizeHint;
|
||||
break;
|
||||
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
|
||||
}
|
||||
return 0;
|
||||
@ -1029,7 +1045,11 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
|
||||
ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
|
||||
const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize)
|
||||
{
|
||||
ZSTD_compressionParameters cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize);
|
||||
ZSTD_compressionParameters cParams;
|
||||
if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) {
|
||||
srcSizeHint = CCtxParams->srcSizeHint;
|
||||
}
|
||||
cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize);
|
||||
if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
|
||||
if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog;
|
||||
if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog;
|
||||
|
@ -203,6 +203,9 @@ struct ZSTD_CCtx_params_s {
|
||||
size_t targetCBlockSize; /* Tries to fit compressed block size to be around targetCBlockSize.
|
||||
* No target when targetCBlockSize == 0.
|
||||
* There is no guarantee on compressed block size */
|
||||
int srcSizeHint; /* User's best guess of source size.
|
||||
* Hint is not valid when srcSizeHint == 0.
|
||||
* There is no guarantee that hint is close to actual source size */
|
||||
|
||||
ZSTD_dictAttachPref_e attachDictPref;
|
||||
ZSTD_literalCompressionMode_e literalCompressionMode;
|
||||
|
11
lib/zstd.h
11
lib/zstd.h
@ -15,6 +15,7 @@ extern "C" {
|
||||
#define ZSTD_H_235446
|
||||
|
||||
/* ====== Dependency ======*/
|
||||
#include <limits.h> /* INT_MAX */
|
||||
#include <stddef.h> /* size_t */
|
||||
|
||||
|
||||
@ -386,6 +387,7 @@ typedef enum {
|
||||
* ZSTD_c_forceAttachDict
|
||||
* ZSTD_c_literalCompressionMode
|
||||
* ZSTD_c_targetCBlockSize
|
||||
* ZSTD_c_srcSizeHint
|
||||
* Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
|
||||
* note : never ever use experimentalParam? names directly;
|
||||
* also, the enums values themselves are unstable and can still change.
|
||||
@ -396,6 +398,7 @@ typedef enum {
|
||||
ZSTD_c_experimentalParam4=1001,
|
||||
ZSTD_c_experimentalParam5=1002,
|
||||
ZSTD_c_experimentalParam6=1003,
|
||||
ZSTD_c_experimentalParam7=1004,
|
||||
} ZSTD_cParameter;
|
||||
|
||||
typedef struct {
|
||||
@ -1063,6 +1066,8 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
|
||||
/* Advanced parameter bounds */
|
||||
#define ZSTD_TARGETCBLOCKSIZE_MIN 64
|
||||
#define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX
|
||||
#define ZSTD_SRCSIZEHINT_MIN 0
|
||||
#define ZSTD_SRCSIZEHINT_MAX INT_MAX
|
||||
|
||||
/* internal */
|
||||
#define ZSTD_HASHLOG3_MAX 17
|
||||
@ -1441,6 +1446,12 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre
|
||||
* There is no guarantee on compressed block size (default:0) */
|
||||
#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
|
||||
|
||||
/* User's best guess of source size.
|
||||
* Hint is not valid when srcSizeHint == 0.
|
||||
* There is no guarantee that hint is close to actual source size,
|
||||
* but compression ratio may regress significantly if guess considerably underestimates */
|
||||
#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7
|
||||
|
||||
/*! ZSTD_CCtx_getParameter() :
|
||||
* Get the requested compression parameter value, selected by enum ZSTD_cParameter,
|
||||
* and store it into int* value.
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include <string.h> /* strcmp, strlen */
|
||||
#include <assert.h>
|
||||
#include <errno.h> /* errno */
|
||||
#include <limits.h> /* INT_MAX */
|
||||
#include <signal.h>
|
||||
#include "timefn.h" /* UTIL_getTime, UTIL_clockSpanMicro */
|
||||
|
||||
@ -306,6 +307,7 @@ struct FIO_prefs_s {
|
||||
int ldmHashRateLog;
|
||||
size_t streamSrcSize;
|
||||
size_t targetCBlockSize;
|
||||
int srcSizeHint;
|
||||
ZSTD_literalCompressionMode_e literalCompressionMode;
|
||||
|
||||
/* IO preferences */
|
||||
@ -352,6 +354,7 @@ FIO_prefs_t* FIO_createPreferences(void)
|
||||
ret->ldmHashRateLog = FIO_LDM_PARAM_NOTSET;
|
||||
ret->streamSrcSize = 0;
|
||||
ret->targetCBlockSize = 0;
|
||||
ret->srcSizeHint = 0;
|
||||
ret->literalCompressionMode = ZSTD_lcm_auto;
|
||||
return ret;
|
||||
}
|
||||
@ -428,6 +431,10 @@ void FIO_setTargetCBlockSize(FIO_prefs_t* const prefs, size_t targetCBlockSize)
|
||||
prefs->targetCBlockSize = targetCBlockSize;
|
||||
}
|
||||
|
||||
void FIO_setSrcSizeHint(FIO_prefs_t* const prefs, size_t srcSizeHint) {
|
||||
prefs->srcSizeHint = (int)MIN((size_t)INT_MAX, srcSizeHint);
|
||||
}
|
||||
|
||||
void FIO_setLiteralCompressionMode(
|
||||
FIO_prefs_t* const prefs,
|
||||
ZSTD_literalCompressionMode_e mode) {
|
||||
@ -672,6 +679,8 @@ static cRess_t FIO_createCResources(FIO_prefs_t* const prefs,
|
||||
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_compressionLevel, cLevel) );
|
||||
/* max compressed block size */
|
||||
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_targetCBlockSize, (int)prefs->targetCBlockSize) );
|
||||
/* source size hint */
|
||||
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_srcSizeHint, (int)prefs->srcSizeHint) );
|
||||
/* long distance matching */
|
||||
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_enableLongDistanceMatching, prefs->ldmFlag) );
|
||||
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_ldmHashLog, prefs->ldmHashLog) );
|
||||
|
@ -73,6 +73,7 @@ void FIO_setSparseWrite(FIO_prefs_t* const prefs, unsigned sparse); /**< 0: no
|
||||
void FIO_setRsyncable(FIO_prefs_t* const prefs, int rsyncable);
|
||||
void FIO_setStreamSrcSize(FIO_prefs_t* const prefs, size_t streamSrcSize);
|
||||
void FIO_setTargetCBlockSize(FIO_prefs_t* const prefs, size_t targetCBlockSize);
|
||||
void FIO_setSrcSizeHint(FIO_prefs_t* const prefs, size_t srcSizeHint);
|
||||
void FIO_setLiteralCompressionMode(
|
||||
FIO_prefs_t* const prefs,
|
||||
ZSTD_literalCompressionMode_e mode);
|
||||
|
@ -149,6 +149,13 @@ the last one takes effect.
|
||||
will be included in the produced frame header. Incorrect stream sizes will cause an error.
|
||||
This information will be used to better optimize compression parameters, resulting in
|
||||
better and potentially faster compression, especially for smaller source sizes.
|
||||
* `--size-hint=#`:
|
||||
When handling input from a stream, `zstd` must guess how large the source size
|
||||
will be when optimizing compression parameters. If the stream size is relatively
|
||||
small, this guess may be a poor one, resulting in a higher compression ratio than
|
||||
expected. This feature allows for controlling the guess when needed.
|
||||
Exact guesses result in better compression ratios. Overestimates result in slightly
|
||||
degraded compression ratios, while underestimates may result in significant degradation.
|
||||
* `--rsyncable` :
|
||||
`zstd` will periodically synchronize the compression state to make the
|
||||
compressed file more rsync-friendly. There is a negligible impact to
|
||||
|
@ -142,6 +142,7 @@ static int usage_advanced(const char* programName)
|
||||
DISPLAY( "--fast[=#]: switch to ultra fast compression level (default: %u)\n", 1);
|
||||
DISPLAY( "--adapt : dynamically adapt compression level to I/O conditions \n");
|
||||
DISPLAY( "--stream-size=# : optimize compression parameters for streaming input of given number of bytes \n");
|
||||
DISPLAY( "--size-hint=# optimize compression parameters for streaming input of approximately this size\n");
|
||||
DISPLAY( "--target-compressed-block-size=# : make compressed block near targeted size \n");
|
||||
#ifdef ZSTD_MULTITHREAD
|
||||
DISPLAY( " -T# : spawns # compression threads (default: 1, 0==# cores) \n");
|
||||
@ -591,6 +592,7 @@ int main(int argCount, const char* argv[])
|
||||
unsigned dictID = 0;
|
||||
size_t streamSrcSize = 0;
|
||||
size_t targetCBlockSize = 0;
|
||||
size_t srcSizeHint = 0;
|
||||
int dictCLevel = g_defaultDictCLevel;
|
||||
unsigned dictSelect = g_defaultSelectivityLevel;
|
||||
#ifdef UTIL_HAS_CREATEFILELIST
|
||||
@ -749,6 +751,7 @@ int main(int argCount, const char* argv[])
|
||||
if (longCommandWArg(&argument, "--zstd=")) { if (!parseCompressionParameters(argument, &compressionParams)) CLEAN_RETURN(badusage(programName)); continue; }
|
||||
if (longCommandWArg(&argument, "--stream-size=")) { streamSrcSize = readU32FromChar(&argument); continue; }
|
||||
if (longCommandWArg(&argument, "--target-compressed-block-size=")) { targetCBlockSize = readU32FromChar(&argument); continue; }
|
||||
if (longCommandWArg(&argument, "--size-hint=")) { srcSizeHint = readU32FromChar(&argument); continue; }
|
||||
if (longCommandWArg(&argument, "--long")) {
|
||||
unsigned ldmWindowLog = 0;
|
||||
ldmFlag = 1;
|
||||
@ -1155,6 +1158,7 @@ int main(int argCount, const char* argv[])
|
||||
FIO_setRsyncable(prefs, rsyncable);
|
||||
FIO_setStreamSrcSize(prefs, streamSrcSize);
|
||||
FIO_setTargetCBlockSize(prefs, targetCBlockSize);
|
||||
FIO_setSrcSizeHint(prefs, srcSizeHint);
|
||||
FIO_setLiteralCompressionMode(prefs, literalCompressionMode);
|
||||
if (adaptMin > cLevel) cLevel = adaptMin;
|
||||
if (adaptMax < cLevel) cLevel = adaptMax;
|
||||
@ -1164,7 +1168,7 @@ int main(int argCount, const char* argv[])
|
||||
else
|
||||
operationResult = FIO_compressMultipleFilenames(prefs, filenameTable, filenameIdx, outFileName, suffix, dictFileName, cLevel, compressionParams);
|
||||
#else
|
||||
(void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)streamSrcSize; (void)targetCBlockSize; /* not used when ZSTD_NOCOMPRESS set */
|
||||
(void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)targetCBlockSize; (void)streamSrcSize; (void)srcSizeHint; /* not used when ZSTD_NOCOMPRESS set */
|
||||
DISPLAY("Compression not supported \n");
|
||||
#endif
|
||||
} else { /* decompression or test */
|
||||
|
@ -90,6 +90,9 @@ void FUZZ_setRandomParameters(ZSTD_CCtx *cctx, size_t srcSize, uint32_t *state)
|
||||
setRand(cctx, ZSTD_c_forceMaxWindow, 0, 1, state);
|
||||
setRand(cctx, ZSTD_c_literalCompressionMode, 0, 2, state);
|
||||
setRand(cctx, ZSTD_c_forceAttachDict, 0, 2, state);
|
||||
if (FUZZ_rand32(state, 0, 1) == 0) {
|
||||
setRand(cctx, ZSTD_c_srcSizeHint, ZSTD_SRCSIZEHINT_MIN, 2 * srcSize, state);
|
||||
}
|
||||
}
|
||||
|
||||
FUZZ_dict_t FUZZ_train(void const* src, size_t srcSize, uint32_t *state)
|
||||
|
@ -425,6 +425,36 @@ println "test : incorrect stream size"
|
||||
cat tmp | $ZSTD -14 -f -o tmp.zst --stream-size=11001 && die "should fail with incorrect stream size"
|
||||
|
||||
|
||||
println "\n===> size-hint mode"
|
||||
|
||||
./datagen -g11000 > tmp
|
||||
./datagen -g11000 > tmp2
|
||||
./datagen > tmpDict
|
||||
println "test : basic file compression vs hinted streaming compression"
|
||||
file_size=$($ZSTD -14 -f tmp -o tmp.zst && wc -c < tmp.zst)
|
||||
stream_size=$(cat tmp | $ZSTD -14 --size-hint=11000 | wc -c)
|
||||
if [ "$stream_size" -ge "$file_size" ]; then
|
||||
die "hinted compression larger than expected"
|
||||
fi
|
||||
println "test : hinted streaming compression and decompression"
|
||||
cat tmp | $ZSTD -14 -f -o tmp.zst --size-hint=11000
|
||||
$ZSTD -df tmp.zst -o tmp_decompress
|
||||
cmp tmp tmp_decompress || die "difference between original and decompressed file"
|
||||
println "test : hinted streaming compression with dictionary"
|
||||
cat tmp | $ZSTD -14 -f -D tmpDict --size-hint=11000 | $ZSTD -t -D tmpDict
|
||||
println "test : multiple file compression with hints and dictionary"
|
||||
$ZSTD -14 -f -D tmpDict --size-hint=11000 tmp tmp2
|
||||
$ZSTD -14 -f -o tmp1_.zst -D tmpDict --size-hint=11000 tmp
|
||||
$ZSTD -14 -f -o tmp2_.zst -D tmpDict --size-hint=11000 tmp2
|
||||
cmp tmp.zst tmp1_.zst || die "first file's output differs"
|
||||
cmp tmp2.zst tmp2_.zst || die "second file's output differs"
|
||||
println "test : incorrect hinted stream sizes"
|
||||
cat tmp | $ZSTD -14 -f --size-hint=11050 | $ZSTD -t # slightly too high
|
||||
cat tmp | $ZSTD -14 -f --size-hint=10950 | $ZSTD -t # slightly too low
|
||||
cat tmp | $ZSTD -14 -f --size-hint=22000 | $ZSTD -t # considerably too high
|
||||
cat tmp | $ZSTD -14 -f --size-hint=5500 | $ZSTD -t # considerably too low
|
||||
|
||||
|
||||
println "\n===> dictionary tests "
|
||||
|
||||
println "- test with raw dict (content only) "
|
||||
|
@ -2106,6 +2106,7 @@ static int fuzzerTests_newAPI(U32 seed, int nbTests, int startTest,
|
||||
if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_ldmMinMatch, FUZ_randomClampedLength(&lseed, ZSTD_LDM_MINMATCH_MIN, ZSTD_LDM_MINMATCH_MAX), opaqueAPI) );
|
||||
if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_ldmBucketSizeLog, FUZ_randomClampedLength(&lseed, ZSTD_LDM_BUCKETSIZELOG_MIN, ZSTD_LDM_BUCKETSIZELOG_MAX), opaqueAPI) );
|
||||
if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_ldmHashRateLog, FUZ_randomClampedLength(&lseed, ZSTD_LDM_HASHRATELOG_MIN, ZSTD_LDM_HASHRATELOG_MAX), opaqueAPI) );
|
||||
if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_srcSizeHint, FUZ_randomClampedLength(&lseed, ZSTD_SRCSIZEHINT_MIN, ZSTD_SRCSIZEHINT_MAX), opaqueAPI) );
|
||||
}
|
||||
|
||||
/* mess with frame parameters */
|
||||
|
Loading…
Reference in New Issue
Block a user