Merge pull request #626 from facebook/stricterDictBuilder
dictBuilder fails to create dictionary on certain input
This commit is contained in:
commit
ebe9963cf6
@ -37,6 +37,7 @@ const char* ERR_getErrorString(ERR_enum code)
|
||||
case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
|
||||
case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
|
||||
case PREFIX(dictionary_wrong): return "Dictionary mismatch";
|
||||
case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
|
||||
case PREFIX(maxCode):
|
||||
default: return notErrorCode;
|
||||
}
|
||||
|
@ -57,6 +57,7 @@ typedef enum {
|
||||
ZSTD_error_maxSymbolValue_tooSmall,
|
||||
ZSTD_error_dictionary_corrupted,
|
||||
ZSTD_error_dictionary_wrong,
|
||||
ZSTD_error_dictionaryCreation_failed,
|
||||
ZSTD_error_maxCode
|
||||
} ZSTD_ErrorCode;
|
||||
|
||||
|
@ -62,8 +62,9 @@
|
||||
#define MINRATIO 4
|
||||
static const int g_compressionLevel_default = 6;
|
||||
static const U32 g_selectivity_default = 9;
|
||||
static const size_t g_provision_entropySize = 200;
|
||||
static const size_t g_provision_entropySize = 192;
|
||||
static const size_t g_min_fast_dictContent = 192;
|
||||
static const size_t g_dictContentSize_min = 32;
|
||||
|
||||
|
||||
/*-*************************************
|
||||
@ -929,8 +930,8 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
||||
|
||||
/* checks */
|
||||
if (!dictList) return ERROR(memory_allocation);
|
||||
if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
|
||||
if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
|
||||
if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */
|
||||
if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */
|
||||
|
||||
/* init */
|
||||
ZDICT_initDictItem(dictList);
|
||||
@ -963,6 +964,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
||||
|
||||
/* create dictionary */
|
||||
{ U32 dictContentSize = ZDICT_dictSize(dictList);
|
||||
if (dictContentSize < g_dictContentSize_min) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
|
||||
if (dictContentSize < targetDictSize/3) {
|
||||
DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
|
||||
if (minRep > MINRATIO) {
|
||||
|
@ -629,7 +629,7 @@ int main(int argCount, const char* argv[])
|
||||
coverParams.compressionLevel = dictCLevel;
|
||||
coverParams.notificationLevel = g_displayLevel;
|
||||
coverParams.dictID = dictID;
|
||||
DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, cover - 1);
|
||||
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, cover - 1);
|
||||
} else {
|
||||
ZDICT_params_t dictParams;
|
||||
memset(&dictParams, 0, sizeof(dictParams));
|
||||
@ -637,7 +637,7 @@ int main(int argCount, const char* argv[])
|
||||
dictParams.selectivityLevel = dictSelect;
|
||||
dictParams.notificationLevel = g_displayLevel;
|
||||
dictParams.dictID = dictID;
|
||||
DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
|
||||
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
|
||||
}
|
||||
#endif
|
||||
goto _end;
|
||||
|
@ -281,6 +281,11 @@ case "$UNAME" in
|
||||
*) $MD5SUM -c tmph1 ;;
|
||||
esac
|
||||
rm -rf dirTestDict
|
||||
$ECHO "- dictionary builder on bogus input"
|
||||
$ECHO "Hello World" > tmp
|
||||
$ZSTD --train -q tmp && die "Dictionary training should fail : not enough input source"
|
||||
./datagen -P0 -g10M > tmp
|
||||
$ZSTD --train -q tmp && die "Dictionary training should fail : source is pure noise"
|
||||
rm tmp*
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user