Merge pull request #626 from facebook/stricterDictBuilder

dictBuilder fails to create dictionary on certain input
This commit is contained in:
Yann Collet 2017-03-24 14:27:28 -07:00 committed by GitHub
commit ebe9963cf6
5 changed files with 14 additions and 5 deletions

View File

@ -37,6 +37,7 @@ const char* ERR_getErrorString(ERR_enum code)
case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
case PREFIX(dictionary_wrong): return "Dictionary mismatch";
case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
case PREFIX(maxCode):
default: return notErrorCode;
}

View File

@ -57,6 +57,7 @@ typedef enum {
ZSTD_error_maxSymbolValue_tooSmall,
ZSTD_error_dictionary_corrupted,
ZSTD_error_dictionary_wrong,
ZSTD_error_dictionaryCreation_failed,
ZSTD_error_maxCode
} ZSTD_ErrorCode;

View File

@ -62,8 +62,9 @@
#define MINRATIO 4
static const int g_compressionLevel_default = 6;
static const U32 g_selectivity_default = 9;
static const size_t g_provision_entropySize = 200;
static const size_t g_provision_entropySize = 192;
static const size_t g_min_fast_dictContent = 192;
static const size_t g_dictContentSize_min = 32;
/*-*************************************
@ -929,8 +930,8 @@ size_t ZDICT_trainFromBuffer_unsafe(
/* checks */
if (!dictList) return ERROR(memory_allocation);
if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */
if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */
/* init */
ZDICT_initDictItem(dictList);
@ -963,6 +964,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
/* create dictionary */
{ U32 dictContentSize = ZDICT_dictSize(dictList);
if (dictContentSize < g_dictContentSize_min) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
if (dictContentSize < targetDictSize/3) {
DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
if (minRep > MINRATIO) {

View File

@ -629,7 +629,7 @@ int main(int argCount, const char* argv[])
coverParams.compressionLevel = dictCLevel;
coverParams.notificationLevel = g_displayLevel;
coverParams.dictID = dictID;
DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, cover - 1);
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, cover - 1);
} else {
ZDICT_params_t dictParams;
memset(&dictParams, 0, sizeof(dictParams));
@ -637,7 +637,7 @@ int main(int argCount, const char* argv[])
dictParams.selectivityLevel = dictSelect;
dictParams.notificationLevel = g_displayLevel;
dictParams.dictID = dictID;
DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
}
#endif
goto _end;

View File

@ -281,6 +281,11 @@ case "$UNAME" in
*) $MD5SUM -c tmph1 ;;
esac
rm -rf dirTestDict
$ECHO "- dictionary builder on bogus input"
$ECHO "Hello World" > tmp
$ZSTD --train -q tmp && die "Dictionary training should fail : not enough input source"
./datagen -P0 -g10M > tmp
$ZSTD --train -q tmp && die "Dictionary training should fail : source is pure noise"
rm tmp*