Merge pull request #509 from terrelln/dict-builder-32

Handle cover dictionary builder maximum input size for 32-bit mode
This commit is contained in:
Yann Collet 2017-01-10 06:30:28 +01:00 committed by GitHub
commit b8cdc16969
3 changed files with 9 additions and 4 deletions

View File

@ -28,7 +28,7 @@
/*-*************************************
* Constants
***************************************/
#define COVER_MAX_SAMPLES_SIZE ((U32)-1)
#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
/*-*************************************
* Console display
@ -500,7 +500,9 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
/* Checks */
if (totalSamplesSize < d ||
totalSamplesSize > (size_t)COVER_MAX_SAMPLES_SIZE) {
totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n",
(COVER_MAX_SAMPLES_SIZE >> 20));
return 0;
}
/* Zero the context */
@ -518,6 +520,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
/* The offsets of each file */
ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
COVER_ctx_destroy(ctx);
return 0;
}
@ -651,7 +654,6 @@ ZDICTLIB_API size_t COVER_trainFromBuffer(
/* Initialize context and activeDmers */
if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
parameters.d)) {
DISPLAYLEVEL(1, "Failed to initialize context\n");
return ERROR(GENERIC);
}
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {

View File

@ -108,6 +108,7 @@ typedef struct {
The resulting dictionary will be saved into `dictBuffer`.
@return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
or an error code, which can be tested with ZDICT_isError().
Note : COVER_trainFromBuffer() requires about 9 bytes of memory for each input byte.
Tips : In general, a reasonable dictionary has a size of ~ 100 KB.
It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
In general, it's recommended to provide a few thousands samples, but this can vary a lot.
@ -131,6 +132,7 @@ ZDICTLIB_API size_t COVER_trainFromBuffer(void* dictBuffer, size_t dictBufferCap
@return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
or an error code, which can be tested with ZDICT_isError().
On success `*parameters` contains the parameters selected.
Note : COVER_optimizeTrainFromBuffer() requires about 9 bytes of memory for each input byte.
*/
ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,

View File

@ -235,7 +235,8 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
void* const dictBuffer = malloc(maxDictSize);
size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
unsigned long long const totalSizeToLoad = DiB_getTotalCappedFileSize(fileNamesTable, nbFiles);
size_t const maxMem = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT;
size_t const memMult = params ? MEMMULT : COVER_MEMMULT;
size_t const maxMem = DiB_findMaxMem(totalSizeToLoad * memMult) / memMult;
size_t benchedSize = (size_t) MIN ((unsigned long long)maxMem, totalSizeToLoad);
void* const srcBuffer = malloc(benchedSize+NOISELENGTH);
int result = 0;