Merge pull request #1302 from jennifermliu/splitpoint

Always use splitPoint=1.0 for non-optimize cover and fastcover and some minor changes on comments
This commit is contained in:
Nick Terrell 2018-09-04 16:59:41 -07:00 committed by GitHub
commit 7a02df8dbe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 10 additions and 9 deletions

View File

@ -684,7 +684,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
BYTE* const dict = (BYTE*)dictBuffer;
COVER_ctx_t ctx;
COVER_map_t activeDmers;
parameters.splitPoint = parameters.splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters.splitPoint;
parameters.splitPoint = 1.0;
/* Initialize global data */
g_displayLevel = parameters.zParams.notificationLevel;
/* Checks */

View File

@ -510,7 +510,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(
/* Initialize global data */
g_displayLevel = parameters.zParams.notificationLevel;
/* Assign splitPoint and f if not provided */
parameters.splitPoint = parameters.splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters.splitPoint;
parameters.splitPoint = 1.0;
parameters.f = parameters.f == 0 ? DEFAULT_F : parameters.f;
parameters.accel = parameters.accel == 0 ? DEFAULT_ACCEL : parameters.accel;
/* Convert to cover parameter */

View File

@ -85,9 +85,9 @@ typedef struct {
typedef struct {
unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
unsigned steps; /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
double splitPoint; /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
ZDICT_params_t zParams;
} ZDICT_cover_params_t;
@ -95,9 +95,9 @@ typedef struct {
unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
unsigned f; /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(18)*/
unsigned steps; /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
double splitPoint; /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
unsigned accel; /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
ZDICT_params_t zParams;
} ZDICT_fastCover_params_t;

View File

@ -194,7 +194,7 @@ All arguments after \fB\-\-\fR are treated as files
Use FILEs as training set to create a dictionary\. The training set should contain a lot of small files (> 100), and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary)\.
.
.IP
Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-cover\fR\ or \fB\-\-train\-fastcover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. Equivalent to \fB\-\-train\-fastCover=d=8,steps=4\fR\.
Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-fastcover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. The cover dictionary builder can be accessed with \fB\-\-train\-cover\fR\. Equivalent to \fB\-\-train\-fastCover=d=8,steps=4\fR\.
.
.TP
\fB\-o file\fR

View File

@ -200,9 +200,10 @@ Compression of small files similar to the sample set will be greatly improved.
(for example, 10 MB for a 100 KB dictionary).
Supports multithreading if `zstd` is compiled with threading support.
Additional parameters can be specified with `--train-cover`.
Additional parameters can be specified with `--train-fastcover`.
The legacy dictionary builder can be accessed with `--train-legacy`.
Equivalent to `--train-cover=d=8,steps=4`.
The cover dictionary builder can be accessed with `--train-cover`.
Equivalent to `--train-fastcover=d=8,steps=4`.
* `-o file`:
Dictionary saved into `file` (default name: dictionary).
* `--maxdict=#`: