Merge fastCover into DictBuilder (#1274)

* Minor fix

* Run non-optimize FASTCOVER 5 times in benchmark

* Merge fastCover into dictBuilder

* Fix mixed declaration issue

* Add fastcover to symbol.c

* Add fastCover.c and cover.h to build

* Change fastCover.c to fastcover.c

* Update benchmark to run FASTCOVER in dictBuilder

* Undo spliting fastcover_param into cover_param and f

* Remove convert param functions

* Assign f to parameter

* Add zdict.h to Makefile in lib

* Add cover.h to BUCK

* Cast 1 to U64 before shifting

* Remove trimming of zero freq head and tail in selectSegment and rebenchmark

* Remove f as a separate parameter of tryParam

* Read 8 bytes when d is 6

* Add trimming off zero frequency head and tail

* Use best functions from COVER and remove trimming part(which leads to worse compression ratio after previous bugs were fixed)

* Add finalize= argument to FASTCOVER to specify percentage of training samples passed to ZDICT_finalizeDictionary

* Change nbDmer to always read 8 bytes even when d=6

* Add skip=# argument to allow skipping dmers in computeFrequency in FASTCOVER

* Update comments and benchmarking result

* Change default method of ZDICT_trainFromBuffer to ZDICT_optimizeTrainFromBuffer_fastCover

* Add dictType enum and fix bug about passing zParam when converting to coverParam

* Combine finalize and skip into a single parameter

* Update acceleration parameters and benchmark on 3 sample sets

* Change default splitPoint of FASTCOVER to 0.75 and benchmark first 3 sample sets

* Initialize variables outside of for loop in benchmark.c

* Update benchmark result for hg-manifest

* Remove cover.h from install-includes

* Add explanation of f

* Set default compression level for trainFromBuffer to 3

* Add assertion of fastCoverParams in DiB_trainFromFiles

* Add checkTotalCompressedSize function + some minor fixes

* Add test for multithreading fastCovr

* Initialize segmentFreqs in every FASTCOVER_selectSegment and move mutex_unnlock to end of COVER_best_finish

* Free segmentFreqs

* Initialize segmentFreqs before calling FASTCOVER_buildDictionary instead of in FASTCOVER_selectSegment

* Add FASTCOVER_MEMMULT

* Minor fix

* Update benchmarking result
This commit is contained in:
Jennifer Liu 2018-08-23 12:06:20 -07:00 committed by Nick Terrell
parent 1af27a7ed7
commit 9d6ed9def3
26 changed files with 1999 additions and 246 deletions

View File

@ -336,6 +336,10 @@
RelativePath="..\..\..\lib\dictBuilder\cover.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\fastcover.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
>
@ -482,6 +486,10 @@
RelativePath="..\..\..\lib\dictBuilder\zdict.h"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\cover.h"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\zdict_static.h"
>

View File

@ -348,6 +348,10 @@
RelativePath="..\..\..\lib\dictBuilder\cover.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\fastcover.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
>
@ -522,6 +526,10 @@
RelativePath="..\..\..\lib\dictBuilder\zdict.h"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\cover.h"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\zdict_static.h"
>

View File

@ -332,6 +332,10 @@
RelativePath="..\..\..\lib\dictBuilder\cover.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\fastcover.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
>
@ -502,6 +506,10 @@
RelativePath="..\..\..\lib\dictBuilder\zdict.h"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\cover.h"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\zdict_static.h"
>

View File

@ -176,6 +176,7 @@
<ClCompile Include="..\..\..\lib\decompress\huf_decompress.c" />
<ClCompile Include="..\..\..\lib\decompress\zstd_decompress.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\fastcover.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
<ClCompile Include="..\..\..\programs\datagen.c" />
@ -199,6 +200,7 @@
<ClInclude Include="..\..\..\lib\compress\zstdmt_compress.h" />
<ClInclude Include="..\..\..\lib\dictBuilder\divsufsort.h" />
<ClInclude Include="..\..\..\lib\dictBuilder\zdict.h" />
<ClInclude Include="..\..\..\lib\dictBuilder\cover.h" />
<ClInclude Include="..\..\..\lib\legacy\zstd_legacy.h" />
<ClInclude Include="..\..\..\programs\datagen.h" />
<ClInclude Include="..\..\..\programs\util.h" />

View File

@ -43,6 +43,7 @@
<ClCompile Include="..\..\..\lib\deprecated\zbuff_compress.c" />
<ClCompile Include="..\..\..\lib\deprecated\zbuff_decompress.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\fastcover.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
<ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />

View File

@ -43,6 +43,7 @@
<ClCompile Include="..\..\..\lib\deprecated\zbuff_compress.c" />
<ClCompile Include="..\..\..\lib\deprecated\zbuff_decompress.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\fastcover.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
<ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />

View File

@ -40,6 +40,7 @@
<ClCompile Include="..\..\..\lib\decompress\huf_decompress.c" />
<ClCompile Include="..\..\..\lib\decompress\zstd_decompress.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\fastcover.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
<ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
<ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />
@ -61,6 +62,7 @@
<ClInclude Include="..\..\..\lib\common\xxhash.h" />
<ClInclude Include="..\..\..\lib\compress\zstdmt_compress.h" />
<ClInclude Include="..\..\..\lib\dictBuilder\zdict.h" />
<ClInclude Include="..\..\..\lib\dictBuilder\cover.h" />
<ClInclude Include="..\..\..\lib\dictBuilder\divsufsort.h" />
<ClInclude Include="..\..\..\lib\common\fse.h" />
<ClInclude Include="..\..\..\lib\common\huf.h" />

View File

@ -47,6 +47,7 @@ SET(Sources
${LIBRARY_DIR}/decompress/huf_decompress.c
${LIBRARY_DIR}/decompress/zstd_decompress.c
${LIBRARY_DIR}/dictBuilder/cover.c
${LIBRARY_DIR}/dictBuilder/fastcover.c
${LIBRARY_DIR}/dictBuilder/divsufsort.c
${LIBRARY_DIR}/dictBuilder/zdict.c
${LIBRARY_DIR}/deprecated/zbuff_common.c
@ -74,6 +75,7 @@ SET(Headers
${LIBRARY_DIR}/compress/zstd_ldm.h
${LIBRARY_DIR}/compress/zstdmt_compress.h
${LIBRARY_DIR}/dictBuilder/zdict.h
${LIBRARY_DIR}/dictBuilder/cover.h
${LIBRARY_DIR}/deprecated/zbuff.h)
IF (ZSTD_LEGACY_SUPPORT)
@ -178,6 +180,7 @@ INSTALL(FILES
${LIBRARY_DIR}/zstd.h
${LIBRARY_DIR}/deprecated/zbuff.h
${LIBRARY_DIR}/dictBuilder/zdict.h
${LIBRARY_DIR}/dictBuilder/cover.h
${LIBRARY_DIR}/common/zstd_errors.h
DESTINATION "include")

View File

@ -2,10 +2,9 @@ ARG :=
CC ?= gcc
CFLAGS ?= -O3
INCLUDES := -I ../randomDictBuilder -I ../fastCover -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder
INCLUDES := -I ../randomDictBuilder -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder
RANDOM_FILE := ../randomDictBuilder/random.c
FAST_FILE := ../fastCover/fastCover.c
IO_FILE := ../randomDictBuilder/io.c
all: run clean
@ -22,8 +21,8 @@ test: benchmarkTest clean
benchmarkTest: benchmark test.sh
sh test.sh
benchmark: benchmark.o io.o random.o fastCover.o libzstd.a
$(CC) $(CFLAGS) benchmark.o io.o random.o fastCover.o libzstd.a -o benchmark
benchmark: benchmark.o io.o random.o libzstd.a
$(CC) $(CFLAGS) benchmark.o io.o random.o libzstd.a -o benchmark
benchmark.o: benchmark.c
$(CC) $(CFLAGS) $(INCLUDES) -c benchmark.c
@ -31,9 +30,6 @@ benchmark.o: benchmark.c
random.o: $(RANDOM_FILE)
$(CC) $(CFLAGS) $(INCLUDES) -c $(RANDOM_FILE)
fastCover.o: $(FAST_FILE)
$(CC) $(CFLAGS) $(INCLUDES) -c $(FAST_FILE)
io.o: $(IO_FILE)
$(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE)

View File

@ -14,113 +14,836 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
###Benchmarking Result:
- First Cover is optimize cover, second Cover uses optimized d and k from first one.
- For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one.
- For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one. This is run for accel values from 1 to 10.
- Fourth column is chosen d and fifth column is chosen k
github:
NODICT 0.000025 2.999642
RANDOM 0.030101 8.791189
LEGACY 0.913108 8.173529
COVER 59.234160 10.652243 8 1298
COVER 6.258459 10.652243 8 1298
FAST15 9.959246 10.555630 8 1874
FAST15 0.077719 10.555630 8 1874
FAST16 10.028343 10.701698 8 1106
FAST16 0.078117 10.701698 8 1106
FAST17 10.567355 10.650652 8 1106
FAST17 0.124833 10.650652 8 1106
FAST18 11.795287 10.499142 8 1826
FAST18 0.086992 10.499142 8 1826
FAST19 13.132451 10.527140 8 1826
FAST19 0.134716 10.527140 8 1826
FAST20 14.366314 10.494710 8 1826
FAST20 0.128844 10.494710 8 1826
FAST21 14.941238 10.503488 8 1778
FAST21 0.134975 10.503488 8 1778
FAST22 15.146226 10.509284 8 1826
FAST22 0.146918 10.509284 8 1826
FAST23 16.260552 10.509284 8 1826
FAST23 0.158494 10.509284 8 1826
FAST24 16.806037 10.512369 8 1826
FAST24 0.190464 10.512369 8 1826
NODICT 0.000004 2.999642
RANDOM 0.024560 8.791189
LEGACY 0.727109 8.173529
COVER 40.565676 10.652243 8 1298
COVER 3.608284 10.652243 8 1298
FAST f=15 a=1 4.181024 10.570882 8 1154
FAST f=15 a=1 0.040788 10.570882 8 1154
FAST f=15 a=2 3.548352 10.574287 6 1970
FAST f=15 a=2 0.035535 10.574287 6 1970
FAST f=15 a=3 3.287364 10.613950 6 1010
FAST f=15 a=3 0.032182 10.613950 6 1010
FAST f=15 a=4 3.184976 10.573883 6 1058
FAST f=15 a=4 0.029878 10.573883 6 1058
FAST f=15 a=5 3.045513 10.580640 8 1154
FAST f=15 a=5 0.022162 10.580640 8 1154
FAST f=15 a=6 3.003296 10.583677 6 1010
FAST f=15 a=6 0.028091 10.583677 6 1010
FAST f=15 a=7 2.952655 10.622551 6 1106
FAST f=15 a=7 0.02724 10.622551 6 1106
FAST f=15 a=8 2.945674 10.614657 6 1010
FAST f=15 a=8 0.027264 10.614657 6 1010
FAST f=15 a=9 3.153439 10.564018 8 1154
FAST f=15 a=9 0.020635 10.564018 8 1154
FAST f=15 a=10 2.950416 10.511454 6 1010
FAST f=15 a=10 0.026606 10.511454 6 1010
FAST f=16 a=1 3.970029 10.681035 8 1154
FAST f=16 a=1 0.038188 10.681035 8 1154
FAST f=16 a=2 3.422892 10.484978 6 1874
FAST f=16 a=2 0.034702 10.484978 6 1874
FAST f=16 a=3 3.215836 10.632631 8 1154
FAST f=16 a=3 0.026084 10.632631 8 1154
FAST f=16 a=4 3.081353 10.626533 6 1106
FAST f=16 a=4 0.030032 10.626533 6 1106
FAST f=16 a=5 3.041241 10.545027 8 1922
FAST f=16 a=5 0.022882 10.545027 8 1922
FAST f=16 a=6 2.989390 10.638284 6 1874
FAST f=16 a=6 0.028308 10.638284 6 1874
FAST f=16 a=7 3.001581 10.797136 6 1106
FAST f=16 a=7 0.027479 10.797136 6 1106
FAST f=16 a=8 2.984107 10.658356 8 1058
FAST f=16 a=8 0.021099 10.658356 8 1058
FAST f=16 a=9 2.925788 10.523869 6 1010
FAST f=16 a=9 0.026905 10.523869 6 1010
FAST f=16 a=10 2.889605 10.745841 6 1874
FAST f=16 a=10 0.026846 10.745841 6 1874
FAST f=17 a=1 4.031953 10.672080 8 1202
FAST f=17 a=1 0.040658 10.672080 8 1202
FAST f=17 a=2 3.458107 10.589352 8 1106
FAST f=17 a=2 0.02926 10.589352 8 1106
FAST f=17 a=3 3.291189 10.662714 8 1154
FAST f=17 a=3 0.026531 10.662714 8 1154
FAST f=17 a=4 3.154950 10.549456 8 1346
FAST f=17 a=4 0.024991 10.549456 8 1346
FAST f=17 a=5 3.092271 10.541670 6 1202
FAST f=17 a=5 0.038285 10.541670 6 1202
FAST f=17 a=6 3.166146 10.729112 6 1874
FAST f=17 a=6 0.038217 10.729112 6 1874
FAST f=17 a=7 3.035467 10.810485 6 1106
FAST f=17 a=7 0.036655 10.810485 6 1106
FAST f=17 a=8 3.035668 10.530532 6 1058
FAST f=17 a=8 0.037715 10.530532 6 1058
FAST f=17 a=9 2.987917 10.589802 8 1922
FAST f=17 a=9 0.02217 10.589802 8 1922
FAST f=17 a=10 2.981647 10.722579 8 1106
FAST f=17 a=10 0.021948 10.722579 8 1106
FAST f=18 a=1 4.067144 10.634943 8 1154
FAST f=18 a=1 0.041386 10.634943 8 1154
FAST f=18 a=2 3.507377 10.546230 6 1970
FAST f=18 a=2 0.037572 10.546230 6 1970
FAST f=18 a=3 3.323015 10.648061 8 1154
FAST f=18 a=3 0.028306 10.648061 8 1154
FAST f=18 a=4 3.216735 10.705402 6 1010
FAST f=18 a=4 0.030755 10.705402 6 1010
FAST f=18 a=5 3.175794 10.588154 8 1874
FAST f=18 a=5 0.025315 10.588154 8 1874
FAST f=18 a=6 3.127459 10.751104 8 1106
FAST f=18 a=6 0.023897 10.751104 8 1106
FAST f=18 a=7 3.083017 10.780402 6 1106
FAST f=18 a=7 0.029158 10.780402 6 1106
FAST f=18 a=8 3.069700 10.547226 8 1346
FAST f=18 a=8 0.024046 10.547226 8 1346
FAST f=18 a=9 3.056591 10.674759 6 1010
FAST f=18 a=9 0.028496 10.674759 6 1010
FAST f=18 a=10 3.063588 10.737578 8 1106
FAST f=18 a=10 0.023033 10.737578 8 1106
FAST f=19 a=1 4.164041 10.650333 8 1154
FAST f=19 a=1 0.042906 10.650333 8 1154
FAST f=19 a=2 3.585409 10.577066 6 1058
FAST f=19 a=2 0.038994 10.577066 6 1058
FAST f=19 a=3 3.439643 10.639403 8 1154
FAST f=19 a=3 0.028427 10.639403 8 1154
FAST f=19 a=4 3.268869 10.554410 8 1298
FAST f=19 a=4 0.026866 10.554410 8 1298
FAST f=19 a=5 3.238225 10.615109 6 1010
FAST f=19 a=5 0.03078 10.615109 6 1010
FAST f=19 a=6 3.199558 10.609782 6 1874
FAST f=19 a=6 0.030099 10.609782 6 1874
FAST f=19 a=7 3.132395 10.794753 6 1106
FAST f=19 a=7 0.028964 10.794753 6 1106
FAST f=19 a=8 3.148446 10.554842 8 1298
FAST f=19 a=8 0.024277 10.554842 8 1298
FAST f=19 a=9 3.108324 10.668763 6 1010
FAST f=19 a=9 0.02896 10.668763 6 1010
FAST f=19 a=10 3.159863 10.757347 8 1106
FAST f=19 a=10 0.023351 10.757347 8 1106
FAST f=20 a=1 4.462698 10.661788 8 1154
FAST f=20 a=1 0.047174 10.661788 8 1154
FAST f=20 a=2 3.820269 10.678612 6 1106
FAST f=20 a=2 0.040807 10.678612 6 1106
FAST f=20 a=3 3.644955 10.648424 8 1154
FAST f=20 a=3 0.031398 10.648424 8 1154
FAST f=20 a=4 3.546257 10.559756 8 1298
FAST f=20 a=4 0.029856 10.559756 8 1298
FAST f=20 a=5 3.485248 10.646637 6 1010
FAST f=20 a=5 0.033756 10.646637 6 1010
FAST f=20 a=6 3.490438 10.775824 8 1106
FAST f=20 a=6 0.028338 10.775824 8 1106
FAST f=20 a=7 3.631289 10.801795 6 1106
FAST f=20 a=7 0.035228 10.801795 6 1106
FAST f=20 a=8 3.758936 10.545116 8 1346
FAST f=20 a=8 0.027495 10.545116 8 1346
FAST f=20 a=9 3.707024 10.677454 6 1010
FAST f=20 a=9 0.031326 10.677454 6 1010
FAST f=20 a=10 3.586593 10.756017 8 1106
FAST f=20 a=10 0.027122 10.756017 8 1106
FAST f=21 a=1 5.701396 10.655398 8 1154
FAST f=21 a=1 0.067744 10.655398 8 1154
FAST f=21 a=2 5.270542 10.650743 6 1106
FAST f=21 a=2 0.052999 10.650743 6 1106
FAST f=21 a=3 4.945294 10.652380 8 1154
FAST f=21 a=3 0.052678 10.652380 8 1154
FAST f=21 a=4 4.894079 10.543185 8 1298
FAST f=21 a=4 0.04997 10.543185 8 1298
FAST f=21 a=5 4.785417 10.630321 6 1010
FAST f=21 a=5 0.045294 10.630321 6 1010
FAST f=21 a=6 4.789381 10.664477 6 1874
FAST f=21 a=6 0.046578 10.664477 6 1874
FAST f=21 a=7 4.302955 10.805179 6 1106
FAST f=21 a=7 0.041205 10.805179 6 1106
FAST f=21 a=8 4.034630 10.551211 8 1298
FAST f=21 a=8 0.040121 10.551211 8 1298
FAST f=21 a=9 4.523868 10.799114 6 1010
FAST f=21 a=9 0.043592 10.799114 6 1010
FAST f=21 a=10 4.760736 10.750255 8 1106
FAST f=21 a=10 0.043483 10.750255 8 1106
FAST f=22 a=1 6.743064 10.640537 8 1154
FAST f=22 a=1 0.086967 10.640537 8 1154
FAST f=22 a=2 6.121739 10.626638 6 1970
FAST f=22 a=2 0.066337 10.626638 6 1970
FAST f=22 a=3 5.248851 10.640688 8 1154
FAST f=22 a=3 0.054935 10.640688 8 1154
FAST f=22 a=4 5.436579 10.588333 8 1298
FAST f=22 a=4 0.064113 10.588333 8 1298
FAST f=22 a=5 5.812815 10.652653 6 1010
FAST f=22 a=5 0.058189 10.652653 6 1010
FAST f=22 a=6 5.745472 10.666437 6 1874
FAST f=22 a=6 0.057188 10.666437 6 1874
FAST f=22 a=7 5.716393 10.806911 6 1106
FAST f=22 a=7 0.056 10.806911 6 1106
FAST f=22 a=8 5.698799 10.530784 8 1298
FAST f=22 a=8 0.0583 10.530784 8 1298
FAST f=22 a=9 5.710533 10.777391 6 1010
FAST f=22 a=9 0.054945 10.777391 6 1010
FAST f=22 a=10 5.685395 10.745023 8 1106
FAST f=22 a=10 0.056526 10.745023 8 1106
FAST f=23 a=1 7.836923 10.638828 8 1154
FAST f=23 a=1 0.099522 10.638828 8 1154
FAST f=23 a=2 6.627834 10.631061 6 1970
FAST f=23 a=2 0.066769 10.631061 6 1970
FAST f=23 a=3 5.602533 10.647288 8 1154
FAST f=23 a=3 0.064513 10.647288 8 1154
FAST f=23 a=4 6.005580 10.568747 8 1298
FAST f=23 a=4 0.062022 10.568747 8 1298
FAST f=23 a=5 5.481816 10.676921 6 1010
FAST f=23 a=5 0.058959 10.676921 6 1010
FAST f=23 a=6 5.460444 10.666194 6 1874
FAST f=23 a=6 0.057687 10.666194 6 1874
FAST f=23 a=7 5.659822 10.800377 6 1106
FAST f=23 a=7 0.06783 10.800377 6 1106
FAST f=23 a=8 6.826940 10.522167 8 1298
FAST f=23 a=8 0.070533 10.522167 8 1298
FAST f=23 a=9 6.804757 10.577799 8 1682
FAST f=23 a=9 0.069949 10.577799 8 1682
FAST f=23 a=10 6.774933 10.742093 8 1106
FAST f=23 a=10 0.068395 10.742093 8 1106
FAST f=24 a=1 8.444110 10.632783 8 1154
FAST f=24 a=1 0.094357 10.632783 8 1154
FAST f=24 a=2 7.289578 10.631061 6 1970
FAST f=24 a=2 0.098515 10.631061 6 1970
FAST f=24 a=3 8.619780 10.646289 8 1154
FAST f=24 a=3 0.098041 10.646289 8 1154
FAST f=24 a=4 8.508455 10.555199 8 1298
FAST f=24 a=4 0.093885 10.555199 8 1298
FAST f=24 a=5 8.471145 10.674363 6 1010
FAST f=24 a=5 0.088676 10.674363 6 1010
FAST f=24 a=6 8.426727 10.667228 6 1874
FAST f=24 a=6 0.087247 10.667228 6 1874
FAST f=24 a=7 8.356826 10.803027 6 1106
FAST f=24 a=7 0.085835 10.803027 6 1106
FAST f=24 a=8 6.756811 10.522049 8 1298
FAST f=24 a=8 0.07107 10.522049 8 1298
FAST f=24 a=9 6.548169 10.571882 8 1682
FAST f=24 a=9 0.0713 10.571882 8 1682
FAST f=24 a=10 8.238079 10.736453 8 1106
FAST f=24 a=10 0.07004 10.736453 8 1106
hg-commands:
NODICT 0.000026 2.425291
RANDOM 0.046270 3.490331
LEGACY 0.847904 3.911682
COVER 71.691804 4.132653 8 386
COVER 3.187085 4.132653 8 386
FAST15 11.593687 3.920720 6 1106
FAST15 0.082431 3.920720 6 1106
FAST16 11.775958 4.033306 8 674
FAST16 0.092587 4.033306 8 674
FAST17 11.965064 4.064132 8 1490
FAST17 0.106382 4.064132 8 1490
FAST18 11.438197 4.086714 8 290
FAST18 0.097293 4.086714 8 290
FAST19 12.292512 4.097947 8 578
FAST19 0.104406 4.097947 8 578
FAST20 13.857857 4.102851 8 434
FAST20 0.139467 4.102851 8 434
FAST21 14.599613 4.105350 8 530
FAST21 0.189416 4.105350 8 530
FAST22 15.966109 4.104100 8 530
FAST22 0.183817 4.104100 8 530
FAST23 18.033645 4.098110 8 914
FAST23 0.246641 4.098110 8 914
FAST24 22.992891 4.117367 8 722
FAST24 0.285994 4.117367 8 722
NODICT 0.000005 2.425276
RANDOM 0.046332 3.490331
LEGACY 0.720351 3.911682
COVER 45.507731 4.132653 8 386
COVER 1.868810 4.132653 8 386
FAST f=15 a=1 4.561427 3.866894 8 1202
FAST f=15 a=1 0.048946 3.866894 8 1202
FAST f=15 a=2 3.574462 3.892119 8 1538
FAST f=15 a=2 0.033677 3.892119 8 1538
FAST f=15 a=3 3.230227 3.888791 6 1346
FAST f=15 a=3 0.034312 3.888791 6 1346
FAST f=15 a=4 3.042388 3.899739 8 1010
FAST f=15 a=4 0.024307 3.899739 8 1010
FAST f=15 a=5 2.800148 3.896220 8 818
FAST f=15 a=5 0.022331 3.896220 8 818
FAST f=15 a=6 2.706518 3.882039 8 578
FAST f=15 a=6 0.020955 3.882039 8 578
FAST f=15 a=7 2.701820 3.885430 6 866
FAST f=15 a=7 0.026074 3.885430 6 866
FAST f=15 a=8 2.604445 3.906932 8 1826
FAST f=15 a=8 0.021789 3.906932 8 1826
FAST f=15 a=9 2.598568 3.870324 6 1682
FAST f=15 a=9 0.026004 3.870324 6 1682
FAST f=15 a=10 2.575920 3.920783 8 1442
FAST f=15 a=10 0.020228 3.920783 8 1442
FAST f=16 a=1 4.630623 4.001430 8 770
FAST f=16 a=1 0.047497 4.001430 8 770
FAST f=16 a=2 3.674721 3.974431 8 1874
FAST f=16 a=2 0.035761 3.974431 8 1874
FAST f=16 a=3 3.338384 3.978703 8 1010
FAST f=16 a=3 0.029436 3.978703 8 1010
FAST f=16 a=4 3.004412 3.983035 8 1010
FAST f=16 a=4 0.025744 3.983035 8 1010
FAST f=16 a=5 2.881892 3.987710 8 770
FAST f=16 a=5 0.023211 3.987710 8 770
FAST f=16 a=6 2.807410 3.952717 8 1298
FAST f=16 a=6 0.023199 3.952717 8 1298
FAST f=16 a=7 2.819623 3.994627 8 770
FAST f=16 a=7 0.021806 3.994627 8 770
FAST f=16 a=8 2.740092 3.954032 8 1826
FAST f=16 a=8 0.0226 3.954032 8 1826
FAST f=16 a=9 2.682564 3.969879 6 1442
FAST f=16 a=9 0.026324 3.969879 6 1442
FAST f=16 a=10 2.657959 3.969755 8 674
FAST f=16 a=10 0.020413 3.969755 8 674
FAST f=17 a=1 4.729228 4.046000 8 530
FAST f=17 a=1 0.049703 4.046000 8 530
FAST f=17 a=2 3.764510 3.991519 8 1970
FAST f=17 a=2 0.038195 3.991519 8 1970
FAST f=17 a=3 3.416992 4.006296 6 914
FAST f=17 a=3 0.036244 4.006296 6 914
FAST f=17 a=4 3.145626 3.979182 8 1970
FAST f=17 a=4 0.028676 3.979182 8 1970
FAST f=17 a=5 2.995070 4.050070 8 770
FAST f=17 a=5 0.025707 4.050070 8 770
FAST f=17 a=6 2.911833 4.040024 8 770
FAST f=17 a=6 0.02453 4.040024 8 770
FAST f=17 a=7 2.894796 4.015884 8 818
FAST f=17 a=7 0.023956 4.015884 8 818
FAST f=17 a=8 2.789962 4.039303 8 530
FAST f=17 a=8 0.023219 4.039303 8 530
FAST f=17 a=9 2.787625 3.996762 8 1634
FAST f=17 a=9 0.023651 3.996762 8 1634
FAST f=17 a=10 2.754796 4.005059 8 1058
FAST f=17 a=10 0.022537 4.005059 8 1058
FAST f=18 a=1 4.779117 4.038214 8 242
FAST f=18 a=1 0.048814 4.038214 8 242
FAST f=18 a=2 3.829753 4.045768 8 722
FAST f=18 a=2 0.036541 4.045768 8 722
FAST f=18 a=3 3.495053 4.021497 8 770
FAST f=18 a=3 0.032648 4.021497 8 770
FAST f=18 a=4 3.221395 4.039623 8 770
FAST f=18 a=4 0.027818 4.039623 8 770
FAST f=18 a=5 3.059369 4.050414 8 530
FAST f=18 a=5 0.026296 4.050414 8 530
FAST f=18 a=6 3.019292 4.010714 6 962
FAST f=18 a=6 0.031104 4.010714 6 962
FAST f=18 a=7 2.949322 4.031439 6 770
FAST f=18 a=7 0.030745 4.031439 6 770
FAST f=18 a=8 2.876425 4.032088 6 386
FAST f=18 a=8 0.027407 4.032088 6 386
FAST f=18 a=9 2.850958 4.053372 8 674
FAST f=18 a=9 0.023799 4.053372 8 674
FAST f=18 a=10 2.884352 4.020148 8 1730
FAST f=18 a=10 0.024401 4.020148 8 1730
FAST f=19 a=1 4.815669 4.061203 8 674
FAST f=19 a=1 0.051425 4.061203 8 674
FAST f=19 a=2 3.951356 4.013822 8 1442
FAST f=19 a=2 0.039968 4.013822 8 1442
FAST f=19 a=3 3.554682 4.050425 8 722
FAST f=19 a=3 0.032725 4.050425 8 722
FAST f=19 a=4 3.242585 4.054677 8 722
FAST f=19 a=4 0.028194 4.054677 8 722
FAST f=19 a=5 3.105909 4.064524 8 818
FAST f=19 a=5 0.02675 4.064524 8 818
FAST f=19 a=6 3.059901 4.036857 8 1250
FAST f=19 a=6 0.026396 4.036857 8 1250
FAST f=19 a=7 3.016151 4.068234 6 770
FAST f=19 a=7 0.031501 4.068234 6 770
FAST f=19 a=8 2.962902 4.077509 8 530
FAST f=19 a=8 0.023333 4.077509 8 530
FAST f=19 a=9 2.899607 4.067328 8 530
FAST f=19 a=9 0.024553 4.067328 8 530
FAST f=19 a=10 2.950978 4.059901 8 434
FAST f=19 a=10 0.023852 4.059901 8 434
FAST f=20 a=1 5.259834 4.027579 8 1634
FAST f=20 a=1 0.061123 4.027579 8 1634
FAST f=20 a=2 4.382150 4.025093 8 1634
FAST f=20 a=2 0.048009 4.025093 8 1634
FAST f=20 a=3 4.104323 4.060842 8 530
FAST f=20 a=3 0.040965 4.060842 8 530
FAST f=20 a=4 3.853340 4.023504 6 914
FAST f=20 a=4 0.041072 4.023504 6 914
FAST f=20 a=5 3.728841 4.018089 6 1634
FAST f=20 a=5 0.037469 4.018089 6 1634
FAST f=20 a=6 3.683045 4.069138 8 578
FAST f=20 a=6 0.028011 4.069138 8 578
FAST f=20 a=7 3.726973 4.063160 8 722
FAST f=20 a=7 0.028437 4.063160 8 722
FAST f=20 a=8 3.555073 4.057690 8 386
FAST f=20 a=8 0.027588 4.057690 8 386
FAST f=20 a=9 3.551095 4.067253 8 482
FAST f=20 a=9 0.025976 4.067253 8 482
FAST f=20 a=10 3.490127 4.068518 8 530
FAST f=20 a=10 0.025971 4.068518 8 530
FAST f=21 a=1 7.343816 4.064945 8 770
FAST f=21 a=1 0.085035 4.064945 8 770
FAST f=21 a=2 5.930894 4.048206 8 386
FAST f=21 a=2 0.067349 4.048206 8 386
FAST f=21 a=3 6.770775 4.063417 8 578
FAST f=21 a=3 0.077104 4.063417 8 578
FAST f=21 a=4 6.889409 4.066761 8 626
FAST f=21 a=4 0.0717 4.066761 8 626
FAST f=21 a=5 6.714896 4.051813 8 914
FAST f=21 a=5 0.071026 4.051813 8 914
FAST f=21 a=6 6.539890 4.047263 8 1922
FAST f=21 a=6 0.07127 4.047263 8 1922
FAST f=21 a=7 6.511052 4.068373 8 482
FAST f=21 a=7 0.065467 4.068373 8 482
FAST f=21 a=8 6.458788 4.071597 8 482
FAST f=21 a=8 0.063817 4.071597 8 482
FAST f=21 a=9 6.377591 4.052905 8 434
FAST f=21 a=9 0.063112 4.052905 8 434
FAST f=21 a=10 6.360752 4.047773 8 530
FAST f=21 a=10 0.063606 4.047773 8 530
FAST f=22 a=1 10.523471 4.040812 8 962
FAST f=22 a=1 0.14214 4.040812 8 962
FAST f=22 a=2 9.454758 4.059396 8 914
FAST f=22 a=2 0.118343 4.059396 8 914
FAST f=22 a=3 9.043197 4.043019 8 1922
FAST f=22 a=3 0.109798 4.043019 8 1922
FAST f=22 a=4 8.716261 4.044819 8 770
FAST f=22 a=4 0.099687 4.044819 8 770
FAST f=22 a=5 8.529472 4.070576 8 530
FAST f=22 a=5 0.093127 4.070576 8 530
FAST f=22 a=6 8.424241 4.070565 8 722
FAST f=22 a=6 0.093703 4.070565 8 722
FAST f=22 a=7 8.403391 4.070591 8 578
FAST f=22 a=7 0.089763 4.070591 8 578
FAST f=22 a=8 8.285221 4.089171 8 530
FAST f=22 a=8 0.087716 4.089171 8 530
FAST f=22 a=9 8.282506 4.047470 8 722
FAST f=22 a=9 0.089773 4.047470 8 722
FAST f=22 a=10 8.241809 4.064151 8 818
FAST f=22 a=10 0.090413 4.064151 8 818
FAST f=23 a=1 12.389208 4.051635 6 530
FAST f=23 a=1 0.147796 4.051635 6 530
FAST f=23 a=2 11.300910 4.042835 6 914
FAST f=23 a=2 0.133178 4.042835 6 914
FAST f=23 a=3 10.879455 4.047415 8 626
FAST f=23 a=3 0.129571 4.047415 8 626
FAST f=23 a=4 10.522718 4.038269 6 914
FAST f=23 a=4 0.118121 4.038269 6 914
FAST f=23 a=5 10.348043 4.066884 8 434
FAST f=23 a=5 0.112098 4.066884 8 434
FAST f=23 a=6 10.238630 4.048635 8 1010
FAST f=23 a=6 0.120281 4.048635 8 1010
FAST f=23 a=7 10.213255 4.061809 8 530
FAST f=23 a=7 0.1121 4.061809 8 530
FAST f=23 a=8 10.107879 4.074104 8 818
FAST f=23 a=8 0.116544 4.074104 8 818
FAST f=23 a=9 10.063424 4.064811 8 674
FAST f=23 a=9 0.109045 4.064811 8 674
FAST f=23 a=10 10.035801 4.054918 8 530
FAST f=23 a=10 0.108735 4.054918 8 530
FAST f=24 a=1 14.963878 4.073490 8 722
FAST f=24 a=1 0.206344 4.073490 8 722
FAST f=24 a=2 13.833472 4.036100 8 962
FAST f=24 a=2 0.17486 4.036100 8 962
FAST f=24 a=3 13.404631 4.026281 6 1106
FAST f=24 a=3 0.153961 4.026281 6 1106
FAST f=24 a=4 13.041164 4.065448 8 674
FAST f=24 a=4 0.155509 4.065448 8 674
FAST f=24 a=5 12.879412 4.054636 8 674
FAST f=24 a=5 0.148282 4.054636 8 674
FAST f=24 a=6 12.773736 4.081376 8 530
FAST f=24 a=6 0.142563 4.081376 8 530
FAST f=24 a=7 12.711310 4.059834 8 770
FAST f=24 a=7 0.149321 4.059834 8 770
FAST f=24 a=8 12.635459 4.052050 8 1298
FAST f=24 a=8 0.15095 4.052050 8 1298
FAST f=24 a=9 12.558104 4.076516 8 722
FAST f=24 a=9 0.144361 4.076516 8 722
FAST f=24 a=10 10.661348 4.062137 8 818
FAST f=24 a=10 0.108232 4.062137 8 818
hg-changelog:
NODICT 0.000007 1.377613
RANDOM 0.297345 2.097487
LEGACY 2.633992 2.058907
COVER 219.179786 2.189685 8 98
COVER 6.620852 2.189685 8 98
FAST15 47.635082 2.130794 6 386
FAST15 0.321297 2.130794 6 386
FAST16 43.837676 2.144845 8 194
FAST16 0.312640 2.144845 8 194
FAST17 49.349017 2.156099 8 242
FAST17 0.348459 2.156099 8 242
FAST18 51.153784 2.172439 6 98
FAST18 0.353106 2.172439 6 98
FAST19 52.627045 2.180321 6 98
FAST19 0.390612 2.180321 6 98
FAST20 63.748782 2.187431 6 98
FAST20 0.489544 2.187431 6 98
FAST21 68.709198 2.184185 6 146
FAST21 0.530852 2.184185 6 146
FAST22 68.491639 2.182830 6 98
FAST22 0.645699 2.182830 6 98
FAST23 72.558688 2.186399 8 98
FAST23 0.593539 2.186399 8 98
FAST24 76.137195 2.185608 6 98
FAST24 0.680132 2.185608 6 98
NODICT 0.000017 1.377590
RANDOM 0.186171 2.097487
LEGACY 1.670867 2.058907
COVER 173.561948 2.189685 8 98
COVER 4.811180 2.189685 8 98
FAST f=15 a=1 18.685906 2.129682 8 434
FAST f=15 a=1 0.173376 2.129682 8 434
FAST f=15 a=2 12.928259 2.131890 8 482
FAST f=15 a=2 0.102582 2.131890 8 482
FAST f=15 a=3 11.132343 2.128027 8 386
FAST f=15 a=3 0.077122 2.128027 8 386
FAST f=15 a=4 10.120683 2.125797 8 434
FAST f=15 a=4 0.065175 2.125797 8 434
FAST f=15 a=5 9.479092 2.127697 8 386
FAST f=15 a=5 0.057905 2.127697 8 386
FAST f=15 a=6 9.159523 2.127132 8 1682
FAST f=15 a=6 0.058604 2.127132 8 1682
FAST f=15 a=7 8.724003 2.129914 8 434
FAST f=15 a=7 0.0493 2.129914 8 434
FAST f=15 a=8 8.595001 2.127137 8 338
FAST f=15 a=8 0.0474 2.127137 8 338
FAST f=15 a=9 8.356405 2.125512 8 482
FAST f=15 a=9 0.046126 2.125512 8 482
FAST f=15 a=10 8.207111 2.126066 8 338
FAST f=15 a=10 0.043292 2.126066 8 338
FAST f=16 a=1 18.464436 2.144040 8 242
FAST f=16 a=1 0.172156 2.144040 8 242
FAST f=16 a=2 12.844825 2.148171 8 194
FAST f=16 a=2 0.099619 2.148171 8 194
FAST f=16 a=3 11.082568 2.140837 8 290
FAST f=16 a=3 0.079165 2.140837 8 290
FAST f=16 a=4 10.066749 2.144405 8 386
FAST f=16 a=4 0.068411 2.144405 8 386
FAST f=16 a=5 9.501121 2.140720 8 386
FAST f=16 a=5 0.061316 2.140720 8 386
FAST f=16 a=6 9.179332 2.139478 8 386
FAST f=16 a=6 0.056322 2.139478 8 386
FAST f=16 a=7 8.849438 2.142412 8 194
FAST f=16 a=7 0.050493 2.142412 8 194
FAST f=16 a=8 8.810919 2.143454 8 434
FAST f=16 a=8 0.051304 2.143454 8 434
FAST f=16 a=9 8.553900 2.140339 8 194
FAST f=16 a=9 0.047285 2.140339 8 194
FAST f=16 a=10 8.398027 2.143130 8 386
FAST f=16 a=10 0.046386 2.143130 8 386
FAST f=17 a=1 18.644657 2.157192 8 98
FAST f=17 a=1 0.173884 2.157192 8 98
FAST f=17 a=2 13.071242 2.159830 8 146
FAST f=17 a=2 0.10388 2.159830 8 146
FAST f=17 a=3 11.332366 2.153654 6 194
FAST f=17 a=3 0.08983 2.153654 6 194
FAST f=17 a=4 10.362413 2.156813 8 242
FAST f=17 a=4 0.070389 2.156813 8 242
FAST f=17 a=5 9.808159 2.155098 6 338
FAST f=17 a=5 0.072661 2.155098 6 338
FAST f=17 a=6 9.451165 2.153845 6 146
FAST f=17 a=6 0.064959 2.153845 6 146
FAST f=17 a=7 9.163097 2.155424 6 242
FAST f=17 a=7 0.064323 2.155424 6 242
FAST f=17 a=8 9.047276 2.156640 8 242
FAST f=17 a=8 0.053382 2.156640 8 242
FAST f=17 a=9 8.807671 2.152396 8 146
FAST f=17 a=9 0.049617 2.152396 8 146
FAST f=17 a=10 8.649827 2.152370 8 146
FAST f=17 a=10 0.047849 2.152370 8 146
FAST f=18 a=1 18.809502 2.168116 8 98
FAST f=18 a=1 0.175226 2.168116 8 98
FAST f=18 a=2 13.756502 2.170870 6 242
FAST f=18 a=2 0.119507 2.170870 6 242
FAST f=18 a=3 12.059748 2.163094 6 98
FAST f=18 a=3 0.093912 2.163094 6 98
FAST f=18 a=4 11.410294 2.172372 8 98
FAST f=18 a=4 0.073048 2.172372 8 98
FAST f=18 a=5 10.560297 2.166388 8 98
FAST f=18 a=5 0.065136 2.166388 8 98
FAST f=18 a=6 10.071390 2.162672 8 98
FAST f=18 a=6 0.059402 2.162672 8 98
FAST f=18 a=7 10.084214 2.166624 6 194
FAST f=18 a=7 0.073276 2.166624 6 194
FAST f=18 a=8 9.953226 2.167454 8 98
FAST f=18 a=8 0.053659 2.167454 8 98
FAST f=18 a=9 8.982461 2.161593 6 146
FAST f=18 a=9 0.05955 2.161593 6 146
FAST f=18 a=10 8.986092 2.164373 6 242
FAST f=18 a=10 0.059135 2.164373 6 242
FAST f=19 a=1 18.908277 2.176021 8 98
FAST f=19 a=1 0.177316 2.176021 8 98
FAST f=19 a=2 13.471313 2.176103 8 98
FAST f=19 a=2 0.106344 2.176103 8 98
FAST f=19 a=3 11.571406 2.172812 8 98
FAST f=19 a=3 0.083293 2.172812 8 98
FAST f=19 a=4 10.632775 2.177770 6 146
FAST f=19 a=4 0.079864 2.177770 6 146
FAST f=19 a=5 10.030190 2.175574 6 146
FAST f=19 a=5 0.07223 2.175574 6 146
FAST f=19 a=6 9.717818 2.169997 8 98
FAST f=19 a=6 0.060049 2.169997 8 98
FAST f=19 a=7 9.397531 2.172770 8 146
FAST f=19 a=7 0.057188 2.172770 8 146
FAST f=19 a=8 9.281061 2.175822 8 98
FAST f=19 a=8 0.053711 2.175822 8 98
FAST f=19 a=9 9.165242 2.169849 6 146
FAST f=19 a=9 0.059898 2.169849 6 146
FAST f=19 a=10 9.048763 2.173394 8 98
FAST f=19 a=10 0.049757 2.173394 8 98
FAST f=20 a=1 21.166917 2.183923 6 98
FAST f=20 a=1 0.205425 2.183923 6 98
FAST f=20 a=2 15.642753 2.182349 6 98
FAST f=20 a=2 0.135957 2.182349 6 98
FAST f=20 a=3 14.053730 2.173544 6 98
FAST f=20 a=3 0.11266 2.173544 6 98
FAST f=20 a=4 15.270019 2.183656 8 98
FAST f=20 a=4 0.107892 2.183656 8 98
FAST f=20 a=5 15.497927 2.174661 6 98
FAST f=20 a=5 0.100305 2.174661 6 98
FAST f=20 a=6 13.973505 2.172391 8 98
FAST f=20 a=6 0.087565 2.172391 8 98
FAST f=20 a=7 14.083296 2.172443 8 98
FAST f=20 a=7 0.078062 2.172443 8 98
FAST f=20 a=8 12.560048 2.175581 8 98
FAST f=20 a=8 0.070282 2.175581 8 98
FAST f=20 a=9 13.078645 2.173975 6 146
FAST f=20 a=9 0.081041 2.173975 6 146
FAST f=20 a=10 12.823328 2.177778 8 98
FAST f=20 a=10 0.074522 2.177778 8 98
FAST f=21 a=1 29.825370 2.183057 6 98
FAST f=21 a=1 0.334453 2.183057 6 98
FAST f=21 a=2 29.476474 2.182752 8 98
FAST f=21 a=2 0.286602 2.182752 8 98
FAST f=21 a=3 25.937186 2.175867 8 98
FAST f=21 a=3 0.17626 2.175867 8 98
FAST f=21 a=4 20.413865 2.179780 8 98
FAST f=21 a=4 0.206085 2.179780 8 98
FAST f=21 a=5 20.541889 2.178328 6 146
FAST f=21 a=5 0.199157 2.178328 6 146
FAST f=21 a=6 21.090670 2.174443 6 146
FAST f=21 a=6 0.190645 2.174443 6 146
FAST f=21 a=7 20.221569 2.177384 6 146
FAST f=21 a=7 0.184278 2.177384 6 146
FAST f=21 a=8 20.322357 2.179456 6 98
FAST f=21 a=8 0.178458 2.179456 6 98
FAST f=21 a=9 20.683912 2.174396 6 146
FAST f=21 a=9 0.190829 2.174396 6 146
FAST f=21 a=10 20.840865 2.174905 8 98
FAST f=21 a=10 0.172515 2.174905 8 98
FAST f=22 a=1 36.822827 2.181612 6 98
FAST f=22 a=1 0.437389 2.181612 6 98
FAST f=22 a=2 30.616902 2.183142 8 98
FAST f=22 a=2 0.324284 2.183142 8 98
FAST f=22 a=3 28.472482 2.178130 8 98
FAST f=22 a=3 0.236538 2.178130 8 98
FAST f=22 a=4 25.847028 2.181878 8 98
FAST f=22 a=4 0.263744 2.181878 8 98
FAST f=22 a=5 27.095881 2.180775 8 98
FAST f=22 a=5 0.24988 2.180775 8 98
FAST f=22 a=6 25.939172 2.170916 8 98
FAST f=22 a=6 0.240033 2.170916 8 98
FAST f=22 a=7 27.064194 2.177849 8 98
FAST f=22 a=7 0.242383 2.177849 8 98
FAST f=22 a=8 25.140221 2.178216 8 98
FAST f=22 a=8 0.237601 2.178216 8 98
FAST f=22 a=9 25.505283 2.177455 6 146
FAST f=22 a=9 0.223217 2.177455 6 146
FAST f=22 a=10 24.529362 2.176705 6 98
FAST f=22 a=10 0.222876 2.176705 6 98
FAST f=23 a=1 39.127310 2.183006 6 98
FAST f=23 a=1 0.417338 2.183006 6 98
FAST f=23 a=2 32.468161 2.183524 6 98
FAST f=23 a=2 0.351645 2.183524 6 98
FAST f=23 a=3 31.577620 2.172604 6 98
FAST f=23 a=3 0.319659 2.172604 6 98
FAST f=23 a=4 30.129247 2.183932 6 98
FAST f=23 a=4 0.307239 2.183932 6 98
FAST f=23 a=5 29.103376 2.183529 6 146
FAST f=23 a=5 0.285533 2.183529 6 146
FAST f=23 a=6 29.776045 2.174367 8 98
FAST f=23 a=6 0.276846 2.174367 8 98
FAST f=23 a=7 28.940407 2.178022 6 146
FAST f=23 a=7 0.274082 2.178022 6 146
FAST f=23 a=8 29.256009 2.179462 6 98
FAST f=23 a=8 0.26949 2.179462 6 98
FAST f=23 a=9 29.347312 2.170407 8 98
FAST f=23 a=9 0.265034 2.170407 8 98
FAST f=23 a=10 29.140081 2.171762 8 98
FAST f=23 a=10 0.259183 2.171762 8 98
FAST f=24 a=1 44.871179 2.182115 6 98
FAST f=24 a=1 0.509433 2.182115 6 98
FAST f=24 a=2 38.694867 2.180549 8 98
FAST f=24 a=2 0.406695 2.180549 8 98
FAST f=24 a=3 38.363769 2.172821 8 98
FAST f=24 a=3 0.359581 2.172821 8 98
FAST f=24 a=4 36.580797 2.184142 8 98
FAST f=24 a=4 0.340614 2.184142 8 98
FAST f=24 a=5 33.125701 2.183301 8 98
FAST f=24 a=5 0.324874 2.183301 8 98
FAST f=24 a=6 34.776068 2.173019 6 146
FAST f=24 a=6 0.340397 2.173019 6 146
FAST f=24 a=7 34.417625 2.176561 6 146
FAST f=24 a=7 0.308223 2.176561 6 146
FAST f=24 a=8 35.470291 2.182161 6 98
FAST f=24 a=8 0.307724 2.182161 6 98
FAST f=24 a=9 34.927252 2.172682 6 146
FAST f=24 a=9 0.300598 2.172682 6 146
FAST f=24 a=10 33.238355 2.173395 6 98
FAST f=24 a=10 0.249916 2.173395 6 98
hg-manifest:
NODICT 0.000026 1.866385
RANDOM 0.784554 2.309436
LEGACY 10.193714 2.506977
COVER 988.206583 2.582528 8 434
COVER 39.726199 2.582528 8 434
FAST15 168.388819 2.392920 6 1826
FAST15 1.272178 2.392920 6 1826
FAST16 161.822607 2.480762 6 1922
FAST16 1.164908 2.480762 6 1922
FAST17 157.688544 2.548285 6 1682
FAST17 1.222439 2.548285 6 1682
FAST18 154.529585 2.567634 6 386
FAST18 1.217596 2.567634 6 386
FAST19 160.244979 2.581653 8 338
FAST19 1.282450 2.581653 8 338
FAST20 191.503297 2.586881 8 194
FAST20 2.009748 2.586881 8 194
FAST21 226.389709 2.590051 6 242
FAST21 2.494543 2.590051 6 242
FAST22 217.859055 2.591376 6 194
FAST22 2.295693 2.591376 6 194
FAST23 236.819791 2.591131 8 434
FAST23 2.744711 2.591131 8 434
FAST24 269.187800 2.591548 6 290
FAST24 2.923671 2.591548 6 290
NODICT 0.000004 1.866377
RANDOM 0.696346 2.309436
LEGACY 7.064527 2.506977
COVER 876.312865 2.582528 8 434
COVER 35.684533 2.582528 8 434
FAST f=15 a=1 76.618201 2.404013 8 1202
FAST f=15 a=1 0.700722 2.404013 8 1202
FAST f=15 a=2 49.213058 2.409248 6 1826
FAST f=15 a=2 0.473393 2.409248 6 1826
FAST f=15 a=3 41.753197 2.409677 8 1490
FAST f=15 a=3 0.336848 2.409677 8 1490
FAST f=15 a=4 38.648295 2.407996 8 1538
FAST f=15 a=4 0.283952 2.407996 8 1538
FAST f=15 a=5 36.144936 2.402895 8 1874
FAST f=15 a=5 0.270128 2.402895 8 1874
FAST f=15 a=6 35.484675 2.394873 8 1586
FAST f=15 a=6 0.251637 2.394873 8 1586
FAST f=15 a=7 34.280599 2.397311 8 1778
FAST f=15 a=7 0.23984 2.397311 8 1778
FAST f=15 a=8 32.122572 2.396089 6 1490
FAST f=15 a=8 0.251508 2.396089 6 1490
FAST f=15 a=9 29.909842 2.390092 6 1970
FAST f=15 a=9 0.251233 2.390092 6 1970
FAST f=15 a=10 30.102938 2.400086 6 1682
FAST f=15 a=10 0.23688 2.400086 6 1682
FAST f=16 a=1 67.750401 2.475460 6 1346
FAST f=16 a=1 0.796035 2.475460 6 1346
FAST f=16 a=2 52.812027 2.480860 6 1730
FAST f=16 a=2 0.480384 2.480860 6 1730
FAST f=16 a=3 44.179259 2.469304 8 1970
FAST f=16 a=3 0.332657 2.469304 8 1970
FAST f=16 a=4 37.612728 2.478208 6 1970
FAST f=16 a=4 0.32498 2.478208 6 1970
FAST f=16 a=5 35.056222 2.475568 6 1298
FAST f=16 a=5 0.302824 2.475568 6 1298
FAST f=16 a=6 34.713012 2.486079 8 1730
FAST f=16 a=6 0.24755 2.486079 8 1730
FAST f=16 a=7 33.713687 2.477180 6 1682
FAST f=16 a=7 0.280358 2.477180 6 1682
FAST f=16 a=8 31.571412 2.475418 8 1538
FAST f=16 a=8 0.241241 2.475418 8 1538
FAST f=16 a=9 31.608069 2.478263 8 1922
FAST f=16 a=9 0.241764 2.478263 8 1922
FAST f=16 a=10 31.358002 2.472263 8 1442
FAST f=16 a=10 0.221661 2.472263 8 1442
FAST f=17 a=1 66.185775 2.536085 6 1346
FAST f=17 a=1 0.713549 2.536085 6 1346
FAST f=17 a=2 50.365000 2.546105 8 1298
FAST f=17 a=2 0.467846 2.546105 8 1298
FAST f=17 a=3 42.712843 2.536250 8 1298
FAST f=17 a=3 0.34047 2.536250 8 1298
FAST f=17 a=4 39.514227 2.535555 8 1442
FAST f=17 a=4 0.302989 2.535555 8 1442
FAST f=17 a=5 35.189292 2.524925 8 1202
FAST f=17 a=5 0.273451 2.524925 8 1202
FAST f=17 a=6 35.791683 2.523466 8 1202
FAST f=17 a=6 0.268261 2.523466 8 1202
FAST f=17 a=7 37.416136 2.526625 6 1010
FAST f=17 a=7 0.277558 2.526625 6 1010
FAST f=17 a=8 37.084707 2.533274 6 1250
FAST f=17 a=8 0.285104 2.533274 6 1250
FAST f=17 a=9 34.183814 2.532765 8 1298
FAST f=17 a=9 0.235133 2.532765 8 1298
FAST f=17 a=10 31.149235 2.528722 8 1346
FAST f=17 a=10 0.232679 2.528722 8 1346
FAST f=18 a=1 72.942176 2.559857 6 386
FAST f=18 a=1 0.718618 2.559857 6 386
FAST f=18 a=2 51.690440 2.559572 8 290
FAST f=18 a=2 0.403978 2.559572 8 290
FAST f=18 a=3 45.344908 2.561040 8 962
FAST f=18 a=3 0.357205 2.561040 8 962
FAST f=18 a=4 39.804522 2.558446 8 1010
FAST f=18 a=4 0.310526 2.558446 8 1010
FAST f=18 a=5 38.134888 2.561811 8 626
FAST f=18 a=5 0.273743 2.561811 8 626
FAST f=18 a=6 35.091890 2.555518 8 722
FAST f=18 a=6 0.260135 2.555518 8 722
FAST f=18 a=7 34.639523 2.562938 8 290
FAST f=18 a=7 0.234294 2.562938 8 290
FAST f=18 a=8 36.076431 2.563567 8 1586
FAST f=18 a=8 0.274075 2.563567 8 1586
FAST f=18 a=9 36.376433 2.560950 8 722
FAST f=18 a=9 0.240106 2.560950 8 722
FAST f=18 a=10 32.624790 2.559340 8 578
FAST f=18 a=10 0.234704 2.559340 8 578
FAST f=19 a=1 70.513761 2.572441 8 194
FAST f=19 a=1 0.726112 2.572441 8 194
FAST f=19 a=2 59.263032 2.574560 8 482
FAST f=19 a=2 0.451554 2.574560 8 482
FAST f=19 a=3 51.509594 2.571546 6 194
FAST f=19 a=3 0.393014 2.571546 6 194
FAST f=19 a=4 55.393906 2.573386 8 482
FAST f=19 a=4 0.38819 2.573386 8 482
FAST f=19 a=5 43.201736 2.567589 8 674
FAST f=19 a=5 0.292155 2.567589 8 674
FAST f=19 a=6 42.911687 2.572666 6 434
FAST f=19 a=6 0.303988 2.572666 6 434
FAST f=19 a=7 44.687591 2.573613 6 290
FAST f=19 a=7 0.308721 2.573613 6 290
FAST f=19 a=8 37.372868 2.571039 6 194
FAST f=19 a=8 0.287137 2.571039 6 194
FAST f=19 a=9 36.074230 2.566473 6 482
FAST f=19 a=9 0.280721 2.566473 6 482
FAST f=19 a=10 33.731720 2.570306 8 194
FAST f=19 a=10 0.224073 2.570306 8 194
FAST f=20 a=1 79.670634 2.581146 6 290
FAST f=20 a=1 0.899986 2.581146 6 290
FAST f=20 a=2 58.827141 2.579782 8 386
FAST f=20 a=2 0.602288 2.579782 8 386
FAST f=20 a=3 51.289004 2.579627 8 722
FAST f=20 a=3 0.446091 2.579627 8 722
FAST f=20 a=4 47.711068 2.581508 8 722
FAST f=20 a=4 0.473007 2.581508 8 722
FAST f=20 a=5 47.402929 2.578062 6 434
FAST f=20 a=5 0.497131 2.578062 6 434
FAST f=20 a=6 54.797102 2.577365 8 482
FAST f=20 a=6 0.515061 2.577365 8 482
FAST f=20 a=7 51.370877 2.583050 8 386
FAST f=20 a=7 0.402878 2.583050 8 386
FAST f=20 a=8 51.437931 2.574875 6 242
FAST f=20 a=8 0.453094 2.574875 6 242
FAST f=20 a=9 44.105456 2.576700 6 242
FAST f=20 a=9 0.456633 2.576700 6 242
FAST f=20 a=10 44.447580 2.578305 8 338
FAST f=20 a=10 0.409121 2.578305 8 338
FAST f=21 a=1 113.031686 2.582449 6 242
FAST f=21 a=1 1.456971 2.582449 6 242
FAST f=21 a=2 97.700932 2.582124 8 194
FAST f=21 a=2 1.072078 2.582124 8 194
FAST f=21 a=3 96.563648 2.585479 8 434
FAST f=21 a=3 0.949528 2.585479 8 434
FAST f=21 a=4 90.597813 2.582366 6 386
FAST f=21 a=4 0.76944 2.582366 6 386
FAST f=21 a=5 86.815980 2.579043 8 434
FAST f=21 a=5 0.858167 2.579043 8 434
FAST f=21 a=6 91.235820 2.578378 8 530
FAST f=21 a=6 0.684274 2.578378 8 530
FAST f=21 a=7 84.392788 2.581243 8 386
FAST f=21 a=7 0.814386 2.581243 8 386
FAST f=21 a=8 82.052310 2.582547 8 338
FAST f=21 a=8 0.822633 2.582547 8 338
FAST f=21 a=9 74.696074 2.579319 8 194
FAST f=21 a=9 0.811028 2.579319 8 194
FAST f=21 a=10 76.211170 2.578766 8 290
FAST f=21 a=10 0.809715 2.578766 8 290
FAST f=22 a=1 138.976871 2.580478 8 194
FAST f=22 a=1 1.748932 2.580478 8 194
FAST f=22 a=2 120.164097 2.583633 8 386
FAST f=22 a=2 1.333239 2.583633 8 386
FAST f=22 a=3 111.986474 2.582566 6 194
FAST f=22 a=3 1.305734 2.582566 6 194
FAST f=22 a=4 108.548148 2.583068 6 194
FAST f=22 a=4 1.314026 2.583068 6 194
FAST f=22 a=5 103.173017 2.583495 6 290
FAST f=22 a=5 1.228664 2.583495 6 290
FAST f=22 a=6 108.421262 2.582349 8 530
FAST f=22 a=6 1.076773 2.582349 8 530
FAST f=22 a=7 103.284127 2.581022 8 386
FAST f=22 a=7 1.112117 2.581022 8 386
FAST f=22 a=8 96.330279 2.581073 8 290
FAST f=22 a=8 1.109303 2.581073 8 290
FAST f=22 a=9 97.651348 2.580075 6 194
FAST f=22 a=9 0.933032 2.580075 6 194
FAST f=22 a=10 101.660621 2.584886 8 194
FAST f=22 a=10 0.796823 2.584886 8 194
FAST f=23 a=1 159.322978 2.581474 6 242
FAST f=23 a=1 2.015878 2.581474 6 242
FAST f=23 a=2 134.331775 2.581619 8 194
FAST f=23 a=2 1.545845 2.581619 8 194
FAST f=23 a=3 127.724552 2.579888 6 338
FAST f=23 a=3 1.444496 2.579888 6 338
FAST f=23 a=4 126.077675 2.578137 6 242
FAST f=23 a=4 1.364394 2.578137 6 242
FAST f=23 a=5 124.914027 2.580843 8 338
FAST f=23 a=5 1.116059 2.580843 8 338
FAST f=23 a=6 122.874153 2.577637 6 338
FAST f=23 a=6 1.164584 2.577637 6 338
FAST f=23 a=7 123.099257 2.582715 6 386
FAST f=23 a=7 1.354042 2.582715 6 386
FAST f=23 a=8 122.026753 2.577681 8 194
FAST f=23 a=8 1.210966 2.577681 8 194
FAST f=23 a=9 121.164312 2.584599 6 290
FAST f=23 a=9 1.174859 2.584599 6 290
FAST f=23 a=10 117.462222 2.580358 8 194
FAST f=23 a=10 1.075258 2.580358 8 194
FAST f=24 a=1 169.539659 2.581642 6 194
FAST f=24 a=1 1.916804 2.581642 6 194
FAST f=24 a=2 160.539270 2.580421 6 290
FAST f=24 a=2 1.71087 2.580421 6 290
FAST f=24 a=3 155.455874 2.580449 6 242
FAST f=24 a=3 1.60307 2.580449 6 242
FAST f=24 a=4 147.630320 2.582953 6 338
FAST f=24 a=4 1.396364 2.582953 6 338
FAST f=24 a=5 133.767428 2.580589 6 290
FAST f=24 a=5 1.19933 2.580589 6 290
FAST f=24 a=6 146.437535 2.579453 8 194
FAST f=24 a=6 1.385405 2.579453 8 194
FAST f=24 a=7 147.227507 2.584155 8 386
FAST f=24 a=7 1.48942 2.584155 8 386
FAST f=24 a=8 138.005773 2.584115 8 194
FAST f=24 a=8 1.352 2.584115 8 194
FAST f=24 a=9 141.442625 2.582902 8 290
FAST f=24 a=9 1.39647 2.582902 8 290
FAST f=24 a=10 142.157446 2.582701 8 434
FAST f=24 a=10 1.498889 2.582701 8 434

View File

@ -5,7 +5,6 @@
#include <ctype.h>
#include <time.h>
#include "random.h"
#include "fastCover.h"
#include "dictBuilder.h"
#include "zstd_internal.h" /* includes zstd.h */
#include "io.h"
@ -149,7 +148,7 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev
/* Allocate dst with enough space to compress the maximum sized sample */
{
size_t maxSampleSize = 0;
for (int i = 0; i < srcInfo->nbSamples; i++) {
for (i = 0; i < srcInfo->nbSamples; i++) {
maxSampleSize = MAX(srcInfo->samplesSizes[i], maxSampleSize);
}
dstCapacity = ZSTD_compressBound(maxSampleSize);
@ -291,6 +290,9 @@ int main(int argCount, const char* argv[])
/* Initialize arguments to default values */
unsigned k = 200;
unsigned d = 8;
unsigned f;
unsigned accel;
unsigned i;
const unsigned cLevel = DEFAULT_CLEVEL;
const unsigned dictID = 0;
const unsigned maxDictSize = g_defaultMaxDictSize;
@ -305,7 +307,7 @@ int main(int argCount, const char* argv[])
const char** extendedFileList = NULL;
/* Parse arguments */
for (int i = 1; i < argCount; i++) {
for (i = 1; i < argCount; i++) {
const char* argument = argv[i];
if (longCommandWArg(&argument, "in=")) {
filenameTable[filenameIdx] = argument;
@ -375,6 +377,7 @@ int main(int argCount, const char* argv[])
/* for cover */
{
/* for cover (optimizing k and d) */
ZDICT_cover_params_t coverParam;
memset(&coverParam, 0, sizeof(coverParam));
coverParam.zParams = zParams;
@ -388,6 +391,7 @@ int main(int argCount, const char* argv[])
goto _cleanup;
}
/* for cover (with k and d provided) */
const int coverResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL, NULL);
DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParam.k, coverParam.d, coverParam.steps, (unsigned)(coverParam.splitPoint * 100));
if(coverResult) {
@ -398,29 +402,34 @@ int main(int argCount, const char* argv[])
}
/* for fastCover */
for (unsigned f = 15; f < 25; f++){
for (f = 15; f < 25; f++){
DISPLAYLEVEL(2, "current f is %u\n", f);
/* for fastCover (optimizing k and d) */
ZDICT_fastCover_params_t fastParam;
memset(&fastParam, 0, sizeof(fastParam));
fastParam.zParams = zParams;
fastParam.splitPoint = 1.0;
fastParam.f = f;
fastParam.steps = 40;
fastParam.nbThreads = 1;
const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
if(fastOptResult) {
result = 1;
goto _cleanup;
}
for (accel = 1; accel < 11; accel++) {
DISPLAYLEVEL(2, "current accel is %u\n", accel);
/* for fastCover (optimizing k and d) */
ZDICT_fastCover_params_t fastParam;
memset(&fastParam, 0, sizeof(fastParam));
fastParam.zParams = zParams;
fastParam.f = f;
fastParam.steps = 40;
fastParam.nbThreads = 1;
fastParam.accel = accel;
const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100), fastParam.accel);
if(fastOptResult) {
result = 1;
goto _cleanup;
}
/* for fastCover (with k and d provided) */
const int fastResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
if(fastResult) {
result = 1;
goto _cleanup;
/* for fastCover (with k and d provided) */
for (i = 0; i < 5; i++) {
const int fastResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100), fastParam.accel);
if(fastResult) {
result = 1;
goto _cleanup;
}
}
}
}

View File

@ -197,7 +197,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
bestSegment.end = newEnd;
}
{
/* Half the frequency of hash value of each dmer covered by the chosen segment. */
/* Zero the frequency of hash value of each dmer covered by the chosen segment. */
U32 pos;
for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, parameters.f, ctx->d);
@ -300,7 +300,7 @@ static int FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, const void *samplesBuffer,
if (totalSamplesSize < MAX(d, sizeof(U64)) ||
totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
(U32)(totalSamplesSize>>20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
(U32)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
return 0;
}
/* Check if there are at least 5 training samples */

View File

@ -69,6 +69,7 @@ cxx_library(
]),
headers=subdir_glob([
('dictBuilder', 'divsufsort.h'),
('dictBuilder', 'cover.h'),
]),
srcs=glob(['dictBuilder/*.c']),
deps=[':common'],

View File

@ -29,6 +29,7 @@
#include "mem.h" /* read */
#include "pool.h"
#include "threading.h"
#include "cover.h"
#include "zstd_internal.h" /* includes zstd.h */
#ifndef ZDICT_STATIC_LINKING_ONLY
#define ZDICT_STATIC_LINKING_ONLY
@ -185,7 +186,7 @@ static void COVER_map_remove(COVER_map_t *map, U32 key) {
}
/**
* Destroyes a map that is inited with COVER_map_init().
* Destroys a map that is inited with COVER_map_init().
*/
static void COVER_map_destroy(COVER_map_t *map) {
if (map->data) {
@ -223,7 +224,7 @@ static COVER_ctx_t *g_ctx = NULL;
/**
* Returns the sum of the sample sizes.
*/
static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
size_t sum = 0;
unsigned i;
for (i = 0; i < nbSamples; ++i) {
@ -380,14 +381,6 @@ static void COVER_group(COVER_ctx_t *ctx, const void *group,
ctx->suffix[dmerId] = freq;
}
/**
* A segment is a range in the source as well as the score of the segment.
*/
typedef struct {
U32 begin;
U32 end;
U32 score;
} COVER_segment_t;
/**
* Selects the best segment in an epoch.
@ -691,7 +684,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
BYTE* const dict = (BYTE*)dictBuffer;
COVER_ctx_t ctx;
COVER_map_t activeDmers;
parameters.splitPoint = 1.0;
parameters.splitPoint = parameters.splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters.splitPoint;
/* Initialize global data */
g_displayLevel = parameters.zParams.notificationLevel;
/* Checks */
@ -737,28 +730,65 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
}
}
/**
* COVER_best_t is used for two purposes:
* 1. Synchronizing threads.
* 2. Saving the best parameters and dictionary.
*
* All of the methods except COVER_best_init() are thread safe if zstd is
* compiled with multithreaded support.
*/
typedef struct COVER_best_s {
ZSTD_pthread_mutex_t mutex;
ZSTD_pthread_cond_t cond;
size_t liveJobs;
void *dict;
size_t dictSize;
ZDICT_cover_params_t parameters;
size_t compressedSize;
} COVER_best_t;
size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
const size_t *samplesSizes, const BYTE *samples,
size_t *offsets,
size_t nbTrainSamples, size_t nbSamples,
BYTE *const dict, size_t dictBufferCapacity) {
size_t totalCompressedSize = ERROR(GENERIC);
/* Pointers */
ZSTD_CCtx *cctx;
ZSTD_CDict *cdict;
void *dst;
/* Local variables */
size_t dstCapacity;
size_t i;
/* Allocate dst with enough space to compress the maximum sized sample */
{
size_t maxSampleSize = 0;
i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
for (; i < nbSamples; ++i) {
maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
}
dstCapacity = ZSTD_compressBound(maxSampleSize);
dst = malloc(dstCapacity);
}
/* Create the cctx and cdict */
cctx = ZSTD_createCCtx();
cdict = ZSTD_createCDict(dict, dictBufferCapacity,
parameters.zParams.compressionLevel);
if (!dst || !cctx || !cdict) {
goto _compressCleanup;
}
/* Compress each sample and sum their sizes (or error) */
totalCompressedSize = dictBufferCapacity;
i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
for (; i < nbSamples; ++i) {
const size_t size = ZSTD_compress_usingCDict(
cctx, dst, dstCapacity, samples + offsets[i],
samplesSizes[i], cdict);
if (ZSTD_isError(size)) {
totalCompressedSize = ERROR(GENERIC);
goto _compressCleanup;
}
totalCompressedSize += size;
}
_compressCleanup:
ZSTD_freeCCtx(cctx);
ZSTD_freeCDict(cdict);
if (dst) {
free(dst);
}
return totalCompressedSize;
}
/**
* Initialize the `COVER_best_t`.
*/
static void COVER_best_init(COVER_best_t *best) {
void COVER_best_init(COVER_best_t *best) {
if (best==NULL) return; /* compatible with init on NULL */
(void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
(void)ZSTD_pthread_cond_init(&best->cond, NULL);
@ -772,7 +802,7 @@ static void COVER_best_init(COVER_best_t *best) {
/**
* Wait until liveJobs == 0.
*/
static void COVER_best_wait(COVER_best_t *best) {
void COVER_best_wait(COVER_best_t *best) {
if (!best) {
return;
}
@ -786,7 +816,7 @@ static void COVER_best_wait(COVER_best_t *best) {
/**
* Call COVER_best_wait() and then destroy the COVER_best_t.
*/
static void COVER_best_destroy(COVER_best_t *best) {
void COVER_best_destroy(COVER_best_t *best) {
if (!best) {
return;
}
@ -802,7 +832,7 @@ static void COVER_best_destroy(COVER_best_t *best) {
* Called when a thread is about to be launched.
* Increments liveJobs.
*/
static void COVER_best_start(COVER_best_t *best) {
void COVER_best_start(COVER_best_t *best) {
if (!best) {
return;
}
@ -816,7 +846,7 @@ static void COVER_best_start(COVER_best_t *best) {
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
* If this dictionary is the best so far save it and its parameters.
*/
static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
ZDICT_cover_params_t parameters, void *dict,
size_t dictSize) {
if (!best) {
@ -847,10 +877,10 @@ static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
best->parameters = parameters;
best->compressedSize = compressedSize;
}
ZSTD_pthread_mutex_unlock(&best->mutex);
if (liveJobs == 0) {
ZSTD_pthread_cond_broadcast(&best->cond);
}
ZSTD_pthread_mutex_unlock(&best->mutex);
}
}
@ -904,51 +934,10 @@ static void COVER_tryParameters(void *opaque) {
}
}
/* Check total compressed size */
{
/* Pointers */
ZSTD_CCtx *cctx;
ZSTD_CDict *cdict;
void *dst;
/* Local variables */
size_t dstCapacity;
size_t i;
/* Allocate dst with enough space to compress the maximum sized sample */
{
size_t maxSampleSize = 0;
i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0;
for (; i < ctx->nbSamples; ++i) {
maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
}
dstCapacity = ZSTD_compressBound(maxSampleSize);
dst = malloc(dstCapacity);
}
/* Create the cctx and cdict */
cctx = ZSTD_createCCtx();
cdict = ZSTD_createCDict(dict, dictBufferCapacity,
parameters.zParams.compressionLevel);
if (!dst || !cctx || !cdict) {
goto _compressCleanup;
}
/* Compress each sample and sum their sizes (or error) */
totalCompressedSize = dictBufferCapacity;
i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0;
for (; i < ctx->nbSamples; ++i) {
const size_t size = ZSTD_compress_usingCDict(
cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
ctx->samplesSizes[i], cdict);
if (ZSTD_isError(size)) {
totalCompressedSize = ERROR(GENERIC);
goto _compressCleanup;
}
totalCompressedSize += size;
}
_compressCleanup:
ZSTD_freeCCtx(cctx);
ZSTD_freeCDict(cdict);
if (dst) {
free(dst);
}
}
totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
ctx->samples, ctx->offsets,
ctx->nbTrainSamples, ctx->nbSamples,
dict, dictBufferCapacity);
_cleanup:
COVER_best_finish(data->best, totalCompressedSize, parameters, dict,

83
lib/dictBuilder/cover.h Normal file
View File

@ -0,0 +1,83 @@
#include <stdio.h> /* fprintf */
#include <stdlib.h> /* malloc, free, qsort */
#include <string.h> /* memset */
#include <time.h> /* clock */
#include "mem.h" /* read */
#include "pool.h"
#include "threading.h"
#include "zstd_internal.h" /* includes zstd.h */
#ifndef ZDICT_STATIC_LINKING_ONLY
#define ZDICT_STATIC_LINKING_ONLY
#endif
#include "zdict.h"
/**
* COVER_best_t is used for two purposes:
* 1. Synchronizing threads.
* 2. Saving the best parameters and dictionary.
*
* All of the methods except COVER_best_init() are thread safe if zstd is
* compiled with multithreaded support.
*/
typedef struct COVER_best_s {
ZSTD_pthread_mutex_t mutex;
ZSTD_pthread_cond_t cond;
size_t liveJobs;
void *dict;
size_t dictSize;
ZDICT_cover_params_t parameters;
size_t compressedSize;
} COVER_best_t;
/**
* A segment is a range in the source as well as the score of the segment.
*/
typedef struct {
U32 begin;
U32 end;
U32 score;
} COVER_segment_t;
/**
* Checks total compressed size of a dictionary
*/
size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
const size_t *samplesSizes, const BYTE *samples,
size_t *offsets,
size_t nbTrainSamples, size_t nbSamples,
BYTE *const dict, size_t dictBufferCapacity);
/**
* Returns the sum of the sample sizes.
*/
size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
/**
* Initialize the `COVER_best_t`.
*/
void COVER_best_init(COVER_best_t *best);
/**
* Wait until liveJobs == 0.
*/
void COVER_best_wait(COVER_best_t *best);
/**
* Call COVER_best_wait() and then destroy the COVER_best_t.
*/
void COVER_best_destroy(COVER_best_t *best);
/**
* Called when a thread is about to be launched.
* Increments liveJobs.
*/
void COVER_best_start(COVER_best_t *best);
/**
* Called when a thread finishes executing, both on error or success.
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
* If this dictionary is the best so far save it and its parameters.
*/
void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
ZDICT_cover_params_t parameters, void *dict,
size_t dictSize);

701
lib/dictBuilder/fastcover.c Normal file
View File

@ -0,0 +1,701 @@
/*-*************************************
* Dependencies
***************************************/
#include <stdio.h> /* fprintf */
#include <stdlib.h> /* malloc, free, qsort */
#include <string.h> /* memset */
#include <time.h> /* clock */
#include "mem.h" /* read */
#include "pool.h"
#include "threading.h"
#include "cover.h"
#include "zstd_internal.h" /* includes zstd.h */
#ifndef ZDICT_STATIC_LINKING_ONLY
#define ZDICT_STATIC_LINKING_ONLY
#endif
#include "zdict.h"
/*-*************************************
* Constants
***************************************/
#define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
#define FASTCOVER_MAX_F 31
#define FASTCOVER_MAX_ACCEL 10
#define DEFAULT_SPLITPOINT 0.75
#define DEFAULT_F 18
#define DEFAULT_ACCEL 1
/*-*************************************
* Console display
***************************************/
static int g_displayLevel = 2;
#define DISPLAY(...) \
{ \
fprintf(stderr, __VA_ARGS__); \
fflush(stderr); \
}
#define LOCALDISPLAYLEVEL(displayLevel, l, ...) \
if (displayLevel >= l) { \
DISPLAY(__VA_ARGS__); \
} /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
#define LOCALDISPLAYUPDATE(displayLevel, l, ...) \
if (displayLevel >= l) { \
if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) { \
g_time = clock(); \
DISPLAY(__VA_ARGS__); \
} \
}
#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
static clock_t g_time = 0;
/*-*************************************
* Hash Functions
***************************************/
static const U64 prime6bytes = 227718039650203ULL;
static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; }
static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
/**
* Hash the d-byte value pointed to by p and mod 2^f
*/
static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 h, unsigned d) {
if (d == 6) {
return ZSTD_hash6Ptr(p, h) & ((1 << h) - 1);
}
return ZSTD_hash8Ptr(p, h) & ((1 << h) - 1);
}
/*-*************************************
* Acceleration
***************************************/
typedef struct {
unsigned finalize; /* Percentage of training samples used for ZDICT_finalizeDictionary */
unsigned skip; /* Number of dmer skipped between each dmer counted in computeFrequency */
} FASTCOVER_accel_t;
static const FASTCOVER_accel_t FASTCOVER_defaultAccelParameters[FASTCOVER_MAX_ACCEL+1] = {
{ 100, 0 }, /* accel = 0, should not happen because accel = 0 defaults to accel = 1 */
{ 100, 0 }, /* accel = 1 */
{ 50, 1 }, /* accel = 2 */
{ 34, 2 }, /* accel = 3 */
{ 25, 3 }, /* accel = 4 */
{ 20, 4 }, /* accel = 5 */
{ 17, 5 }, /* accel = 6 */
{ 14, 6 }, /* accel = 7 */
{ 13, 7 }, /* accel = 8 */
{ 11, 8 }, /* accel = 9 */
{ 10, 9 }, /* accel = 10 */
};
/*-*************************************
* Context
***************************************/
typedef struct {
const BYTE *samples;
size_t *offsets;
const size_t *samplesSizes;
size_t nbSamples;
size_t nbTrainSamples;
size_t nbTestSamples;
size_t nbDmers;
U32 *freqs;
unsigned d;
unsigned f;
FASTCOVER_accel_t accelParams;
} FASTCOVER_ctx_t;
/*-*************************************
* Helper functions
***************************************/
/**
* Selects the best segment in an epoch.
* Segments of are scored according to the function:
*
* Let F(d) be the frequency of all dmers with hash value d.
* Let S_i be hash value of the dmer at position i of segment S which has length k.
*
* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
*
* Once the dmer with hash value d is in the dictionay we set F(d) = 0.
*/
static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
U32 *freqs, U32 begin, U32 end,
ZDICT_cover_params_t parameters,
U16* segmentFreqs) {
/* Constants */
const U32 k = parameters.k;
const U32 d = parameters.d;
const U32 f = ctx->f;
const U32 dmersInK = k - d + 1;
/* Try each segment (activeSegment) and save the best (bestSegment) */
COVER_segment_t bestSegment = {0, 0, 0};
COVER_segment_t activeSegment;
/* Reset the activeDmers in the segment */
/* The activeSegment starts at the beginning of the epoch. */
activeSegment.begin = begin;
activeSegment.end = begin;
activeSegment.score = 0;
/* Slide the activeSegment through the whole epoch.
* Save the best segment in bestSegment.
*/
while (activeSegment.end < end) {
/* Get hash value of current dmer */
const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
/* Add frequency of this index to score if this is the first occurence of index in active segment */
if (segmentFreqs[index] == 0) {
activeSegment.score += freqs[index];
}
/* Increment end of segment and segmentFreqs*/
activeSegment.end += 1;
segmentFreqs[index] += 1;
/* If the window is now too large, drop the first position */
if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
/* Get hash value of the dmer to be eliminated from active segment */
const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
segmentFreqs[delIndex] -= 1;
/* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */
if (segmentFreqs[delIndex] == 0) {
activeSegment.score -= freqs[delIndex];
}
/* Increment start of segment */
activeSegment.begin += 1;
}
/* If this segment is the best so far save it */
if (activeSegment.score > bestSegment.score) {
bestSegment = activeSegment;
}
}
/* Zero out rest of segmentFreqs array */
while (activeSegment.begin < end) {
const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
segmentFreqs[delIndex] -= 1;
activeSegment.begin += 1;
}
{
/* Zero the frequency of hash value of each dmer covered by the chosen segment. */
U32 pos;
for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, f, d);
freqs[i] = 0;
}
}
return bestSegment;
}
static int FASTCOVER_checkParameters(ZDICT_cover_params_t parameters,
size_t maxDictSize, unsigned f,
unsigned accel) {
/* k, d, and f are required parameters */
if (parameters.d == 0 || parameters.k == 0) {
return 0;
}
/* d has to be 6 or 8 */
if (parameters.d != 6 && parameters.d != 8) {
return 0;
}
/* k <= maxDictSize */
if (parameters.k > maxDictSize) {
return 0;
}
/* d <= k */
if (parameters.d > parameters.k) {
return 0;
}
/* 0 < f <= FASTCOVER_MAX_F*/
if (f > FASTCOVER_MAX_F || f == 0) {
return 0;
}
/* 0 < splitPoint <= 1 */
if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) {
return 0;
}
/* 0 < accel <= 10 */
if (accel > 10 || accel == 0) {
return 0;
}
return 1;
}
/**
* Clean up a context initialized with `FASTCOVER_ctx_init()`.
*/
static void FASTCOVER_ctx_destroy(FASTCOVER_ctx_t *ctx) {
if (!ctx) {
return;
}
free(ctx->freqs);
ctx->freqs = NULL;
free(ctx->offsets);
ctx->offsets = NULL;
}
/**
* Calculate for frequency of hash value of each dmer in ctx->samples
*/
static void FASTCOVER_computeFrequency(U32 *freqs, FASTCOVER_ctx_t *ctx){
const unsigned f = ctx->f;
const unsigned d = ctx->d;
const unsigned skip = ctx->accelParams.skip;
const unsigned readLength = MAX(d, 8);
size_t start; /* start of current dmer */
size_t i;
for (i = 0; i < ctx->nbTrainSamples; i++) {
size_t currSampleStart = ctx->offsets[i];
size_t currSampleEnd = ctx->offsets[i+1];
start = currSampleStart;
while (start + readLength <= currSampleEnd) {
const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, d);
freqs[dmerIndex]++;
start = start + skip + 1;
}
}
}
/**
* Prepare a context for dictionary building.
* The context is only dependent on the parameter `d` and can used multiple
* times.
* Returns 1 on success or zero on error.
* The context must be destroyed with `FASTCOVER_ctx_destroy()`.
*/
static int FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, const void *samplesBuffer,
const size_t *samplesSizes, unsigned nbSamples,
unsigned d, double splitPoint, unsigned f,
FASTCOVER_accel_t accelParams) {
const BYTE *const samples = (const BYTE *)samplesBuffer;
const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
/* Split samples into testing and training sets */
const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
/* Checks */
if (totalSamplesSize < MAX(d, sizeof(U64)) ||
totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
(U32)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
return 0;
}
/* Check if there are at least 5 training samples */
if (nbTrainSamples < 5) {
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
return 0;
}
/* Check if there's testing sample */
if (nbTestSamples < 1) {
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
return 0;
}
/* Zero the context */
memset(ctx, 0, sizeof(*ctx));
DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
(U32)trainingSamplesSize);
DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
(U32)testSamplesSize);
ctx->samples = samples;
ctx->samplesSizes = samplesSizes;
ctx->nbSamples = nbSamples;
ctx->nbTrainSamples = nbTrainSamples;
ctx->nbTestSamples = nbTestSamples;
ctx->nbDmers = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
ctx->d = d;
ctx->f = f;
ctx->accelParams = accelParams;
/* The offsets of each file */
ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
if (!ctx->offsets) {
DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
FASTCOVER_ctx_destroy(ctx);
return 0;
}
/* Fill offsets from the samplesSizes */
{
U32 i;
ctx->offsets[0] = 0;
for (i = 1; i <= nbSamples; ++i) {
ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
}
}
/* Initialize frequency array of size 2^f */
ctx->freqs = (U32 *)calloc(((U64)1 << f), sizeof(U32));
DISPLAYLEVEL(2, "Computing frequencies\n");
FASTCOVER_computeFrequency(ctx->freqs, ctx);
return 1;
}
/**
* Given the prepared context build the dictionary.
*/
static size_t FASTCOVER_buildDictionary(const FASTCOVER_ctx_t *ctx, U32 *freqs,
void *dictBuffer, size_t dictBufferCapacity,
ZDICT_cover_params_t parameters, U16* segmentFreqs){
BYTE *const dict = (BYTE *)dictBuffer;
size_t tail = dictBufferCapacity;
/* Divide the data up into epochs of equal size.
* We will select at least one segment from each epoch.
*/
const U32 epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k));
const U32 epochSize = (U32)(ctx->nbDmers / epochs);
size_t epoch;
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
epochSize);
/* Loop through the epochs until there are no more segments or the dictionary
* is full.
*/
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
const U32 epochBegin = (U32)(epoch * epochSize);
const U32 epochEnd = epochBegin + epochSize;
size_t segmentSize;
/* Select a segment */
COVER_segment_t segment = FASTCOVER_selectSegment(
ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
/* If the segment covers no dmers, then we are out of content */
if (segment.score == 0) {
break;
}
/* Trim the segment if necessary and if it is too small then we are done */
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
if (segmentSize < parameters.d) {
break;
}
/* We fill the dictionary from the back to allow the best segments to be
* referenced with the smallest offsets.
*/
tail -= segmentSize;
memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
DISPLAYUPDATE(
2, "\r%u%% ",
(U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
}
DISPLAYLEVEL(2, "\r%79s\r", "");
return tail;
}
/**
* Parameters for FASTCOVER_tryParameters().
*/
typedef struct FASTCOVER_tryParameters_data_s {
const FASTCOVER_ctx_t *ctx;
COVER_best_t *best;
size_t dictBufferCapacity;
ZDICT_cover_params_t parameters;
} FASTCOVER_tryParameters_data_t;
/**
* Tries a set of parameters and updates the COVER_best_t with the results.
* This function is thread safe if zstd is compiled with multithreaded support.
* It takes its parameters as an *OWNING* opaque pointer to support threading.
*/
static void FASTCOVER_tryParameters(void *opaque) {
/* Save parameters as local variables */
FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t *)opaque;
const FASTCOVER_ctx_t *const ctx = data->ctx;
const ZDICT_cover_params_t parameters = data->parameters;
size_t dictBufferCapacity = data->dictBufferCapacity;
size_t totalCompressedSize = ERROR(GENERIC);
/* Initialize array to keep track of frequency of dmer within activeSegment */
U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
/* Allocate space for hash table, dict, and freqs */
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
if (!segmentFreqs || !dict || !freqs) {
DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
goto _cleanup;
}
/* Copy the frequencies because we need to modify them */
memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
/* Build the dictionary */
{
const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
parameters, segmentFreqs);
const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
dictBufferCapacity = ZDICT_finalizeDictionary(
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
ctx->samples, ctx->samplesSizes, nbFinalizeSamples, parameters.zParams);
if (ZDICT_isError(dictBufferCapacity)) {
DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
goto _cleanup;
}
}
/* Check total compressed size */
totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
ctx->samples, ctx->offsets,
ctx->nbTrainSamples, ctx->nbSamples,
dict, dictBufferCapacity);
_cleanup:
COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
dictBufferCapacity);
free(data);
free(segmentFreqs);
free(dict);
free(freqs);
}
static void FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
ZDICT_cover_params_t *coverParams) {
coverParams->k = fastCoverParams.k;
coverParams->d = fastCoverParams.d;
coverParams->steps = fastCoverParams.steps;
coverParams->nbThreads = fastCoverParams.nbThreads;
coverParams->splitPoint = fastCoverParams.splitPoint;
coverParams->zParams = fastCoverParams.zParams;
}
static void FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
ZDICT_fastCover_params_t *fastCoverParams,
unsigned f, unsigned accel) {
fastCoverParams->k = coverParams.k;
fastCoverParams->d = coverParams.d;
fastCoverParams->steps = coverParams.steps;
fastCoverParams->nbThreads = coverParams.nbThreads;
fastCoverParams->splitPoint = coverParams.splitPoint;
fastCoverParams->f = f;
fastCoverParams->accel = accel;
fastCoverParams->zParams = coverParams.zParams;
}
ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters) {
BYTE* const dict = (BYTE*)dictBuffer;
FASTCOVER_ctx_t ctx;
ZDICT_cover_params_t coverParams;
FASTCOVER_accel_t accelParams;
/* Initialize global data */
g_displayLevel = parameters.zParams.notificationLevel;
/* Assign splitPoint and f if not provided */
parameters.splitPoint = parameters.splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters.splitPoint;
parameters.f = parameters.f == 0 ? DEFAULT_F : parameters.f;
parameters.accel = parameters.accel == 0 ? DEFAULT_ACCEL : parameters.accel;
/* Convert to cover parameter */
memset(&coverParams, 0 , sizeof(coverParams));
FASTCOVER_convertToCoverParams(parameters, &coverParams);
/* Checks */
if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
parameters.accel)) {
DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
return ERROR(GENERIC);
}
if (nbSamples == 0) {
DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
return ERROR(GENERIC);
}
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
ZDICT_DICTSIZE_MIN);
return ERROR(dstSize_tooSmall);
}
/* Assign corresponding FASTCOVER_accel_t to accelParams*/
accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
/* Initialize context */
if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
coverParams.d, parameters.splitPoint, parameters.f,
accelParams)) {
DISPLAYLEVEL(1, "Failed to initialize context\n");
return ERROR(GENERIC);
}
/* Build the dictionary */
DISPLAYLEVEL(2, "Building dictionary\n");
{
/* Initialize array to keep track of frequency of dmer within activeSegment */
U16* segmentFreqs = (U16 *)calloc(((U64)1 << parameters.f), sizeof(U16));
const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer,
dictBufferCapacity, coverParams, segmentFreqs);
const unsigned nbFinalizeSamples = (unsigned)(ctx.nbTrainSamples * ctx.accelParams.finalize / 100);
const size_t dictionarySize = ZDICT_finalizeDictionary(
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
samplesBuffer, samplesSizes, nbFinalizeSamples, coverParams.zParams);
if (!ZSTD_isError(dictionarySize)) {
DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
(U32)dictionarySize);
}
FASTCOVER_ctx_destroy(&ctx);
free(segmentFreqs);
return dictionarySize;
}
}
ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
const size_t *samplesSizes, unsigned nbSamples,
ZDICT_fastCover_params_t *parameters) {
ZDICT_cover_params_t coverParams;
FASTCOVER_accel_t accelParams;
/* constants */
const unsigned nbThreads = parameters->nbThreads;
const double splitPoint =
parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
const unsigned kIterations =
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
/* Local variables */
const int displayLevel = parameters->zParams.notificationLevel;
unsigned iteration = 1;
unsigned d;
unsigned k;
COVER_best_t best;
POOL_ctx *pool = NULL;
/* Checks */
if (splitPoint <= 0 || splitPoint > 1) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
return ERROR(GENERIC);
}
if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
return ERROR(GENERIC);
}
if (kMinK < kMaxD || kMaxK < kMinK) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
return ERROR(GENERIC);
}
if (nbSamples == 0) {
LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
return ERROR(GENERIC);
}
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
ZDICT_DICTSIZE_MIN);
return ERROR(dstSize_tooSmall);
}
if (nbThreads > 1) {
pool = POOL_create(nbThreads, 1);
if (!pool) {
return ERROR(memory_allocation);
}
}
/* Initialization */
COVER_best_init(&best);
memset(&coverParams, 0 , sizeof(coverParams));
FASTCOVER_convertToCoverParams(*parameters, &coverParams);
accelParams = FASTCOVER_defaultAccelParameters[accel];
/* Turn down global display level to clean up display at level 2 and below */
g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
/* Loop through d first because each new value needs a new context */
LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
kIterations);
for (d = kMinD; d <= kMaxD; d += 2) {
/* Initialize the context for this value of d */
FASTCOVER_ctx_t ctx;
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams)) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
COVER_best_destroy(&best);
POOL_free(pool);
return ERROR(GENERIC);
}
/* Loop through k reusing the same context */
for (k = kMinK; k <= kMaxK; k += kStepSize) {
/* Prepare the arguments */
FASTCOVER_tryParameters_data_t *data = (FASTCOVER_tryParameters_data_t *)malloc(
sizeof(FASTCOVER_tryParameters_data_t));
LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
if (!data) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
COVER_best_destroy(&best);
FASTCOVER_ctx_destroy(&ctx);
POOL_free(pool);
return ERROR(GENERIC);
}
data->ctx = &ctx;
data->best = &best;
data->dictBufferCapacity = dictBufferCapacity;
data->parameters = coverParams;
data->parameters.k = k;
data->parameters.d = d;
data->parameters.splitPoint = splitPoint;
data->parameters.steps = kSteps;
data->parameters.zParams.notificationLevel = g_displayLevel;
/* Check the parameters */
if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
data->ctx->f, accel)) {
DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
free(data);
continue;
}
/* Call the function and pass ownership of data to it */
COVER_best_start(&best);
if (pool) {
POOL_add(pool, &FASTCOVER_tryParameters, data);
} else {
FASTCOVER_tryParameters(data);
}
/* Print status */
LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",
(U32)((iteration * 100) / kIterations));
++iteration;
}
COVER_best_wait(&best);
FASTCOVER_ctx_destroy(&ctx);
}
LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
/* Fill the output buffer and parameters with output of the best parameters */
{
const size_t dictSize = best.dictSize;
if (ZSTD_isError(best.compressedSize)) {
const size_t compressedSize = best.compressedSize;
COVER_best_destroy(&best);
POOL_free(pool);
return compressedSize;
}
FASTCOVER_convertToFastCoverParams(best.parameters, parameters, f, accel);
memcpy(dictBuffer, best.dict, dictSize);
COVER_best_destroy(&best);
POOL_free(pool);
return dictSize;
}
}

View File

@ -863,8 +863,8 @@ _cleanup:
size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
const void* customDictContent, size_t dictContentSize,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_params_t params)
const void* samplesBuffer, const size_t* samplesSizes,
unsigned nbSamples, ZDICT_params_t params)
{
size_t hSize;
#define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
@ -1078,17 +1078,17 @@ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
{
ZDICT_cover_params_t params;
ZDICT_fastCover_params_t params;
DEBUGLOG(3, "ZDICT_trainFromBuffer");
memset(&params, 0, sizeof(params));
params.d = 8;
params.steps = 4;
/* Default to level 6 since no compression level information is available */
params.zParams.compressionLevel = 6;
params.zParams.compressionLevel = 3;
#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
params.zParams.notificationLevel = DEBUGLEVEL;
#endif
return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
samplesBuffer, samplesSizes, nbSamples,
&params);
}

View File

@ -39,7 +39,8 @@ extern "C" {
/*! ZDICT_trainFromBuffer():
* Train a dictionary from an array of samples.
* Redirect towards ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4.
* Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4,
* f=18, and accel=1.
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
* supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
* The resulting dictionary will be saved into `dictBuffer`.
@ -90,6 +91,16 @@ typedef struct {
ZDICT_params_t zParams;
} ZDICT_cover_params_t;
typedef struct {
unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
unsigned f; /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(18)*/
unsigned steps; /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
double splitPoint; /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
unsigned accel; /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
ZDICT_params_t zParams;
} ZDICT_fastCover_params_t;
/*! ZDICT_trainFromBuffer_cover():
* Train a dictionary from an array of samples using the COVER algorithm.
@ -116,9 +127,9 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
* dictionary constructed with those parameters is stored in `dictBuffer`.
*
* All of the parameters d, k, steps are optional.
* If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
* If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
* if steps is zero it defaults to its default value.
* If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
* If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
*
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
* or an error code, which can be tested with ZDICT_isError().
@ -130,6 +141,48 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_cover_params_t* parameters);
/*! ZDICT_trainFromBuffer_fastCover():
* Train a dictionary from an array of samples using a modified version of COVER algorithm.
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
* supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
* d and k are required.
* All other parameters are optional, will use default values if not provided
* The resulting dictionary will be saved into `dictBuffer`.
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
* or an error code, which can be tested with ZDICT_isError().
* Note: ZDICT_trainFromBuffer_fastCover() requires about 1 bytes of memory for each input byte and additionally another 6 * 2^f bytes of memory .
* Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
* It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
* In general, it's recommended to provide a few thousands samples, though this can vary a lot.
* It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
*/
ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer,
size_t dictBufferCapacity, const void *samplesBuffer,
const size_t *samplesSizes, unsigned nbSamples,
ZDICT_fastCover_params_t parameters);
/*! ZDICT_optimizeTrainFromBuffer_fastCover():
* The same requirements as above hold for all the parameters except `parameters`.
* This function tries many parameter combinations (specifically, k and d combinations)
* and picks the best parameters. `*parameters` is filled with the best parameters found,
* dictionary constructed with those parameters is stored in `dictBuffer`.
* All of the parameters d, k, steps, f, and accel are optional.
* If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
* if steps is zero it defaults to its default value.
* If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
* If f is zero, default value of 18 is used.
* If accel is zero, default value of 1 is used.
*
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
* or an error code, which can be tested with ZDICT_isError().
* On success `*parameters` contains the parameters selected.
* Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 1 byte of memory for each input byte and additionally another 6 * 2^f bytes of memory for each thread.
*/
ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
size_t dictBufferCapacity, const void* samplesBuffer,
const size_t* samplesSizes, unsigned nbSamples,
ZDICT_fastCover_params_t* parameters);
/*! ZDICT_finalizeDictionary():
* Given a custom content as a basis for dictionary, and a set of samples,
* finalize dictionary by adding headers and statistics.

View File

@ -151,6 +151,7 @@ Advanced arguments :
Dictionary builder :
--train ## : create a dictionary from a training set of files
--train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args
--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#] : use the fastcover algorithm with optional args
--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: 9)
-o file : `file` is dictionary name (default: dictionary)
--maxdict=# : limit dictionary to specified size (default: 112640)

View File

@ -44,6 +44,7 @@
#define SAMPLESIZE_MAX (128 KB)
#define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */
#define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */
#define FASTCOVER_MEMMULT 1 /* rough estimation : memory cost to analyze 1 byte of sample */
static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
#define NOISELENGTH 32
@ -271,16 +272,19 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(void* dictBuffer, size_t dictBufferCa
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
ZDICT_legacy_params_t *params, ZDICT_cover_params_t *coverParams,
int optimizeCover)
ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
ZDICT_fastCover_params_t* fastCoverParams, int optimize)
{
unsigned const displayLevel = params ? params->zParams.notificationLevel :
coverParams ? coverParams->zParams.notificationLevel :
fastCoverParams ? fastCoverParams->zParams.notificationLevel :
0; /* should never happen */
void* const dictBuffer = malloc(maxDictSize);
fileStats const fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
size_t const memMult = params ? MEMMULT : COVER_MEMMULT;
size_t const memMult = params ? MEMMULT :
coverParams ? COVER_MEMMULT:
FASTCOVER_MEMMULT;
size_t const maxMem = DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
@ -312,6 +316,7 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
/* Load input buffer */
DISPLAYLEVEL(3, "Shuffling input files\n");
DiB_shuffle(fileNamesTable, nbFiles);
DiB_loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
{ size_t dictSize;
@ -320,19 +325,36 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize,
srcBuffer, sampleSizes, fs.nbSamples,
*params);
} else if (optimizeCover) {
assert(coverParams != NULL);
dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
srcBuffer, sampleSizes, fs.nbSamples,
coverParams);
if (!ZDICT_isError(dictSize)) {
unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d, coverParams->steps, splitPercentage);
} else if (coverParams) {
if (optimize) {
dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
srcBuffer, sampleSizes, fs.nbSamples,
coverParams);
if (!ZDICT_isError(dictSize)) {
unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
coverParams->steps, splitPercentage);
}
} else {
dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
sampleSizes, fs.nbSamples, *coverParams);
}
} else {
assert(coverParams != NULL);
dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
sampleSizes, fs.nbSamples, *coverParams);
assert(fastCoverParams != NULL);
if (optimize) {
dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
srcBuffer, sampleSizes, fs.nbSamples,
fastCoverParams);
if (!ZDICT_isError(dictSize)) {
unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
fastCoverParams->accel);
}
} else {
dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
sampleSizes, fs.nbSamples, *fastCoverParams);
}
}
if (ZDICT_isError(dictSize)) {
DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */

View File

@ -34,6 +34,6 @@
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
int optimizeCover);
ZDICT_fastCover_params_t* fastCoverParams, int optimize);
#endif

View File

@ -194,7 +194,7 @@ All arguments after \fB\-\-\fR are treated as files
Use FILEs as training set to create a dictionary\. The training set should contain a lot of small files (> 100), and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary)\.
.
.IP
Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-cover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. Equivalent to \fB\-\-train\-cover=d=8,steps=4\fR\.
Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-cover\fR\ or \fB\-\-train\-fastcover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. Equivalent to \fB\-\-train\-fastCover=d=8,steps=4\fR\.
.
.TP
\fB\-o file\fR
@ -240,6 +240,25 @@ Examples:
\fBzstd \-\-train\-cover=k=50 FILEs\fR
.
.TP
\fB\-\-train\-fastcover[=k#,d=#,f=#,steps=#,split=#,accel=#]\fR
Same as cover but with extra parameters \fIf\fR and \fIaccel\fR and different default value of split
.
.IP
If \fIsplit\fR is not specified, then it tries \fIsplit\fR = 75. If \fIf\fR is not specified, then it tries \fIf\fR = 18. Requires that 0 < \fIf\fR < 32. If \fIaccel\fR is not specified, then it tries \fIaccel\fR = 1. Requires that 0 < \fIaccel\fR <= 10. Requires that \fId\fR = 6 or \fId\fR = 8.
.
.IP
\fIf\fR is log of size of array that keeps track of frequency of subsegments of size \fId\fR. The subsegment is hashed to an index in the range [0,2^\fIf\fR - 1]. It is possible that 2 different subsegments are hashed to the same index, and they are considered as the same subsegment when computing frequency. Using a higher \fIf\fR reduces collision but takes longer.
.
.IP
Examples:
.
.IP
\fBzstd \-\-train\-fastcover FILEs\fR
.
.IP
\fBzstd \-\-train\-fastcover=d=8,f=15,accel=2 FILEs\fR
.
.TP
\fB\-\-train\-legacy[=selectivity=#]\fR
Use legacy dictionary builder algorithm with the given dictionary \fIselectivity\fR (default: 9)\. The smaller the \fIselectivity\fR value, the denser the dictionary, improving its efficiency but reducing its possible maximum size\. \fB\-\-train\-legacy=s=#\fR is also accepted\.
.

View File

@ -254,6 +254,26 @@ Compression of small files similar to the sample set will be greatly improved.
`zstd --train-cover=k=50,split=60 FILEs`
* `--train-fastcover[=k#,d=#,f=#,steps=#,split=#,accel=#]`:
Same as cover but with extra parameters _f_ and _accel_ and different default value of split
If _split_ is not specified, then it tries _split_ = 75.
If _f_ is not specified, then it tries _f_ = 18.
Requires that 0 < _f_ < 32.
If _accel_ is not specified, then it tries _accel_ = 1.
Requires that 0 < _accel_ <= 10.
Requires that _d_ = 6 or _d_ = 8.
_f_ is log of size of array that keeps track of frequency of subsegments of size _d_.
The subsegment is hashed to an index in the range [0,2^_f_ - 1].
It is possible that 2 different subsegments are hashed to the same index, and they are considered as the same subsegment when computing frequency.
Using a higher _f_ reduces collision but takes longer.
Examples:
`zstd --train-fastcover FILEs`
`zstd --train-fastcover=d=8,f=15,accel=2 FILEs`
* `--train-legacy[=selectivity=#]`:
Use legacy dictionary builder algorithm with the given dictionary
_selectivity_ (default: 9).

View File

@ -84,7 +84,10 @@ static U32 g_ldmMinMatch = 0;
static U32 g_ldmHashEveryLog = LDM_PARAM_DEFAULT;
static U32 g_ldmBucketSizeLog = LDM_PARAM_DEFAULT;
#define DEFAULT_SPLITPOINT 1.0
#define DEFAULT_ACCEL 1
typedef enum { cover, fastCover, legacy } dictType;
/*-************************************
* Display Macros
@ -172,6 +175,7 @@ static int usage_advanced(const char* programName)
DISPLAY( "Dictionary builder : \n");
DISPLAY( "--train ## : create a dictionary from a training set of files \n");
DISPLAY( "--train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args\n");
DISPLAY( "--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#] : use the fast cover algorithm with optional args\n");
DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel);
DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
DISPLAY( "--maxdict=# : limit dictionary to specified size (default: %u) \n", g_defaultMaxDictSize);
@ -295,6 +299,33 @@ static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t
return 1;
}
/**
* parseFastCoverParameters() :
* reads fastcover parameters from *stringPtr (e.g. "--train-fastcover=k=48,d=8,f=20,steps=32,accel=2") into *params
* @return 1 means that fastcover parameters were correct
* @return 0 in case of malformed parameters
*/
static unsigned parseFastCoverParameters(const char* stringPtr, ZDICT_fastCover_params_t* params)
{
memset(params, 0, sizeof(*params));
for (; ;) {
if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "f=")) { params->f = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "accel=")) { params->accel = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "split=")) {
unsigned splitPercentage = readU32FromChar(&stringPtr);
params->splitPoint = (double)splitPercentage / 100.0;
if (stringPtr[0]==',') { stringPtr++; continue; } else break;
}
return 0;
}
if (stringPtr[0] != 0) return 0;
DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint * 100), params->accel);
return 1;
}
/**
* parseLegacyParameters() :
* reads legacy dictioanry builter parameters from *stringPtr (e.g. "--train-legacy=selectivity=8") into *selectivity
@ -316,7 +347,19 @@ static ZDICT_cover_params_t defaultCoverParams(void)
memset(&params, 0, sizeof(params));
params.d = 8;
params.steps = 4;
params.splitPoint = DEFAULT_SPLITPOINT;
params.splitPoint = 1.0;
return params;
}
static ZDICT_fastCover_params_t defaultFastCoverParams(void)
{
ZDICT_fastCover_params_t params;
memset(&params, 0, sizeof(params));
params.d = 8;
params.f = 18;
params.steps = 4;
params.splitPoint = 0.75; /* different from default splitPoint of cover */
params.accel = DEFAULT_ACCEL;
return params;
}
#endif
@ -431,7 +474,8 @@ int main(int argCount, const char* argv[])
#endif
#ifndef ZSTD_NODICT
ZDICT_cover_params_t coverParams = defaultCoverParams();
int cover = 1;
ZDICT_fastCover_params_t fastCoverParams = defaultFastCoverParams();
dictType dict = fastCover;
#endif
#ifndef ZSTD_NOBENCH
BMK_advancedParams_t benchParams = BMK_initAdvancedParams();
@ -530,18 +574,29 @@ int main(int argCount, const char* argv[])
operation = zom_train;
if (outFileName == NULL)
outFileName = g_defaultDictName;
cover = 1;
dict = cover;
/* Allow optional arguments following an = */
if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); }
else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
else if (!parseCoverParameters(argument, &coverParams)) { CLEAN_RETURN(badusage(programName)); }
continue;
}
if (longCommandWArg(&argument, "--train-fastcover")) {
operation = zom_train;
if (outFileName == NULL)
outFileName = g_defaultDictName;
dict = fastCover;
/* Allow optional arguments following an = */
if (*argument == 0) { memset(&fastCoverParams, 0, sizeof(fastCoverParams)); }
else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
else if (!parseFastCoverParameters(argument, &fastCoverParams)) { CLEAN_RETURN(badusage(programName)); }
continue;
}
if (longCommandWArg(&argument, "--train-legacy")) {
operation = zom_train;
if (outFileName == NULL)
outFileName = g_defaultDictName;
cover = 0;
dict = legacy;
/* Allow optional arguments following an = */
if (*argument == 0) { continue; }
else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
@ -881,17 +936,22 @@ int main(int argCount, const char* argv[])
zParams.compressionLevel = dictCLevel;
zParams.notificationLevel = g_displayLevel;
zParams.dictID = dictID;
if (cover) {
if (dict == cover) {
int const optimize = !coverParams.k || !coverParams.d;
coverParams.nbThreads = nbWorkers;
coverParams.zParams = zParams;
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, &coverParams, optimize);
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, &coverParams, NULL, optimize);
} else if (dict == fastCover) {
int const optimize = !fastCoverParams.k || !fastCoverParams.d;
fastCoverParams.nbThreads = nbWorkers;
fastCoverParams.zParams = zParams;
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, NULL, &fastCoverParams, optimize);
} else {
ZDICT_legacy_params_t dictParams;
memset(&dictParams, 0, sizeof(dictParams));
dictParams.selectivityLevel = dictSelect;
dictParams.zParams = zParams;
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, &dictParams, NULL, 0);
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, &dictParams, NULL, NULL, 0);
}
#endif
goto _end;

View File

@ -427,7 +427,7 @@ $ECHO "- Create second (different) dictionary"
$ZSTD --train-cover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC
$ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
$ECHO "- Create dictionary with short dictID"
$ZSTD --train-cover=k=46,d=8 *.c ../programs/*.c --dictID=1 -o tmpDict1
$ZSTD --train-cover=k=46,d=8,split=80 *.c ../programs/*.c --dictID=1 -o tmpDict1
cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
$ECHO "- Create dictionary with size limit"
$ZSTD --train-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
@ -444,6 +444,47 @@ $ZSTD --train-cover *.c ../programs/*.c
test -f dictionary
rm tmp* dictionary
$ECHO "\n===> fastCover dictionary builder : advanced options "
TESTFILE=../programs/zstdcli.c
./datagen > tmpDict
$ECHO "- Create first dictionary"
$ZSTD --train-fastcover=k=46,d=8,f=15,split=80 *.c ../programs/*.c -o tmpDict
cp $TESTFILE tmp
$ZSTD -f tmp -D tmpDict
$ZSTD -d tmp.zst -D tmpDict -fo result
$DIFF $TESTFILE result
$ECHO "- Create second (different) dictionary"
$ZSTD --train-fastcover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC
$ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
$ECHO "- Create dictionary with short dictID"
$ZSTD --train-fastcover=k=46,d=8,f=15,split=80 *.c ../programs/*.c --dictID=1 -o tmpDict1
cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
$ECHO "- Create dictionary with size limit"
$ZSTD --train-fastcover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
$ECHO "- Compare size of dictionary from 90% training samples with 80% training samples"
$ZSTD --train-fastcover=split=90 -r *.c ../programs/*.c
$ZSTD --train-fastcover=split=80 -r *.c ../programs/*.c
$ECHO "- Create dictionary using all samples for both training and testing"
$ZSTD --train-fastcover=split=100 -r *.c ../programs/*.c
$ECHO "- Create dictionary using f=16"
$ZSTD --train-fastcover=f=16 -r *.c ../programs/*.c
$ECHO "- Create dictionary using accel=2"
$ZSTD --train-fastcover=accel=2 -r *.c ../programs/*.c
$ECHO "- Create dictionary using accel=10"
$ZSTD --train-fastcover=accel=10 -r *.c ../programs/*.c
$ECHO "- Create dictionary with multithreading"
$ZSTD --train-fastcover -T4 -r *.c ../programs/*.c
$ECHO "- Test -o before --train-fastcover"
rm -f tmpDict dictionary
$ZSTD -o tmpDict --train-fastcover *.c ../programs/*.c
test -f tmpDict
$ZSTD --train-fastcover *.c ../programs/*.c
test -f dictionary
rm tmp* dictionary
$ECHO "\n===> legacy dictionary builder "
TESTFILE=../programs/zstdcli.c

View File

@ -144,6 +144,8 @@ static const void *symbols[] = {
/* zdict.h: advanced functions */
&ZDICT_trainFromBuffer_cover,
&ZDICT_optimizeTrainFromBuffer_cover,
&ZDICT_trainFromBuffer_fastCover,
&ZDICT_optimizeTrainFromBuffer_fastCover,
&ZDICT_finalizeDictionary,
&ZDICT_trainFromBuffer_legacy,
&ZDICT_addEntropyTablesFromBuffer,