Merge branch 'dev' into adapt

2018-09-19 12:43:42 -07:00 · 2018-09-19 12:43:42 -07:00 · 2f78228f65
commit 2f78228f65
parent c71c4f23d7 c8ff5200f9
78 changed files with 4963 additions and 1661 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -20,23 +20,25 @@ matrix:
    - env: Cmd='make gcc8install && CC=gcc-8 CFLAGS="-Werror -O3" make -j all'
    - env: Cmd='make clang38install && CC=clang-3.8 make clean msan-test-zstd'

+    - env: Cmd='make staticAnalyze'
+
    - env: Cmd='make gcc6install && CC=gcc-6 make clean uasan-fuzztest'
    - env: Cmd='make gcc6install libc6install
             && make clean && CC=gcc-6 CFLAGS=-m32 make uasan-fuzztest'
    - env: Cmd='make clang38install && CC=clang-3.8 make clean msan-fuzztest'
    - env: Cmd='make clang38install && CC=clang-3.8 make clean tsan-test-zstream'

-    - env: Cmd='make arminstall && make armfuzz'
-    - env: Cmd='make arminstall && make aarch64fuzz'
-    - env: Cmd='make ppcinstall && make ppcfuzz'
-    - env: Cmd='make ppcinstall && make ppc64fuzz'
-
    - env: Cmd='make -j uasanregressiontest
             && make clean && make -j msanregressiontest'

    - env: Cmd='make valgrindinstall && make -C tests clean valgrindTest
             && make clean && make -C tests test-fuzzer-stackmode'

+    - env: Cmd='make arminstall && make armfuzz'
+    - env: Cmd='make arminstall && make aarch64fuzz'
+    - env: Cmd='make ppcinstall && make ppcfuzz'
+    - env: Cmd='make ppcinstall && make ppc64fuzz'
+
    - env: Cmd='make lz4install && make -C tests test-lz4
             && make clean && make -C tests test-pool
             && make clean && bash tests/libzstd_partial_builds.sh'
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -0,0 +1,5 @@
+# Code of Conduct
+
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
+Please read the [full text](https://code.fb.com/codeofconduct/)
+so that you can understand what actions will and will not be tolerated.
--- a/44
+++ b/44
@ -23,6 +23,7 @@ else
 EXT =
 endif

+## default: Build lib-release and zstd-release
 .PHONY: default
 default: lib-release zstd-release

@ -30,10 +31,9 @@ default: lib-release zstd-release
 all: allmost examples manual contrib

 .PHONY: allmost
-allmost: allzstd
-	$(MAKE) -C $(ZWRAPDIR) all
+allmost: allzstd zlibwrapper

-#skip zwrapper, can't build that on alternate architectures without the proper zlib installed
+# skip zwrapper, can't build that on alternate architectures without the proper zlib installed
 .PHONY: allzstd
 allzstd: lib
 	$(MAKE) -C $(PRGDIR) all
@ -44,8 +44,8 @@ all32:
 	$(MAKE) -C $(PRGDIR) zstd32
 	$(MAKE) -C $(TESTDIR) all32

-.PHONY: lib lib-release
-lib lib-release:
+.PHONY: lib lib-release libzstd.a
+lib lib-release :
 	@$(MAKE) -C $(ZSTDDIR) $@

 .PHONY: zstd zstd-release
@ -59,8 +59,8 @@ zstdmt:
 	cp $(PRGDIR)/zstd$(EXT) ./zstdmt$(EXT)

 .PHONY: zlibwrapper
-zlibwrapper:
-	$(MAKE) -C $(ZWRAPDIR) test
+zlibwrapper: lib
+	$(MAKE) -C $(ZWRAPDIR) all

 .PHONY: test
 test: MOREFLAGS += -g -DDEBUGLEVEL=1 -Werror
@ -88,6 +88,7 @@ contrib: lib
 	$(MAKE) -C contrib/pzstd all
 	$(MAKE) -C contrib/seekable_format/examples all
 	$(MAKE) -C contrib/adaptive-compression all
+	$(MAKE) -C contrib/largeNbDicts all

 .PHONY: cleanTabs
 cleanTabs:
@ -104,6 +105,7 @@ clean:
 	@$(MAKE) -C contrib/pzstd $@ > $(VOID)
 	@$(MAKE) -C contrib/seekable_format/examples $@ > $(VOID)
 	@$(MAKE) -C contrib/adaptive-compression $@ > $(VOID)
+	@$(MAKE) -C contrib/largeNbDicts $@ > $(VOID)
 	@$(RM) zstd$(EXT) zstdmt$(EXT) tmp*
 	@$(RM) -r lz4
 	@echo Cleaning completed
@ -114,11 +116,26 @@ clean:
 ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD DragonFly NetBSD MSYS_NT))

 HOST_OS = POSIX
-CMAKE_PARAMS = -DZSTD_BUILD_CONTRIB:BOOL=ON -DZSTD_BUILD_STATIC:BOOL=ON -DZSTD_BUILD_TESTS:BOOL=ON -DZSTD_ZLIB_SUPPORT:BOOL=ON -DZSTD_LZMA_SUPPORT:BOOL=ON -DCMAKE_BUILD_TYPE=Release 
+CMAKE_PARAMS = -DZSTD_BUILD_CONTRIB:BOOL=ON -DZSTD_BUILD_STATIC:BOOL=ON -DZSTD_BUILD_TESTS:BOOL=ON -DZSTD_ZLIB_SUPPORT:BOOL=ON -DZSTD_LZMA_SUPPORT:BOOL=ON -DCMAKE_BUILD_TYPE=Release

+# Print a two column output of targets and their description. To add a target description, put a
+# comment in the Makefile with the format "## <TARGET>: <DESCRIPTION>".  For example:
+#
+## list: Print all targets and their descriptions (if provided)
 .PHONY: list
 list:
-	@$(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null | awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' | sort | egrep -v -e '^[^[:alnum:]]' -e '^$@$$' | xargs
+	@TARGETS=$$($(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null \
+		| awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' \
+		| egrep -v  -e '^[^[:alnum:]]' | sort); \
+	{ \
+	    printf "Target Name\tDescription\n"; \
+	    printf "%0.s-" {1..16}; printf "\t"; printf "%0.s-" {1..40}; printf "\n"; \
+	    for target in $$TARGETS; do \
+	        line=$$(egrep "^##[[:space:]]+$$target:" $(lastword $(MAKEFILE_LIST))); \
+	        description=$$(echo $$line | awk '{i=index($$0,":"); print substr($$0,i+1)}' | xargs); \
+	        printf "$$target\t$$description\n"; \
+	    done \
+	} | column -t -s $$'\t'

 .PHONY: install clangtest armtest usan asan uasan
 install:
@ -198,7 +215,7 @@ gcc6test: clean

 clangtest: clean
 	clang -v
-	$(MAKE) all CXX=clang-++ CC=clang MOREFLAGS="-Werror -Wconversion -Wno-sign-conversion -Wdocumentation"
+	$(MAKE) all CXX=clang++ CC=clang MOREFLAGS="-Werror -Wconversion -Wno-sign-conversion -Wdocumentation"

 armtest: clean
 	$(MAKE) -C $(TESTDIR) datagen   # use native, faster
@ -351,7 +368,10 @@ bmi32build: clean
 	$(CC) -v
 	CFLAGS="-O3 -mbmi -m32 -Werror" $(MAKE) -C $(TESTDIR) test

-staticAnalyze: clean
+# static analyzer test uses clang's scan-build
+# does not analyze zlibWrapper, due to detected issues in zlib source code
+staticAnalyze: SCANBUILD ?= scan-build
+staticAnalyze:
 	$(CC) -v
-	CPPFLAGS=-g scan-build --status-bugs -v $(MAKE) all
+	CC=$(CC) CPPFLAGS=-g $(SCANBUILD) --status-bugs -v $(MAKE) allzstd examples contrib
 endif
--- a/5
+++ b/5
@ -1,3 +1,8 @@
+v1.3.6
+perf: much faster dictionary builder, by @jenniferliu
+api : reduced DDict size by 2 KB
+misc: tests/paramgrill, a parameter optimizer, by @GeorgeLu97
+
 v1.3.5
 perf: much faster dictionary compression, by @felixhandte
 perf: small quality improvement for dictionary generation, by @terrelln
--- a/build/VS2008/fuzzer/fuzzer.vcproj
+++ b/build/VS2008/fuzzer/fuzzer.vcproj
@ -336,6 +336,10 @@
 				RelativePath="..\..\..\lib\dictBuilder\cover.c"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\dictBuilder\fastcover.c"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
 				>
@ -482,6 +486,10 @@
 				RelativePath="..\..\..\lib\dictBuilder\zdict.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\dictBuilder\cover.h"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\dictBuilder\zdict_static.h"
 				>
--- a/build/VS2008/zstd/zstd.vcproj
+++ b/build/VS2008/zstd/zstd.vcproj
@ -348,6 +348,10 @@
 				RelativePath="..\..\..\lib\dictBuilder\cover.c"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\dictBuilder\fastcover.c"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
 				>
@ -522,6 +526,10 @@
 				RelativePath="..\..\..\lib\dictBuilder\zdict.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\dictBuilder\cover.h"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\dictBuilder\zdict_static.h"
 				>
--- a/build/VS2008/zstdlib/zstdlib.vcproj
+++ b/build/VS2008/zstdlib/zstdlib.vcproj
@ -332,6 +332,10 @@
 				RelativePath="..\..\..\lib\dictBuilder\cover.c"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\dictBuilder\fastcover.c"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
 				>
@ -502,6 +506,10 @@
 				RelativePath="..\..\..\lib\dictBuilder\zdict.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\dictBuilder\cover.h"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\dictBuilder\zdict_static.h"
 				>
--- a/build/VS2010/fuzzer/fuzzer.vcxproj
+++ b/build/VS2010/fuzzer/fuzzer.vcxproj
@ -176,6 +176,7 @@
    <ClCompile Include="..\..\..\lib\decompress\huf_decompress.c" />
    <ClCompile Include="..\..\..\lib\decompress\zstd_decompress.c" />
    <ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
+    <ClCompile Include="..\..\..\lib\dictBuilder\fastcover.c" />
    <ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
    <ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
    <ClCompile Include="..\..\..\programs\datagen.c" />
@ -199,6 +200,7 @@
    <ClInclude Include="..\..\..\lib\compress\zstdmt_compress.h" />
    <ClInclude Include="..\..\..\lib\dictBuilder\divsufsort.h" />
    <ClInclude Include="..\..\..\lib\dictBuilder\zdict.h" />
+    <ClInclude Include="..\..\..\lib\dictBuilder\cover.h" />
    <ClInclude Include="..\..\..\lib\legacy\zstd_legacy.h" />
    <ClInclude Include="..\..\..\programs\datagen.h" />
    <ClInclude Include="..\..\..\programs\util.h" />
--- a/build/VS2010/libzstd-dll/libzstd-dll.vcxproj
+++ b/build/VS2010/libzstd-dll/libzstd-dll.vcxproj
@ -43,6 +43,7 @@
    <ClCompile Include="..\..\..\lib\deprecated\zbuff_compress.c" />
    <ClCompile Include="..\..\..\lib\deprecated\zbuff_decompress.c" />
    <ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
+    <ClCompile Include="..\..\..\lib\dictBuilder\fastcover.c" />
    <ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
    <ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
    <ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />
--- a/build/VS2010/libzstd/libzstd.vcxproj
+++ b/build/VS2010/libzstd/libzstd.vcxproj
@ -43,6 +43,7 @@
    <ClCompile Include="..\..\..\lib\deprecated\zbuff_compress.c" />
    <ClCompile Include="..\..\..\lib\deprecated\zbuff_decompress.c" />
    <ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
+    <ClCompile Include="..\..\..\lib\dictBuilder\fastcover.c" />
    <ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
    <ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
    <ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />
--- a/build/VS2010/zstd/zstd.vcxproj
+++ b/build/VS2010/zstd/zstd.vcxproj
@ -40,6 +40,7 @@
    <ClCompile Include="..\..\..\lib\decompress\huf_decompress.c" />
    <ClCompile Include="..\..\..\lib\decompress\zstd_decompress.c" />
    <ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
+    <ClCompile Include="..\..\..\lib\dictBuilder\fastcover.c" />
    <ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
    <ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
    <ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />
@ -61,6 +62,7 @@
    <ClInclude Include="..\..\..\lib\common\xxhash.h" />
    <ClInclude Include="..\..\..\lib\compress\zstdmt_compress.h" />
    <ClInclude Include="..\..\..\lib\dictBuilder\zdict.h" />
+    <ClInclude Include="..\..\..\lib\dictBuilder\cover.h" />
    <ClInclude Include="..\..\..\lib\dictBuilder\divsufsort.h" />
    <ClInclude Include="..\..\..\lib\common\fse.h" />
    <ClInclude Include="..\..\..\lib\common\huf.h" />
--- a/build/cmake/contrib/gen_html/CMakeLists.txt
+++ b/build/cmake/contrib/gen_html/CMakeLists.txt
@ -27,4 +27,4 @@ ADD_CUSTOM_TARGET(zstd_manual.html ALL
                  ${GENHTML_BINARY} "${LIBVERSION}" "${LIBRARY_DIR}/zstd.h" "${PROJECT_BINARY_DIR}/zstd_manual.html"
                  DEPENDS gen_html COMMENT "Update zstd manual")

-INSTALL(FILES "${PROJECT_BINARY_DIR}/zstd_manual.html" DESTINATION "${CMAKE_INSTALL_PREFIX}/${DOC_INSTALL_DIR}")
+INSTALL(FILES "${PROJECT_BINARY_DIR}/zstd_manual.html" DESTINATION "${CMAKE_INSTALL_DOCDIR}")
--- a/build/cmake/lib/CMakeLists.txt
+++ b/build/cmake/lib/CMakeLists.txt
@ -47,6 +47,7 @@ SET(Sources
        ${LIBRARY_DIR}/decompress/huf_decompress.c
        ${LIBRARY_DIR}/decompress/zstd_decompress.c
        ${LIBRARY_DIR}/dictBuilder/cover.c
+        ${LIBRARY_DIR}/dictBuilder/fastcover.c
        ${LIBRARY_DIR}/dictBuilder/divsufsort.c
        ${LIBRARY_DIR}/dictBuilder/zdict.c
        ${LIBRARY_DIR}/deprecated/zbuff_common.c
@ -74,6 +75,7 @@ SET(Headers
        ${LIBRARY_DIR}/compress/zstd_ldm.h
        ${LIBRARY_DIR}/compress/zstdmt_compress.h
        ${LIBRARY_DIR}/dictBuilder/zdict.h
+        ${LIBRARY_DIR}/dictBuilder/cover.h
        ${LIBRARY_DIR}/deprecated/zbuff.h)

 IF (ZSTD_LEGACY_SUPPORT)
@ -178,6 +180,7 @@ INSTALL(FILES
    ${LIBRARY_DIR}/zstd.h
    ${LIBRARY_DIR}/deprecated/zbuff.h
    ${LIBRARY_DIR}/dictBuilder/zdict.h
+    ${LIBRARY_DIR}/dictBuilder/cover.h
    ${LIBRARY_DIR}/common/zstd_errors.h
    DESTINATION "include")

--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile
@ -2,10 +2,9 @@ ARG :=

 CC ?= gcc
 CFLAGS ?= -O3
-INCLUDES := -I ../randomDictBuilder -I ../fastCover -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder
+INCLUDES := -I ../randomDictBuilder -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder

 RANDOM_FILE := ../randomDictBuilder/random.c
-FAST_FILE := ../fastCover/fastCover.c
 IO_FILE := ../randomDictBuilder/io.c

 all: run clean
@ -22,8 +21,8 @@ test: benchmarkTest clean
 benchmarkTest: benchmark test.sh
 	sh test.sh

-benchmark: benchmark.o io.o random.o fastCover.o libzstd.a
-	$(CC) $(CFLAGS) benchmark.o io.o random.o fastCover.o libzstd.a -o benchmark
+benchmark: benchmark.o io.o random.o libzstd.a
+	$(CC) $(CFLAGS) benchmark.o io.o random.o libzstd.a -o benchmark

 benchmark.o: benchmark.c
 	$(CC) $(CFLAGS) $(INCLUDES) -c benchmark.c
@ -31,9 +30,6 @@ benchmark.o: benchmark.c
 random.o: $(RANDOM_FILE)
 	$(CC) $(CFLAGS) $(INCLUDES) -c $(RANDOM_FILE)

-fastCover.o: $(FAST_FILE)
-	$(CC) $(CFLAGS) $(INCLUDES) -c $(FAST_FILE)
-
 io.o: $(IO_FILE)
 	$(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE)

--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
@ -14,113 +14,836 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"

 ###Benchmarking Result:
 - First Cover is optimize cover, second Cover uses optimized d and k from first one.
- For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one.
+- For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one. This is run for accel values from 1 to 10.
 - Fourth column is chosen d and fifth column is chosen k

 github:
-NODICT       0.000025       2.999642        
-RANDOM       0.030101       8.791189        
-LEGACY       0.913108       8.173529        
-COVER       59.234160       10.652243        8          1298
-COVER       6.258459       10.652243        8          1298
-FAST15       9.959246       10.555630        8          1874
-FAST15       0.077719       10.555630        8          1874
-FAST16       10.028343       10.701698        8          1106
-FAST16       0.078117       10.701698        8          1106
-FAST17       10.567355       10.650652        8          1106
-FAST17       0.124833       10.650652        8          1106
-FAST18       11.795287       10.499142        8          1826
-FAST18       0.086992       10.499142        8          1826
-FAST19       13.132451       10.527140        8          1826
-FAST19       0.134716       10.527140        8          1826
-FAST20       14.366314       10.494710        8          1826
-FAST20       0.128844       10.494710        8          1826
-FAST21       14.941238       10.503488        8          1778
-FAST21       0.134975       10.503488        8          1778
-FAST22       15.146226       10.509284        8          1826
-FAST22       0.146918       10.509284        8          1826
-FAST23       16.260552       10.509284        8          1826
-FAST23       0.158494       10.509284        8          1826
-FAST24       16.806037       10.512369        8          1826
-FAST24       0.190464       10.512369        8          1826
+NODICT       0.000004       2.999642        
+RANDOM       0.024560       8.791189        
+LEGACY       0.727109       8.173529        
+COVER       40.565676       10.652243        8          1298
+COVER       3.608284       10.652243        8          1298
+FAST f=15 a=1       4.181024       10.570882        8          1154
+FAST f=15 a=1       0.040788       10.570882        8          1154
+FAST f=15 a=2       3.548352       10.574287        6          1970
+FAST f=15 a=2       0.035535       10.574287        6          1970
+FAST f=15 a=3       3.287364       10.613950        6          1010
+FAST f=15 a=3       0.032182       10.613950        6          1010
+FAST f=15 a=4       3.184976       10.573883        6          1058
+FAST f=15 a=4       0.029878       10.573883        6          1058
+FAST f=15 a=5       3.045513       10.580640        8          1154
+FAST f=15 a=5       0.022162       10.580640        8          1154
+FAST f=15 a=6       3.003296       10.583677        6          1010
+FAST f=15 a=6       0.028091       10.583677        6          1010
+FAST f=15 a=7       2.952655       10.622551        6          1106
+FAST f=15 a=7       0.02724       10.622551        6          1106
+FAST f=15 a=8       2.945674       10.614657        6          1010
+FAST f=15 a=8       0.027264       10.614657        6          1010
+FAST f=15 a=9       3.153439       10.564018        8          1154
+FAST f=15 a=9       0.020635       10.564018        8          1154
+FAST f=15 a=10       2.950416       10.511454        6          1010
+FAST f=15 a=10       0.026606       10.511454        6          1010
+FAST f=16 a=1       3.970029       10.681035        8          1154
+FAST f=16 a=1       0.038188       10.681035        8          1154
+FAST f=16 a=2       3.422892       10.484978        6          1874
+FAST f=16 a=2       0.034702       10.484978        6          1874
+FAST f=16 a=3       3.215836       10.632631        8          1154
+FAST f=16 a=3       0.026084       10.632631        8          1154
+FAST f=16 a=4       3.081353       10.626533        6          1106
+FAST f=16 a=4       0.030032       10.626533        6          1106
+FAST f=16 a=5       3.041241       10.545027        8          1922
+FAST f=16 a=5       0.022882       10.545027        8          1922
+FAST f=16 a=6       2.989390       10.638284        6          1874
+FAST f=16 a=6       0.028308       10.638284        6          1874
+FAST f=16 a=7       3.001581       10.797136        6          1106
+FAST f=16 a=7       0.027479       10.797136        6          1106
+FAST f=16 a=8       2.984107       10.658356        8          1058
+FAST f=16 a=8       0.021099       10.658356        8          1058
+FAST f=16 a=9       2.925788       10.523869        6          1010
+FAST f=16 a=9       0.026905       10.523869        6          1010
+FAST f=16 a=10       2.889605       10.745841        6          1874
+FAST f=16 a=10       0.026846       10.745841        6          1874
+FAST f=17 a=1       4.031953       10.672080        8          1202
+FAST f=17 a=1       0.040658       10.672080        8          1202
+FAST f=17 a=2       3.458107       10.589352        8          1106
+FAST f=17 a=2       0.02926       10.589352        8          1106
+FAST f=17 a=3       3.291189       10.662714        8          1154
+FAST f=17 a=3       0.026531       10.662714        8          1154
+FAST f=17 a=4       3.154950       10.549456        8          1346
+FAST f=17 a=4       0.024991       10.549456        8          1346
+FAST f=17 a=5       3.092271       10.541670        6          1202
+FAST f=17 a=5       0.038285       10.541670        6          1202
+FAST f=17 a=6       3.166146       10.729112        6          1874
+FAST f=17 a=6       0.038217       10.729112        6          1874
+FAST f=17 a=7       3.035467       10.810485        6          1106
+FAST f=17 a=7       0.036655       10.810485        6          1106
+FAST f=17 a=8       3.035668       10.530532        6          1058
+FAST f=17 a=8       0.037715       10.530532        6          1058
+FAST f=17 a=9       2.987917       10.589802        8          1922
+FAST f=17 a=9       0.02217       10.589802        8          1922
+FAST f=17 a=10       2.981647       10.722579        8          1106
+FAST f=17 a=10       0.021948       10.722579        8          1106
+FAST f=18 a=1       4.067144       10.634943        8          1154
+FAST f=18 a=1       0.041386       10.634943        8          1154
+FAST f=18 a=2       3.507377       10.546230        6          1970
+FAST f=18 a=2       0.037572       10.546230        6          1970
+FAST f=18 a=3       3.323015       10.648061        8          1154
+FAST f=18 a=3       0.028306       10.648061        8          1154
+FAST f=18 a=4       3.216735       10.705402        6          1010
+FAST f=18 a=4       0.030755       10.705402        6          1010
+FAST f=18 a=5       3.175794       10.588154        8          1874
+FAST f=18 a=5       0.025315       10.588154        8          1874
+FAST f=18 a=6       3.127459       10.751104        8          1106
+FAST f=18 a=6       0.023897       10.751104        8          1106
+FAST f=18 a=7       3.083017       10.780402        6          1106
+FAST f=18 a=7       0.029158       10.780402        6          1106
+FAST f=18 a=8       3.069700       10.547226        8          1346
+FAST f=18 a=8       0.024046       10.547226        8          1346
+FAST f=18 a=9       3.056591       10.674759        6          1010
+FAST f=18 a=9       0.028496       10.674759        6          1010
+FAST f=18 a=10       3.063588       10.737578        8          1106
+FAST f=18 a=10       0.023033       10.737578        8          1106
+FAST f=19 a=1       4.164041       10.650333        8          1154
+FAST f=19 a=1       0.042906       10.650333        8          1154
+FAST f=19 a=2       3.585409       10.577066        6          1058
+FAST f=19 a=2       0.038994       10.577066        6          1058
+FAST f=19 a=3       3.439643       10.639403        8          1154
+FAST f=19 a=3       0.028427       10.639403        8          1154
+FAST f=19 a=4       3.268869       10.554410        8          1298
+FAST f=19 a=4       0.026866       10.554410        8          1298
+FAST f=19 a=5       3.238225       10.615109        6          1010
+FAST f=19 a=5       0.03078       10.615109        6          1010
+FAST f=19 a=6       3.199558       10.609782        6          1874
+FAST f=19 a=6       0.030099       10.609782        6          1874
+FAST f=19 a=7       3.132395       10.794753        6          1106
+FAST f=19 a=7       0.028964       10.794753        6          1106
+FAST f=19 a=8       3.148446       10.554842        8          1298
+FAST f=19 a=8       0.024277       10.554842        8          1298
+FAST f=19 a=9       3.108324       10.668763        6          1010
+FAST f=19 a=9       0.02896       10.668763        6          1010
+FAST f=19 a=10       3.159863       10.757347        8          1106
+FAST f=19 a=10       0.023351       10.757347        8          1106
+FAST f=20 a=1       4.462698       10.661788        8          1154
+FAST f=20 a=1       0.047174       10.661788        8          1154
+FAST f=20 a=2       3.820269       10.678612        6          1106
+FAST f=20 a=2       0.040807       10.678612        6          1106
+FAST f=20 a=3       3.644955       10.648424        8          1154
+FAST f=20 a=3       0.031398       10.648424        8          1154
+FAST f=20 a=4       3.546257       10.559756        8          1298
+FAST f=20 a=4       0.029856       10.559756        8          1298
+FAST f=20 a=5       3.485248       10.646637        6          1010
+FAST f=20 a=5       0.033756       10.646637        6          1010
+FAST f=20 a=6       3.490438       10.775824        8          1106
+FAST f=20 a=6       0.028338       10.775824        8          1106
+FAST f=20 a=7       3.631289       10.801795        6          1106
+FAST f=20 a=7       0.035228       10.801795        6          1106
+FAST f=20 a=8       3.758936       10.545116        8          1346
+FAST f=20 a=8       0.027495       10.545116        8          1346
+FAST f=20 a=9       3.707024       10.677454        6          1010
+FAST f=20 a=9       0.031326       10.677454        6          1010
+FAST f=20 a=10       3.586593       10.756017        8          1106
+FAST f=20 a=10       0.027122       10.756017        8          1106
+FAST f=21 a=1       5.701396       10.655398        8          1154
+FAST f=21 a=1       0.067744       10.655398        8          1154
+FAST f=21 a=2       5.270542       10.650743        6          1106
+FAST f=21 a=2       0.052999       10.650743        6          1106
+FAST f=21 a=3       4.945294       10.652380        8          1154
+FAST f=21 a=3       0.052678       10.652380        8          1154
+FAST f=21 a=4       4.894079       10.543185        8          1298
+FAST f=21 a=4       0.04997       10.543185        8          1298
+FAST f=21 a=5       4.785417       10.630321        6          1010
+FAST f=21 a=5       0.045294       10.630321        6          1010
+FAST f=21 a=6       4.789381       10.664477        6          1874
+FAST f=21 a=6       0.046578       10.664477        6          1874
+FAST f=21 a=7       4.302955       10.805179        6          1106
+FAST f=21 a=7       0.041205       10.805179        6          1106
+FAST f=21 a=8       4.034630       10.551211        8          1298
+FAST f=21 a=8       0.040121       10.551211        8          1298
+FAST f=21 a=9       4.523868       10.799114        6          1010
+FAST f=21 a=9       0.043592       10.799114        6          1010
+FAST f=21 a=10       4.760736       10.750255        8          1106
+FAST f=21 a=10       0.043483       10.750255        8          1106
+FAST f=22 a=1       6.743064       10.640537        8          1154
+FAST f=22 a=1       0.086967       10.640537        8          1154
+FAST f=22 a=2       6.121739       10.626638        6          1970
+FAST f=22 a=2       0.066337       10.626638        6          1970
+FAST f=22 a=3       5.248851       10.640688        8          1154
+FAST f=22 a=3       0.054935       10.640688        8          1154
+FAST f=22 a=4       5.436579       10.588333        8          1298
+FAST f=22 a=4       0.064113       10.588333        8          1298
+FAST f=22 a=5       5.812815       10.652653        6          1010
+FAST f=22 a=5       0.058189       10.652653        6          1010
+FAST f=22 a=6       5.745472       10.666437        6          1874
+FAST f=22 a=6       0.057188       10.666437        6          1874
+FAST f=22 a=7       5.716393       10.806911        6          1106
+FAST f=22 a=7       0.056       10.806911        6          1106
+FAST f=22 a=8       5.698799       10.530784        8          1298
+FAST f=22 a=8       0.0583       10.530784        8          1298
+FAST f=22 a=9       5.710533       10.777391        6          1010
+FAST f=22 a=9       0.054945       10.777391        6          1010
+FAST f=22 a=10       5.685395       10.745023        8          1106
+FAST f=22 a=10       0.056526       10.745023        8          1106
+FAST f=23 a=1       7.836923       10.638828        8          1154
+FAST f=23 a=1       0.099522       10.638828        8          1154
+FAST f=23 a=2       6.627834       10.631061        6          1970
+FAST f=23 a=2       0.066769       10.631061        6          1970
+FAST f=23 a=3       5.602533       10.647288        8          1154
+FAST f=23 a=3       0.064513       10.647288        8          1154
+FAST f=23 a=4       6.005580       10.568747        8          1298
+FAST f=23 a=4       0.062022       10.568747        8          1298
+FAST f=23 a=5       5.481816       10.676921        6          1010
+FAST f=23 a=5       0.058959       10.676921        6          1010
+FAST f=23 a=6       5.460444       10.666194        6          1874
+FAST f=23 a=6       0.057687       10.666194        6          1874
+FAST f=23 a=7       5.659822       10.800377        6          1106
+FAST f=23 a=7       0.06783       10.800377        6          1106
+FAST f=23 a=8       6.826940       10.522167        8          1298
+FAST f=23 a=8       0.070533       10.522167        8          1298
+FAST f=23 a=9       6.804757       10.577799        8          1682
+FAST f=23 a=9       0.069949       10.577799        8          1682
+FAST f=23 a=10       6.774933       10.742093        8          1106
+FAST f=23 a=10       0.068395       10.742093        8          1106
+FAST f=24 a=1       8.444110       10.632783        8          1154
+FAST f=24 a=1       0.094357       10.632783        8          1154
+FAST f=24 a=2       7.289578       10.631061        6          1970
+FAST f=24 a=2       0.098515       10.631061        6          1970
+FAST f=24 a=3       8.619780       10.646289        8          1154
+FAST f=24 a=3       0.098041       10.646289        8          1154
+FAST f=24 a=4       8.508455       10.555199        8          1298
+FAST f=24 a=4       0.093885       10.555199        8          1298
+FAST f=24 a=5       8.471145       10.674363        6          1010
+FAST f=24 a=5       0.088676       10.674363        6          1010
+FAST f=24 a=6       8.426727       10.667228        6          1874
+FAST f=24 a=6       0.087247       10.667228        6          1874
+FAST f=24 a=7       8.356826       10.803027        6          1106
+FAST f=24 a=7       0.085835       10.803027        6          1106
+FAST f=24 a=8       6.756811       10.522049        8          1298
+FAST f=24 a=8       0.07107       10.522049        8          1298
+FAST f=24 a=9       6.548169       10.571882        8          1682
+FAST f=24 a=9       0.0713       10.571882        8          1682
+FAST f=24 a=10       8.238079       10.736453        8          1106
+FAST f=24 a=10       0.07004       10.736453        8          1106
+

 hg-commands:
-NODICT       0.000026       2.425291        
-RANDOM       0.046270       3.490331        
-LEGACY       0.847904       3.911682        
-COVER       71.691804       4.132653        8          386
-COVER       3.187085       4.132653        8          386
-FAST15       11.593687       3.920720        6          1106
-FAST15       0.082431       3.920720        6          1106
-FAST16       11.775958       4.033306        8          674
-FAST16       0.092587       4.033306        8          674
-FAST17       11.965064       4.064132        8          1490
-FAST17       0.106382       4.064132        8          1490
-FAST18       11.438197       4.086714        8          290
-FAST18       0.097293       4.086714        8          290
-FAST19       12.292512       4.097947        8          578
-FAST19       0.104406       4.097947        8          578
-FAST20       13.857857       4.102851        8          434
-FAST20       0.139467       4.102851        8          434
-FAST21       14.599613       4.105350        8          530
-FAST21       0.189416       4.105350        8          530
-FAST22       15.966109       4.104100        8          530
-FAST22       0.183817       4.104100        8          530
-FAST23       18.033645       4.098110        8          914
-FAST23       0.246641       4.098110        8          914
-FAST24       22.992891       4.117367        8          722
-FAST24       0.285994       4.117367        8          722
+NODICT       0.000005       2.425276        
+RANDOM       0.046332       3.490331        
+LEGACY       0.720351       3.911682        
+COVER       45.507731       4.132653        8          386
+COVER       1.868810       4.132653        8          386
+FAST f=15 a=1       4.561427       3.866894        8          1202
+FAST f=15 a=1       0.048946       3.866894        8          1202
+FAST f=15 a=2       3.574462       3.892119        8          1538
+FAST f=15 a=2       0.033677       3.892119        8          1538
+FAST f=15 a=3       3.230227       3.888791        6          1346
+FAST f=15 a=3       0.034312       3.888791        6          1346
+FAST f=15 a=4       3.042388       3.899739        8          1010
+FAST f=15 a=4       0.024307       3.899739        8          1010
+FAST f=15 a=5       2.800148       3.896220        8          818
+FAST f=15 a=5       0.022331       3.896220        8          818
+FAST f=15 a=6       2.706518       3.882039        8          578
+FAST f=15 a=6       0.020955       3.882039        8          578
+FAST f=15 a=7       2.701820       3.885430        6          866
+FAST f=15 a=7       0.026074       3.885430        6          866
+FAST f=15 a=8       2.604445       3.906932        8          1826
+FAST f=15 a=8       0.021789       3.906932        8          1826
+FAST f=15 a=9       2.598568       3.870324        6          1682
+FAST f=15 a=9       0.026004       3.870324        6          1682
+FAST f=15 a=10       2.575920       3.920783        8          1442
+FAST f=15 a=10       0.020228       3.920783        8          1442
+FAST f=16 a=1       4.630623       4.001430        8          770
+FAST f=16 a=1       0.047497       4.001430        8          770
+FAST f=16 a=2       3.674721       3.974431        8          1874
+FAST f=16 a=2       0.035761       3.974431        8          1874
+FAST f=16 a=3       3.338384       3.978703        8          1010
+FAST f=16 a=3       0.029436       3.978703        8          1010
+FAST f=16 a=4       3.004412       3.983035        8          1010
+FAST f=16 a=4       0.025744       3.983035        8          1010
+FAST f=16 a=5       2.881892       3.987710        8          770
+FAST f=16 a=5       0.023211       3.987710        8          770
+FAST f=16 a=6       2.807410       3.952717        8          1298
+FAST f=16 a=6       0.023199       3.952717        8          1298
+FAST f=16 a=7       2.819623       3.994627        8          770
+FAST f=16 a=7       0.021806       3.994627        8          770
+FAST f=16 a=8       2.740092       3.954032        8          1826
+FAST f=16 a=8       0.0226       3.954032        8          1826
+FAST f=16 a=9       2.682564       3.969879        6          1442
+FAST f=16 a=9       0.026324       3.969879        6          1442
+FAST f=16 a=10       2.657959       3.969755        8          674
+FAST f=16 a=10       0.020413       3.969755        8          674
+FAST f=17 a=1       4.729228       4.046000        8          530
+FAST f=17 a=1       0.049703       4.046000        8          530
+FAST f=17 a=2       3.764510       3.991519        8          1970
+FAST f=17 a=2       0.038195       3.991519        8          1970
+FAST f=17 a=3       3.416992       4.006296        6          914
+FAST f=17 a=3       0.036244       4.006296        6          914
+FAST f=17 a=4       3.145626       3.979182        8          1970
+FAST f=17 a=4       0.028676       3.979182        8          1970
+FAST f=17 a=5       2.995070       4.050070        8          770
+FAST f=17 a=5       0.025707       4.050070        8          770
+FAST f=17 a=6       2.911833       4.040024        8          770
+FAST f=17 a=6       0.02453       4.040024        8          770
+FAST f=17 a=7       2.894796       4.015884        8          818
+FAST f=17 a=7       0.023956       4.015884        8          818
+FAST f=17 a=8       2.789962       4.039303        8          530
+FAST f=17 a=8       0.023219       4.039303        8          530
+FAST f=17 a=9       2.787625       3.996762        8          1634
+FAST f=17 a=9       0.023651       3.996762        8          1634
+FAST f=17 a=10       2.754796       4.005059        8          1058
+FAST f=17 a=10       0.022537       4.005059        8          1058
+FAST f=18 a=1       4.779117       4.038214        8          242
+FAST f=18 a=1       0.048814       4.038214        8          242
+FAST f=18 a=2       3.829753       4.045768        8          722
+FAST f=18 a=2       0.036541       4.045768        8          722
+FAST f=18 a=3       3.495053       4.021497        8          770
+FAST f=18 a=3       0.032648       4.021497        8          770
+FAST f=18 a=4       3.221395       4.039623        8          770
+FAST f=18 a=4       0.027818       4.039623        8          770
+FAST f=18 a=5       3.059369       4.050414        8          530
+FAST f=18 a=5       0.026296       4.050414        8          530
+FAST f=18 a=6       3.019292       4.010714        6          962
+FAST f=18 a=6       0.031104       4.010714        6          962
+FAST f=18 a=7       2.949322       4.031439        6          770
+FAST f=18 a=7       0.030745       4.031439        6          770
+FAST f=18 a=8       2.876425       4.032088        6          386
+FAST f=18 a=8       0.027407       4.032088        6          386
+FAST f=18 a=9       2.850958       4.053372        8          674
+FAST f=18 a=9       0.023799       4.053372        8          674
+FAST f=18 a=10       2.884352       4.020148        8          1730
+FAST f=18 a=10       0.024401       4.020148        8          1730
+FAST f=19 a=1       4.815669       4.061203        8          674
+FAST f=19 a=1       0.051425       4.061203        8          674
+FAST f=19 a=2       3.951356       4.013822        8          1442
+FAST f=19 a=2       0.039968       4.013822        8          1442
+FAST f=19 a=3       3.554682       4.050425        8          722
+FAST f=19 a=3       0.032725       4.050425        8          722
+FAST f=19 a=4       3.242585       4.054677        8          722
+FAST f=19 a=4       0.028194       4.054677        8          722
+FAST f=19 a=5       3.105909       4.064524        8          818
+FAST f=19 a=5       0.02675       4.064524        8          818
+FAST f=19 a=6       3.059901       4.036857        8          1250
+FAST f=19 a=6       0.026396       4.036857        8          1250
+FAST f=19 a=7       3.016151       4.068234        6          770
+FAST f=19 a=7       0.031501       4.068234        6          770
+FAST f=19 a=8       2.962902       4.077509        8          530
+FAST f=19 a=8       0.023333       4.077509        8          530
+FAST f=19 a=9       2.899607       4.067328        8          530
+FAST f=19 a=9       0.024553       4.067328        8          530
+FAST f=19 a=10       2.950978       4.059901        8          434
+FAST f=19 a=10       0.023852       4.059901        8          434
+FAST f=20 a=1       5.259834       4.027579        8          1634
+FAST f=20 a=1       0.061123       4.027579        8          1634
+FAST f=20 a=2       4.382150       4.025093        8          1634
+FAST f=20 a=2       0.048009       4.025093        8          1634
+FAST f=20 a=3       4.104323       4.060842        8          530
+FAST f=20 a=3       0.040965       4.060842        8          530
+FAST f=20 a=4       3.853340       4.023504        6          914
+FAST f=20 a=4       0.041072       4.023504        6          914
+FAST f=20 a=5       3.728841       4.018089        6          1634
+FAST f=20 a=5       0.037469       4.018089        6          1634
+FAST f=20 a=6       3.683045       4.069138        8          578
+FAST f=20 a=6       0.028011       4.069138        8          578
+FAST f=20 a=7       3.726973       4.063160        8          722
+FAST f=20 a=7       0.028437       4.063160        8          722
+FAST f=20 a=8       3.555073       4.057690        8          386
+FAST f=20 a=8       0.027588       4.057690        8          386
+FAST f=20 a=9       3.551095       4.067253        8          482
+FAST f=20 a=9       0.025976       4.067253        8          482
+FAST f=20 a=10       3.490127       4.068518        8          530
+FAST f=20 a=10       0.025971       4.068518        8          530
+FAST f=21 a=1       7.343816       4.064945        8          770
+FAST f=21 a=1       0.085035       4.064945        8          770
+FAST f=21 a=2       5.930894       4.048206        8          386
+FAST f=21 a=2       0.067349       4.048206        8          386
+FAST f=21 a=3       6.770775       4.063417        8          578
+FAST f=21 a=3       0.077104       4.063417        8          578
+FAST f=21 a=4       6.889409       4.066761        8          626
+FAST f=21 a=4       0.0717       4.066761        8          626
+FAST f=21 a=5       6.714896       4.051813        8          914
+FAST f=21 a=5       0.071026       4.051813        8          914
+FAST f=21 a=6       6.539890       4.047263        8          1922
+FAST f=21 a=6       0.07127       4.047263        8          1922
+FAST f=21 a=7       6.511052       4.068373        8          482
+FAST f=21 a=7       0.065467       4.068373        8          482
+FAST f=21 a=8       6.458788       4.071597        8          482
+FAST f=21 a=8       0.063817       4.071597        8          482
+FAST f=21 a=9       6.377591       4.052905        8          434
+FAST f=21 a=9       0.063112       4.052905        8          434
+FAST f=21 a=10       6.360752       4.047773        8          530
+FAST f=21 a=10       0.063606       4.047773        8          530
+FAST f=22 a=1       10.523471       4.040812        8          962
+FAST f=22 a=1       0.14214       4.040812        8          962
+FAST f=22 a=2       9.454758       4.059396        8          914
+FAST f=22 a=2       0.118343       4.059396        8          914
+FAST f=22 a=3       9.043197       4.043019        8          1922
+FAST f=22 a=3       0.109798       4.043019        8          1922
+FAST f=22 a=4       8.716261       4.044819        8          770
+FAST f=22 a=4       0.099687       4.044819        8          770
+FAST f=22 a=5       8.529472       4.070576        8          530
+FAST f=22 a=5       0.093127       4.070576        8          530
+FAST f=22 a=6       8.424241       4.070565        8          722
+FAST f=22 a=6       0.093703       4.070565        8          722
+FAST f=22 a=7       8.403391       4.070591        8          578
+FAST f=22 a=7       0.089763       4.070591        8          578
+FAST f=22 a=8       8.285221       4.089171        8          530
+FAST f=22 a=8       0.087716       4.089171        8          530
+FAST f=22 a=9       8.282506       4.047470        8          722
+FAST f=22 a=9       0.089773       4.047470        8          722
+FAST f=22 a=10       8.241809       4.064151        8          818
+FAST f=22 a=10       0.090413       4.064151        8          818
+FAST f=23 a=1       12.389208       4.051635        6          530
+FAST f=23 a=1       0.147796       4.051635        6          530
+FAST f=23 a=2       11.300910       4.042835        6          914
+FAST f=23 a=2       0.133178       4.042835        6          914
+FAST f=23 a=3       10.879455       4.047415        8          626
+FAST f=23 a=3       0.129571       4.047415        8          626
+FAST f=23 a=4       10.522718       4.038269        6          914
+FAST f=23 a=4       0.118121       4.038269        6          914
+FAST f=23 a=5       10.348043       4.066884        8          434
+FAST f=23 a=5       0.112098       4.066884        8          434
+FAST f=23 a=6       10.238630       4.048635        8          1010
+FAST f=23 a=6       0.120281       4.048635        8          1010
+FAST f=23 a=7       10.213255       4.061809        8          530
+FAST f=23 a=7       0.1121       4.061809        8          530
+FAST f=23 a=8       10.107879       4.074104        8          818
+FAST f=23 a=8       0.116544       4.074104        8          818
+FAST f=23 a=9       10.063424       4.064811        8          674
+FAST f=23 a=9       0.109045       4.064811        8          674
+FAST f=23 a=10       10.035801       4.054918        8          530
+FAST f=23 a=10       0.108735       4.054918        8          530
+FAST f=24 a=1       14.963878       4.073490        8          722
+FAST f=24 a=1       0.206344       4.073490        8          722
+FAST f=24 a=2       13.833472       4.036100        8          962
+FAST f=24 a=2       0.17486       4.036100        8          962
+FAST f=24 a=3       13.404631       4.026281        6          1106
+FAST f=24 a=3       0.153961       4.026281        6          1106
+FAST f=24 a=4       13.041164       4.065448        8          674
+FAST f=24 a=4       0.155509       4.065448        8          674
+FAST f=24 a=5       12.879412       4.054636        8          674
+FAST f=24 a=5       0.148282       4.054636        8          674
+FAST f=24 a=6       12.773736       4.081376        8          530
+FAST f=24 a=6       0.142563       4.081376        8          530
+FAST f=24 a=7       12.711310       4.059834        8          770
+FAST f=24 a=7       0.149321       4.059834        8          770
+FAST f=24 a=8       12.635459       4.052050        8          1298
+FAST f=24 a=8       0.15095       4.052050        8          1298
+FAST f=24 a=9       12.558104       4.076516        8          722
+FAST f=24 a=9       0.144361       4.076516        8          722
+FAST f=24 a=10       10.661348       4.062137        8          818
+FAST f=24 a=10       0.108232       4.062137        8          818
+

 hg-changelog:
-NODICT       0.000007       1.377613        
-RANDOM       0.297345       2.097487        
-LEGACY       2.633992       2.058907        
-COVER       219.179786       2.189685        8          98
-COVER       6.620852       2.189685        8          98
-FAST15       47.635082       2.130794        6          386
-FAST15       0.321297       2.130794        6          386
-FAST16       43.837676       2.144845        8          194
-FAST16       0.312640       2.144845        8          194
-FAST17       49.349017       2.156099        8          242
-FAST17       0.348459       2.156099        8          242
-FAST18       51.153784       2.172439        6          98
-FAST18       0.353106       2.172439        6          98
-FAST19       52.627045       2.180321        6          98
-FAST19       0.390612       2.180321        6          98
-FAST20       63.748782       2.187431        6          98
-FAST20       0.489544       2.187431        6          98
-FAST21       68.709198       2.184185        6          146
-FAST21       0.530852       2.184185        6          146
-FAST22       68.491639       2.182830        6          98
-FAST22       0.645699       2.182830        6          98
-FAST23       72.558688       2.186399        8          98
-FAST23       0.593539       2.186399        8          98
-FAST24       76.137195       2.185608        6          98
-FAST24       0.680132       2.185608        6          98
+NODICT       0.000017       1.377590        
+RANDOM       0.186171       2.097487        
+LEGACY       1.670867       2.058907        
+COVER       173.561948       2.189685        8          98
+COVER       4.811180       2.189685        8          98
+FAST f=15 a=1       18.685906       2.129682        8          434
+FAST f=15 a=1       0.173376       2.129682        8          434
+FAST f=15 a=2       12.928259       2.131890        8          482
+FAST f=15 a=2       0.102582       2.131890        8          482
+FAST f=15 a=3       11.132343       2.128027        8          386
+FAST f=15 a=3       0.077122       2.128027        8          386
+FAST f=15 a=4       10.120683       2.125797        8          434
+FAST f=15 a=4       0.065175       2.125797        8          434
+FAST f=15 a=5       9.479092       2.127697        8          386
+FAST f=15 a=5       0.057905       2.127697        8          386
+FAST f=15 a=6       9.159523       2.127132        8          1682
+FAST f=15 a=6       0.058604       2.127132        8          1682
+FAST f=15 a=7       8.724003       2.129914        8          434
+FAST f=15 a=7       0.0493       2.129914        8          434
+FAST f=15 a=8       8.595001       2.127137        8          338
+FAST f=15 a=8       0.0474       2.127137        8          338
+FAST f=15 a=9       8.356405       2.125512        8          482
+FAST f=15 a=9       0.046126       2.125512        8          482
+FAST f=15 a=10       8.207111       2.126066        8          338
+FAST f=15 a=10       0.043292       2.126066        8          338
+FAST f=16 a=1       18.464436       2.144040        8          242
+FAST f=16 a=1       0.172156       2.144040        8          242
+FAST f=16 a=2       12.844825       2.148171        8          194
+FAST f=16 a=2       0.099619       2.148171        8          194
+FAST f=16 a=3       11.082568       2.140837        8          290
+FAST f=16 a=3       0.079165       2.140837        8          290
+FAST f=16 a=4       10.066749       2.144405        8          386
+FAST f=16 a=4       0.068411       2.144405        8          386
+FAST f=16 a=5       9.501121       2.140720        8          386
+FAST f=16 a=5       0.061316       2.140720        8          386
+FAST f=16 a=6       9.179332       2.139478        8          386
+FAST f=16 a=6       0.056322       2.139478        8          386
+FAST f=16 a=7       8.849438       2.142412        8          194
+FAST f=16 a=7       0.050493       2.142412        8          194
+FAST f=16 a=8       8.810919       2.143454        8          434
+FAST f=16 a=8       0.051304       2.143454        8          434
+FAST f=16 a=9       8.553900       2.140339        8          194
+FAST f=16 a=9       0.047285       2.140339        8          194
+FAST f=16 a=10       8.398027       2.143130        8          386
+FAST f=16 a=10       0.046386       2.143130        8          386
+FAST f=17 a=1       18.644657       2.157192        8          98
+FAST f=17 a=1       0.173884       2.157192        8          98
+FAST f=17 a=2       13.071242       2.159830        8          146
+FAST f=17 a=2       0.10388       2.159830        8          146
+FAST f=17 a=3       11.332366       2.153654        6          194
+FAST f=17 a=3       0.08983       2.153654        6          194
+FAST f=17 a=4       10.362413       2.156813        8          242
+FAST f=17 a=4       0.070389       2.156813        8          242
+FAST f=17 a=5       9.808159       2.155098        6          338
+FAST f=17 a=5       0.072661       2.155098        6          338
+FAST f=17 a=6       9.451165       2.153845        6          146
+FAST f=17 a=6       0.064959       2.153845        6          146
+FAST f=17 a=7       9.163097       2.155424        6          242
+FAST f=17 a=7       0.064323       2.155424        6          242
+FAST f=17 a=8       9.047276       2.156640        8          242
+FAST f=17 a=8       0.053382       2.156640        8          242
+FAST f=17 a=9       8.807671       2.152396        8          146
+FAST f=17 a=9       0.049617       2.152396        8          146
+FAST f=17 a=10       8.649827       2.152370        8          146
+FAST f=17 a=10       0.047849       2.152370        8          146
+FAST f=18 a=1       18.809502       2.168116        8          98
+FAST f=18 a=1       0.175226       2.168116        8          98
+FAST f=18 a=2       13.756502       2.170870        6          242
+FAST f=18 a=2       0.119507       2.170870        6          242
+FAST f=18 a=3       12.059748       2.163094        6          98
+FAST f=18 a=3       0.093912       2.163094        6          98
+FAST f=18 a=4       11.410294       2.172372        8          98
+FAST f=18 a=4       0.073048       2.172372        8          98
+FAST f=18 a=5       10.560297       2.166388        8          98
+FAST f=18 a=5       0.065136       2.166388        8          98
+FAST f=18 a=6       10.071390       2.162672        8          98
+FAST f=18 a=6       0.059402       2.162672        8          98
+FAST f=18 a=7       10.084214       2.166624        6          194
+FAST f=18 a=7       0.073276       2.166624        6          194
+FAST f=18 a=8       9.953226       2.167454        8          98
+FAST f=18 a=8       0.053659       2.167454        8          98
+FAST f=18 a=9       8.982461       2.161593        6          146
+FAST f=18 a=9       0.05955       2.161593        6          146
+FAST f=18 a=10       8.986092       2.164373        6          242
+FAST f=18 a=10       0.059135       2.164373        6          242
+FAST f=19 a=1       18.908277       2.176021        8          98
+FAST f=19 a=1       0.177316       2.176021        8          98
+FAST f=19 a=2       13.471313       2.176103        8          98
+FAST f=19 a=2       0.106344       2.176103        8          98
+FAST f=19 a=3       11.571406       2.172812        8          98
+FAST f=19 a=3       0.083293       2.172812        8          98
+FAST f=19 a=4       10.632775       2.177770        6          146
+FAST f=19 a=4       0.079864       2.177770        6          146
+FAST f=19 a=5       10.030190       2.175574        6          146
+FAST f=19 a=5       0.07223       2.175574        6          146
+FAST f=19 a=6       9.717818       2.169997        8          98
+FAST f=19 a=6       0.060049       2.169997        8          98
+FAST f=19 a=7       9.397531       2.172770        8          146
+FAST f=19 a=7       0.057188       2.172770        8          146
+FAST f=19 a=8       9.281061       2.175822        8          98
+FAST f=19 a=8       0.053711       2.175822        8          98
+FAST f=19 a=9       9.165242       2.169849        6          146
+FAST f=19 a=9       0.059898       2.169849        6          146
+FAST f=19 a=10       9.048763       2.173394        8          98
+FAST f=19 a=10       0.049757       2.173394        8          98
+FAST f=20 a=1       21.166917       2.183923        6          98
+FAST f=20 a=1       0.205425       2.183923        6          98
+FAST f=20 a=2       15.642753       2.182349        6          98
+FAST f=20 a=2       0.135957       2.182349        6          98
+FAST f=20 a=3       14.053730       2.173544        6          98
+FAST f=20 a=3       0.11266       2.173544        6          98
+FAST f=20 a=4       15.270019       2.183656        8          98
+FAST f=20 a=4       0.107892       2.183656        8          98
+FAST f=20 a=5       15.497927       2.174661        6          98
+FAST f=20 a=5       0.100305       2.174661        6          98
+FAST f=20 a=6       13.973505       2.172391        8          98
+FAST f=20 a=6       0.087565       2.172391        8          98
+FAST f=20 a=7       14.083296       2.172443        8          98
+FAST f=20 a=7       0.078062       2.172443        8          98
+FAST f=20 a=8       12.560048       2.175581        8          98
+FAST f=20 a=8       0.070282       2.175581        8          98
+FAST f=20 a=9       13.078645       2.173975        6          146
+FAST f=20 a=9       0.081041       2.173975        6          146
+FAST f=20 a=10       12.823328       2.177778        8          98
+FAST f=20 a=10       0.074522       2.177778        8          98
+FAST f=21 a=1       29.825370       2.183057        6          98
+FAST f=21 a=1       0.334453       2.183057        6          98
+FAST f=21 a=2       29.476474       2.182752        8          98
+FAST f=21 a=2       0.286602       2.182752        8          98
+FAST f=21 a=3       25.937186       2.175867        8          98
+FAST f=21 a=3       0.17626       2.175867        8          98
+FAST f=21 a=4       20.413865       2.179780        8          98
+FAST f=21 a=4       0.206085       2.179780        8          98
+FAST f=21 a=5       20.541889       2.178328        6          146
+FAST f=21 a=5       0.199157       2.178328        6          146
+FAST f=21 a=6       21.090670       2.174443        6          146
+FAST f=21 a=6       0.190645       2.174443        6          146
+FAST f=21 a=7       20.221569       2.177384        6          146
+FAST f=21 a=7       0.184278       2.177384        6          146
+FAST f=21 a=8       20.322357       2.179456        6          98
+FAST f=21 a=8       0.178458       2.179456        6          98
+FAST f=21 a=9       20.683912       2.174396        6          146
+FAST f=21 a=9       0.190829       2.174396        6          146
+FAST f=21 a=10       20.840865       2.174905        8          98
+FAST f=21 a=10       0.172515       2.174905        8          98
+FAST f=22 a=1       36.822827       2.181612        6          98
+FAST f=22 a=1       0.437389       2.181612        6          98
+FAST f=22 a=2       30.616902       2.183142        8          98
+FAST f=22 a=2       0.324284       2.183142        8          98
+FAST f=22 a=3       28.472482       2.178130        8          98
+FAST f=22 a=3       0.236538       2.178130        8          98
+FAST f=22 a=4       25.847028       2.181878        8          98
+FAST f=22 a=4       0.263744       2.181878        8          98
+FAST f=22 a=5       27.095881       2.180775        8          98
+FAST f=22 a=5       0.24988       2.180775        8          98
+FAST f=22 a=6       25.939172       2.170916        8          98
+FAST f=22 a=6       0.240033       2.170916        8          98
+FAST f=22 a=7       27.064194       2.177849        8          98
+FAST f=22 a=7       0.242383       2.177849        8          98
+FAST f=22 a=8       25.140221       2.178216        8          98
+FAST f=22 a=8       0.237601       2.178216        8          98
+FAST f=22 a=9       25.505283       2.177455        6          146
+FAST f=22 a=9       0.223217       2.177455        6          146
+FAST f=22 a=10       24.529362       2.176705        6          98
+FAST f=22 a=10       0.222876       2.176705        6          98
+FAST f=23 a=1       39.127310       2.183006        6          98
+FAST f=23 a=1       0.417338       2.183006        6          98
+FAST f=23 a=2       32.468161       2.183524        6          98
+FAST f=23 a=2       0.351645       2.183524        6          98
+FAST f=23 a=3       31.577620       2.172604        6          98
+FAST f=23 a=3       0.319659       2.172604        6          98
+FAST f=23 a=4       30.129247       2.183932        6          98
+FAST f=23 a=4       0.307239       2.183932        6          98
+FAST f=23 a=5       29.103376       2.183529        6          146
+FAST f=23 a=5       0.285533       2.183529        6          146
+FAST f=23 a=6       29.776045       2.174367        8          98
+FAST f=23 a=6       0.276846       2.174367        8          98
+FAST f=23 a=7       28.940407       2.178022        6          146
+FAST f=23 a=7       0.274082       2.178022        6          146
+FAST f=23 a=8       29.256009       2.179462        6          98
+FAST f=23 a=8       0.26949       2.179462        6          98
+FAST f=23 a=9       29.347312       2.170407        8          98
+FAST f=23 a=9       0.265034       2.170407        8          98
+FAST f=23 a=10       29.140081       2.171762        8          98
+FAST f=23 a=10       0.259183       2.171762        8          98
+FAST f=24 a=1       44.871179       2.182115        6          98
+FAST f=24 a=1       0.509433       2.182115        6          98
+FAST f=24 a=2       38.694867       2.180549        8          98
+FAST f=24 a=2       0.406695       2.180549        8          98
+FAST f=24 a=3       38.363769       2.172821        8          98
+FAST f=24 a=3       0.359581       2.172821        8          98
+FAST f=24 a=4       36.580797       2.184142        8          98
+FAST f=24 a=4       0.340614       2.184142        8          98
+FAST f=24 a=5       33.125701       2.183301        8          98
+FAST f=24 a=5       0.324874       2.183301        8          98
+FAST f=24 a=6       34.776068       2.173019        6          146
+FAST f=24 a=6       0.340397       2.173019        6          146
+FAST f=24 a=7       34.417625       2.176561        6          146
+FAST f=24 a=7       0.308223       2.176561        6          146
+FAST f=24 a=8       35.470291       2.182161        6          98
+FAST f=24 a=8       0.307724       2.182161        6          98
+FAST f=24 a=9       34.927252       2.172682        6          146
+FAST f=24 a=9       0.300598       2.172682        6          146
+FAST f=24 a=10       33.238355       2.173395        6          98
+FAST f=24 a=10       0.249916       2.173395        6          98
+

 hg-manifest:
-NODICT       0.000026       1.866385        
-RANDOM       0.784554       2.309436        
-LEGACY       10.193714       2.506977        
-COVER       988.206583       2.582528        8          434
-COVER       39.726199       2.582528        8          434
-FAST15       168.388819       2.392920        6          1826
-FAST15       1.272178       2.392920        6          1826
-FAST16       161.822607       2.480762        6          1922
-FAST16       1.164908       2.480762        6          1922
-FAST17       157.688544       2.548285        6          1682
-FAST17       1.222439       2.548285        6          1682
-FAST18       154.529585       2.567634        6          386
-FAST18       1.217596       2.567634        6          386
-FAST19       160.244979       2.581653        8          338
-FAST19       1.282450       2.581653        8          338
-FAST20       191.503297       2.586881        8          194
-FAST20       2.009748       2.586881        8          194
-FAST21       226.389709       2.590051        6          242
-FAST21       2.494543       2.590051        6          242
-FAST22       217.859055       2.591376        6          194
-FAST22       2.295693       2.591376        6          194
-FAST23       236.819791       2.591131        8          434
-FAST23       2.744711       2.591131        8          434
-FAST24       269.187800       2.591548        6          290
-FAST24       2.923671       2.591548        6          290
+NODICT       0.000004       1.866377        
+RANDOM       0.696346       2.309436        
+LEGACY       7.064527       2.506977        
+COVER       876.312865       2.582528        8          434
+COVER       35.684533       2.582528        8          434
+FAST f=15 a=1       76.618201       2.404013        8          1202
+FAST f=15 a=1       0.700722       2.404013        8          1202
+FAST f=15 a=2       49.213058       2.409248        6          1826
+FAST f=15 a=2       0.473393       2.409248        6          1826
+FAST f=15 a=3       41.753197       2.409677        8          1490
+FAST f=15 a=3       0.336848       2.409677        8          1490
+FAST f=15 a=4       38.648295       2.407996        8          1538
+FAST f=15 a=4       0.283952       2.407996        8          1538
+FAST f=15 a=5       36.144936       2.402895        8          1874
+FAST f=15 a=5       0.270128       2.402895        8          1874
+FAST f=15 a=6       35.484675       2.394873        8          1586
+FAST f=15 a=6       0.251637       2.394873        8          1586
+FAST f=15 a=7       34.280599       2.397311        8          1778
+FAST f=15 a=7       0.23984       2.397311        8          1778
+FAST f=15 a=8       32.122572       2.396089        6          1490
+FAST f=15 a=8       0.251508       2.396089        6          1490
+FAST f=15 a=9       29.909842       2.390092        6          1970
+FAST f=15 a=9       0.251233       2.390092        6          1970
+FAST f=15 a=10       30.102938       2.400086        6          1682
+FAST f=15 a=10       0.23688       2.400086        6          1682
+FAST f=16 a=1       67.750401       2.475460        6          1346
+FAST f=16 a=1       0.796035       2.475460        6          1346
+FAST f=16 a=2       52.812027       2.480860        6          1730
+FAST f=16 a=2       0.480384       2.480860        6          1730
+FAST f=16 a=3       44.179259       2.469304        8          1970
+FAST f=16 a=3       0.332657       2.469304        8          1970
+FAST f=16 a=4       37.612728       2.478208        6          1970
+FAST f=16 a=4       0.32498       2.478208        6          1970
+FAST f=16 a=5       35.056222       2.475568        6          1298
+FAST f=16 a=5       0.302824       2.475568        6          1298
+FAST f=16 a=6       34.713012       2.486079        8          1730
+FAST f=16 a=6       0.24755       2.486079        8          1730
+FAST f=16 a=7       33.713687       2.477180        6          1682
+FAST f=16 a=7       0.280358       2.477180        6          1682
+FAST f=16 a=8       31.571412       2.475418        8          1538
+FAST f=16 a=8       0.241241       2.475418        8          1538
+FAST f=16 a=9       31.608069       2.478263        8          1922
+FAST f=16 a=9       0.241764       2.478263        8          1922
+FAST f=16 a=10       31.358002       2.472263        8          1442
+FAST f=16 a=10       0.221661       2.472263        8          1442
+FAST f=17 a=1       66.185775       2.536085        6          1346
+FAST f=17 a=1       0.713549       2.536085        6          1346
+FAST f=17 a=2       50.365000       2.546105        8          1298
+FAST f=17 a=2       0.467846       2.546105        8          1298
+FAST f=17 a=3       42.712843       2.536250        8          1298
+FAST f=17 a=3       0.34047       2.536250        8          1298
+FAST f=17 a=4       39.514227       2.535555        8          1442
+FAST f=17 a=4       0.302989       2.535555        8          1442
+FAST f=17 a=5       35.189292       2.524925        8          1202
+FAST f=17 a=5       0.273451       2.524925        8          1202
+FAST f=17 a=6       35.791683       2.523466        8          1202
+FAST f=17 a=6       0.268261       2.523466        8          1202
+FAST f=17 a=7       37.416136       2.526625        6          1010
+FAST f=17 a=7       0.277558       2.526625        6          1010
+FAST f=17 a=8       37.084707       2.533274        6          1250
+FAST f=17 a=8       0.285104       2.533274        6          1250
+FAST f=17 a=9       34.183814       2.532765        8          1298
+FAST f=17 a=9       0.235133       2.532765        8          1298
+FAST f=17 a=10       31.149235       2.528722        8          1346
+FAST f=17 a=10       0.232679       2.528722        8          1346
+FAST f=18 a=1       72.942176       2.559857        6          386
+FAST f=18 a=1       0.718618       2.559857        6          386
+FAST f=18 a=2       51.690440       2.559572        8          290
+FAST f=18 a=2       0.403978       2.559572        8          290
+FAST f=18 a=3       45.344908       2.561040        8          962
+FAST f=18 a=3       0.357205       2.561040        8          962
+FAST f=18 a=4       39.804522       2.558446        8          1010
+FAST f=18 a=4       0.310526       2.558446        8          1010
+FAST f=18 a=5       38.134888       2.561811        8          626
+FAST f=18 a=5       0.273743       2.561811        8          626
+FAST f=18 a=6       35.091890       2.555518        8          722
+FAST f=18 a=6       0.260135       2.555518        8          722
+FAST f=18 a=7       34.639523       2.562938        8          290
+FAST f=18 a=7       0.234294       2.562938        8          290
+FAST f=18 a=8       36.076431       2.563567        8          1586
+FAST f=18 a=8       0.274075       2.563567        8          1586
+FAST f=18 a=9       36.376433       2.560950        8          722
+FAST f=18 a=9       0.240106       2.560950        8          722
+FAST f=18 a=10       32.624790       2.559340        8          578
+FAST f=18 a=10       0.234704       2.559340        8          578
+FAST f=19 a=1       70.513761       2.572441        8          194
+FAST f=19 a=1       0.726112       2.572441        8          194
+FAST f=19 a=2       59.263032       2.574560        8          482
+FAST f=19 a=2       0.451554       2.574560        8          482
+FAST f=19 a=3       51.509594       2.571546        6          194
+FAST f=19 a=3       0.393014       2.571546        6          194
+FAST f=19 a=4       55.393906       2.573386        8          482
+FAST f=19 a=4       0.38819       2.573386        8          482
+FAST f=19 a=5       43.201736       2.567589        8          674
+FAST f=19 a=5       0.292155       2.567589        8          674
+FAST f=19 a=6       42.911687       2.572666        6          434
+FAST f=19 a=6       0.303988       2.572666        6          434
+FAST f=19 a=7       44.687591       2.573613        6          290
+FAST f=19 a=7       0.308721       2.573613        6          290
+FAST f=19 a=8       37.372868       2.571039        6          194
+FAST f=19 a=8       0.287137       2.571039        6          194
+FAST f=19 a=9       36.074230       2.566473        6          482
+FAST f=19 a=9       0.280721       2.566473        6          482
+FAST f=19 a=10       33.731720       2.570306        8          194
+FAST f=19 a=10       0.224073       2.570306        8          194
+FAST f=20 a=1       79.670634       2.581146        6          290
+FAST f=20 a=1       0.899986       2.581146        6          290
+FAST f=20 a=2       58.827141       2.579782        8          386
+FAST f=20 a=2       0.602288       2.579782        8          386
+FAST f=20 a=3       51.289004       2.579627        8          722
+FAST f=20 a=3       0.446091       2.579627        8          722
+FAST f=20 a=4       47.711068       2.581508        8          722
+FAST f=20 a=4       0.473007       2.581508        8          722
+FAST f=20 a=5       47.402929       2.578062        6          434
+FAST f=20 a=5       0.497131       2.578062        6          434
+FAST f=20 a=6       54.797102       2.577365        8          482
+FAST f=20 a=6       0.515061       2.577365        8          482
+FAST f=20 a=7       51.370877       2.583050        8          386
+FAST f=20 a=7       0.402878       2.583050        8          386
+FAST f=20 a=8       51.437931       2.574875        6          242
+FAST f=20 a=8       0.453094       2.574875        6          242
+FAST f=20 a=9       44.105456       2.576700        6          242
+FAST f=20 a=9       0.456633       2.576700        6          242
+FAST f=20 a=10       44.447580       2.578305        8          338
+FAST f=20 a=10       0.409121       2.578305        8          338
+FAST f=21 a=1       113.031686       2.582449        6          242
+FAST f=21 a=1       1.456971       2.582449        6          242
+FAST f=21 a=2       97.700932       2.582124        8          194
+FAST f=21 a=2       1.072078       2.582124        8          194
+FAST f=21 a=3       96.563648       2.585479        8          434
+FAST f=21 a=3       0.949528       2.585479        8          434
+FAST f=21 a=4       90.597813       2.582366        6          386
+FAST f=21 a=4       0.76944       2.582366        6          386
+FAST f=21 a=5       86.815980       2.579043        8          434
+FAST f=21 a=5       0.858167       2.579043        8          434
+FAST f=21 a=6       91.235820       2.578378        8          530
+FAST f=21 a=6       0.684274       2.578378        8          530
+FAST f=21 a=7       84.392788       2.581243        8          386
+FAST f=21 a=7       0.814386       2.581243        8          386
+FAST f=21 a=8       82.052310       2.582547        8          338
+FAST f=21 a=8       0.822633       2.582547        8          338
+FAST f=21 a=9       74.696074       2.579319        8          194
+FAST f=21 a=9       0.811028       2.579319        8          194
+FAST f=21 a=10       76.211170       2.578766        8          290
+FAST f=21 a=10       0.809715       2.578766        8          290
+FAST f=22 a=1       138.976871       2.580478        8          194
+FAST f=22 a=1       1.748932       2.580478        8          194
+FAST f=22 a=2       120.164097       2.583633        8          386
+FAST f=22 a=2       1.333239       2.583633        8          386
+FAST f=22 a=3       111.986474       2.582566        6          194
+FAST f=22 a=3       1.305734       2.582566        6          194
+FAST f=22 a=4       108.548148       2.583068        6          194
+FAST f=22 a=4       1.314026       2.583068        6          194
+FAST f=22 a=5       103.173017       2.583495        6          290
+FAST f=22 a=5       1.228664       2.583495        6          290
+FAST f=22 a=6       108.421262       2.582349        8          530
+FAST f=22 a=6       1.076773       2.582349        8          530
+FAST f=22 a=7       103.284127       2.581022        8          386
+FAST f=22 a=7       1.112117       2.581022        8          386
+FAST f=22 a=8       96.330279       2.581073        8          290
+FAST f=22 a=8       1.109303       2.581073        8          290
+FAST f=22 a=9       97.651348       2.580075        6          194
+FAST f=22 a=9       0.933032       2.580075        6          194
+FAST f=22 a=10       101.660621       2.584886        8          194
+FAST f=22 a=10       0.796823       2.584886        8          194
+FAST f=23 a=1       159.322978       2.581474        6          242
+FAST f=23 a=1       2.015878       2.581474        6          242
+FAST f=23 a=2       134.331775       2.581619        8          194
+FAST f=23 a=2       1.545845       2.581619        8          194
+FAST f=23 a=3       127.724552       2.579888        6          338
+FAST f=23 a=3       1.444496       2.579888        6          338
+FAST f=23 a=4       126.077675       2.578137        6          242
+FAST f=23 a=4       1.364394       2.578137        6          242
+FAST f=23 a=5       124.914027       2.580843        8          338
+FAST f=23 a=5       1.116059       2.580843        8          338
+FAST f=23 a=6       122.874153       2.577637        6          338
+FAST f=23 a=6       1.164584       2.577637        6          338
+FAST f=23 a=7       123.099257       2.582715        6          386
+FAST f=23 a=7       1.354042       2.582715        6          386
+FAST f=23 a=8       122.026753       2.577681        8          194
+FAST f=23 a=8       1.210966       2.577681        8          194
+FAST f=23 a=9       121.164312       2.584599        6          290
+FAST f=23 a=9       1.174859       2.584599        6          290
+FAST f=23 a=10       117.462222       2.580358        8          194
+FAST f=23 a=10       1.075258       2.580358        8          194
+FAST f=24 a=1       169.539659       2.581642        6          194
+FAST f=24 a=1       1.916804       2.581642        6          194
+FAST f=24 a=2       160.539270       2.580421        6          290
+FAST f=24 a=2       1.71087       2.580421        6          290
+FAST f=24 a=3       155.455874       2.580449        6          242
+FAST f=24 a=3       1.60307       2.580449        6          242
+FAST f=24 a=4       147.630320       2.582953        6          338
+FAST f=24 a=4       1.396364       2.582953        6          338
+FAST f=24 a=5       133.767428       2.580589        6          290
+FAST f=24 a=5       1.19933       2.580589        6          290
+FAST f=24 a=6       146.437535       2.579453        8          194
+FAST f=24 a=6       1.385405       2.579453        8          194
+FAST f=24 a=7       147.227507       2.584155        8          386
+FAST f=24 a=7       1.48942       2.584155        8          386
+FAST f=24 a=8       138.005773       2.584115        8          194
+FAST f=24 a=8       1.352       2.584115        8          194
+FAST f=24 a=9       141.442625       2.582902        8          290
+FAST f=24 a=9       1.39647       2.582902        8          290
+FAST f=24 a=10       142.157446       2.582701        8          434
+FAST f=24 a=10       1.498889       2.582701        8          434
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
@ -5,7 +5,6 @@
 #include <ctype.h>
 #include <time.h>
 #include "random.h"
-#include "fastCover.h"
 #include "dictBuilder.h"
 #include "zstd_internal.h" /* includes zstd.h */
 #include "io.h"
@ -149,7 +148,7 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev
  /* Allocate dst with enough space to compress the maximum sized sample */
  {
    size_t maxSampleSize = 0;
-    for (int i = 0; i < srcInfo->nbSamples; i++) {
+    for (i = 0; i < srcInfo->nbSamples; i++) {
      maxSampleSize = MAX(srcInfo->samplesSizes[i], maxSampleSize);
    }
    dstCapacity = ZSTD_compressBound(maxSampleSize);
@ -291,6 +290,9 @@ int main(int argCount, const char* argv[])
  /* Initialize arguments to default values */
  unsigned k = 200;
  unsigned d = 8;
+  unsigned f;
+  unsigned accel;
+  unsigned i;
  const unsigned cLevel = DEFAULT_CLEVEL;
  const unsigned dictID = 0;
  const unsigned maxDictSize = g_defaultMaxDictSize;
@ -305,7 +307,7 @@ int main(int argCount, const char* argv[])
  const char** extendedFileList = NULL;

  /* Parse arguments */
-  for (int i = 1; i < argCount; i++) {
+  for (i = 1; i < argCount; i++) {
    const char* argument = argv[i];
    if (longCommandWArg(&argument, "in=")) {
      filenameTable[filenameIdx] = argument;
@ -375,6 +377,7 @@ int main(int argCount, const char* argv[])

  /* for cover */
  {
+    /* for cover (optimizing k and d) */
    ZDICT_cover_params_t coverParam;
    memset(&coverParam, 0, sizeof(coverParam));
    coverParam.zParams = zParams;
@ -388,6 +391,7 @@ int main(int argCount, const char* argv[])
      goto _cleanup;
    }

+    /* for cover (with k and d provided) */
    const int coverResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL, NULL);
    DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParam.k, coverParam.d, coverParam.steps, (unsigned)(coverParam.splitPoint * 100));
    if(coverResult) {
@ -398,29 +402,34 @@ int main(int argCount, const char* argv[])
  }

  /* for fastCover */
-  for (unsigned f = 15; f < 25; f++){
+  for (f = 15; f < 25; f++){
    DISPLAYLEVEL(2, "current f is %u\n", f);
-    /* for fastCover (optimizing k and d) */
-    ZDICT_fastCover_params_t fastParam;
-    memset(&fastParam, 0, sizeof(fastParam));
-    fastParam.zParams = zParams;
-    fastParam.splitPoint = 1.0;
-    fastParam.f = f;
-    fastParam.steps = 40;
-    fastParam.nbThreads = 1;
-    const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
-    DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
-    if(fastOptResult) {
-      result = 1;
-      goto _cleanup;
-    }
+    for (accel = 1; accel < 11; accel++) {
+      DISPLAYLEVEL(2, "current accel is %u\n", accel);
+      /* for fastCover (optimizing k and d) */
+      ZDICT_fastCover_params_t fastParam;
+      memset(&fastParam, 0, sizeof(fastParam));
+      fastParam.zParams = zParams;
+      fastParam.f = f;
+      fastParam.steps = 40;
+      fastParam.nbThreads = 1;
+      fastParam.accel = accel;
+      const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
+      DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100), fastParam.accel);
+      if(fastOptResult) {
+        result = 1;
+        goto _cleanup;
+      }

-    /* for fastCover (with k and d provided) */
-    const int fastResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
-    DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
-    if(fastResult) {
-      result = 1;
-      goto _cleanup;
+      /* for fastCover (with k and d provided) */
+      for (i = 0; i < 5; i++) {
+        const int fastResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
+        DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100), fastParam.accel);
+        if(fastResult) {
+          result = 1;
+          goto _cleanup;
+        }
+      }
    }
  }

--- a/contrib/experimental_dict_builders/fastCover/fastCover.c
+++ b/contrib/experimental_dict_builders/fastCover/fastCover.c
@ -197,7 +197,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
    bestSegment.end = newEnd;
  }
  {
-    /* Half the frequency of hash value of each dmer covered by the chosen segment. */
+    /*  Zero the frequency of hash value of each dmer covered by the chosen segment. */
    U32 pos;
    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
      const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, parameters.f, ctx->d);
@ -300,7 +300,7 @@ static int FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, const void *samplesBuffer,
  if (totalSamplesSize < MAX(d, sizeof(U64)) ||
      totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
    DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
-                 (U32)(totalSamplesSize>>20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
+                 (U32)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
    return 0;
  }
  /* Check if there are at least 5 training samples */
--- a/contrib/largeNbDicts/.gitignore
+++ b/contrib/largeNbDicts/.gitignore
@ -0,0 +1,2 @@
+# build artifacts
+largeNbDicts
--- a/contrib/largeNbDicts/Makefile
+++ b/contrib/largeNbDicts/Makefile
@ -0,0 +1,49 @@
+# ################################################################
+# Copyright (c) 2018-present, Yann Collet, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under both the BSD-style license (found in the
+# LICENSE file in the root directory of this source tree) and the GPLv2 (found
+# in the COPYING file in the root directory of this source tree).
+# ################################################################
+
+PROGDIR = ../../programs
+LIBDIR  = ../../lib
+
+LIBZSTD = $(LIBDIR)/libzstd.a
+
+CPPFLAGS+= -I$(LIBDIR) -I$(LIBDIR)/common -I$(LIBDIR)/dictBuilder -I$(PROGDIR)
+
+CFLAGS  ?= -O3
+CFLAGS  += -std=gnu99
+DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
+            -Wstrict-aliasing=1 -Wswitch-enum \
+            -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \
+            -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
+            -Wredundant-decls
+CFLAGS  += $(DEBUGFLAGS) $(MOREFLAGS)
+
+
+default: largeNbDicts
+
+all : largeNbDicts
+
+largeNbDicts: bench.o datagen.o xxhash.o largeNbDicts.c $(LIBZSTD)
+	$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
+
+.PHONY: $(LIBZSTD)
+$(LIBZSTD):
+	$(MAKE) -C $(LIBDIR) libzstd.a
+
+bench.o  : $(PROGDIR)/bench.c
+	$(CC) $(CPPFLAGS) $(CFLAGS) $^ -c
+
+datagen.o: $(PROGDIR)/datagen.c
+	$(CC) $(CPPFLAGS) $(CFLAGS) $^ -c
+
+xxhash.o : $(LIBDIR)/common/xxhash.c
+	$(CC) $(CPPFLAGS) $(CFLAGS) $^ -c
+
+clean:
+	$(RM) *.o
+	$(RM) largeNbDicts
--- a/contrib/largeNbDicts/README.md
+++ b/contrib/largeNbDicts/README.md
@ -0,0 +1,25 @@
+largeNbDicts
+=====================
+
+`largeNbDicts` is a benchmark test tool
+dedicated to the specific scenario of
+dictionary decompression using a very large number of dictionaries.
+When dictionaries are constantly changing, they are always "cold",
+suffering from increased latency due to cache misses.
+
+The tool is created in a bid to investigate performance for this scenario,
+and experiment mitigation techniques.
+
+Command line :
+```
+largeNbDicts [Options] filename(s)
+
+Options :
+-r           : recursively load all files in subdirectories (default: off)
+-B#          : split input into blocks of size # (default: no split)
+-#           : use compression level # (default: 3)
+-D #         : use # as a dictionary (default: create one)
+-i#          : nb benchmark rounds (default: 6)
+--nbDicts=#  : set nb of dictionaries to # (default: one per block)
+-h           : help (this text)
+```
--- a/contrib/largeNbDicts/largeNbDicts.c
+++ b/contrib/largeNbDicts/largeNbDicts.c
@ -0,0 +1,806 @@
+/*
+ * Copyright (c) 2018-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* largeNbDicts
+ * This is a benchmark test tool
+ * dedicated to the specific case of dictionary decompression
+ * using a very large nb of dictionaries
+ * thus suffering latency from lots of cache misses.
+ * It's created in a bid to investigate performance and find optimizations. */
+
+
+/*---  Dependencies  ---*/
+
+#include <stddef.h>   /* size_t */
+#include <stdlib.h>   /* malloc, free, abort */
+#include <stdio.h>    /* fprintf */
+#include <assert.h>   /* assert */
+
+#include "util.h"
+#include "bench.h"
+#define ZSTD_STATIC_LINKING_ONLY
+#include "zstd.h"
+#include "zdict.h"
+
+
+/*---  Constants  --- */
+
+#define KB  *(1<<10)
+#define MB  *(1<<20)
+
+#define BLOCKSIZE_DEFAULT 0  /* no slicing into blocks */
+#define DICTSIZE  (4 KB)
+#define CLEVEL_DEFAULT 3
+
+#define BENCH_TIME_DEFAULT_S   6
+#define RUN_TIME_DEFAULT_MS    1000
+#define BENCH_TIME_DEFAULT_MS (BENCH_TIME_DEFAULT_S * RUN_TIME_DEFAULT_MS)
+
+#define DISPLAY_LEVEL_DEFAULT 3
+
+#define BENCH_SIZE_MAX (1200 MB)
+
+
+/*---  Macros  ---*/
+#define CONTROL(c)   { if (!(c)) abort(); }
+#undef MIN
+#define MIN(a,b)     ((a) < (b) ? (a) : (b))
+
+
+/*---  Display Macros  ---*/
+
+#define DISPLAY(...)         fprintf(stdout, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) { if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); } }
+static int g_displayLevel = DISPLAY_LEVEL_DEFAULT;   /* 0 : no display,  1: errors,  2 : + result + interaction + warnings,  3 : + progression,  4 : + information */
+
+
+/*---  buffer_t  ---*/
+
+typedef struct {
+    void* ptr;
+    size_t size;
+    size_t capacity;
+} buffer_t;
+
+static const buffer_t kBuffNull = { NULL, 0, 0 };
+
+/* @return : kBuffNull if any error */
+static buffer_t createBuffer(size_t capacity)
+{
+    assert(capacity > 0);
+    void* const ptr = malloc(capacity);
+    if (ptr==NULL) return kBuffNull;
+
+    buffer_t buffer;
+    buffer.ptr = ptr;
+    buffer.capacity = capacity;
+    buffer.size = 0;
+    return buffer;
+}
+
+static void freeBuffer(buffer_t buff)
+{
+    free(buff.ptr);
+}
+
+
+static void fillBuffer_fromHandle(buffer_t* buff, FILE* f)
+{
+    size_t const readSize = fread(buff->ptr, 1, buff->capacity, f);
+    buff->size = readSize;
+}
+
+
+/* @return : kBuffNull if any error */
+static buffer_t createBuffer_fromFile(const char* fileName)
+{
+    U64 const fileSize = UTIL_getFileSize(fileName);
+    size_t const bufferSize = (size_t) fileSize;
+
+    if (fileSize == UTIL_FILESIZE_UNKNOWN) return kBuffNull;
+    assert((U64)bufferSize == fileSize);   /* check overflow */
+
+    {   FILE* const f = fopen(fileName, "rb");
+        if (f == NULL) return kBuffNull;
+
+        buffer_t buff = createBuffer(bufferSize);
+        CONTROL(buff.ptr != NULL);
+
+        fillBuffer_fromHandle(&buff, f);
+        CONTROL(buff.size == buff.capacity);
+
+        fclose(f);   /* do nothing specific if fclose() fails */
+        return buff;
+    }
+}
+
+
+/* @return : kBuffNull if any error */
+static buffer_t
+createDictionaryBuffer(const char* dictionaryName,
+                       const void* srcBuffer,
+                       const size_t* srcBlockSizes, unsigned nbBlocks,
+                       size_t requestedDictSize)
+{
+    if (dictionaryName) {
+        DISPLAYLEVEL(3, "loading dictionary %s \n", dictionaryName);
+        return createBuffer_fromFile(dictionaryName);  /* note : result might be kBuffNull */
+
+    } else {
+
+        DISPLAYLEVEL(3, "creating dictionary, of target size %u bytes \n",
+                        (unsigned)requestedDictSize);
+        void* const dictBuffer = malloc(requestedDictSize);
+        CONTROL(dictBuffer != NULL);
+
+        size_t const dictSize = ZDICT_trainFromBuffer(dictBuffer, requestedDictSize,
+                                                      srcBuffer,
+                                                      srcBlockSizes, nbBlocks);
+        CONTROL(!ZSTD_isError(dictSize));
+
+        buffer_t result;
+        result.ptr = dictBuffer;
+        result.capacity = requestedDictSize;
+        result.size = dictSize;
+        return result;
+    }
+}
+
+
+/*! BMK_loadFiles() :
+ *  Loads `buffer`, with content from files listed within `fileNamesTable`.
+ *  Fills `buffer` entirely.
+ * @return : 0 on success, !=0 on error */
+static int loadFiles(void* buffer, size_t bufferSize,
+                     size_t* fileSizes,
+                     const char* const * fileNamesTable, unsigned nbFiles)
+{
+    size_t pos = 0, totalSize = 0;
+
+    for (unsigned n=0; n<nbFiles; n++) {
+        U64 fileSize = UTIL_getFileSize(fileNamesTable[n]);
+        if (UTIL_isDirectory(fileNamesTable[n])) {
+            fileSizes[n] = 0;
+            continue;
+        }
+        if (fileSize == UTIL_FILESIZE_UNKNOWN) {
+            fileSizes[n] = 0;
+            continue;
+        }
+
+        FILE* const f = fopen(fileNamesTable[n], "rb");
+        assert(f!=NULL);
+
+        assert(pos <= bufferSize);
+        assert(fileSize <= bufferSize - pos);
+
+        {   size_t const readSize = fread(((char*)buffer)+pos, 1, (size_t)fileSize, f);
+            assert(readSize == fileSize);
+            pos += readSize;
+        }
+        fileSizes[n] = (size_t)fileSize;
+        totalSize += (size_t)fileSize;
+        fclose(f);
+    }
+
+    assert(totalSize == bufferSize);
+    return 0;
+}
+
+
+
+/*---  slice_collection_t  ---*/
+
+typedef struct {
+    void** slicePtrs;
+    size_t* capacities;
+    size_t nbSlices;
+} slice_collection_t;
+
+static const slice_collection_t kNullCollection = { NULL, NULL, 0 };
+
+static void freeSliceCollection(slice_collection_t collection)
+{
+    free(collection.slicePtrs);
+    free(collection.capacities);
+}
+
+/* shrinkSizes() :
+ * downsizes sizes of slices within collection, according to `newSizes`.
+ * every `newSizes` entry must be <= than its corresponding collection size */
+void shrinkSizes(slice_collection_t collection,
+                 const size_t* newSizes)  /* presumed same size as collection */
+{
+    size_t const nbSlices = collection.nbSlices;
+    for (size_t blockNb = 0; blockNb < nbSlices; blockNb++) {
+        assert(newSizes[blockNb] <= collection.capacities[blockNb]);
+        collection.capacities[blockNb] = newSizes[blockNb];
+    }
+}
+
+
+/* splitSlices() :
+ * nbSlices : if == 0, nbSlices is automatically determined from srcSlices and blockSize.
+ *            otherwise, creates exactly nbSlices slices,
+ *            by either truncating input (when smaller)
+ *            or repeating input from beginning */
+static slice_collection_t
+splitSlices(slice_collection_t srcSlices, size_t blockSize, size_t nbSlices)
+{
+    if (blockSize==0) blockSize = (size_t)(-1);   /* means "do not cut" */
+    size_t nbSrcBlocks = 0;
+    for (size_t ssnb=0; ssnb < srcSlices.nbSlices; ssnb++) {
+        size_t pos = 0;
+        while (pos <= srcSlices.capacities[ssnb]) {
+            nbSrcBlocks++;
+            pos += blockSize;
+        }
+    }
+
+    if (nbSlices == 0) nbSlices = nbSrcBlocks;
+
+    void** const sliceTable = (void**)malloc(nbSlices * sizeof(*sliceTable));
+    size_t* const capacities = (size_t*)malloc(nbSlices * sizeof(*capacities));
+    if (sliceTable == NULL || capacities == NULL) {
+        free(sliceTable);
+        free(capacities);
+        return kNullCollection;
+    }
+
+    size_t ssnb = 0;
+    for (size_t sliceNb=0; sliceNb < nbSlices; ) {
+        ssnb = (ssnb + 1) % srcSlices.nbSlices;
+        size_t pos = 0;
+        char* const ptr = (char*)srcSlices.slicePtrs[ssnb];
+        while (pos < srcSlices.capacities[ssnb] && sliceNb < nbSlices) {
+            size_t const size = MIN(blockSize, srcSlices.capacities[ssnb] - pos);
+            sliceTable[sliceNb] = ptr + pos;
+            capacities[sliceNb] = size;
+            sliceNb++;
+            pos += blockSize;
+        }
+    }
+
+    slice_collection_t result;
+    result.nbSlices = nbSlices;
+    result.slicePtrs = sliceTable;
+    result.capacities = capacities;
+    return result;
+}
+
+
+static size_t sliceCollection_totalCapacity(slice_collection_t sc)
+{
+    size_t totalSize = 0;
+    for (size_t n=0; n<sc.nbSlices; n++)
+        totalSize += sc.capacities[n];
+    return totalSize;
+}
+
+
+/* ---  buffer collection  --- */
+
+typedef struct {
+    buffer_t buffer;
+    slice_collection_t slices;
+} buffer_collection_t;
+
+
+static void freeBufferCollection(buffer_collection_t bc)
+{
+    freeBuffer(bc.buffer);
+    freeSliceCollection(bc.slices);
+}
+
+
+static buffer_collection_t
+createBufferCollection_fromSliceCollectionSizes(slice_collection_t sc)
+{
+    size_t const bufferSize = sliceCollection_totalCapacity(sc);
+
+    buffer_t buffer = createBuffer(bufferSize);
+    CONTROL(buffer.ptr != NULL);
+
+    size_t const nbSlices = sc.nbSlices;
+    void** const slices = (void**)malloc(nbSlices * sizeof(*slices));
+    CONTROL(slices != NULL);
+
+    size_t* const capacities = (size_t*)malloc(nbSlices * sizeof(*capacities));
+    CONTROL(capacities != NULL);
+
+    char* const ptr = (char*)buffer.ptr;
+    size_t pos = 0;
+    for (size_t n=0; n < nbSlices; n++) {
+        capacities[n] = sc.capacities[n];
+        slices[n] = ptr + pos;
+        pos += capacities[n];
+    }
+
+    buffer_collection_t result;
+    result.buffer = buffer;
+    result.slices.nbSlices = nbSlices;
+    result.slices.capacities = capacities;
+    result.slices.slicePtrs = slices;
+    return result;
+}
+
+
+/* @return : kBuffNull if any error */
+static buffer_collection_t
+createBufferCollection_fromFiles(const char* const * fileNamesTable, unsigned nbFiles)
+{
+    U64 const totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles);
+    assert(totalSizeToLoad != UTIL_FILESIZE_UNKNOWN);
+    assert(totalSizeToLoad <= BENCH_SIZE_MAX);
+    size_t const loadedSize = (size_t)totalSizeToLoad;
+    assert(loadedSize > 0);
+    void* const srcBuffer = malloc(loadedSize);
+    assert(srcBuffer != NULL);
+
+    assert(nbFiles > 0);
+    size_t* const fileSizes = (size_t*)calloc(nbFiles, sizeof(*fileSizes));
+    assert(fileSizes != NULL);
+
+    /* Load input buffer */
+    int const errorCode = loadFiles(srcBuffer, loadedSize,
+                                    fileSizes,
+                                    fileNamesTable, nbFiles);
+    assert(errorCode == 0);
+
+    void** sliceTable = (void**)malloc(nbFiles * sizeof(*sliceTable));
+    assert(sliceTable != NULL);
+
+    char* const ptr = (char*)srcBuffer;
+    size_t pos = 0;
+    unsigned fileNb = 0;
+    for ( ; (pos < loadedSize) && (fileNb < nbFiles); fileNb++) {
+        sliceTable[fileNb] = ptr + pos;
+        pos += fileSizes[fileNb];
+    }
+    assert(pos == loadedSize);
+    assert(fileNb == nbFiles);
+
+
+    buffer_t buffer;
+    buffer.ptr = srcBuffer;
+    buffer.capacity = loadedSize;
+    buffer.size = loadedSize;
+
+    slice_collection_t slices;
+    slices.slicePtrs = sliceTable;
+    slices.capacities = fileSizes;
+    slices.nbSlices = nbFiles;
+
+    buffer_collection_t bc;
+    bc.buffer = buffer;
+    bc.slices = slices;
+    return bc;
+}
+
+
+
+
+/*---  ddict_collection_t  ---*/
+
+typedef struct {
+    ZSTD_DDict** ddicts;
+    size_t nbDDict;
+} ddict_collection_t;
+
+static const ddict_collection_t kNullDDictCollection = { NULL, 0 };
+
+static void freeDDictCollection(ddict_collection_t ddictc)
+{
+    for (size_t dictNb=0; dictNb < ddictc.nbDDict; dictNb++) {
+        ZSTD_freeDDict(ddictc.ddicts[dictNb]);
+    }
+    free(ddictc.ddicts);
+}
+
+/* returns .buffers=NULL if operation fails */
+static ddict_collection_t createDDictCollection(const void* dictBuffer, size_t dictSize, size_t nbDDict)
+{
+    ZSTD_DDict** const ddicts = malloc(nbDDict * sizeof(ZSTD_DDict*));
+    assert(ddicts != NULL);
+    if (ddicts==NULL) return kNullDDictCollection;
+    for (size_t dictNb=0; dictNb < nbDDict; dictNb++) {
+        ddicts[dictNb] = ZSTD_createDDict(dictBuffer, dictSize);
+        assert(ddicts[dictNb] != NULL);
+    }
+    ddict_collection_t ddictc;
+    ddictc.ddicts = ddicts;
+    ddictc.nbDDict = nbDDict;
+    return ddictc;
+}
+
+
+/* mess with adresses, so that linear scanning dictionaries != linear address scanning */
+void shuffleDictionaries(ddict_collection_t dicts)
+{
+    size_t const nbDicts = dicts.nbDDict;
+    for (size_t r=0; r<nbDicts; r++) {
+        size_t const d = rand() % nbDicts;
+        ZSTD_DDict* tmpd = dicts.ddicts[d];
+        dicts.ddicts[d] = dicts.ddicts[r];
+        dicts.ddicts[r] = tmpd;
+    }
+    for (size_t r=0; r<nbDicts; r++) {
+        size_t const d1 = rand() % nbDicts;
+        size_t const d2 = rand() % nbDicts;
+        ZSTD_DDict* tmpd = dicts.ddicts[d1];
+        dicts.ddicts[d1] = dicts.ddicts[d2];
+        dicts.ddicts[d2] = tmpd;
+    }
+}
+
+
+/* ---   Compression  --- */
+
+/* compressBlocks() :
+ * @return : total compressed size of all blocks,
+ *        or 0 if error.
+ */
+static size_t compressBlocks(size_t* cSizes,   /* optional (can be NULL). If present, must contain at least nbBlocks fields */
+                             slice_collection_t dstBlockBuffers,
+                             slice_collection_t srcBlockBuffers,
+                             ZSTD_CDict* cdict, int cLevel)
+{
+    size_t const nbBlocks = srcBlockBuffers.nbSlices;
+    assert(dstBlockBuffers.nbSlices == srcBlockBuffers.nbSlices);
+
+    ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+    assert(cctx != NULL);
+
+    size_t totalCSize = 0;
+    for (size_t blockNb=0; blockNb < nbBlocks; blockNb++) {
+        size_t cBlockSize;
+        if (cdict == NULL) {
+            cBlockSize = ZSTD_compressCCtx(cctx,
+                            dstBlockBuffers.slicePtrs[blockNb], dstBlockBuffers.capacities[blockNb],
+                            srcBlockBuffers.slicePtrs[blockNb], srcBlockBuffers.capacities[blockNb],
+                            cLevel);
+        } else {
+            cBlockSize = ZSTD_compress_usingCDict(cctx,
+                            dstBlockBuffers.slicePtrs[blockNb], dstBlockBuffers.capacities[blockNb],
+                            srcBlockBuffers.slicePtrs[blockNb], srcBlockBuffers.capacities[blockNb],
+                            cdict);
+        }
+        CONTROL(!ZSTD_isError(cBlockSize));
+        if (cSizes) cSizes[blockNb] = cBlockSize;
+        totalCSize += cBlockSize;
+    }
+    return totalCSize;
+}
+
+
+/* ---  Benchmark  --- */
+
+typedef struct {
+    ZSTD_DCtx* dctx;
+    size_t nbDicts;
+    size_t dictNb;
+    ddict_collection_t dictionaries;
+} decompressInstructions;
+
+decompressInstructions createDecompressInstructions(ddict_collection_t dictionaries)
+{
+    decompressInstructions di;
+    di.dctx = ZSTD_createDCtx();
+    assert(di.dctx != NULL);
+    di.nbDicts = dictionaries.nbDDict;
+    di.dictNb = 0;
+    di.dictionaries = dictionaries;
+    return di;
+}
+
+void freeDecompressInstructions(decompressInstructions di)
+{
+    ZSTD_freeDCtx(di.dctx);
+}
+
+/* benched function */
+size_t decompress(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* payload)
+{
+    decompressInstructions* const di = (decompressInstructions*) payload;
+
+    size_t const result = ZSTD_decompress_usingDDict(di->dctx,
+                                        dst, dstCapacity,
+                                        src, srcSize,
+                                        di->dictionaries.ddicts[di->dictNb]);
+
+    di->dictNb = di->dictNb + 1;
+    if (di->dictNb >= di->nbDicts) di->dictNb = 0;
+
+    return result;
+}
+
+
+static int benchMem(slice_collection_t dstBlocks,
+                    slice_collection_t srcBlocks,
+                    ddict_collection_t dictionaries,
+                    int nbRounds)
+{
+    assert(dstBlocks.nbSlices == srcBlocks.nbSlices);
+
+    unsigned const ms_per_round = RUN_TIME_DEFAULT_MS;
+    unsigned const total_time_ms = nbRounds * ms_per_round;
+
+    double bestSpeed = 0.;
+
+    BMK_timedFnState_t* const benchState =
+            BMK_createTimedFnState(total_time_ms, ms_per_round);
+    decompressInstructions di = createDecompressInstructions(dictionaries);
+
+    for (;;) {
+        BMK_runOutcome_t const outcome = BMK_benchTimedFn(benchState,
+                                decompress, &di,
+                                NULL, NULL,
+                                dstBlocks.nbSlices,
+                                (const void* const *)srcBlocks.slicePtrs, srcBlocks.capacities,
+                                dstBlocks.slicePtrs, dstBlocks.capacities,
+                                NULL);
+        CONTROL(BMK_isSuccessful_runOutcome(outcome));
+
+        BMK_runTime_t const result = BMK_extract_runTime(outcome);
+        U64 const dTime_ns = result.nanoSecPerRun;
+        double const dTime_sec = (double)dTime_ns / 1000000000;
+        size_t const srcSize = result.sumOfReturn;
+        double const dSpeed_MBps = (double)srcSize / dTime_sec / (1 MB);
+        if (dSpeed_MBps > bestSpeed) bestSpeed = dSpeed_MBps;
+        DISPLAY("Decompression Speed : %.1f MB/s \r", bestSpeed);
+        fflush(stdout);
+        if (BMK_isCompleted_TimedFn(benchState)) break;
+    }
+    DISPLAY("\n");
+
+    freeDecompressInstructions(di);
+    BMK_freeTimedFnState(benchState);
+
+    return 0;   /* success */
+}
+
+
+/*! bench() :
+ *  fileName : file to load for benchmarking purpose
+ *  dictionary : optional (can be NULL), file to load as dictionary,
+ *              if none provided : will be calculated on the fly by the program.
+ * @return : 0 is success, 1+ otherwise */
+int bench(const char** fileNameTable, unsigned nbFiles,
+          const char* dictionary,
+          size_t blockSize, int clevel,
+          unsigned nbDictMax, unsigned nbBlocks,
+          int nbRounds)
+{
+    int result = 0;
+
+    DISPLAYLEVEL(3, "loading %u files... \n", nbFiles);
+    buffer_collection_t const srcs = createBufferCollection_fromFiles(fileNameTable, nbFiles);
+    CONTROL(srcs.buffer.ptr != NULL);
+    buffer_t srcBuffer = srcs.buffer;
+    size_t const srcSize = srcBuffer.size;
+    DISPLAYLEVEL(3, "created src buffer of size %.1f MB \n",
+                    (double)srcSize / (1 MB));
+
+    slice_collection_t const srcSlices = splitSlices(srcs.slices, blockSize, nbBlocks);
+    nbBlocks = (unsigned)(srcSlices.nbSlices);
+    DISPLAYLEVEL(3, "split input into %u blocks ", nbBlocks);
+    if (blockSize)
+        DISPLAYLEVEL(3, "of max size %u bytes ", (unsigned)blockSize);
+    DISPLAYLEVEL(3, "\n");
+
+
+    size_t* const dstCapacities = malloc(nbBlocks * sizeof(*dstCapacities));
+    CONTROL(dstCapacities != NULL);
+    size_t dstBufferCapacity = 0;
+    for (size_t bnb=0; bnb<nbBlocks; bnb++) {
+        dstCapacities[bnb] = ZSTD_compressBound(srcSlices.capacities[bnb]);
+        dstBufferCapacity += dstCapacities[bnb];
+    }
+
+    buffer_t dstBuffer = createBuffer(dstBufferCapacity);
+    CONTROL(dstBuffer.ptr != NULL);
+
+    void** const sliceTable = malloc(nbBlocks * sizeof(*sliceTable));
+    CONTROL(sliceTable != NULL);
+
+    {   char* const ptr = dstBuffer.ptr;
+        size_t pos = 0;
+        for (size_t snb=0; snb < nbBlocks; snb++) {
+            sliceTable[snb] = ptr + pos;
+            pos += dstCapacities[snb];
+    }   }
+
+    slice_collection_t dstSlices;
+    dstSlices.capacities = dstCapacities;
+    dstSlices.slicePtrs = sliceTable;
+    dstSlices.nbSlices = nbBlocks;
+
+
+    /* dictionary determination */
+    buffer_t const dictBuffer = createDictionaryBuffer(dictionary,
+                                srcBuffer.ptr,
+                                srcSlices.capacities, nbBlocks,
+                                DICTSIZE);
+    CONTROL(dictBuffer.ptr != NULL);
+
+    ZSTD_CDict* const cdict = ZSTD_createCDict(dictBuffer.ptr, dictBuffer.size, clevel);
+    CONTROL(cdict != NULL);
+
+    size_t const cTotalSizeNoDict = compressBlocks(NULL, dstSlices, srcSlices, NULL, clevel);
+    CONTROL(cTotalSizeNoDict != 0);
+    DISPLAYLEVEL(3, "compressing at level %u without dictionary : Ratio=%.2f  (%u bytes) \n",
+                    clevel,
+                    (double)srcSize / cTotalSizeNoDict, (unsigned)cTotalSizeNoDict);
+
+    size_t* const cSizes = malloc(nbBlocks * sizeof(size_t));
+    CONTROL(cSizes != NULL);
+
+    size_t const cTotalSize = compressBlocks(cSizes, dstSlices, srcSlices, cdict, clevel);
+    CONTROL(cTotalSize != 0);
+    DISPLAYLEVEL(3, "compressed using a %u bytes dictionary : Ratio=%.2f  (%u bytes) \n",
+                    (unsigned)dictBuffer.size,
+                    (double)srcSize / cTotalSize, (unsigned)cTotalSize);
+
+    /* now dstSlices contain the real compressed size of each block, instead of the maximum capacity */
+    shrinkSizes(dstSlices, cSizes);
+
+    size_t const dictMem = ZSTD_estimateDDictSize(dictBuffer.size, ZSTD_dlm_byCopy);
+    unsigned const nbDicts = nbDictMax ? nbDictMax : nbBlocks;
+    size_t const allDictMem = dictMem * nbDicts;
+    DISPLAYLEVEL(3, "generating %u dictionaries, using %.1f MB of memory \n",
+                    nbDicts, (double)allDictMem / (1 MB));
+
+    ddict_collection_t const dictionaries = createDDictCollection(dictBuffer.ptr, dictBuffer.size, nbDicts);
+    CONTROL(dictionaries.ddicts != NULL);
+
+    shuffleDictionaries(dictionaries);
+
+    buffer_collection_t resultCollection = createBufferCollection_fromSliceCollectionSizes(srcSlices);
+    CONTROL(resultCollection.buffer.ptr != NULL);
+
+    result = benchMem(resultCollection.slices, dstSlices, dictionaries, nbRounds);
+
+    /* free all heap objects in reverse order */
+    freeBufferCollection(resultCollection);
+    freeDDictCollection(dictionaries);
+    free(cSizes);
+    ZSTD_freeCDict(cdict);
+    freeBuffer(dictBuffer);
+    freeSliceCollection(dstSlices);
+    freeBuffer(dstBuffer);
+    freeSliceCollection(srcSlices);
+    freeBufferCollection(srcs);
+
+    return result;
+}
+
+
+
+/* ---  Command Line  --- */
+
+/*! readU32FromChar() :
+ * @return : unsigned integer value read from input in `char` format.
+ *  allows and interprets K, KB, KiB, M, MB and MiB suffix.
+ *  Will also modify `*stringPtr`, advancing it to position where it stopped reading.
+ *  Note : function will exit() program if digit sequence overflows */
+static unsigned readU32FromChar(const char** stringPtr)
+{
+    unsigned result = 0;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        unsigned const max = (((unsigned)(-1)) / 10) - 1;
+        assert(result <= max);   /* check overflow */
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        unsigned const maxK = ((unsigned)(-1)) >> 10;
+        assert(result <= maxK);   /* check overflow */
+        result <<= 10;
+        if (**stringPtr=='M') {
+            assert(result <= maxK);   /* check overflow */
+            result <<= 10;
+        }
+        (*stringPtr)++;  /* skip `K` or `M` */
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result;
+}
+
+/** longCommandWArg() :
+ *  check if *stringPtr is the same as longCommand.
+ *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
+ * @return 0 and doesn't modify *stringPtr otherwise.
+ */
+static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
+{
+    size_t const comSize = strlen(longCommand);
+    int const result = !strncmp(*stringPtr, longCommand, comSize);
+    if (result) *stringPtr += comSize;
+    return result;
+}
+
+
+int usage(const char* exeName)
+{
+    DISPLAY (" \n");
+    DISPLAY (" %s [Options] filename(s) \n", exeName);
+    DISPLAY (" \n");
+    DISPLAY ("Options : \n");
+    DISPLAY ("-r          : recursively load all files in subdirectories (default: off) \n");
+    DISPLAY ("-B#         : split input into blocks of size # (default: no split) \n");
+    DISPLAY ("-#          : use compression level # (default: %u) \n", CLEVEL_DEFAULT);
+    DISPLAY ("-D #        : use # as a dictionary (default: create one) \n");
+    DISPLAY ("-i#         : nb benchmark rounds (default: %u) \n", BENCH_TIME_DEFAULT_S);
+    DISPLAY ("--nbBlocks=#: use # blocks for bench (default: one per file) \n");
+    DISPLAY ("--nbDicts=# : create # dictionaries for bench (default: one per block) \n");
+    DISPLAY ("-h          : help (this text) \n");
+    return 0;
+}
+
+int bad_usage(const char* exeName)
+{
+    DISPLAY (" bad usage : \n");
+    usage(exeName);
+    return 1;
+}
+
+int main (int argc, const char** argv)
+{
+    int recursiveMode = 0;
+    int nbRounds = BENCH_TIME_DEFAULT_S;
+    const char* const exeName = argv[0];
+
+    if (argc < 2) return bad_usage(exeName);
+
+    const char** nameTable = (const char**)malloc(argc * sizeof(const char*));
+    assert(nameTable != NULL);
+    unsigned nameIdx = 0;
+
+    const char* dictionary = NULL;
+    int cLevel = CLEVEL_DEFAULT;
+    size_t blockSize = BLOCKSIZE_DEFAULT;
+    unsigned nbDicts = 0;  /* determine nbDicts automatically: 1 dictionary per block */
+    unsigned nbBlocks = 0; /* determine nbBlocks automatically, from source and blockSize */
+
+    for (int argNb = 1; argNb < argc ; argNb++) {
+        const char* argument = argv[argNb];
+        if (!strcmp(argument, "-h")) { free(nameTable); return usage(exeName); }
+        if (!strcmp(argument, "-r")) { recursiveMode = 1; continue; }
+        if (!strcmp(argument, "-D")) { argNb++; assert(argNb < argc); dictionary = argv[argNb]; continue; }
+        if (longCommandWArg(&argument, "-i")) { nbRounds = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "--dictionary=")) { dictionary = argument; continue; }
+        if (longCommandWArg(&argument, "-B")) { blockSize = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "--blockSize=")) { blockSize = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "--nbDicts=")) { nbDicts = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "--nbBlocks=")) { nbBlocks = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "--clevel=")) { cLevel = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "-")) { cLevel = readU32FromChar(&argument); continue; }
+        /* anything that's not a command is a filename */
+        nameTable[nameIdx++] = argument;
+    }
+
+    const char** filenameTable = nameTable;
+    unsigned nbFiles = nameIdx;
+    char* buffer_containing_filenames = NULL;
+
+    if (recursiveMode) {
+#ifndef UTIL_HAS_CREATEFILELIST
+        assert(0);   /* missing capability, do not run */
+#endif
+        filenameTable = UTIL_createFileList(nameTable, nameIdx, &buffer_containing_filenames, &nbFiles, 1 /* follow_links */);
+    }
+
+    int result = bench(filenameTable, nbFiles, dictionary, blockSize, cLevel, nbDicts, nbBlocks, nbRounds);
+
+    free(buffer_containing_filenames);
+    free(nameTable);
+
+    return result;
+}
--- a/contrib/seekable_format/examples/seekable_compression.c
+++ b/contrib/seekable_format/examples/seekable_compression.c
@ -101,7 +101,7 @@ static void compressFile_orDie(const char* fname, const char* outName, int cLeve
    free(buffOut);
 }

-static const char* createOutFilename_orDie(const char* filename)
+static char* createOutFilename_orDie(const char* filename)
 {
    size_t const inL = strlen(filename);
    size_t const outL = inL + 5;
@ -109,7 +109,7 @@ static const char* createOutFilename_orDie(const char* filename)
    memset(outSpace, 0, outL);
    strcat(outSpace, filename);
    strcat(outSpace, ".zst");
-    return (const char*)outSpace;
+    return (char*)outSpace;
 }

 int main(int argc, const char** argv) {
@ -124,8 +124,9 @@ int main(int argc, const char** argv) {
    {   const char* const inFileName = argv[1];
        unsigned const frameSize = (unsigned)atoi(argv[2]);

-        const char* const outFileName = createOutFilename_orDie(inFileName);
+        char* const outFileName = createOutFilename_orDie(inFileName);
        compressFile_orDie(inFileName, outFileName, 5, frameSize);
+        free(outFileName);
    }

    return 0;
--- a/contrib/seekable_format/examples/seekable_decompression.c
+++ b/contrib/seekable_format/examples/seekable_decompression.c
@ -84,7 +84,7 @@ static void fseek_orDie(FILE* file, long int offset, int origin) {
 }


-static void decompressFile_orDie(const char* fname, unsigned startOffset, unsigned endOffset)
+static void decompressFile_orDie(const char* fname, off_t startOffset, off_t endOffset)
 {
    FILE* const fin  = fopen_orDie(fname, "rb");
    FILE* const fout = stdout;
@ -129,8 +129,8 @@ int main(int argc, const char** argv)

    {
        const char* const inFilename = argv[1];
-        unsigned const startOffset = (unsigned) atoi(argv[2]);
-        unsigned const endOffset = (unsigned) atoi(argv[3]);
+        off_t const startOffset = atoll(argv[2]);
+        off_t const endOffset = atoll(argv[3]);
        decompressFile_orDie(inFilename, startOffset, endOffset);
    }

--- a/contrib/seekable_format/zstdseek_decompress.c
+++ b/contrib/seekable_format/zstdseek_decompress.c
@ -56,6 +56,7 @@

 #include <stdlib.h> /* malloc, free */
 #include <stdio.h>  /* FILE* */
+#include <assert.h>

 #define XXH_STATIC_LINKING_ONLY
 #define XXH_NAMESPACE ZSTD_
@ -112,7 +113,7 @@ static int ZSTD_seekable_read_buff(void* opaque, void* buffer, size_t n)

 static int ZSTD_seekable_seek_buff(void* opaque, long long offset, int origin)
 {
-    buffWrapper_t* buff = (buffWrapper_t*) opaque;
+    buffWrapper_t* const buff = (buffWrapper_t*) opaque;
    unsigned long long newOffset;
    switch (origin) {
    case SEEK_SET:
@ -124,6 +125,8 @@ static int ZSTD_seekable_seek_buff(void* opaque, long long offset, int origin)
    case SEEK_END:
        newOffset = (unsigned long long)buff->size - offset;
        break;
+    default:
+        assert(0);  /* not possible */
    }
    if (newOffset > buff->size) {
        return -1;
@ -310,8 +313,8 @@ static size_t ZSTD_seekable_loadSeekTable(ZSTD_seekable* zs)
            /* compute cumulative positions */
            for (; idx < numFrames; idx++) {
                if (pos + sizePerEntry > SEEKABLE_BUFF_SIZE) {
-                    U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE);
                    U32 const offset = SEEKABLE_BUFF_SIZE - pos;
+                    U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE - offset);
                    memmove(zs->inBuff, zs->inBuff + pos, offset); /* move any data we haven't read yet */
                    CHECK_IO(src.read(src.opaque, zs->inBuff+offset, toRead));
                    remaining -= toRead;
--- a/doc/zstd_compression_format.md
+++ b/doc/zstd_compression_format.md
@ -16,7 +16,7 @@ Distribution of this document is unlimited.

 ### Version

-0.2.8 (30/05/18)
+0.2.9 (05/09/18)


 Introduction
@ -1192,6 +1192,8 @@ Number_of_Bits = Weight ? (Max_Number_of_Bits + 1 - Weight) : 0
 The last symbol's `Weight` is deduced from previously decoded ones,
 by completing to the nearest power of 2.
 This power of 2 gives `Max_Number_of_Bits`, the depth of the current tree.
+`Max_Number_of_Bits` must be <= 11,
+otherwise the representation is considered corrupted.

 __Example__ :
 Let's presume the following Huffman tree must be described :
@ -1216,12 +1218,12 @@ It gives the following series of weights :
 |   `Weight`    |  4  |  3  |  2  |  0  |  1  |

 The decoder will do the inverse operation :
-having collected weights of literals from `0` to `4`,
+having collected weights of literal symbols from `0` to `4`,
 it knows the last literal, `5`, is present with a non-zero weight.
 The weight of `5` can be determined by advancing to the next power of 2.
 The sum of `2^(Weight-1)` (excluding 0's) is :
 `8 + 4 + 2 + 0 + 1 = 15`.
-Nearest power of 2 is 16.
+Nearest larger power of 2 value is 16.
 Therefore, `Max_Number_of_Bits = 4` and `Weight[5] = 16-15 = 1`.

 #### Huffman Tree header
@ -1233,18 +1235,24 @@ which describes how the series of weights is encoded.
  the series of weights is compressed using FSE (see below).
  The length of the FSE-compressed series is equal to `headerByte` (0-127).

- if `headerByte` >= 128 : this is a direct representation,
-  where each `Weight` is written directly as a 4 bits field (0-15).
-  They are encoded forward, 2 weights to a byte with the first weight taking
-  the top four bits and the second taking the bottom four (e.g. the following
-  operations could be used to read the weights:
-  `Weight[0] = (Byte[0] >> 4), Weight[1] = (Byte[0] & 0xf)`, etc.).
-  The full representation occupies `Ceiling(Number_of_Symbols/2)` bytes,
-  meaning it uses only full bytes even if `Number_of_Symbols` is odd.
-  `Number_of_Symbols = headerByte - 127`.
-  Note that maximum `Number_of_Symbols` is 255-127 = 128.
-  If any literal has a value > 128, raw header mode is not possible.
-  In such case, it's necessary to use FSE compression.
+- if `headerByte` >= 128 :
+  + the series of weights uses a direct representation,
+    where each `Weight` is encoded directly as a 4 bits field (0-15).
+  + They are encoded forward, 2 weights to a byte,
+    first weight taking the top four bits and second one taking the bottom four.
+    * e.g. the following operations could be used to read the weights:
+      `Weight[0] = (Byte[0] >> 4), Weight[1] = (Byte[0] & 0xf)`, etc.
+  + The full representation occupies `Ceiling(Number_of_Weights/2)` bytes,
+    meaning it uses only full bytes even if `Number_of_Weights` is odd.
+  + `Number_of_Weights = headerByte - 127`.
+    * Note that maximum `Number_of_Weights` is 255-127 = 128,
+      therefore, only up to 128 `Weight` can be encoded using direct representation.
+    * Since the last non-zero `Weight` is _not_ encoded,
+      this scheme is compatible with alphabet sizes of up to 129 symbols,
+      hence including literal symbol 128.
+    * If any literal symbol > 128 has a non-zero `Weight`,
+      direct representation is not possible.
+      In such case, it's necessary to use FSE compression.


 #### Finite State Entropy (FSE) compression of Huffman weights
@ -1621,6 +1629,7 @@ or at least provide a meaningful error code explaining for which reason it canno

 Version changes
 ---------------
+- 0.2.9 : clarifications for huffman weights direct representation, by Ulrich Kunitz
 - 0.2.8 : clarifications for IETF RFC discuss
 - 0.2.7 : clarifications from IETF RFC review, by Vijay Gurbani and Nick Terrell
 - 0.2.6 : fixed an error in huffman example, by Ulrich Kunitz
--- a/doc/zstd_manual.html
+++ b/doc/zstd_manual.html
@ -35,22 +35,34 @@
 </ol>
 <hr>
 <a name="Chapter1"></a><h2>Introduction</h2><pre>
-  zstd, short for Zstandard, is a fast lossless compression algorithm,
-  targeting real-time compression scenarios at zlib-level and better compression ratios.
-  The zstd compression library provides in-memory compression and decompression functions.
-  The library supports compression levels from 1 up to ZSTD_maxCLevel() which is currently 22.
-  Levels >= 20, labeled `--ultra`, should be used with caution, as they require more memory.
+  zstd, short for Zstandard, is a fast lossless compression algorithm, targeting
+  real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression
+  functions.
+
+  The library supports regular compression levels from 1 up to ZSTD_maxCLevel(),
+  which is currently 22. Levels >= 20, labeled `--ultra`, should be used with
+  caution, as they require more memory. The library also offers negative
+  compression levels, which extend the range of speed vs. ratio preferences.
+  The lower the level, the faster the speed (at the cost of compression).
+
  Compression can be done in:
    - a single step (described as Simple API)
    - a single step, reusing a context (described as Explicit context)
    - unbounded multiple steps (described as Streaming compression)
-  The compression ratio achievable on small data can be highly improved using a dictionary in:
-    - a single step (described as Simple dictionary API)
-    - a single step, reusing a dictionary (described as Bulk-processing dictionary API)

-  Advanced experimental functions can be accessed using #define ZSTD_STATIC_LINKING_ONLY before including zstd.h.
-  Advanced experimental APIs shall never be used with a dynamic library.
-  They are not "stable", their definition may change in the future. Only static linking is allowed.
+  The compression ratio achievable on small data can be highly improved using
+  a dictionary. Dictionary compression can be performed in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing
+      dictionary API)
+
+  Advanced experimental functions can be accessed using
+  `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h.
+
+  Advanced experimental APIs should never be used with a dynamically-linked
+  library. They are not "stable"; their definitions or signatures may change in
+  the future. Only static linking is allowed.
 <BR></pre>

 <a name="Chapter2"></a><h2>Version</h2><pre></pre>
@ -181,7 +193,8 @@ size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
 </b><p>  When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
  ZSTD_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
  ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
-  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict 
+  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict
+  Note : A ZSTD_CDict can be created with an empty dictionary, but it is inefficient for small data. 
 </p></pre><BR>

 <pre><b>size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
@ -195,7 +208,9 @@ size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
 </b><p>  Compression using a digested Dictionary.
  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
  Note that compression level is decided during dictionary creation.
-  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) 
+  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no)
+  Note : ZSTD_compress_usingCDict() can be used with a ZSTD_CDict created from an empty dictionary.
+         But it is inefficient for small data, and it is recommended to use ZSTD_compressCCtx(). 
 </p></pre><BR>

 <pre><b>ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize);
@ -965,16 +980,21 @@ size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx,
                           const void* prefix, size_t prefixSize,
                           ZSTD_dictContentType_e dictContentType);
 </b><p>  Reference a prefix (single-usage dictionary) for next compression job.
-  Decompression need same prefix to properly regenerate data.
-  Prefix is **only used once**. Tables are discarded at end of compression job (ZSTD_e_end).
+  Decompression will need same prefix to properly regenerate data.
+  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+  Note that prefix is **only used once**. Tables are discarded at end of compression job (ZSTD_e_end).
 @result : 0, or an error code (which can be tested with ZSTD_isError()).
  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
  Note 1 : Prefix buffer is referenced. It **must** outlive compression job.
           Its contain must remain unmodified up to end of compression (ZSTD_e_end).
-  Note 2 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+  Note 2 : If the intention is to diff some large src data blob with some prior version of itself,
+           ensure that the window size is large enough to contain the entire source.
+           See ZSTD_p_windowLog.
+  Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters.
           It's a CPU consuming operation, with non-negligible impact on latency.
           If there is a need to use same prefix multiple times, consider loadDictionary instead.
-  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
+  Note 4 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode. 
 </p></pre><BR>

@ -1141,6 +1161,8 @@ size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx,
                        const void* prefix, size_t prefixSize,
                        ZSTD_dictContentType_e dictContentType);
 </b><p>  Reference a prefix (single-usage dictionary) for next compression job.
+  This is the reverse operation of ZSTD_CCtx_refPrefix(),
+  and must use the same prefix as the one used during compression.
  Prefix is **only used once**. Reference is discarded at end of frame.
  End of frame is reached when ZSTD_DCtx_decompress_generic() returns 0.
 @result : 0, or an error code (which can be tested with ZSTD_isError()).
--- a/examples/multiple_streaming_compression.c
+++ b/examples/multiple_streaming_compression.c
@ -158,7 +158,8 @@ int main(int argc, const char** argv)
    }

    freeResources(ress);
-    /* success */
+    free(ofnBuffer);
+
    printf("compressed %i files \n", argc-1);

    return 0;
--- a/examples/streaming_compression.c
+++ b/examples/streaming_compression.c
@ -73,7 +73,11 @@ static void compressFile_orDie(const char* fname, const char* outName, int cLeve
    ZSTD_CStream* const cstream = ZSTD_createCStream();
    if (cstream==NULL) { fprintf(stderr, "ZSTD_createCStream() error \n"); exit(10); }
    size_t const initResult = ZSTD_initCStream(cstream, cLevel);
-    if (ZSTD_isError(initResult)) { fprintf(stderr, "ZSTD_initCStream() error : %s \n", ZSTD_getErrorName(initResult)); exit(11); }
+    if (ZSTD_isError(initResult)) {
+        fprintf(stderr, "ZSTD_initCStream() error : %s \n",
+                    ZSTD_getErrorName(initResult));
+        exit(11);
+    }

    size_t read, toRead = buffInSize;
    while( (read = fread_orDie(buffIn, toRead, fin)) ) {
@ -81,7 +85,11 @@ static void compressFile_orDie(const char* fname, const char* outName, int cLeve
        while (input.pos < input.size) {
            ZSTD_outBuffer output = { buffOut, buffOutSize, 0 };
            toRead = ZSTD_compressStream(cstream, &output , &input);   /* toRead is guaranteed to be <= ZSTD_CStreamInSize() */
-            if (ZSTD_isError(toRead)) { fprintf(stderr, "ZSTD_compressStream() error : %s \n", ZSTD_getErrorName(toRead)); exit(12); }
+            if (ZSTD_isError(toRead)) {
+                fprintf(stderr, "ZSTD_compressStream() error : %s \n",
+                                ZSTD_getErrorName(toRead));
+                exit(12);
+            }
            if (toRead > buffInSize) toRead = buffInSize;   /* Safely handle case when `buffInSize` is manually changed to a value < ZSTD_CStreamInSize()*/
            fwrite_orDie(buffOut, output.pos, fout);
        }
@ -100,15 +108,15 @@ static void compressFile_orDie(const char* fname, const char* outName, int cLeve
 }


-static const char* createOutFilename_orDie(const char* filename)
+static char* createOutFilename_orDie(const char* filename)
 {
    size_t const inL = strlen(filename);
    size_t const outL = inL + 5;
-    void* outSpace = malloc_orDie(outL);
+    void* const outSpace = malloc_orDie(outL);
    memset(outSpace, 0, outL);
    strcat(outSpace, filename);
    strcat(outSpace, ".zst");
-    return (const char*)outSpace;
+    return (char*)outSpace;
 }

 int main(int argc, const char** argv)
@ -124,8 +132,10 @@ int main(int argc, const char** argv)

    const char* const inFilename = argv[1];

-    const char* const outFilename = createOutFilename_orDie(inFilename);
+    char* const outFilename = createOutFilename_orDie(inFilename);
    compressFile_orDie(inFilename, outFilename, 1);

+    free(outFilename);   /* not strictly required, since program execution stops there,
+                          * but some static analyzer main complain otherwise */
    return 0;
 }
--- a/lib/BUCK
+++ b/lib/BUCK
@ -69,6 +69,7 @@ cxx_library(
    ]),
    headers=subdir_glob([
        ('dictBuilder', 'divsufsort.h'),
+        ('dictBuilder', 'cover.h'),
    ]),
    srcs=glob(['dictBuilder/*.c']),
    deps=[':common'],
--- a/lib/Makefile
+++ b/lib/Makefile
@ -23,7 +23,7 @@ ifeq ($(OS),Windows_NT)   # MinGW assumed
 CPPFLAGS   += -D__USE_MINGW_ANSI_STDIO   # compatibility with %zu formatting
 endif
 CFLAGS  ?= -O3
-DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
+DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
            -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
            -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \
            -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
--- a/lib/common/compiler.h
+++ b/lib/common/compiler.h
@ -89,20 +89,37 @@
 #endif

 /* prefetch
- * can be disabled, by declaring NO_PREFETCH macro */
+ * can be disabled, by declaring NO_PREFETCH macro
+ * All prefetch invocations use a single default locality 2,
+ * generating instruction prefetcht1,
+ * which, according to Intel, means "load data into L2 cache".
+ * This is a good enough "middle ground" for the time being,
+ * though in theory, it would be better to specialize locality depending on data being prefetched.
+ * Tests could not determine any sensible difference based on locality value. */
 #if defined(NO_PREFETCH)
-#  define PREFETCH(ptr)   /* disabled */
+#  define PREFETCH(ptr)     (void)(ptr)  /* disabled */
 #else
 #  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
 #    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
-#    define PREFETCH(ptr)   _mm_prefetch((const char*)ptr, _MM_HINT_T0)
+#    define PREFETCH(ptr)   _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
 #  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
-#    define PREFETCH(ptr)   __builtin_prefetch(ptr, 0, 0)
+#    define PREFETCH(ptr)   __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
 #  else
-#    define PREFETCH(ptr)   /* disabled */
+#    define PREFETCH(ptr)   (void)(ptr)  /* disabled */
 #  endif
 #endif  /* NO_PREFETCH */

+#define CACHELINE_SIZE 64
+
+#define PREFETCH_AREA(p, s)  {            \
+    const char* const _ptr = (const char*)(p);  \
+    size_t const _size = (size_t)(s);     \
+    size_t _pos;                          \
+    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
+        PREFETCH(_ptr + _pos);            \
+    }                                     \
+}
+
 /* disable warnings */
 #ifdef _MSC_VER    /* Visual Studio */
 #  include <intrin.h>                    /* For Visual 2005 */
--- a/lib/common/cpu.h
+++ b/lib/common/cpu.h
@ -36,7 +36,7 @@ MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
    U32 f1d = 0;
    U32 f7b = 0;
    U32 f7c = 0;
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
    int reg[4];
    __cpuid((int*)reg, 0);
    {
--- a/lib/common/zstd_internal.h
+++ b/lib/common/zstd_internal.h
@ -79,8 +79,7 @@ static const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 };
 static const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 };
 static const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 };

-#define ZSTD_FRAMEIDSIZE 4
-static const size_t ZSTD_frameIdSize = ZSTD_FRAMEIDSIZE;  /* magic number size */
+#define ZSTD_FRAMEIDSIZE 4   /* magic number size */

 #define ZSTD_BLOCKHEADERSIZE 3   /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
 static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
@ -193,6 +192,8 @@ typedef struct {
    BYTE* llCode;
    BYTE* mlCode;
    BYTE* ofCode;
+    size_t maxNbSeq;
+    size_t maxNbLit;
    U32   longLengthID;   /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
    U32   longLengthPos;
 } seqStore_t;
--- a/lib/compress/fse_compress.c
+++ b/lib/compress/fse_compress.c
@ -83,7 +83,9 @@
 * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
 * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
 */
-size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+                      const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                            void* workSpace, size_t wkspSize)
 {
    U32 const tableSize = 1 << tableLog;
    U32 const tableMask = tableSize - 1;
@ -101,10 +103,14 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
    if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize) return ERROR(tableLog_tooLarge);
    tableU16[-2] = (U16) tableLog;
    tableU16[-1] = (U16) maxSymbolValue;
-    assert(tableLog < 16);   /* required for the threshold strategy to work */
+    assert(tableLog < 16);   /* required for threshold strategy to work */

    /* For explanations on how to distribute symbol values over the table :
-    *  http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+     #ifdef __clang_analyzer__
+     memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
+     #endif

    /* symbol start positions */
    {   U32 u;
@ -124,13 +130,15 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
        U32 symbol;
        for (symbol=0; symbol<=maxSymbolValue; symbol++) {
            int nbOccurences;
-            for (nbOccurences=0; nbOccurences<normalizedCounter[symbol]; nbOccurences++) {
+            int const freq = normalizedCounter[symbol];
+            for (nbOccurences=0; nbOccurences<freq; nbOccurences++) {
                tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
                position = (position + step) & tableMask;
-                while (position > highThreshold) position = (position + step) & tableMask;   /* Low proba area */
+                while (position > highThreshold)
+                    position = (position + step) & tableMask;   /* Low proba area */
        }   }

-        if (position!=0) return ERROR(GENERIC);   /* Must have gone through all positions */
+        assert(position==0);  /* Must have initialized all positions */
    }

    /* Build table */
@ -201,9 +209,10 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
    return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
 }

-static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
-                                       const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
-                                       unsigned writeIsSafe)
+static size_t
+FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+                   const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                         unsigned writeIsSafe)
 {
    BYTE* const ostart = (BYTE*) header;
    BYTE* out = ostart;
@ -212,13 +221,12 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
    const int tableSize = 1 << tableLog;
    int remaining;
    int threshold;
-    U32 bitStream;
-    int bitCount;
-    unsigned charnum = 0;
-    int previous0 = 0;
+    U32 bitStream = 0;
+    int bitCount = 0;
+    unsigned symbol = 0;
+    unsigned const alphabetSize = maxSymbolValue + 1;
+    int previousIs0 = 0;

-    bitStream = 0;
-    bitCount  = 0;
    /* Table Size */
    bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
    bitCount  += 4;
@ -228,48 +236,53 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
    threshold = tableSize;
    nbBits = tableLog+1;

-    while (remaining>1) {  /* stops at 1 */
-        if (previous0) {
-            unsigned start = charnum;
-            while (!normalizedCounter[charnum]) charnum++;
-            while (charnum >= start+24) {
+    while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
+        if (previousIs0) {
+            unsigned start = symbol;
+            while ((symbol < alphabetSize) && !normalizedCounter[symbol]) symbol++;
+            if (symbol == alphabetSize) break;   /* incorrect distribution */
+            while (symbol >= start+24) {
                start+=24;
                bitStream += 0xFFFFU << bitCount;
-                if ((!writeIsSafe) && (out > oend-2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                if ((!writeIsSafe) && (out > oend-2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
                out[0] = (BYTE) bitStream;
                out[1] = (BYTE)(bitStream>>8);
                out+=2;
                bitStream>>=16;
            }
-            while (charnum >= start+3) {
+            while (symbol >= start+3) {
                start+=3;
                bitStream += 3 << bitCount;
                bitCount += 2;
            }
-            bitStream += (charnum-start) << bitCount;
+            bitStream += (symbol-start) << bitCount;
            bitCount += 2;
            if (bitCount>16) {
-                if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                if ((!writeIsSafe) && (out > oend - 2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
                out[0] = (BYTE)bitStream;
                out[1] = (BYTE)(bitStream>>8);
                out += 2;
                bitStream >>= 16;
                bitCount -= 16;
        }   }
-        {   int count = normalizedCounter[charnum++];
-            int const max = (2*threshold-1)-remaining;
+        {   int count = normalizedCounter[symbol++];
+            int const max = (2*threshold-1) - remaining;
            remaining -= count < 0 ? -count : count;
            count++;   /* +1 for extra accuracy */
-            if (count>=threshold) count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+            if (count>=threshold)
+                count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
            bitStream += count << bitCount;
            bitCount  += nbBits;
            bitCount  -= (count<max);
-            previous0  = (count==1);
+            previousIs0  = (count==1);
            if (remaining<1) return ERROR(GENERIC);
            while (remaining<threshold) { nbBits--; threshold>>=1; }
        }
        if (bitCount>16) {
-            if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+            if ((!writeIsSafe) && (out > oend - 2))
+                return ERROR(dstSize_tooSmall);   /* Buffer overflow */
            out[0] = (BYTE)bitStream;
            out[1] = (BYTE)(bitStream>>8);
            out += 2;
@ -277,19 +290,23 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
            bitCount -= 16;
    }   }

+    if (remaining != 1)
+        return ERROR(GENERIC);  /* incorrect normalized distribution */
+    assert(symbol <= alphabetSize);
+
    /* flush remaining bitStream */
-    if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+    if ((!writeIsSafe) && (out > oend - 2))
+        return ERROR(dstSize_tooSmall);   /* Buffer overflow */
    out[0] = (BYTE)bitStream;
    out[1] = (BYTE)(bitStream>>8);
    out+= (bitCount+7) /8;

-    if (charnum > maxSymbolValue + 1) return ERROR(GENERIC);
-
    return (out-ostart);
 }


-size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+                  const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
 {
    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported */
    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported */
@ -297,7 +314,7 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalized
    if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
        return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);

-    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
+    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1 /* write in buffer is safe */);
 }


--- a/lib/compress/hist.h
+++ b/lib/compress/hist.h
@ -50,7 +50,7 @@
 size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
                  const void* src, size_t srcSize);

-unsigned HIST_isError(size_t code);  /*< tells if a return value is an error code */
+unsigned HIST_isError(size_t code);  /**< tells if a return value is an error code */


 /* --- advanced histogram functions --- */
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@ -805,7 +805,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
        U32    const divider = (cParams.searchLength==3) ? 3 : 4;
        size_t const maxNbSeq = blockSize / divider;
-        size_t const tokenSpace = blockSize + 11*maxNbSeq;
+        size_t const tokenSpace = WILDCOPY_OVERLENGTH + blockSize + 11*maxNbSeq;
        size_t const entropySpace = HUF_WORKSPACE_SIZE;
        size_t const blockStateSpace = 2 * sizeof(ZSTD_compressedBlockState_t);
        size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 1);
@ -949,33 +949,51 @@ typedef enum { ZSTDb_not_buffered, ZSTDb_buffered } ZSTD_buffered_policy_e;
 /* ZSTD_sufficientBuff() :
 * check internal buffers exist for streaming if buffPol == ZSTDb_buffered .
 * Note : they are assumed to be correctly sized if ZSTD_equivalentCParams()==1 */
-static U32 ZSTD_sufficientBuff(size_t bufferSize1, size_t blockSize1,
+static U32 ZSTD_sufficientBuff(size_t bufferSize1, size_t maxNbSeq1,
+                            size_t maxNbLit1,
                            ZSTD_buffered_policy_e buffPol2,
                            ZSTD_compressionParameters cParams2,
                            U64 pledgedSrcSize)
 {
    size_t const windowSize2 = MAX(1, (size_t)MIN(((U64)1 << cParams2.windowLog), pledgedSrcSize));
    size_t const blockSize2 = MIN(ZSTD_BLOCKSIZE_MAX, windowSize2);
+    size_t const maxNbSeq2 = blockSize2 / ((cParams2.searchLength == 3) ? 3 : 4);
+    size_t const maxNbLit2 = blockSize2;
    size_t const neededBufferSize2 = (buffPol2==ZSTDb_buffered) ? windowSize2 + blockSize2 : 0;
-    DEBUGLOG(4, "ZSTD_sufficientBuff: is windowSize2=%u <= wlog1=%u",
-                (U32)windowSize2, cParams2.windowLog);
-    DEBUGLOG(4, "ZSTD_sufficientBuff: is blockSize2=%u <= blockSize1=%u",
-                (U32)blockSize2, (U32)blockSize1);
-    return (blockSize2 <= blockSize1) /* seqStore space depends on blockSize */
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is neededBufferSize2=%u <= bufferSize1=%u",
+                (U32)neededBufferSize2, (U32)bufferSize1);
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is maxNbSeq2=%u <= maxNbSeq1=%u",
+                (U32)maxNbSeq2, (U32)maxNbSeq1);
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is maxNbLit2=%u <= maxNbLit1=%u",
+                (U32)maxNbLit2, (U32)maxNbLit1);
+    return (maxNbLit2 <= maxNbLit1)
+         & (maxNbSeq2 <= maxNbSeq1)
         & (neededBufferSize2 <= bufferSize1);
 }

 /** Equivalence for resetCCtx purposes */
 static U32 ZSTD_equivalentParams(ZSTD_CCtx_params params1,
                                 ZSTD_CCtx_params params2,
-                                 size_t buffSize1, size_t blockSize1,
+                                 size_t buffSize1,
+                                 size_t maxNbSeq1, size_t maxNbLit1,
                                 ZSTD_buffered_policy_e buffPol2,
                                 U64 pledgedSrcSize)
 {
    DEBUGLOG(4, "ZSTD_equivalentParams: pledgedSrcSize=%u", (U32)pledgedSrcSize);
-    return ZSTD_equivalentCParams(params1.cParams, params2.cParams) &&
-           ZSTD_equivalentLdmParams(params1.ldmParams, params2.ldmParams) &&
-           ZSTD_sufficientBuff(buffSize1, blockSize1, buffPol2, params2.cParams, pledgedSrcSize);
+    if (!ZSTD_equivalentCParams(params1.cParams, params2.cParams)) {
+      DEBUGLOG(4, "ZSTD_equivalentCParams() == 0");
+      return 0;
+    }
+    if (!ZSTD_equivalentLdmParams(params1.ldmParams, params2.ldmParams)) {
+      DEBUGLOG(4, "ZSTD_equivalentLdmParams() == 0");
+      return 0;
+    }
+    if (!ZSTD_sufficientBuff(buffSize1, maxNbSeq1, maxNbLit1, buffPol2,
+                             params2.cParams, pledgedSrcSize)) {
+      DEBUGLOG(4, "ZSTD_sufficientBuff() == 0");
+      return 0;
+    }
+    return 1;
 }

 static void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs)
@ -1103,8 +1121,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,

    if (crp == ZSTDcrp_continue) {
        if (ZSTD_equivalentParams(zc->appliedParams, params,
-                                zc->inBuffSize, zc->blockSize,
-                                zbuff, pledgedSrcSize)) {
+                                  zc->inBuffSize,
+                                  zc->seqStore.maxNbSeq, zc->seqStore.maxNbLit,
+                                  zbuff, pledgedSrcSize)) {
            DEBUGLOG(4, "ZSTD_equivalentParams()==1 -> continue mode (wLog1=%u, blockSize1=%zu)",
                        zc->appliedParams.cParams.windowLog, zc->blockSize);
            zc->workSpaceOversizedDuration += (zc->workSpaceOversizedDuration > 0);   /* if it was too large, it still is */
@ -1125,7 +1144,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
        U32    const divider = (params.cParams.searchLength==3) ? 3 : 4;
        size_t const maxNbSeq = blockSize / divider;
-        size_t const tokenSpace = blockSize + 11*maxNbSeq;
+        size_t const tokenSpace = WILDCOPY_OVERLENGTH + blockSize + 11*maxNbSeq;
        size_t const buffOutSize = (zbuff==ZSTDb_buffered) ? ZSTD_compressBound(blockSize)+1 : 0;
        size_t const buffInSize = (zbuff==ZSTDb_buffered) ? windowSize + blockSize : 0;
        size_t const matchStateSize = ZSTD_sizeof_matchState(&params.cParams, /* forCCtx */ 1);
@ -1165,7 +1184,6 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
                if (zc->workSpace == NULL) return ERROR(memory_allocation);
                zc->workSpaceSize = neededSpace;
                zc->workSpaceOversizedDuration = 0;
-                ptr = zc->workSpace;

                /* Statically sized space.
                 * entropyWorkspace never moves,
@ -1216,13 +1234,18 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
        ptr = ZSTD_reset_matchState(&zc->blockState.matchState, ptr, &params.cParams, crp, /* forCCtx */ 1);

        /* sequences storage */
+        zc->seqStore.maxNbSeq = maxNbSeq;
        zc->seqStore.sequencesStart = (seqDef*)ptr;
        ptr = zc->seqStore.sequencesStart + maxNbSeq;
        zc->seqStore.llCode = (BYTE*) ptr;
        zc->seqStore.mlCode = zc->seqStore.llCode + maxNbSeq;
        zc->seqStore.ofCode = zc->seqStore.mlCode + maxNbSeq;
        zc->seqStore.litStart = zc->seqStore.ofCode + maxNbSeq;
-        ptr = zc->seqStore.litStart + blockSize;
+        /* ZSTD_wildcopy() is used to copy into the literals buffer,
+         * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+         */
+        zc->seqStore.maxNbLit = blockSize;
+        ptr = zc->seqStore.litStart + blockSize + WILDCOPY_OVERLENGTH;

        /* ldm bucketOffsets table */
        if (params.ldmParams.enableLdm) {
@ -1341,8 +1364,7 @@ static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
        }

        /* copy dictionary offsets */
-        {
-            ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
+        {   ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
            ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState;
            dstMatchState->window       = srcMatchState->window;
            dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
@ -1666,6 +1688,7 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
    BYTE* const mlCodeTable = seqStorePtr->mlCode;
    U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
    U32 u;
+    assert(nbSeq <= seqStorePtr->maxNbSeq);
    for (u=0; u<nbSeq; u++) {
        U32 const llv = sequences[u].litLength;
        U32 const mlv = sequences[u].matchLength;
@ -2254,13 +2277,6 @@ MEM_STATIC size_t ZSTD_compressSequences(seqStore_t* seqStorePtr,
        if (cSize >= maxCSize) return 0;  /* block not compressed */
    }

-    /* We check that dictionaries have offset codes available for the first
-     * block. After the first block, the offcode table might not have large
-     * enough codes to represent the offsets in the data.
-     */
-    if (nextEntropy->fse.offcode_repeatMode == FSE_repeat_valid)
-        nextEntropy->fse.offcode_repeatMode = FSE_repeat_check;
-
    return cSize;
 }

@ -2399,12 +2415,20 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
                                &zc->appliedParams,
                                dst, dstCapacity,
                                srcSize, zc->entropyWorkspace, zc->bmi2);
-        if (ZSTD_isError(cSize) || cSize == 0) return cSize;
-        /* confirm repcodes and entropy tables */
-        {   ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
+        if (!ZSTD_isError(cSize) && cSize != 0) {
+            /* confirm repcodes and entropy tables */
+            ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
            zc->blockState.prevCBlock = zc->blockState.nextCBlock;
            zc->blockState.nextCBlock = tmp;
        }
+
+        /* We check that dictionaries have offset codes available for the first
+         * block. After the first block, the offcode table might not have large
+         * enough codes to represent the offsets in the data.
+         */
+        if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+            zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
        return cSize;
    }
 }
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@ -314,8 +314,10 @@ MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const v
               pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offsetCode);
    }
 #endif
+    assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
    /* copy Literals */
-    assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + 128 KB);
+    assert(seqStorePtr->maxNbLit <= 128 KB);
+    assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit);
    ZSTD_wildcopy(seqStorePtr->lit, literals, litLength);
    seqStorePtr->lit += litLength;

--- a/lib/compress/zstd_opt.c
+++ b/lib/compress/zstd_opt.c
@ -970,7 +970,7 @@ _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
            U32 seqPos = cur;

            DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
-                        last_pos, cur);
+                        last_pos, cur); (void)last_pos;
            assert(storeEnd < ZSTD_OPT_NUM);
            DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
                        storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
--- a/lib/compress/zstdmt_compress.c
+++ b/lib/compress/zstdmt_compress.c
@ -320,7 +320,8 @@ static void ZSTDMT_setNbSeq(ZSTDMT_seqPool* const seqPool, size_t const nbSeq)

 static ZSTDMT_seqPool* ZSTDMT_createSeqPool(unsigned nbWorkers, ZSTD_customMem cMem)
 {
-    ZSTDMT_seqPool* seqPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+    ZSTDMT_seqPool* const seqPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+    if (seqPool == NULL) return NULL;
    ZSTDMT_setNbSeq(seqPool, 0);
    return seqPool;
 }
--- a/lib/decompress/huf_decompress.c
+++ b/lib/decompress/huf_decompress.c
@ -533,9 +533,9 @@ static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
    }
 }

-size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src,
-                             size_t srcSize, void* workSpace,
-                             size_t wkspSize)
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+                       const void* src, size_t srcSize,
+                             void* workSpace, size_t wkspSize)
 {
    U32 tableLog, maxW, sizeOfSort, nbSymbols;
    DTableDesc dtd = HUF_getDTableDesc(DTable);
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@ -40,7 +40,6 @@
 #  define ZSTD_MAXWINDOWSIZE_DEFAULT (((U32)1 << ZSTD_WINDOWLOG_DEFAULTMAX) + 1)
 #endif

-
 /*!
 *  NO_FORWARD_PROGRESS_MAX :
 *  maximum allowed nb of calls to ZSTD_decompressStream() and ZSTD_decompress_generic()
@ -52,11 +51,13 @@
 #  define ZSTD_NO_FORWARD_PROGRESS_MAX 16
 #endif

+
 /*-*******************************************************
 *  Dependencies
 *********************************************************/
 #include <string.h>      /* memcpy, memmove, memset */
-#include "cpu.h"
+#include "compiler.h"    /* prefetch */
+#include "cpu.h"         /* bmi2 */
 #include "mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "fse.h"
@ -68,6 +69,9 @@
 #  include "zstd_legacy.h"
 #endif

+static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict);
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict);
+

 /*-*************************************
 *  Errors
@ -110,11 +114,10 @@ typedef struct {
 #define SEQSYMBOL_TABLE_SIZE(log)   (1 + (1 << (log)))

 typedef struct {
-    ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];
-    ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];
-    ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];
+    ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
+    ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
+    ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
    HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
-    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
    U32 rep[ZSTD_REP_NUM];
 } ZSTD_entropyDTables_t;

@ -125,6 +128,7 @@ struct ZSTD_DCtx_s
    const ZSTD_seqSymbol* OFTptr;
    const HUF_DTable* HUFptr;
    ZSTD_entropyDTables_t entropy;
+    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];   /* space needed when building huffman tables */
    const void* previousDstEnd;   /* detect continuity */
    const void* prefixStart;      /* start of current segment */
    const void* virtualStart;     /* virtual start of previous segment if it was just before current one */
@ -138,7 +142,6 @@ struct ZSTD_DCtx_s
    U32 fseEntropy;
    XXH64_state_t xxhState;
    size_t headerSize;
-    U32 dictID;
    ZSTD_format_e format;
    const BYTE* litPtr;
    ZSTD_customMem customMem;
@ -147,9 +150,13 @@ struct ZSTD_DCtx_s
    size_t staticSize;
    int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */

-    /* streaming */
+    /* dictionary */
    ZSTD_DDict* ddictLocal;
-    const ZSTD_DDict* ddict;
+    const ZSTD_DDict* ddict;     /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
+    U32 dictID;
+    int ddictIsCold;             /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
+
+    /* streaming */
    ZSTD_dStreamStage streamStage;
    char*  inBuff;
    size_t inBuffSize;
@ -185,7 +192,7 @@ size_t ZSTD_estimateDCtxSize(void) { return sizeof(ZSTD_DCtx); }
 static size_t ZSTD_startingInputLength(ZSTD_format_e format)
 {
    size_t const startingInputLength = (format==ZSTD_f_zstd1_magicless) ?
-                    ZSTD_frameHeaderSize_prefix - ZSTD_frameIdSize :
+                    ZSTD_frameHeaderSize_prefix - ZSTD_FRAMEIDSIZE :
                    ZSTD_frameHeaderSize_prefix;
    ZSTD_STATIC_ASSERT(ZSTD_FRAMEHEADERSIZE_PREFIX >= ZSTD_FRAMEIDSIZE);
    /* only supports formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless */
@ -200,6 +207,8 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
    dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
    dctx->ddict       = NULL;
    dctx->ddictLocal  = NULL;
+    dctx->dictEnd     = NULL;
+    dctx->ddictIsCold = 0;
    dctx->inBuff      = NULL;
    dctx->inBuffSize  = 0;
    dctx->outBuffSize = 0;
@ -278,7 +287,7 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
 *  Note 3 : Skippable Frame Identifiers are considered valid. */
 unsigned ZSTD_isFrame(const void* buffer, size_t size)
 {
-    if (size < ZSTD_frameIdSize) return 0;
+    if (size < ZSTD_FRAMEIDSIZE) return 0;
    {   U32 const magic = MEM_readLE32(buffer);
        if (magic == ZSTD_MAGICNUMBER) return 1;
        if ((magic & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) return 1;
@ -330,7 +339,9 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s
    const BYTE* ip = (const BYTE*)src;
    size_t const minInputSize = ZSTD_startingInputLength(format);

+    memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
    if (srcSize < minInputSize) return minInputSize;
+    if (src==NULL) return ERROR(GENERIC);   /* invalid parameter */

    if ( (format != ZSTD_f_zstd1_magicless)
      && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
@ -339,7 +350,7 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s
            if (srcSize < ZSTD_skippableHeaderSize)
                return ZSTD_skippableHeaderSize; /* magic number + frame length */
            memset(zfhPtr, 0, sizeof(*zfhPtr));
-            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_frameIdSize);
+            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE);
            zfhPtr->frameType = ZSTD_skippableFrame;
            return 0;
        }
@ -451,7 +462,7 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
            size_t skippableSize;
            if (srcSize < ZSTD_skippableHeaderSize)
                return ERROR(srcSize_wrong);
-            skippableSize = MEM_readLE32((const BYTE *)src + ZSTD_frameIdSize)
+            skippableSize = MEM_readLE32((const BYTE *)src + ZSTD_FRAMEIDSIZE)
                          + ZSTD_skippableHeaderSize;
            if (srcSize < skippableSize) {
                return ZSTD_CONTENTSIZE_ERROR;
@ -540,6 +551,7 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
 static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity,
                          const void* src, size_t srcSize)
 {
+    if (dst==NULL) return ERROR(dstSize_tooSmall);
    if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
    memcpy(dst, src, srcSize);
    return srcSize;
@ -572,6 +584,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
        case set_repeat:
            if (dctx->litEntropy==0) return ERROR(dictionary_corrupted);
            /* fall-through */
+
        case set_compressed:
            if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */
            {   size_t lhSize, litSize, litCSize;
@ -603,15 +616,20 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                if (litSize > ZSTD_BLOCKSIZE_MAX) return ERROR(corruption_detected);
                if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);

+                /* prefetch huffman table if cold */
+                if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
+                    PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable));
+                }
+
                if (HUF_isError((litEncType==set_repeat) ?
                                    ( singleStream ?
                                        HUF_decompress1X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) :
                                        HUF_decompress4X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) ) :
                                    ( singleStream ?
                                        HUF_decompress1X1_DCtx_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                                                                         dctx->entropy.workspace, sizeof(dctx->entropy.workspace), dctx->bmi2) :
+                                                                         dctx->workspace, sizeof(dctx->workspace), dctx->bmi2) :
                                        HUF_decompress4X_hufOnly_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                                                                           dctx->entropy.workspace, sizeof(dctx->entropy.workspace), dctx->bmi2))))
+                                                                           dctx->workspace, sizeof(dctx->workspace), dctx->bmi2))))
                    return ERROR(corruption_detected);

                dctx->litPtr = dctx->litBuffer;
@ -883,7 +901,8 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
                                 symbolEncodingType_e type, U32 max, U32 maxLog,
                                 const void* src, size_t srcSize,
                                 const U32* baseValue, const U32* nbAdditionalBits,
-                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable)
+                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
+                                 int ddictIsCold, int nbSeq)
 {
    switch(type)
    {
@ -902,6 +921,12 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
        return 0;
    case set_repeat:
        if (!flagRepeatTable) return ERROR(corruption_detected);
+        /* prefetch FSE table if used */
+        if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
+            const void* const pStart = *DTablePtr;
+            size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog));
+            PREFETCH_AREA(pStart, pSize);
+        }
        return 0;
    case set_compressed :
        {   U32 tableLog;
@ -954,25 +979,25 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
    const BYTE* const istart = (const BYTE* const)src;
    const BYTE* const iend = istart + srcSize;
    const BYTE* ip = istart;
+    int nbSeq;
    DEBUGLOG(5, "ZSTD_decodeSeqHeaders");

    /* check */
    if (srcSize < MIN_SEQUENCES_SIZE) return ERROR(srcSize_wrong);

    /* SeqHead */
-    {   int nbSeq = *ip++;
-        if (!nbSeq) { *nbSeqPtr=0; return 1; }
-        if (nbSeq > 0x7F) {
-            if (nbSeq == 0xFF) {
-                if (ip+2 > iend) return ERROR(srcSize_wrong);
-                nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
-            } else {
-                if (ip >= iend) return ERROR(srcSize_wrong);
-                nbSeq = ((nbSeq-0x80)<<8) + *ip++;
-            }
+    nbSeq = *ip++;
+    if (!nbSeq) { *nbSeqPtr=0; return 1; }
+    if (nbSeq > 0x7F) {
+        if (nbSeq == 0xFF) {
+            if (ip+2 > iend) return ERROR(srcSize_wrong);
+            nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
+        } else {
+            if (ip >= iend) return ERROR(srcSize_wrong);
+            nbSeq = ((nbSeq-0x80)<<8) + *ip++;
        }
-        *nbSeqPtr = nbSeq;
    }
+    *nbSeqPtr = nbSeq;

    /* FSE table descriptors */
    if (ip+4 > iend) return ERROR(srcSize_wrong); /* minimum possible size */
@ -986,7 +1011,8 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                                                      LLtype, MaxLL, LLFSELog,
                                                      ip, iend-ip,
                                                      LL_base, LL_bits,
-                                                      LL_defaultDTable, dctx->fseEntropy);
+                                                      LL_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
            if (ZSTD_isError(llhSize)) return ERROR(corruption_detected);
            ip += llhSize;
        }
@ -995,7 +1021,8 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                                                      OFtype, MaxOff, OffFSELog,
                                                      ip, iend-ip,
                                                      OF_base, OF_bits,
-                                                      OF_defaultDTable, dctx->fseEntropy);
+                                                      OF_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
            if (ZSTD_isError(ofhSize)) return ERROR(corruption_detected);
            ip += ofhSize;
        }
@ -1004,12 +1031,23 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                                                      MLtype, MaxML, MLFSELog,
                                                      ip, iend-ip,
                                                      ML_base, ML_bits,
-                                                      ML_defaultDTable, dctx->fseEntropy);
+                                                      ML_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
            if (ZSTD_isError(mlhSize)) return ERROR(corruption_detected);
            ip += mlhSize;
        }
    }

+    /* prefetch dictionary content */
+    if (dctx->ddictIsCold) {
+        size_t const dictSize = (const char*)dctx->prefixStart - (const char*)dctx->virtualStart;
+        size_t const psmin = MIN(dictSize, (size_t)(64*nbSeq) /* heuristic */ );
+        size_t const pSize = MIN(psmin, 128 KB /* protection */ );
+        const void* const pStart = (const char*)dctx->dictEnd - pSize;
+        PREFETCH_AREA(pStart, pSize);
+        dctx->ddictIsCold = 0;
+    }
+
    return ip-istart;
 }

@ -1676,7 +1714,8 @@ static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
    /* isLongOffset must be true if there are long offsets.
     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
     * We don't expect that to be the case in 64-bit mode.
-     * In block mode, window size is not known, so we have to be conservative. (note: but it could be evaluated from current-lowLimit)
+     * In block mode, window size is not known, so we have to be conservative.
+     * (note: but it could be evaluated from current-lowLimit)
     */
    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)));
    DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
@ -1763,7 +1802,7 @@ size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
 #endif
    if ( (srcSize >= ZSTD_skippableHeaderSize)
      && (MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START ) {
-        return ZSTD_skippableHeaderSize + MEM_readLE32((const BYTE*)src + ZSTD_frameIdSize);
+        return ZSTD_skippableHeaderSize + MEM_readLE32((const BYTE*)src + ZSTD_FRAMEIDSIZE);
    } else {
        const BYTE* ip = (const BYTE*)src;
        const BYTE* const ipstart = ip;
@ -1797,7 +1836,6 @@ size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
        if (zfh.checksumFlag) {   /* Final frame content checksum */
            if (remainingSize < 4) return ERROR(srcSize_wrong);
            ip += 4;
-            remainingSize -= 4;
        }

        return ip - ipstart;
@ -1885,9 +1923,6 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
    return op-ostart;
 }

-static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict);
-static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict);
-
 static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
                                        void* dst, size_t dstCapacity,
                                  const void* src, size_t srcSize,
@ -1896,6 +1931,8 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
 {
    void* const dststart = dst;
    int moreThan1Frame = 0;
+
+    DEBUGLOG(5, "ZSTD_decompressMultiFrame");
    assert(dict==NULL || ddict==NULL);  /* either dict or ddict set, not both */

    if (ddict) {
@ -1932,7 +1969,7 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
                size_t skippableSize;
                if (srcSize < ZSTD_skippableHeaderSize)
                    return ERROR(srcSize_wrong);
-                skippableSize = MEM_readLE32((const BYTE*)src + ZSTD_frameIdSize)
+                skippableSize = MEM_readLE32((const BYTE*)src + ZSTD_FRAMEIDSIZE)
                              + ZSTD_skippableHeaderSize;
                if (srcSize < skippableSize) return ERROR(srcSize_wrong);

@ -2057,7 +2094,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
    case ZSTDds_getFrameHeaderSize :
        assert(src != NULL);
        if (dctx->format == ZSTD_f_zstd1) {  /* allows header */
-            assert(srcSize >= ZSTD_frameIdSize);  /* to read skippable magic number */
+            assert(srcSize >= ZSTD_FRAMEIDSIZE);  /* to read skippable magic number */
            if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {        /* skippable frame */
                memcpy(dctx->headerBuffer, src, srcSize);
                dctx->expected = ZSTD_skippableHeaderSize - srcSize;  /* remaining to load to get full skippable frame header */
@ -2167,7 +2204,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
        assert(src != NULL);
        assert(srcSize <= ZSTD_skippableHeaderSize);
        memcpy(dctx->headerBuffer + (ZSTD_skippableHeaderSize - srcSize), src, srcSize);   /* complete skippable header */
-        dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_frameIdSize);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
+        dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
        dctx->stage = ZSTDds_skipFrame;
        return 0;

@ -2191,21 +2228,27 @@ static size_t ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dict
    return 0;
 }

-/* ZSTD_loadEntropy() :
- * dict : must point at beginning of a valid zstd dictionary
+/*! ZSTD_loadEntropy() :
+ *  dict : must point at beginning of a valid zstd dictionary.
 * @return : size of entropy tables read */
-static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const dict, size_t const dictSize)
+static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy,
+                         const void* const dict, size_t const dictSize)
 {
    const BYTE* dictPtr = (const BYTE*)dict;
    const BYTE* const dictEnd = dictPtr + dictSize;

    if (dictSize <= 8) return ERROR(dictionary_corrupted);
+    assert(MEM_readLE32(dict) == ZSTD_MAGIC_DICTIONARY);   /* dict must be valid */
    dictPtr += 8;   /* skip header = magic + dictID */

-
-    {   size_t const hSize = HUF_readDTableX2_wksp(
-            entropy->hufTable, dictPtr, dictEnd - dictPtr,
-            entropy->workspace, sizeof(entropy->workspace));
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, OFTable) == offsetof(ZSTD_entropyDTables_t, LLTable) + sizeof(entropy->LLTable));
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, MLTable) == offsetof(ZSTD_entropyDTables_t, OFTable) + sizeof(entropy->OFTable));
+    ZSTD_STATIC_ASSERT(sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable) >= HUF_DECOMPRESS_WORKSPACE_SIZE);
+    {   void* const workspace = &entropy->LLTable;   /* use fse tables as temporary workspace; implies fse tables are grouped together */
+        size_t const workspaceSize = sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable);
+        size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+                                                dictPtr, dictEnd - dictPtr,
+                                                workspace, workspaceSize);
        if (HUF_isError(hSize)) return ERROR(dictionary_corrupted);
        dictPtr += hSize;
    }
@ -2216,7 +2259,7 @@ static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const
        if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
        if (offcodeMaxValue > MaxOff) return ERROR(dictionary_corrupted);
        if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
-        ZSTD_buildFSETable(entropy->OFTable,
+        ZSTD_buildFSETable( entropy->OFTable,
                            offcodeNCount, offcodeMaxValue,
                            OF_base, OF_bits,
                            offcodeLog);
@ -2229,7 +2272,7 @@ static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const
        if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
        if (matchlengthMaxValue > MaxML) return ERROR(dictionary_corrupted);
        if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
-        ZSTD_buildFSETable(entropy->MLTable,
+        ZSTD_buildFSETable( entropy->MLTable,
                            matchlengthNCount, matchlengthMaxValue,
                            ML_base, ML_bits,
                            matchlengthLog);
@ -2242,7 +2285,7 @@ static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const
        if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
        if (litlengthMaxValue > MaxLL) return ERROR(dictionary_corrupted);
        if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
-        ZSTD_buildFSETable(entropy->LLTable,
+        ZSTD_buildFSETable( entropy->LLTable,
                            litlengthNCount, litlengthMaxValue,
                            LL_base, LL_bits,
                            litlengthLog);
@ -2268,7 +2311,7 @@ static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict
        if (magic != ZSTD_MAGIC_DICTIONARY) {
            return ZSTD_refDictContent(dctx, dict, dictSize);   /* pure content mode */
    }   }
-    dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_frameIdSize);
+    dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);

    /* load entropy tables */
    {   size_t const eSize = ZSTD_loadEntropy(&dctx->entropy, dict, dictSize);
@ -2282,7 +2325,6 @@ static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict
    return ZSTD_refDictContent(dctx, dict, dictSize);
 }

-/* Note : this function cannot fail */
 size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
 {
    assert(dctx != NULL);
@ -2328,42 +2370,53 @@ struct ZSTD_DDict_s {

 static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict)
 {
+    assert(ddict != NULL);
    return ddict->dictContent;
 }

 static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict)
 {
+    assert(ddict != NULL);
    return ddict->dictSize;
 }

-size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dstDCtx, const ZSTD_DDict* ddict)
+size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
 {
-    CHECK_F( ZSTD_decompressBegin(dstDCtx) );
-    if (ddict) {   /* support begin on NULL */
-        dstDCtx->dictID = ddict->dictID;
-        dstDCtx->prefixStart = ddict->dictContent;
-        dstDCtx->virtualStart = ddict->dictContent;
-        dstDCtx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
-        dstDCtx->previousDstEnd = dstDCtx->dictEnd;
+    DEBUGLOG(4, "ZSTD_decompressBegin_usingDDict");
+    assert(dctx != NULL);
+    if (ddict) {
+        dctx->ddictIsCold = (dctx->dictEnd != (const char*)ddict->dictContent + ddict->dictSize);
+        DEBUGLOG(4, "DDict is %s",
+                    dctx->ddictIsCold ? "~cold~" : "hot!");
+    }
+    CHECK_F( ZSTD_decompressBegin(dctx) );
+    if (ddict) {   /* NULL ddict is equivalent to no dictionary */
+        dctx->dictID = ddict->dictID;
+        dctx->prefixStart = ddict->dictContent;
+        dctx->virtualStart = ddict->dictContent;
+        dctx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
+        dctx->previousDstEnd = dctx->dictEnd;
        if (ddict->entropyPresent) {
-            dstDCtx->litEntropy = 1;
-            dstDCtx->fseEntropy = 1;
-            dstDCtx->LLTptr = ddict->entropy.LLTable;
-            dstDCtx->MLTptr = ddict->entropy.MLTable;
-            dstDCtx->OFTptr = ddict->entropy.OFTable;
-            dstDCtx->HUFptr = ddict->entropy.hufTable;
-            dstDCtx->entropy.rep[0] = ddict->entropy.rep[0];
-            dstDCtx->entropy.rep[1] = ddict->entropy.rep[1];
-            dstDCtx->entropy.rep[2] = ddict->entropy.rep[2];
+            dctx->litEntropy = 1;
+            dctx->fseEntropy = 1;
+            dctx->LLTptr = ddict->entropy.LLTable;
+            dctx->MLTptr = ddict->entropy.MLTable;
+            dctx->OFTptr = ddict->entropy.OFTable;
+            dctx->HUFptr = ddict->entropy.hufTable;
+            dctx->entropy.rep[0] = ddict->entropy.rep[0];
+            dctx->entropy.rep[1] = ddict->entropy.rep[1];
+            dctx->entropy.rep[2] = ddict->entropy.rep[2];
        } else {
-            dstDCtx->litEntropy = 0;
-            dstDCtx->fseEntropy = 0;
+            dctx->litEntropy = 0;
+            dctx->fseEntropy = 0;
        }
    }
    return 0;
 }

-static size_t ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict, ZSTD_dictContentType_e dictContentType)
+static size_t
+ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict,
+                         ZSTD_dictContentType_e dictContentType)
 {
    ddict->dictID = 0;
    ddict->entropyPresent = 0;
@ -2381,10 +2434,12 @@ static size_t ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict, ZSTD_dictContentType_e
            return 0;   /* pure content mode */
        }
    }
-    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_frameIdSize);
+    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_FRAMEIDSIZE);

    /* load entropy tables */
-    CHECK_E( ZSTD_loadEntropy(&ddict->entropy, ddict->dictContent, ddict->dictSize), dictionary_corrupted );
+    CHECK_E( ZSTD_loadEntropy(&ddict->entropy,
+                              ddict->dictContent, ddict->dictSize),
+             dictionary_corrupted );
    ddict->entropyPresent = 1;
    return 0;
 }
@ -2398,6 +2453,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
    if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) {
        ddict->dictBuffer = NULL;
        ddict->dictContent = dict;
+        if (!dict) dictSize = 0;
    } else {
        void* const internalBuffer = ZSTD_malloc(dictSize, ddict->cMem);
        ddict->dictBuffer = internalBuffer;
@ -2422,14 +2478,15 @@ ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;

    {   ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_malloc(sizeof(ZSTD_DDict), customMem);
-        if (!ddict) return NULL;
+        if (ddict == NULL) return NULL;
        ddict->cMem = customMem;
-
-        if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, dictLoadMethod, dictContentType) )) {
-            ZSTD_freeDDict(ddict);
-            return NULL;
-        }
-
+        {   size_t const initResult = ZSTD_initDDict_internal(ddict,
+                                            dict, dictSize,
+                                            dictLoadMethod, dictContentType);
+            if (ZSTD_isError(initResult)) {
+                ZSTD_freeDDict(ddict);
+                return NULL;
+        }   }
        return ddict;
    }
 }
@ -2456,23 +2513,25 @@ ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize


 const ZSTD_DDict* ZSTD_initStaticDDict(
-                                void* workspace, size_t workspaceSize,
+                                void* sBuffer, size_t sBufferSize,
                                const void* dict, size_t dictSize,
                                ZSTD_dictLoadMethod_e dictLoadMethod,
                                ZSTD_dictContentType_e dictContentType)
 {
-    size_t const neededSpace =
-            sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
-    ZSTD_DDict* const ddict = (ZSTD_DDict*)workspace;
-    assert(workspace != NULL);
+    size_t const neededSpace = sizeof(ZSTD_DDict)
+                             + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+    ZSTD_DDict* const ddict = (ZSTD_DDict*)sBuffer;
+    assert(sBuffer != NULL);
    assert(dict != NULL);
-    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
-    if (workspaceSize < neededSpace) return NULL;
+    if ((size_t)sBuffer & 7) return NULL;   /* 8-aligned */
+    if (sBufferSize < neededSpace) return NULL;
    if (dictLoadMethod == ZSTD_dlm_byCopy) {
        memcpy(ddict+1, dict, dictSize);  /* local copy */
        dict = ddict+1;
    }
-    if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, ZSTD_dlm_byRef, dictContentType) ))
+    if (ZSTD_isError( ZSTD_initDDict_internal(ddict,
+                                              dict, dictSize,
+                                              ZSTD_dlm_byRef, dictContentType) ))
        return NULL;
    return ddict;
 }
@ -2510,7 +2569,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
 {
    if (dictSize < 8) return 0;
    if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) return 0;
-    return MEM_readLE32((const char*)dict + ZSTD_frameIdSize);
+    return MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
 }

 /*! ZSTD_getDictID_fromDDict() :
@ -2586,12 +2645,15 @@ size_t ZSTD_freeDStream(ZSTD_DStream* zds)
 }


-/* *** Initialization *** */
+/* ***  Initialization  *** */

 size_t ZSTD_DStreamInSize(void)  { return ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize; }
 size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_MAX; }

-size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
+size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx,
+                                   const void* dict, size_t dictSize,
+                                         ZSTD_dictLoadMethod_e dictLoadMethod,
+                                         ZSTD_dictContentType_e dictContentType)
 {
    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
    ZSTD_freeDDict(dctx->ddictLocal);
@ -2645,13 +2707,6 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
    return ZSTD_initDStream_usingDict(zds, NULL, 0);
 }

-size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
-{
-    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
-    dctx->ddict = ddict;
-    return 0;
-}
-
 /* ZSTD_initDStream_usingDDict() :
 * ddict will just be referenced, and must outlive decompression session
 * this function cannot fail */
@ -2690,6 +2745,13 @@ size_t ZSTD_setDStreamParameter(ZSTD_DStream* dctx,
    return 0;
 }

+size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
+    dctx->ddict = ddict;
+    return 0;
+}
+
 size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize)
 {
    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
@ -2855,7 +2917,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
            CHECK_F(ZSTD_decompressBegin_usingDDict(zds, zds->ddict));

            if ((MEM_readLE32(zds->headerBuffer) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
-                zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_frameIdSize);
+                zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
                zds->stage = ZSTDds_skipFrame;
            } else {
                CHECK_F(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize));
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@ -29,6 +29,7 @@
 #include "mem.h" /* read */
 #include "pool.h"
 #include "threading.h"
+#include "cover.h"
 #include "zstd_internal.h" /* includes zstd.h */
 #ifndef ZDICT_STATIC_LINKING_ONLY
 #define ZDICT_STATIC_LINKING_ONLY
@ -185,7 +186,7 @@ static void COVER_map_remove(COVER_map_t *map, U32 key) {
 }

 /**
- * Destroyes a map that is inited with COVER_map_init().
+ * Destroys a map that is inited with COVER_map_init().
 */
 static void COVER_map_destroy(COVER_map_t *map) {
  if (map->data) {
@ -223,7 +224,7 @@ static COVER_ctx_t *g_ctx = NULL;
 /**
 * Returns the sum of the sample sizes.
 */
-static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
+size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
  size_t sum = 0;
  unsigned i;
  for (i = 0; i < nbSamples; ++i) {
@ -380,14 +381,6 @@ static void COVER_group(COVER_ctx_t *ctx, const void *group,
  ctx->suffix[dmerId] = freq;
 }

-/**
- * A segment is a range in the source as well as the score of the segment.
- */
-typedef struct {
-  U32 begin;
-  U32 end;
-  U32 score;
-} COVER_segment_t;

 /**
 * Selects the best segment in an epoch.
@ -737,28 +730,65 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
  }
 }

-/**
- * COVER_best_t is used for two purposes:
- * 1. Synchronizing threads.
- * 2. Saving the best parameters and dictionary.
- *
- * All of the methods except COVER_best_init() are thread safe if zstd is
- * compiled with multithreaded support.
- */
-typedef struct COVER_best_s {
-  ZSTD_pthread_mutex_t mutex;
-  ZSTD_pthread_cond_t cond;
-  size_t liveJobs;
-  void *dict;
-  size_t dictSize;
-  ZDICT_cover_params_t parameters;
-  size_t compressedSize;
-} COVER_best_t;
+
+
+size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
+                                    const size_t *samplesSizes, const BYTE *samples,
+                                    size_t *offsets,
+                                    size_t nbTrainSamples, size_t nbSamples,
+                                    BYTE *const dict, size_t dictBufferCapacity) {
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Pointers */
+  ZSTD_CCtx *cctx;
+  ZSTD_CDict *cdict;
+  void *dst;
+  /* Local variables */
+  size_t dstCapacity;
+  size_t i;
+  /* Allocate dst with enough space to compress the maximum sized sample */
+  {
+    size_t maxSampleSize = 0;
+    i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
+    for (; i < nbSamples; ++i) {
+      maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
+    }
+    dstCapacity = ZSTD_compressBound(maxSampleSize);
+    dst = malloc(dstCapacity);
+  }
+  /* Create the cctx and cdict */
+  cctx = ZSTD_createCCtx();
+  cdict = ZSTD_createCDict(dict, dictBufferCapacity,
+                           parameters.zParams.compressionLevel);
+  if (!dst || !cctx || !cdict) {
+    goto _compressCleanup;
+  }
+  /* Compress each sample and sum their sizes (or error) */
+  totalCompressedSize = dictBufferCapacity;
+  i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
+  for (; i < nbSamples; ++i) {
+    const size_t size = ZSTD_compress_usingCDict(
+        cctx, dst, dstCapacity, samples + offsets[i],
+        samplesSizes[i], cdict);
+    if (ZSTD_isError(size)) {
+      totalCompressedSize = ERROR(GENERIC);
+      goto _compressCleanup;
+    }
+    totalCompressedSize += size;
+  }
+_compressCleanup:
+  ZSTD_freeCCtx(cctx);
+  ZSTD_freeCDict(cdict);
+  if (dst) {
+    free(dst);
+  }
+  return totalCompressedSize;
+}
+

 /**
 * Initialize the `COVER_best_t`.
 */
-static void COVER_best_init(COVER_best_t *best) {
+void COVER_best_init(COVER_best_t *best) {
  if (best==NULL) return; /* compatible with init on NULL */
  (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
  (void)ZSTD_pthread_cond_init(&best->cond, NULL);
@ -772,7 +802,7 @@ static void COVER_best_init(COVER_best_t *best) {
 /**
 * Wait until liveJobs == 0.
 */
-static void COVER_best_wait(COVER_best_t *best) {
+void COVER_best_wait(COVER_best_t *best) {
  if (!best) {
    return;
  }
@ -786,7 +816,7 @@ static void COVER_best_wait(COVER_best_t *best) {
 /**
 * Call COVER_best_wait() and then destroy the COVER_best_t.
 */
-static void COVER_best_destroy(COVER_best_t *best) {
+void COVER_best_destroy(COVER_best_t *best) {
  if (!best) {
    return;
  }
@ -802,7 +832,7 @@ static void COVER_best_destroy(COVER_best_t *best) {
 * Called when a thread is about to be launched.
 * Increments liveJobs.
 */
-static void COVER_best_start(COVER_best_t *best) {
+void COVER_best_start(COVER_best_t *best) {
  if (!best) {
    return;
  }
@ -816,7 +846,7 @@ static void COVER_best_start(COVER_best_t *best) {
 * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
 * If this dictionary is the best so far save it and its parameters.
 */
-static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
+void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
                              ZDICT_cover_params_t parameters, void *dict,
                              size_t dictSize) {
  if (!best) {
@ -847,10 +877,10 @@ static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
      best->parameters = parameters;
      best->compressedSize = compressedSize;
    }
-    ZSTD_pthread_mutex_unlock(&best->mutex);
    if (liveJobs == 0) {
      ZSTD_pthread_cond_broadcast(&best->cond);
    }
+    ZSTD_pthread_mutex_unlock(&best->mutex);
  }
 }

@ -904,51 +934,10 @@ static void COVER_tryParameters(void *opaque) {
    }
  }
  /* Check total compressed size */
-  {
-    /* Pointers */
-    ZSTD_CCtx *cctx;
-    ZSTD_CDict *cdict;
-    void *dst;
-    /* Local variables */
-    size_t dstCapacity;
-    size_t i;
-    /* Allocate dst with enough space to compress the maximum sized sample */
-    {
-      size_t maxSampleSize = 0;
-      i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0;
-      for (; i < ctx->nbSamples; ++i) {
-        maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
-      }
-      dstCapacity = ZSTD_compressBound(maxSampleSize);
-      dst = malloc(dstCapacity);
-    }
-    /* Create the cctx and cdict */
-    cctx = ZSTD_createCCtx();
-    cdict = ZSTD_createCDict(dict, dictBufferCapacity,
-                             parameters.zParams.compressionLevel);
-    if (!dst || !cctx || !cdict) {
-      goto _compressCleanup;
-    }
-    /* Compress each sample and sum their sizes (or error) */
-    totalCompressedSize = dictBufferCapacity;
-    i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0;
-    for (; i < ctx->nbSamples; ++i) {
-      const size_t size = ZSTD_compress_usingCDict(
-          cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
-          ctx->samplesSizes[i], cdict);
-      if (ZSTD_isError(size)) {
-        totalCompressedSize = ERROR(GENERIC);
-        goto _compressCleanup;
-      }
-      totalCompressedSize += size;
-    }
-  _compressCleanup:
-    ZSTD_freeCCtx(cctx);
-    ZSTD_freeCDict(cdict);
-    if (dst) {
-      free(dst);
-    }
-  }
+  totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
+                                                       ctx->samples, ctx->offsets,
+                                                       ctx->nbTrainSamples, ctx->nbSamples,
+                                                       dict, dictBufferCapacity);

 _cleanup:
  COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
--- a/lib/dictBuilder/cover.h
+++ b/lib/dictBuilder/cover.h
@ -0,0 +1,83 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+/**
+ * COVER_best_t is used for two purposes:
+ * 1. Synchronizing threads.
+ * 2. Saving the best parameters and dictionary.
+ *
+ * All of the methods except COVER_best_init() are thread safe if zstd is
+ * compiled with multithreaded support.
+ */
+typedef struct COVER_best_s {
+  ZSTD_pthread_mutex_t mutex;
+  ZSTD_pthread_cond_t cond;
+  size_t liveJobs;
+  void *dict;
+  size_t dictSize;
+  ZDICT_cover_params_t parameters;
+  size_t compressedSize;
+} COVER_best_t;
+
+/**
+ * A segment is a range in the source as well as the score of the segment.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+  U32 score;
+} COVER_segment_t;
+
+/**
+ *  Checks total compressed size of a dictionary
+ */
+size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
+                                      const size_t *samplesSizes, const BYTE *samples,
+                                      size_t *offsets,
+                                      size_t nbTrainSamples, size_t nbSamples,
+                                      BYTE *const dict, size_t dictBufferCapacity);
+
+/**
+ * Returns the sum of the sample sizes.
+ */
+size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
+
+/**
+ * Initialize the `COVER_best_t`.
+ */
+void COVER_best_init(COVER_best_t *best);
+
+/**
+ * Wait until liveJobs == 0.
+ */
+void COVER_best_wait(COVER_best_t *best);
+
+/**
+ * Call COVER_best_wait() and then destroy the COVER_best_t.
+ */
+void COVER_best_destroy(COVER_best_t *best);
+
+/**
+ * Called when a thread is about to be launched.
+ * Increments liveJobs.
+ */
+void COVER_best_start(COVER_best_t *best);
+
+/**
+ * Called when a thread finishes executing, both on error or success.
+ * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
+ * If this dictionary is the best so far save it and its parameters.
+ */
+void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
+                       ZDICT_cover_params_t parameters, void *dict,
+                       size_t dictSize);
--- a/lib/dictBuilder/divsufsort.c
+++ b/lib/dictBuilder/divsufsort.c
@ -1637,7 +1637,7 @@ construct_SA(const unsigned char *T, int *SA,
            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
            k = SA + BUCKET_B(c2 = c0, c1);
          }
-          assert(k < j);
+          assert(k < j); assert(k != NULL);
          *k-- = s;
        } else {
          assert(((s == 0) && (T[s] == c1)) || (s < 0));
@ -1701,7 +1701,7 @@ construct_BWT(const unsigned char *T, int *SA,
            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
            k = SA + BUCKET_B(c2 = c0, c1);
          }
-          assert(k < j);
+          assert(k < j); assert(k != NULL);
          *k-- = s;
        } else if(s != 0) {
          *j = ~s;
@ -1785,7 +1785,7 @@ construct_BWT_indexes(const unsigned char *T, int *SA,
            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
            k = SA + BUCKET_B(c2 = c0, c1);
          }
-          assert(k < j);
+          assert(k < j); assert(k != NULL);
          *k-- = s;
        } else if(s != 0) {
          *j = ~s;
--- a/lib/dictBuilder/fastcover.c
+++ b/lib/dictBuilder/fastcover.c
@ -0,0 +1,701 @@
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "cover.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define FASTCOVER_MAX_F 31
+#define FASTCOVER_MAX_ACCEL 10
+#define DEFAULT_SPLITPOINT 0.75
+#define DEFAULT_F 20
+#define DEFAULT_ACCEL 1
+
+
+/*-*************************************
+*  Console display
+***************************************/
+static int g_displayLevel = 2;
+#define DISPLAY(...)                                                           \
+  {                                                                            \
+    fprintf(stderr, __VA_ARGS__);                                              \
+    fflush(stderr);                                                            \
+  }
+#define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
+  if (displayLevel >= l) {                                                     \
+    DISPLAY(__VA_ARGS__);                                                      \
+  } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
+
+#define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
+  if (displayLevel >= l) {                                                     \
+    if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) {             \
+      g_time = clock();                                                        \
+      DISPLAY(__VA_ARGS__);                                                    \
+    }                                                                          \
+  }
+#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
+static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
+static clock_t g_time = 0;
+
+
+/*-*************************************
+* Hash Functions
+***************************************/
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+
+
+/**
+ * Hash the d-byte value pointed to by p and mod 2^f
+ */
+static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 h, unsigned d) {
+  if (d == 6) {
+    return ZSTD_hash6Ptr(p, h) & ((1 << h) - 1);
+  }
+  return ZSTD_hash8Ptr(p, h) & ((1 << h) - 1);
+}
+
+
+/*-*************************************
+* Acceleration
+***************************************/
+typedef struct {
+  unsigned finalize;    /* Percentage of training samples used for ZDICT_finalizeDictionary */
+  unsigned skip;        /* Number of dmer skipped between each dmer counted in computeFrequency */
+} FASTCOVER_accel_t;
+
+
+static const FASTCOVER_accel_t FASTCOVER_defaultAccelParameters[FASTCOVER_MAX_ACCEL+1] = {
+  { 100, 0 },   /* accel = 0, should not happen because accel = 0 defaults to accel = 1 */
+  { 100, 0 },   /* accel = 1 */
+  { 50, 1 },   /* accel = 2 */
+  { 34, 2 },   /* accel = 3 */
+  { 25, 3 },   /* accel = 4 */
+  { 20, 4 },   /* accel = 5 */
+  { 17, 5 },   /* accel = 6 */
+  { 14, 6 },   /* accel = 7 */
+  { 13, 7 },   /* accel = 8 */
+  { 11, 8 },   /* accel = 9 */
+  { 10, 9 },   /* accel = 10 */
+};
+
+
+/*-*************************************
+* Context
+***************************************/
+typedef struct {
+  const BYTE *samples;
+  size_t *offsets;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+  size_t nbTrainSamples;
+  size_t nbTestSamples;
+  size_t nbDmers;
+  U32 *freqs;
+  unsigned d;
+  unsigned f;
+  FASTCOVER_accel_t accelParams;
+} FASTCOVER_ctx_t;
+
+
+/*-*************************************
+*  Helper functions
+***************************************/
+/**
+ * Selects the best segment in an epoch.
+ * Segments of are scored according to the function:
+ *
+ * Let F(d) be the frequency of all dmers with hash value d.
+ * Let S_i be hash value of the dmer at position i of segment S which has length k.
+ *
+ *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
+ *
+ * Once the dmer with hash value d is in the dictionay we set F(d) = 0.
+ */
+static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
+                                              U32 *freqs, U32 begin, U32 end,
+                                              ZDICT_cover_params_t parameters,
+                                              U16* segmentFreqs) {
+  /* Constants */
+  const U32 k = parameters.k;
+  const U32 d = parameters.d;
+  const U32 f = ctx->f;
+  const U32 dmersInK = k - d + 1;
+
+  /* Try each segment (activeSegment) and save the best (bestSegment) */
+  COVER_segment_t bestSegment = {0, 0, 0};
+  COVER_segment_t activeSegment;
+
+  /* Reset the activeDmers in the segment */
+  /* The activeSegment starts at the beginning of the epoch. */
+  activeSegment.begin = begin;
+  activeSegment.end = begin;
+  activeSegment.score = 0;
+
+  /* Slide the activeSegment through the whole epoch.
+   * Save the best segment in bestSegment.
+   */
+  while (activeSegment.end < end) {
+    /* Get hash value of current dmer */
+    const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
+
+    /* Add frequency of this index to score if this is the first occurence of index in active segment */
+    if (segmentFreqs[index] == 0) {
+      activeSegment.score += freqs[index];
+    }
+    /* Increment end of segment and segmentFreqs*/
+    activeSegment.end += 1;
+    segmentFreqs[index] += 1;
+    /* If the window is now too large, drop the first position */
+    if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
+      /* Get hash value of the dmer to be eliminated from active segment */
+      const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
+      segmentFreqs[delIndex] -= 1;
+      /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */
+      if (segmentFreqs[delIndex] == 0) {
+        activeSegment.score -= freqs[delIndex];
+      }
+      /* Increment start of segment */
+      activeSegment.begin += 1;
+    }
+
+    /* If this segment is the best so far save it */
+    if (activeSegment.score > bestSegment.score) {
+      bestSegment = activeSegment;
+    }
+  }
+
+  /* Zero out rest of segmentFreqs array */
+  while (activeSegment.begin < end) {
+    const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
+    segmentFreqs[delIndex] -= 1;
+    activeSegment.begin += 1;
+  }
+
+  {
+    /*  Zero the frequency of hash value of each dmer covered by the chosen segment. */
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, f, d);
+      freqs[i] = 0;
+    }
+  }
+
+  return bestSegment;
+}
+
+
+static int FASTCOVER_checkParameters(ZDICT_cover_params_t parameters,
+                                     size_t maxDictSize, unsigned f,
+                                     unsigned accel) {
+  /* k, d, and f are required parameters */
+  if (parameters.d == 0 || parameters.k == 0) {
+    return 0;
+  }
+  /* d has to be 6 or 8 */
+  if (parameters.d != 6 && parameters.d != 8) {
+    return 0;
+  }
+  /* k <= maxDictSize */
+  if (parameters.k > maxDictSize) {
+    return 0;
+  }
+  /* d <= k */
+  if (parameters.d > parameters.k) {
+    return 0;
+  }
+  /* 0 < f <= FASTCOVER_MAX_F*/
+  if (f > FASTCOVER_MAX_F || f == 0) {
+    return 0;
+  }
+  /* 0 < splitPoint <= 1 */
+  if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) {
+    return 0;
+  }
+  /* 0 < accel <= 10 */
+  if (accel > 10 || accel == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+
+/**
+ * Clean up a context initialized with `FASTCOVER_ctx_init()`.
+ */
+static void FASTCOVER_ctx_destroy(FASTCOVER_ctx_t *ctx) {
+  if (!ctx) {
+    return;
+  }
+
+  free(ctx->freqs);
+  ctx->freqs = NULL;
+
+  free(ctx->offsets);
+  ctx->offsets = NULL;
+}
+
+
+/**
+ * Calculate for frequency of hash value of each dmer in ctx->samples
+ */
+static void FASTCOVER_computeFrequency(U32 *freqs, FASTCOVER_ctx_t *ctx){
+  const unsigned f = ctx->f;
+  const unsigned d = ctx->d;
+  const unsigned skip = ctx->accelParams.skip;
+  const unsigned readLength = MAX(d, 8);
+  size_t start; /* start of current dmer */
+  size_t i;
+  for (i = 0; i < ctx->nbTrainSamples; i++) {
+    size_t currSampleStart = ctx->offsets[i];
+    size_t currSampleEnd = ctx->offsets[i+1];
+    start = currSampleStart;
+    while (start + readLength <= currSampleEnd) {
+      const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, d);
+      freqs[dmerIndex]++;
+      start = start + skip + 1;
+    }
+  }
+}
+
+
+/**
+ * Prepare a context for dictionary building.
+ * The context is only dependent on the parameter `d` and can used multiple
+ * times.
+ * Returns 1 on success or zero on error.
+ * The context must be destroyed with `FASTCOVER_ctx_destroy()`.
+ */
+static int FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, const void *samplesBuffer,
+                              const size_t *samplesSizes, unsigned nbSamples,
+                              unsigned d, double splitPoint, unsigned f,
+                              FASTCOVER_accel_t accelParams) {
+  const BYTE *const samples = (const BYTE *)samplesBuffer;
+  const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
+  /* Split samples into testing and training sets */
+  const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
+  const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
+  const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
+  const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
+  /* Checks */
+  if (totalSamplesSize < MAX(d, sizeof(U64)) ||
+      totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
+    DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
+                 (U32)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
+    return 0;
+  }
+  /* Check if there are at least 5 training samples */
+  if (nbTrainSamples < 5) {
+    DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
+    return 0;
+  }
+  /* Check if there's testing sample */
+  if (nbTestSamples < 1) {
+    DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
+    return 0;
+  }
+  /* Zero the context */
+  memset(ctx, 0, sizeof(*ctx));
+  DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
+               (U32)trainingSamplesSize);
+  DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
+               (U32)testSamplesSize);
+
+  ctx->samples = samples;
+  ctx->samplesSizes = samplesSizes;
+  ctx->nbSamples = nbSamples;
+  ctx->nbTrainSamples = nbTrainSamples;
+  ctx->nbTestSamples = nbTestSamples;
+  ctx->nbDmers = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
+  ctx->d = d;
+  ctx->f = f;
+  ctx->accelParams = accelParams;
+
+  /* The offsets of each file */
+  ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
+  if (!ctx->offsets) {
+    DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
+    FASTCOVER_ctx_destroy(ctx);
+    return 0;
+  }
+
+  /* Fill offsets from the samplesSizes */
+  {
+    U32 i;
+    ctx->offsets[0] = 0;
+    for (i = 1; i <= nbSamples; ++i) {
+      ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
+    }
+  }
+
+  /* Initialize frequency array of size 2^f */
+  ctx->freqs = (U32 *)calloc(((U64)1 << f), sizeof(U32));
+
+  DISPLAYLEVEL(2, "Computing frequencies\n");
+  FASTCOVER_computeFrequency(ctx->freqs, ctx);
+
+  return 1;
+}
+
+
+/**
+ * Given the prepared context build the dictionary.
+ */
+static size_t FASTCOVER_buildDictionary(const FASTCOVER_ctx_t *ctx, U32 *freqs,
+                                        void *dictBuffer, size_t dictBufferCapacity,
+                                        ZDICT_cover_params_t parameters, U16* segmentFreqs){
+  BYTE *const dict = (BYTE *)dictBuffer;
+  size_t tail = dictBufferCapacity;
+  /* Divide the data up into epochs of equal size.
+   * We will select at least one segment from each epoch.
+   */
+  const U32 epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k));
+  const U32 epochSize = (U32)(ctx->nbDmers / epochs);
+  size_t epoch;
+  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
+               epochSize);
+  /* Loop through the epochs until there are no more segments or the dictionary
+   * is full.
+   */
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
+    const U32 epochBegin = (U32)(epoch * epochSize);
+    const U32 epochEnd = epochBegin + epochSize;
+    size_t segmentSize;
+    /* Select a segment */
+    COVER_segment_t segment = FASTCOVER_selectSegment(
+        ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
+
+    /* If the segment covers no dmers, then we are out of content */
+    if (segment.score == 0) {
+      break;
+    }
+
+    /* Trim the segment if necessary and if it is too small then we are done */
+    segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
+    if (segmentSize < parameters.d) {
+      break;
+    }
+
+    /* We fill the dictionary from the back to allow the best segments to be
+     * referenced with the smallest offsets.
+     */
+    tail -= segmentSize;
+    memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+    DISPLAYUPDATE(
+        2, "\r%u%%       ",
+        (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
+  }
+  DISPLAYLEVEL(2, "\r%79s\r", "");
+  return tail;
+}
+
+
+/**
+ * Parameters for FASTCOVER_tryParameters().
+ */
+typedef struct FASTCOVER_tryParameters_data_s {
+  const FASTCOVER_ctx_t *ctx;
+  COVER_best_t *best;
+  size_t dictBufferCapacity;
+  ZDICT_cover_params_t parameters;
+} FASTCOVER_tryParameters_data_t;
+
+
+/**
+ * Tries a set of parameters and updates the COVER_best_t with the results.
+ * This function is thread safe if zstd is compiled with multithreaded support.
+ * It takes its parameters as an *OWNING* opaque pointer to support threading.
+ */
+static void FASTCOVER_tryParameters(void *opaque) {
+  /* Save parameters as local variables */
+  FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t *)opaque;
+  const FASTCOVER_ctx_t *const ctx = data->ctx;
+  const ZDICT_cover_params_t parameters = data->parameters;
+  size_t dictBufferCapacity = data->dictBufferCapacity;
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Initialize array to keep track of frequency of dmer within activeSegment */
+  U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
+  /* Allocate space for hash table, dict, and freqs */
+  BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
+  U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
+  if (!segmentFreqs || !dict || !freqs) {
+    DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
+    goto _cleanup;
+  }
+  /* Copy the frequencies because we need to modify them */
+  memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
+  /* Build the dictionary */
+  {
+    const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
+                                                  parameters, segmentFreqs);
+    const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
+    dictBufferCapacity = ZDICT_finalizeDictionary(
+        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+        ctx->samples, ctx->samplesSizes, nbFinalizeSamples, parameters.zParams);
+    if (ZDICT_isError(dictBufferCapacity)) {
+      DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
+      goto _cleanup;
+    }
+  }
+  /* Check total compressed size */
+  totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
+                                                       ctx->samples, ctx->offsets,
+                                                       ctx->nbTrainSamples, ctx->nbSamples,
+                                                       dict, dictBufferCapacity);
+_cleanup:
+  COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
+                    dictBufferCapacity);
+  free(data);
+  free(segmentFreqs);
+  free(dict);
+  free(freqs);
+}
+
+
+
+static void FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
+                                          ZDICT_cover_params_t *coverParams) {
+    coverParams->k = fastCoverParams.k;
+    coverParams->d = fastCoverParams.d;
+    coverParams->steps = fastCoverParams.steps;
+    coverParams->nbThreads = fastCoverParams.nbThreads;
+    coverParams->splitPoint = fastCoverParams.splitPoint;
+    coverParams->zParams = fastCoverParams.zParams;
+}
+
+
+static void FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
+                                          ZDICT_fastCover_params_t *fastCoverParams,
+                                          unsigned f, unsigned accel) {
+    fastCoverParams->k = coverParams.k;
+    fastCoverParams->d = coverParams.d;
+    fastCoverParams->steps = coverParams.steps;
+    fastCoverParams->nbThreads = coverParams.nbThreads;
+    fastCoverParams->splitPoint = coverParams.splitPoint;
+    fastCoverParams->f = f;
+    fastCoverParams->accel = accel;
+    fastCoverParams->zParams = coverParams.zParams;
+}
+
+
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters) {
+    BYTE* const dict = (BYTE*)dictBuffer;
+    FASTCOVER_ctx_t ctx;
+    ZDICT_cover_params_t coverParams;
+    FASTCOVER_accel_t accelParams;
+    /* Initialize global data */
+    g_displayLevel = parameters.zParams.notificationLevel;
+    /* Assign splitPoint and f if not provided */
+    parameters.splitPoint = 1.0;
+    parameters.f = parameters.f == 0 ? DEFAULT_F : parameters.f;
+    parameters.accel = parameters.accel == 0 ? DEFAULT_ACCEL : parameters.accel;
+    /* Convert to cover parameter */
+    memset(&coverParams, 0 , sizeof(coverParams));
+    FASTCOVER_convertToCoverParams(parameters, &coverParams);
+    /* Checks */
+    if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
+                                   parameters.accel)) {
+      DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
+      return ERROR(GENERIC);
+    }
+    if (nbSamples == 0) {
+      DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
+      return ERROR(GENERIC);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    /* Assign corresponding FASTCOVER_accel_t to accelParams*/
+    accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
+    /* Initialize context */
+    if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
+                            coverParams.d, parameters.splitPoint, parameters.f,
+                            accelParams)) {
+      DISPLAYLEVEL(1, "Failed to initialize context\n");
+      return ERROR(GENERIC);
+    }
+    /* Build the dictionary */
+    DISPLAYLEVEL(2, "Building dictionary\n");
+    {
+      /* Initialize array to keep track of frequency of dmer within activeSegment */
+      U16* segmentFreqs = (U16 *)calloc(((U64)1 << parameters.f), sizeof(U16));
+      const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer,
+                                                dictBufferCapacity, coverParams, segmentFreqs);
+      const unsigned nbFinalizeSamples = (unsigned)(ctx.nbTrainSamples * ctx.accelParams.finalize / 100);
+      const size_t dictionarySize = ZDICT_finalizeDictionary(
+          dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+          samplesBuffer, samplesSizes, nbFinalizeSamples, coverParams.zParams);
+      if (!ZSTD_isError(dictionarySize)) {
+          DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
+                      (U32)dictionarySize);
+      }
+      FASTCOVER_ctx_destroy(&ctx);
+      free(segmentFreqs);
+      return dictionarySize;
+    }
+}
+
+
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_fastCover_params_t *parameters) {
+    ZDICT_cover_params_t coverParams;
+    FASTCOVER_accel_t accelParams;
+    /* constants */
+    const unsigned nbThreads = parameters->nbThreads;
+    const double splitPoint =
+        parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
+    const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
+    const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
+    const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
+    const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
+    const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
+    const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
+    const unsigned kIterations =
+        (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
+    const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
+    const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
+    /* Local variables */
+    const int displayLevel = parameters->zParams.notificationLevel;
+    unsigned iteration = 1;
+    unsigned d;
+    unsigned k;
+    COVER_best_t best;
+    POOL_ctx *pool = NULL;
+    /* Checks */
+    if (splitPoint <= 0 || splitPoint > 1) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
+      return ERROR(GENERIC);
+    }
+    if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
+      return ERROR(GENERIC);
+    }
+    if (kMinK < kMaxD || kMaxK < kMinK) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
+      return ERROR(GENERIC);
+    }
+    if (nbSamples == 0) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
+      return ERROR(GENERIC);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    if (nbThreads > 1) {
+      pool = POOL_create(nbThreads, 1);
+      if (!pool) {
+        return ERROR(memory_allocation);
+      }
+    }
+    /* Initialization */
+    COVER_best_init(&best);
+    memset(&coverParams, 0 , sizeof(coverParams));
+    FASTCOVER_convertToCoverParams(*parameters, &coverParams);
+    accelParams = FASTCOVER_defaultAccelParameters[accel];
+    /* Turn down global display level to clean up display at level 2 and below */
+    g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
+    /* Loop through d first because each new value needs a new context */
+    LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
+                      kIterations);
+    for (d = kMinD; d <= kMaxD; d += 2) {
+      /* Initialize the context for this value of d */
+      FASTCOVER_ctx_t ctx;
+      LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
+      if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams)) {
+        LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
+        COVER_best_destroy(&best);
+        POOL_free(pool);
+        return ERROR(GENERIC);
+      }
+      /* Loop through k reusing the same context */
+      for (k = kMinK; k <= kMaxK; k += kStepSize) {
+        /* Prepare the arguments */
+        FASTCOVER_tryParameters_data_t *data = (FASTCOVER_tryParameters_data_t *)malloc(
+            sizeof(FASTCOVER_tryParameters_data_t));
+        LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
+        if (!data) {
+          LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
+          COVER_best_destroy(&best);
+          FASTCOVER_ctx_destroy(&ctx);
+          POOL_free(pool);
+          return ERROR(GENERIC);
+        }
+        data->ctx = &ctx;
+        data->best = &best;
+        data->dictBufferCapacity = dictBufferCapacity;
+        data->parameters = coverParams;
+        data->parameters.k = k;
+        data->parameters.d = d;
+        data->parameters.splitPoint = splitPoint;
+        data->parameters.steps = kSteps;
+        data->parameters.zParams.notificationLevel = g_displayLevel;
+        /* Check the parameters */
+        if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
+                                       data->ctx->f, accel)) {
+          DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
+          free(data);
+          continue;
+        }
+        /* Call the function and pass ownership of data to it */
+        COVER_best_start(&best);
+        if (pool) {
+          POOL_add(pool, &FASTCOVER_tryParameters, data);
+        } else {
+          FASTCOVER_tryParameters(data);
+        }
+        /* Print status */
+        LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
+                           (U32)((iteration * 100) / kIterations));
+        ++iteration;
+      }
+      COVER_best_wait(&best);
+      FASTCOVER_ctx_destroy(&ctx);
+    }
+    LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
+    /* Fill the output buffer and parameters with output of the best parameters */
+    {
+      const size_t dictSize = best.dictSize;
+      if (ZSTD_isError(best.compressedSize)) {
+        const size_t compressedSize = best.compressedSize;
+        COVER_best_destroy(&best);
+        POOL_free(pool);
+        return compressedSize;
+      }
+      FASTCOVER_convertToFastCoverParams(best.parameters, parameters, f, accel);
+      memcpy(dictBuffer, best.dict, dictSize);
+      COVER_best_destroy(&best);
+      POOL_free(pool);
+      return dictSize;
+    }
+
+}
--- a/lib/dictBuilder/zdict.c
+++ b/lib/dictBuilder/zdict.c
@ -698,7 +698,7 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
    short litLengthNCount[MaxLL+1];
    U32 repOffset[MAXREPOFFSET];
    offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
-    EStats_ress_t esr;
+    EStats_ress_t esr = { NULL, NULL, NULL };
    ZSTD_parameters params;
    U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
    size_t pos = 0, errorCode;
@ -863,8 +863,8 @@ _cleanup:

 size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
                          const void* customDictContent, size_t dictContentSize,
-                          const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                          ZDICT_params_t params)
+                          const void* samplesBuffer, const size_t* samplesSizes,
+                          unsigned nbSamples, ZDICT_params_t params)
 {
    size_t hSize;
 #define HBUFFSIZE 256   /* should prove large enough for all entropy headers */
@ -987,8 +987,10 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
            U32 const pos = dictList[u].pos;
            U32 const length = dictList[u].length;
            U32 const printedLength = MIN(40, length);
-            if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize))
+            if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
+                free(dictList);
                return ERROR(GENERIC);   /* should never happen */
+            }
            DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
                         u, length, pos, dictList[u].savings);
            ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
@ -1078,17 +1080,17 @@ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
 {
-    ZDICT_cover_params_t params;
+    ZDICT_fastCover_params_t params;
    DEBUGLOG(3, "ZDICT_trainFromBuffer");
    memset(&params, 0, sizeof(params));
    params.d = 8;
    params.steps = 4;
    /* Default to level 6 since no compression level information is available */
-    params.zParams.compressionLevel = 6;
+    params.zParams.compressionLevel = 3;
 #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
    params.zParams.notificationLevel = DEBUGLEVEL;
 #endif
-    return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
+    return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
                                               samplesBuffer, samplesSizes, nbSamples,
                                               &params);
 }
--- a/lib/dictBuilder/zdict.h
+++ b/lib/dictBuilder/zdict.h
@ -39,7 +39,8 @@ extern "C" {

 /*! ZDICT_trainFromBuffer():
 *  Train a dictionary from an array of samples.
- *  Redirect towards ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4.
+ *  Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4,
+ *  f=20, and accel=1.
 *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
 *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
 *  The resulting dictionary will be saved into `dictBuffer`.
@ -52,7 +53,8 @@ extern "C" {
 *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
 */
 ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
-                                    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
+                                    const void* samplesBuffer,
+                                    const size_t* samplesSizes, unsigned nbSamples);


 /*======   Helper functions   ======*/
@ -84,12 +86,22 @@ typedef struct {
 typedef struct {
    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
    unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
-    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
    unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
-    double splitPoint;           /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
+    double splitPoint;           /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
    ZDICT_params_t zParams;
 } ZDICT_cover_params_t;

+typedef struct {
+    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
+    unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
+    unsigned f;                  /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(20)*/
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
+    unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
+    double splitPoint;           /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
+    unsigned accel;              /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
+    ZDICT_params_t zParams;
+} ZDICT_fastCover_params_t;

 /*! ZDICT_trainFromBuffer_cover():
 *  Train a dictionary from an array of samples using the COVER algorithm.
@ -116,9 +128,9 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
 * dictionary constructed with those parameters is stored in `dictBuffer`.
 *
 * All of the parameters d, k, steps are optional.
- * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
 * if steps is zero it defaults to its default value.
- * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
 *
 * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
 *           or an error code, which can be tested with ZDICT_isError().
@ -130,6 +142,48 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
          ZDICT_cover_params_t* parameters);

+/*! ZDICT_trainFromBuffer_fastCover():
+ *  Train a dictionary from an array of samples using a modified version of COVER algorithm.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  d and k are required.
+ *  All other parameters are optional, will use default values if not provided
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Note: ZDICT_trainFromBuffer_fastCover() requires about 1 bytes of memory for each input byte and additionally another 6 * 2^f bytes of memory .
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer,
+                    size_t dictBufferCapacity, const void *samplesBuffer,
+                    const size_t *samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t parameters);
+
+/*! ZDICT_optimizeTrainFromBuffer_fastCover():
+ * The same requirements as above hold for all the parameters except `parameters`.
+ * This function tries many parameter combinations (specifically, k and d combinations)
+ * and picks the best parameters. `*parameters` is filled with the best parameters found,
+ * dictionary constructed with those parameters is stored in `dictBuffer`.
+ * All of the parameters d, k, steps, f, and accel are optional.
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
+ * if steps is zero it defaults to its default value.
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
+ * If f is zero, default value of 20 is used.
+ * If accel is zero, default value of 1 is used.
+ *
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ *           On success `*parameters` contains the parameters selected.
+ * Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 1 byte of memory for each input byte and additionally another 6 * 2^f bytes of memory for each thread.
+ */
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
+                    size_t dictBufferCapacity, const void* samplesBuffer,
+                    const size_t* samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t* parameters);
+
 /*! ZDICT_finalizeDictionary():
 * Given a custom content as a basis for dictionary, and a set of samples,
 * finalize dictionary by adding headers and statistics.
--- a/lib/legacy/zstd_v04.c
+++ b/lib/legacy/zstd_v04.c
@ -1093,6 +1093,7 @@ static size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, un
    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);

    /* Init, lay down lowprob symbols */
+    memset(tableDecode, 0, sizeof(FSE_DECODE_TYPE) * (maxSymbolValue+1) );   /* useless init, but keep static analyzer happy, and we don't need to performance optimize legacy decoders */
    DTableH.tableLog = (U16)tableLog;
    for (s=0; s<=maxSymbolValue; s++)
    {
--- a/lib/legacy/zstd_v05.c
+++ b/lib/legacy/zstd_v05.c
@ -1224,6 +1224,7 @@ size_t FSEv05_buildDTable(FSEv05_DTable* dt, const short* normalizedCounter, uns
    if (tableLog > FSEv05_MAX_TABLELOG) return ERROR(tableLog_tooLarge);

    /* Init, lay down lowprob symbols */
+    memset(tableDecode, 0, sizeof(FSEv05_FUNCTION_TYPE) * (maxSymbolValue+1) );   /* useless init, but keep static analyzer happy, and we don't need to performance optimize legacy decoders */
    DTableH.tableLog = (U16)tableLog;
    for (s=0; s<=maxSymbolValue; s++) {
        if (normalizedCounter[s]==-1) {
@ -2845,6 +2846,7 @@ size_t ZSTDv05_getcBlockSize(const void* src, size_t srcSize, blockProperties_t*

 static size_t ZSTDv05_copyRawBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
+    if (dst==NULL) return ERROR(dstSize_tooSmall);
    if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall);
    memcpy(dst, src, srcSize);
    return srcSize;
--- a/lib/legacy/zstd_v06.c
+++ b/lib/legacy/zstd_v06.c
@ -3041,6 +3041,7 @@ size_t ZSTDv06_getcBlockSize(const void* src, size_t srcSize, blockProperties_t*

 static size_t ZSTDv06_copyRawBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
+    if (dst==NULL) return ERROR(dstSize_tooSmall);
    if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
    memcpy(dst, src, srcSize);
    return srcSize;
@ -4006,7 +4007,7 @@ size_t ZBUFFv06_decompressContinue(ZBUFFv06_DCtx* zbd,
                    if (ZSTDv06_isError(hSize)) return hSize;
                    if (toLoad > (size_t)(iend-ip)) {   /* not enough input to load full header */
                        memcpy(zbd->headerBuffer + zbd->lhSize, ip, iend-ip);
-                        zbd->lhSize += iend-ip; ip = iend; notDone = 0;
+                        zbd->lhSize += iend-ip;
                        *dstCapacityPtr = 0;
                        return (hSize - zbd->lhSize) + ZSTDv06_blockHeaderSize;   /* remaining header bytes + next block header */
                    }
--- a/lib/legacy/zstd_v07.c
+++ b/lib/legacy/zstd_v07.c
@ -3150,10 +3150,10 @@ size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src,
    const BYTE* ip = (const BYTE*)src;

    if (srcSize < ZSTDv07_frameHeaderSize_min) return ZSTDv07_frameHeaderSize_min;
+    memset(fparamsPtr, 0, sizeof(*fparamsPtr));
    if (MEM_readLE32(src) != ZSTDv07_MAGICNUMBER) {
        if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTDv07_MAGIC_SKIPPABLE_START) {
            if (srcSize < ZSTDv07_skippableHeaderSize) return ZSTDv07_skippableHeaderSize; /* magic number + skippable frame length */
-            memset(fparamsPtr, 0, sizeof(*fparamsPtr));
            fparamsPtr->frameContentSize = MEM_readLE32((const char *)src + 4);
            fparamsPtr->windowSize = 0; /* windowSize==0 means a frame is skippable */
            return 0;
@ -3175,11 +3175,13 @@ size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src,
        U32 windowSize = 0;
        U32 dictID = 0;
        U64 frameContentSize = 0;
-        if ((fhdByte & 0x08) != 0) return ERROR(frameParameter_unsupported);   /* reserved bits, which must be zero */
+        if ((fhdByte & 0x08) != 0)   /* reserved bits, which must be zero */
+            return ERROR(frameParameter_unsupported);
        if (!directMode) {
            BYTE const wlByte = ip[pos++];
            U32 const windowLog = (wlByte >> 3) + ZSTDv07_WINDOWLOG_ABSOLUTEMIN;
-            if (windowLog > ZSTDv07_WINDOWLOG_MAX) return ERROR(frameParameter_unsupported);
+            if (windowLog > ZSTDv07_WINDOWLOG_MAX)
+                return ERROR(frameParameter_unsupported);
            windowSize = (1U << windowLog);
            windowSize += (windowSize >> 3) * (wlByte&7);
        }
@ -3201,7 +3203,8 @@ size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src,
            case 3 : frameContentSize = MEM_readLE64(ip+pos); break;
        }
        if (!windowSize) windowSize = (U32)frameContentSize;
-        if (windowSize > windowSizeMax) return ERROR(frameParameter_unsupported);
+        if (windowSize > windowSizeMax)
+            return ERROR(frameParameter_unsupported);
        fparamsPtr->frameContentSize = frameContentSize;
        fparamsPtr->windowSize = windowSize;
        fparamsPtr->dictID = dictID;
@ -3220,11 +3223,10 @@ size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src,
                   - frame header not completely provided (`srcSize` too small) */
 unsigned long long ZSTDv07_getDecompressedSize(const void* src, size_t srcSize)
 {
-    {   ZSTDv07_frameParams fparams;
-        size_t const frResult = ZSTDv07_getFrameParams(&fparams, src, srcSize);
-        if (frResult!=0) return 0;
-        return fparams.frameContentSize;
-    }
+    ZSTDv07_frameParams fparams;
+    size_t const frResult = ZSTDv07_getFrameParams(&fparams, src, srcSize);
+    if (frResult!=0) return 0;
+    return fparams.frameContentSize;
 }


--- a/lib/zstd.h
+++ b/lib/zstd.h
@ -35,26 +35,38 @@ extern "C" {
 #endif


-/*******************************************************************************************************
+/*******************************************************************************
  Introduction

-  zstd, short for Zstandard, is a fast lossless compression algorithm,
-  targeting real-time compression scenarios at zlib-level and better compression ratios.
-  The zstd compression library provides in-memory compression and decompression functions.
-  The library supports compression levels from 1 up to ZSTD_maxCLevel() which is currently 22.
-  Levels >= 20, labeled `--ultra`, should be used with caution, as they require more memory.
+  zstd, short for Zstandard, is a fast lossless compression algorithm, targeting
+  real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression
+  functions.
+
+  The library supports regular compression levels from 1 up to ZSTD_maxCLevel(),
+  which is currently 22. Levels >= 20, labeled `--ultra`, should be used with
+  caution, as they require more memory. The library also offers negative
+  compression levels, which extend the range of speed vs. ratio preferences.
+  The lower the level, the faster the speed (at the cost of compression).
+
  Compression can be done in:
    - a single step (described as Simple API)
    - a single step, reusing a context (described as Explicit context)
    - unbounded multiple steps (described as Streaming compression)
-  The compression ratio achievable on small data can be highly improved using a dictionary in:
-    - a single step (described as Simple dictionary API)
-    - a single step, reusing a dictionary (described as Bulk-processing dictionary API)

-  Advanced experimental functions can be accessed using #define ZSTD_STATIC_LINKING_ONLY before including zstd.h.
-  Advanced experimental APIs shall never be used with a dynamic library.
-  They are not "stable", their definition may change in the future. Only static linking is allowed.
-*********************************************************************************************************/
+  The compression ratio achievable on small data can be highly improved using
+  a dictionary. Dictionary compression can be performed in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing
+      dictionary API)
+
+  Advanced experimental functions can be accessed using
+  `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h.
+
+  Advanced experimental APIs should never be used with a dynamically-linked
+  library. They are not "stable"; their definitions or signatures may change in
+  the future. Only static linking is allowed.
+*******************************************************************************/

 /*------   Version   ------*/
 #define ZSTD_VERSION_MAJOR    1
@ -211,7 +223,8 @@ typedef struct ZSTD_CDict_s ZSTD_CDict;
 *  When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
 *  ZSTD_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
 *  ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
- *  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict */
+ *  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict
+ *  Note : A ZSTD_CDict can be created with an empty dictionary, but it is inefficient for small data. */
 ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize,
                                         int compressionLevel);

@ -223,7 +236,9 @@ ZSTDLIB_API size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
 *  Compression using a digested Dictionary.
 *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
 *  Note that compression level is decided during dictionary creation.
- *  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
+ *  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no)
+ *  Note : ZSTD_compress_usingCDict() can be used with a ZSTD_CDict created from an empty dictionary.
+ *         But it is inefficient for small data, and it is recommended to use ZSTD_compressCCtx(). */
 ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
                                            void* dst, size_t dstCapacity,
                                      const void* src, size_t srcSize,
@ -1161,16 +1176,21 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);

 /*! ZSTD_CCtx_refPrefix() :
 *  Reference a prefix (single-usage dictionary) for next compression job.
- *  Decompression need same prefix to properly regenerate data.
- *  Prefix is **only used once**. Tables are discarded at end of compression job (ZSTD_e_end).
+ *  Decompression will need same prefix to properly regenerate data.
+ *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+ *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ *  Note that prefix is **only used once**. Tables are discarded at end of compression job (ZSTD_e_end).
 * @result : 0, or an error code (which can be tested with ZSTD_isError()).
 *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
 *  Note 1 : Prefix buffer is referenced. It **must** outlive compression job.
 *           Its contain must remain unmodified up to end of compression (ZSTD_e_end).
- *  Note 2 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+ *  Note 2 : If the intention is to diff some large src data blob with some prior version of itself,
+ *           ensure that the window size is large enough to contain the entire source.
+ *           See ZSTD_p_windowLog.
+ *  Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters.
 *           It's a CPU consuming operation, with non-negligible impact on latency.
 *           If there is a need to use same prefix multiple times, consider loadDictionary instead.
- *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
+ *  Note 4 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
 *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode. */
 ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
                                       const void* prefix, size_t prefixSize);
@ -1353,6 +1373,8 @@ ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);

 /*! ZSTD_DCtx_refPrefix() :
 *  Reference a prefix (single-usage dictionary) for next compression job.
+ *  This is the reverse operation of ZSTD_CCtx_refPrefix(),
+ *  and must use the same prefix as the one used during compression.
 *  Prefix is **only used once**. Reference is discarded at end of frame.
 *  End of frame is reached when ZSTD_DCtx_decompress_generic() returns 0.
 * @result : 0, or an error code (which can be tested with ZSTD_isError()).
--- a/programs/Makefile
+++ b/programs/Makefile
@ -132,6 +132,15 @@ else
 LZ4_MSG := $(NO_LZ4_MSG)
 endif

+# enable backtrace symbol names for Linux/Darwin
+ALL_SYMBOLS := 0
+ifeq (,$(filter Windows%, $(OS)))
+ifeq ($(ALL_SYMBOLS), 1)
+DEBUGFLAGS_LD+=-rdynamic
+endif
+endif
+
+
 .PHONY: default
 default: zstd-release

@ -144,7 +153,7 @@ allVariants: zstd zstd-compress zstd-decompress zstd-small zstd-nolegacy
 $(ZSTDDECOMP_O): CFLAGS += $(ALIGN_LOOP)

 zstd : CPPFLAGS += $(THREAD_CPP) $(ZLIBCPP) $(LZMACPP) $(LZ4CPP)
-zstd : LDFLAGS += $(THREAD_LD) $(ZLIBLD) $(LZMALD) $(LZ4LD)
+zstd : LDFLAGS += $(THREAD_LD) $(ZLIBLD) $(LZMALD) $(LZ4LD) $(DEBUGFLAGS_LD)
 zstd : CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
 zstd : $(ZSTDLIB_FILES) zstdcli.o fileio.o bench.o datagen.o dibio.o
 	@echo "$(THREAD_MSG)"
@ -158,6 +167,7 @@ endif

 .PHONY: zstd-release
 zstd-release: DEBUGFLAGS :=
+zstd-release: DEBUGFLAGS_LD :=
 zstd-release: zstd

 zstd32 : CPPFLAGS += $(THREAD_CPP)
--- a/programs/README.md
+++ b/programs/README.md
@ -61,6 +61,13 @@ There are however other Makefile targets that create different variations of CLI
  In which case, linking stage will fail if `lz4` library cannot be found.
  This is useful to prevent silent feature disabling.

+- __ALL_SYMBOLS__ : `zstd` can display a stack backtrace if the execution
+  generates a runtime exception. By default, this feature may be
+  degraded/disabled on some platforms unless additional compiler directives are
+  applied. When triaging a runtime issue, enabling this feature can provided
+  more context to determine the location of the fault.
+  Example : `make zstd ALL_SYMBOLS=1`
+

 #### Aggregation of parameters
 CLI supports aggregation of parameters i.e. `-b1`, `-e18`, and `-i1` can be joined into `-b1e18i1`.
@ -151,6 +158,7 @@ Advanced arguments :
 Dictionary builder :
 --train ## : create a dictionary from a training set of files
 --train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args
+--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#] : use the fastcover algorithm with optional args
 --train-legacy[=s=#] : use the legacy algorithm with selectivity (default: 9)
 -o file : `file` is dictionary name (default: dictionary)
 --maxdict=# : limit dictionary to specified size (default: 112640)
--- a/programs/bench.c
+++ b/programs/bench.c
--- a/programs/bench.h
+++ b/programs/bench.h
@ -15,59 +15,82 @@ extern "C" {
 #ifndef BENCH_H_121279284357
 #define BENCH_H_121279284357

+/* ===  Dependencies  === */
 #include <stddef.h>   /* size_t */
 #define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_compressionParameters */
 #include "zstd.h"     /* ZSTD_compressionParameters */

-/* Creates a struct of type typeName with an int type .error field
- * and a .result field of some baseType. Functions with return
- * typeName pass a successful result with .error = 0 and .result
- * with the intended result, while returning an error will result
- * in .error != 0. 
+
+/* ===  Constants  === */
+
+#define MB_UNIT 1000000
+
+
+/* ===  Benchmark functions  === */
+
+/* Creates a variant `typeName`, able to express "error or valid result".
+ * Functions with return type `typeName`
+ * must first check if result is valid, using BMK_isSuccessful_*(),
+ * and only then can extract `baseType`.
 */
-#define ERROR_STRUCT(baseType, typeName) typedef struct { \
-    baseType result; \
-    int error;       \
-} typeName
+#define VARIANT_ERROR_RESULT(baseType, variantName)  \
+                                             \
+typedef struct {                             \
+    baseType internal_never_use_directly;    \
+    int tag;                                 \
+} variantName
+

 typedef struct {
    size_t cSize;
-    U64 cSpeed;   /* bytes / sec */
-    U64 dSpeed;
-    size_t cMem;
-} BMK_result_t;
+    unsigned long long cSpeed;   /* bytes / sec */
+    unsigned long long dSpeed;
+    size_t cMem;                 /* ? what is reported ? */
+} BMK_benchResult_t;

-ERROR_STRUCT(BMK_result_t, BMK_return_t);
+VARIANT_ERROR_RESULT(BMK_benchResult_t, BMK_benchOutcome_t);

-/* called in cli */
-/* Loads files in fileNamesTable into memory, as well as a dictionary 
- * from dictFileName, and then uses benchMem */
-/* fileNamesTable - name of files to benchmark
- * nbFiles - number of files (size of fileNamesTable), must be > 0
- * dictFileName - name of dictionary file to load
- * cLevel - compression level to benchmark, errors if invalid
- * compressionParams - basic compression Parameters
- * displayLevel - what gets printed
- *      0 : no display;   
- *      1 : errors;   
- *      2 : + result + interaction + warnings;   
- *      3 : + progression;   
- *      4 : + information
- * return 
- *      .error will give a nonzero error value if an error has occured
- *      .result - if .error = 0, .result will return the time taken to compression speed
- *          (.cSpeed), decompression speed (.dSpeed), and compressed size (.cSize) of the original
- *          file
+/* check first if the return structure represents an error or a valid result */
+int BMK_isSuccessful_benchOutcome(BMK_benchOutcome_t outcome);
+
+/* extract result from variant type.
+ * note : this function will abort() program execution if result is not valid
+ *        check result validity first, by using BMK_isSuccessful_benchOutcome()
 */
-BMK_return_t BMK_benchFiles(const char* const * const fileNamesTable, unsigned const nbFiles,
-                   const char* const dictFileName, 
-                   int const cLevel, const ZSTD_compressionParameters* const compressionParams, 
+BMK_benchResult_t BMK_extract_benchResult(BMK_benchOutcome_t outcome);
+
+
+/*! BMK_benchFiles() -- called by zstdcli */
+/*  Loads files from fileNamesTable into memory,
+ *  and an optional dictionary from dictFileName (can be NULL),
+ *  then uses benchMem().
+ *  fileNamesTable - name of files to benchmark.
+ *  nbFiles - number of files (size of fileNamesTable), must be > 0.
+ *  dictFileName - name of dictionary file to load.
+ *  cLevel - compression level to benchmark, errors if invalid.
+ *  compressionParams - advanced compression Parameters.
+ *  displayLevel - what gets printed:
+ *      0 : no display;
+ *      1 : errors;
+ *      2 : + result + interaction + warnings;
+ *      3 : + information;
+ *      4 : + debug
+ * @return:
+ *      a variant, which expresses either an error, or a valid result.
+ *      Use BMK_isSuccessful_benchOutcome() to check if function was successful.
+ *      If yes, extract the valid result with BMK_extract_benchResult(),
+ *      it will contain :
+ *          .cSpeed: compression speed in bytes per second,
+ *          .dSpeed: decompression speed in bytes per second,
+ *          .cSize : compressed size, in bytes
+ *          .cMem  : memory budget required for the compression context
+ */
+BMK_benchOutcome_t BMK_benchFiles(
+                   const char* const * fileNamesTable, unsigned nbFiles,
+                   const char* dictFileName,
+                   int cLevel, const ZSTD_compressionParameters* compressionParams,
                   int displayLevel);

-typedef enum {
-    BMK_timeMode = 0,
-    BMK_iterMode = 1
-} BMK_loopMode_t;

 typedef enum {
    BMK_both = 0,
@ -77,15 +100,14 @@ typedef enum {

 typedef struct {
    BMK_mode_t mode;            /* 0: all, 1: compress only 2: decode only */
-    BMK_loopMode_t loopMode;    /* if loopmode, then nbSeconds = nbLoops */
    unsigned nbSeconds;         /* default timing is in nbSeconds */
-    size_t blockSize;           /* Maximum allowable size of a block*/
+    size_t blockSize;           /* Maximum size of each block*/
    unsigned nbWorkers;         /* multithreading */
    unsigned realTime;          /* real time priority */
    int additionalParam;        /* used by python speed benchmark */
    unsigned ldmFlag;           /* enables long distance matching */
-    unsigned ldmMinMatch;       /* below: parameters for long distance matching, see zstd.1.md for meaning */
-    unsigned ldmHashLog; 
+    unsigned ldmMinMatch;       /* below: parameters for long distance matching, see zstd.1.md */
+    unsigned ldmHashLog;
    unsigned ldmBucketSizeLog;
    unsigned ldmHashEveryLog;
 } BMK_advancedParams_t;
@ -93,132 +115,186 @@ typedef struct {
 /* returns default parameters used by nonAdvanced functions */
 BMK_advancedParams_t BMK_initAdvancedParams(void);

-/* See benchFiles for normal parameter uses and return, see advancedParams_t for adv */
-BMK_return_t BMK_benchFilesAdvanced(const char* const * const fileNamesTable, unsigned const nbFiles,
-                   const char* const dictFileName, 
-                   int const cLevel, const ZSTD_compressionParameters* const compressionParams, 
-                   int displayLevel, const BMK_advancedParams_t* const adv);
+/*! BMK_benchFilesAdvanced():
+ *  Same as BMK_benchFiles(),
+ *  with more controls, provided through advancedParams_t structure */
+BMK_benchOutcome_t BMK_benchFilesAdvanced(
+                   const char* const * fileNamesTable, unsigned nbFiles,
+                   const char* dictFileName,
+                   int cLevel, const ZSTD_compressionParameters* compressionParams,
+                   int displayLevel, const BMK_advancedParams_t* adv);

-/* called in cli */
-/* Generates a sample with datagen with the compressibility argument*/
-/* cLevel - compression level to benchmark, errors if invalid
- * compressibility - determines compressibility of sample
- * compressionParams - basic compression Parameters
- * displayLevel - see benchFiles
- * adv - see advanced_Params_t
- * return 
- *      .error will give a nonzero error value if an error has occured
- *      .result - if .error = 0, .result will return the time taken to compression speed
- *          (.cSpeed), decompression speed (.dSpeed), and compressed size (.cSize) of the original
- *          file
+/*! BMK_syntheticTest() -- called from zstdcli */
+/*  Generates a sample with datagen, using compressibility argument */
+/*  cLevel - compression level to benchmark, errors if invalid
+ *  compressibility - determines compressibility of sample
+ *  compressionParams - basic compression Parameters
+ *  displayLevel - see benchFiles
+ *  adv - see advanced_Params_t
+ * @return:
+ *      a variant, which expresses either an error, or a valid result.
+ *      Use BMK_isSuccessful_benchOutcome() to check if function was successful.
+ *      If yes, extract the valid result with BMK_extract_benchResult(),
+ *      it will contain :
+ *          .cSpeed: compression speed in bytes per second,
+ *          .dSpeed: decompression speed in bytes per second,
+ *          .cSize : compressed size, in bytes
+ *          .cMem  : memory budget required for the compression context
 */
-BMK_return_t BMK_syntheticTest(int cLevel, double compressibility,
+BMK_benchOutcome_t BMK_syntheticTest(
+                              int cLevel, double compressibility,
                              const ZSTD_compressionParameters* compressionParams,
-                              int displayLevel, const BMK_advancedParams_t * const adv);
+                              int displayLevel, const BMK_advancedParams_t* adv);

-/* basic benchmarking function, called in paramgrill 
- * applies ZSTD_compress_generic() and ZSTD_decompress_generic() on data in srcBuffer
- * with specific compression parameters specified by other arguments using benchFunction
- * (cLevel, comprParams + adv in advanced Mode) */
-/* srcBuffer - data source, expected to be valid compressed data if in Decode Only Mode
- * srcSize - size of data in srcBuffer
- * cLevel - compression level  
- * comprParams - basic compression parameters
- * dictBuffer - a dictionary if used, null otherwise
- * dictBufferSize - size of dictBuffer, 0 otherwise
- * diplayLevel - see BMK_benchFiles
- * displayName - name used by display
- * return
- *      .error will give a nonzero value if an error has occured
- *      .result - if .error = 0, will give the same results as benchFiles
- *          but for the data stored in srcBuffer
+
+
+/* ===  Benchmark Zstandard in a memory-to-memory scenario  === */
+
+/** BMK_benchMem() -- core benchmarking function, called in paramgrill
+ *  applies ZSTD_compress_generic() and ZSTD_decompress_generic() on data in srcBuffer
+ *  with specific compression parameters provided by other arguments using benchFunction
+ *  (cLevel, comprParams + adv in advanced Mode) */
+/*  srcBuffer - data source, expected to be valid compressed data if in Decode Only Mode
+ *  srcSize - size of data in srcBuffer
+ *  fileSizes - srcBuffer is considered cut into 1+ segments, to compress separately.
+ *              note : sum(fileSizes) must be == srcSize.  (<== ensure it's properly checked)
+ *  nbFiles - nb of segments
+ *  cLevel - compression level
+ *  comprParams - basic compression parameters
+ *  dictBuffer - a dictionary if used, null otherwise
+ *  dictBufferSize - size of dictBuffer, 0 otherwise
+ *  diplayLevel - see BMK_benchFiles
+ *  displayName - name used by display
+ * @return:
+ *      a variant, which expresses either an error, or a valid result.
+ *      Use BMK_isSuccessful_benchOutcome() to check if function was successful.
+ *      If yes, extract the valid result with BMK_extract_benchResult(),
+ *      it will contain :
+ *          .cSpeed: compression speed in bytes per second,
+ *          .dSpeed: decompression speed in bytes per second,
+ *          .cSize : compressed size, in bytes
+ *          .cMem  : memory budget required for the compression context
 */
-BMK_return_t BMK_benchMem(const void* srcBuffer, size_t srcSize,
+BMK_benchOutcome_t BMK_benchMem(const void* srcBuffer, size_t srcSize,
                        const size_t* fileSizes, unsigned nbFiles,
-                        const int cLevel, const ZSTD_compressionParameters* comprParams,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
                        const void* dictBuffer, size_t dictBufferSize,
                        int displayLevel, const char* displayName);

-/* See benchMem for normal parameter uses and return, see advancedParams_t for adv 
+/* BMK_benchMemAdvanced() : same as BMK_benchMem()
+ * with following additional options :
 * dstBuffer - destination buffer to write compressed output in, NULL if none provided.
 * dstCapacity - capacity of destination buffer, give 0 if dstBuffer = NULL
+ * adv = see advancedParams_t
 */
-BMK_return_t BMK_benchMemAdvanced(const void* srcBuffer, size_t srcSize,
-                        void* dstBuffer, size_t dstCapacity, 
+BMK_benchOutcome_t BMK_benchMemAdvanced(const void* srcBuffer, size_t srcSize,
+                        void* dstBuffer, size_t dstCapacity,
                        const size_t* fileSizes, unsigned nbFiles,
-                        const int cLevel, const ZSTD_compressionParameters* comprParams,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
                        const void* dictBuffer, size_t dictBufferSize,
                        int displayLevel, const char* displayName,
                        const BMK_advancedParams_t* adv);

+
+
+/* ====  Benchmarking any function, iterated on a set of blocks  ==== */
+
 typedef struct {
-    size_t sumOfReturn;    /* sum of return values */
-    U64 nanoSecPerRun;     /* time per iteration */
-} BMK_customResult_t;
+    unsigned long long nanoSecPerRun;  /* time per iteration */
+    size_t sumOfReturn;       /* sum of return values */
+} BMK_runTime_t;

-ERROR_STRUCT(BMK_customResult_t, BMK_customReturn_t);
+VARIANT_ERROR_RESULT(BMK_runTime_t, BMK_runOutcome_t);

-typedef size_t (*BMK_benchFn_t)(const void*, size_t, void*, size_t, void*);
-typedef size_t (*BMK_initFn_t)(void*);
+/* check first if the return structure represents an error or a valid result */
+int BMK_isSuccessful_runOutcome(BMK_runOutcome_t outcome);

-/* This function times the execution of 2 argument functions, benchFn and initFn  */
+/* extract result from variant type.
+ * note : this function will abort() program execution if result is not valid
+ *        check result validity first, by using BMK_isSuccessful_runOutcome()
+ */
+BMK_runTime_t BMK_extract_runTime(BMK_runOutcome_t outcome);
+
+
+
+typedef size_t (*BMK_benchFn_t)(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* customPayload);
+typedef size_t (*BMK_initFn_t)(void* initPayload);
+
+
+/* BMK_benchFunction() :
+ * This function times the execution of 2 argument functions, benchFn and initFn  */

 /* benchFn - (*benchFn)(srcBuffers[i], srcSizes[i], dstBuffers[i], dstCapacities[i], benchPayload)
 *      is run nbLoops times
- * initFn - (*initFn)(initPayload) is run once per benchmark at the beginning. This argument can 
- *          be NULL, in which case nothing is run.
- * blockCount - number of blocks (size of srcBuffers, srcSizes, dstBuffers, dstCapacities)
+ * initFn - (*initFn)(initPayload) is run once per benchmark, at the beginning.
+ *      This argument can be NULL, in which case nothing is run.
+ * blockCount - number of blocks. Size of all array parameters : srcBuffers, srcSizes, dstBuffers, dstCapacities, blockResults
 * srcBuffers - an array of buffers to be operated on by benchFn
 * srcSizes - an array of the sizes of above buffers
 * dstBuffers - an array of buffers to be written into by benchFn
 * dstCapacities - an array of the capacities of above buffers
- * blockResults - the return value of benchFn called on each block.
+ * blockResults - Optional: store the return value of benchFn for each block. Use NULL if this result is not requested.
 * nbLoops - defines number of times benchFn is run.
- * assumed array of size blockCount, will have compressed size of each block written to it.
- * return 
- *      .error will give a nonzero value if ZSTD_isError() is nonzero for any of the return
- *          of the calls to initFn and benchFn, or if benchFunction errors internally
- *      .result - if .error = 0, then .result will contain the sum of all return values of 
- *          benchFn on the first iteration through all of the blocks (.sumOfReturn) and also 
- *          the time per run of benchFn (.nanoSecPerRun). For the former, this
- *          is generally intended to be used on functions which return the # of bytes written 
- *          into dstBuffer, hence this value will be the total amount of bytes written to 
- *          dstBuffer.
+ * @return: a variant, which express either an error, or can generate a valid BMK_runTime_t result.
+ *          Use BMK_isSuccessful_runOutcome() to check if function was successful.
+ *          If yes, extract the result with BMK_extract_runTime(),
+ *          it will contain :
+ *              .sumOfReturn : the sum of all return values of benchFn through all of blocks
+ *              .nanoSecPerRun : time per run of benchFn + (time for initFn / nbLoops)
+ *          .sumOfReturn is generally intended for functions which return a # of bytes written into dstBuffer,
+ *              in which case, this value will be the total amount of bytes written into dstBuffer.
 */
-BMK_customReturn_t BMK_benchFunction(BMK_benchFn_t benchFn, void* benchPayload,
+BMK_runOutcome_t BMK_benchFunction(
+                        BMK_benchFn_t benchFn, void* benchPayload,
                        BMK_initFn_t initFn, void* initPayload,
                        size_t blockCount,
-                        const void* const * const srcBuffers, const size_t* srcSizes,
-                        void * const * const dstBuffers, const size_t* dstCapacities, size_t* blockResults,  
+                        const void *const * srcBuffers, const size_t* srcSizes,
+                        void *const * dstBuffers, const size_t* dstCapacities,
+                        size_t* blockResults,
                        unsigned nbLoops);


-/* state information needed to advance computation for benchFunctionTimed */
-typedef struct BMK_timeState_t BMK_timedFnState_t;
-/* initializes timeState object with desired number of seconds */
-BMK_timedFnState_t* BMK_createTimeState(unsigned nbSeconds);
-/* resets existing timeState object */
-void BMK_resetTimeState(BMK_timedFnState_t*, unsigned nbSeconds);
-/* deletes timeState object */
-void BMK_freeTimeState(BMK_timedFnState_t* state);

-typedef struct {
-    BMK_customReturn_t result;
-    int completed;
-} BMK_customTimedReturn_t;
+/* ====  Benchmark any function, providing intermediate results  ==== */

-/* 
- * Benchmarks custom functions like BMK_benchFunction(), but runs for nbSeconds seconds rather than a fixed number of loops
- * arguments mostly the same other than BMK_benchFunction()
- * Usage - benchFunctionTimed will return in approximately one second. Keep calling BMK_benchFunctionTimed() until the return's completed field = 1. 
- * to continue updating intermediate result. Intermediate return values are returned by the function.
+/* state information tracking benchmark session */
+typedef struct BMK_timedFnState_s BMK_timedFnState_t;
+
+/* BMK_createTimedFnState() and BMK_resetTimedFnState() :
+ * Create/Set BMK_timedFnState_t for next benchmark session,
+ * which shall last a minimum of total_ms milliseconds,
+ * producing intermediate results, paced at interval of (approximately) run_ms.
 */
-BMK_customTimedReturn_t BMK_benchFunctionTimed(BMK_timedFnState_t* cont,
-    BMK_benchFn_t benchFn, void* benchPayload,
-    BMK_initFn_t initFn, void* initPayload,
-    size_t blockCount,
-    const void* const * const srcBlockBuffers, const size_t* srcBlockSizes,
-    void* const * const dstBlockBuffers, const size_t* dstBlockCapacities, size_t* blockResults);
+BMK_timedFnState_t* BMK_createTimedFnState(unsigned total_ms, unsigned run_ms);
+void BMK_resetTimedFnState(BMK_timedFnState_t* timedFnState, unsigned total_ms, unsigned run_ms);
+void BMK_freeTimedFnState(BMK_timedFnState_t* state);
+
+
+/* Tells if duration of all benchmark runs has exceeded total_ms
+ */
+int BMK_isCompleted_TimedFn(const BMK_timedFnState_t* timedFnState);
+
+
+/* BMK_benchTimedFn() :
+ * Similar to BMK_benchFunction(), most arguments being identical.
+ * Automatically determines `nbLoops` so that each result is regularly produced at interval of about run_ms.
+ * Note : minimum `nbLoops` is 1, therefore a run may last more than run_ms, and possibly even more than total_ms.
+ * Usage - initialize timedFnState, select benchmark duration (total_ms) and each measurement duration (run_ms)
+ *         call BMK_benchTimedFn() repetitively, each measurement is supposed to last about run_ms
+ *         Check if total time budget is spent or exceeded, using BMK_isCompleted_TimedFn()
+ */
+BMK_runOutcome_t BMK_benchTimedFn(
+                    BMK_timedFnState_t* timedFnState,
+                    BMK_benchFn_t benchFn, void* benchPayload,
+                    BMK_initFn_t initFn, void* initPayload,
+                    size_t blockCount,
+                    const void *const * srcBlockBuffers, const size_t* srcBlockSizes,
+                    void *const * dstBlockBuffers, const size_t* dstBlockCapacities,
+                    size_t* blockResults);
+
+
+
+

 #endif   /* BENCH_H_121279284357 */

--- a/programs/dibio.c
+++ b/programs/dibio.c
@ -27,6 +27,7 @@
 #include <string.h>         /* memset */
 #include <stdio.h>          /* fprintf, fopen, ftello64 */
 #include <errno.h>          /* errno */
+#include <assert.h>

 #include "mem.h"            /* read */
 #include "error_private.h"
@ -43,6 +44,7 @@
 #define SAMPLESIZE_MAX (128 KB)
 #define MEMMULT 11    /* rough estimation : memory cost to analyze 1 byte of sample */
 #define COVER_MEMMULT 9    /* rough estimation : memory cost to analyze 1 byte of sample */
+#define FASTCOVER_MEMMULT 1    /* rough estimation : memory cost to analyze 1 byte of sample */
 static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));

 #define NOISELENGTH 32
@ -165,6 +167,7 @@ static U32 DiB_rand(U32* src)
 static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
    U32 seed = 0xFD2FB528;
    unsigned i;
+    assert(nbFiles >= 1);
    for (i = nbFiles - 1; i > 0; --i) {
        unsigned const j = DiB_rand(&seed) % (i + 1);
        const char* const tmp = fileNamesTable[j];
@ -269,16 +272,19 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(void* dictBuffer, size_t dictBufferCa

 int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
                       const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
-                       ZDICT_legacy_params_t *params, ZDICT_cover_params_t *coverParams,
-                       int optimizeCover)
+                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
+                       ZDICT_fastCover_params_t* fastCoverParams, int optimize)
 {
    unsigned const displayLevel = params ? params->zParams.notificationLevel :
                        coverParams ? coverParams->zParams.notificationLevel :
+                        fastCoverParams ? fastCoverParams->zParams.notificationLevel :
                        0;   /* should never happen */
    void* const dictBuffer = malloc(maxDictSize);
    fileStats const fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
    size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
-    size_t const memMult = params ? MEMMULT : COVER_MEMMULT;
+    size_t const memMult = params ? MEMMULT :
+                           coverParams ? COVER_MEMMULT:
+                           FASTCOVER_MEMMULT;
    size_t const maxMem =  DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
    size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
    void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
@ -310,7 +316,8 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
    /* Load input buffer */
    DISPLAYLEVEL(3, "Shuffling input files\n");
    DiB_shuffle(fileNamesTable, nbFiles);
-    nbFiles = DiB_loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
+
+    DiB_loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);

    {   size_t dictSize;
        if (params) {
@ -318,17 +325,36 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
            dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize,
                                                           srcBuffer, sampleSizes, fs.nbSamples,
                                                           *params);
-        } else if (optimizeCover) {
-            dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
-                                                           srcBuffer, sampleSizes, fs.nbSamples,
-                                                           coverParams);
-            if (!ZDICT_isError(dictSize)) {
-                unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
-                DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d, coverParams->steps, splitPercentage);
+        } else if (coverParams) {
+            if (optimize) {
+              dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
+                                                             srcBuffer, sampleSizes, fs.nbSamples,
+                                                             coverParams);
+              if (!ZDICT_isError(dictSize)) {
+                  unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
+                  DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
+                              coverParams->steps, splitPercentage);
+              }
+            } else {
+              dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
+                                                     sampleSizes, fs.nbSamples, *coverParams);
            }
        } else {
-            dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
-                                                   sampleSizes, fs.nbSamples, *coverParams);
+            assert(fastCoverParams != NULL);
+            if (optimize) {
+              dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
+                                                              srcBuffer, sampleSizes, fs.nbSamples,
+                                                              fastCoverParams);
+              if (!ZDICT_isError(dictSize)) {
+                unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
+                DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
+                            fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
+                            fastCoverParams->accel);
+              }
+            } else {
+              dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
+                                                        sampleSizes, fs.nbSamples, *fastCoverParams);
+            }
        }
        if (ZDICT_isError(dictSize)) {
            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
--- a/programs/dibio.h
+++ b/programs/dibio.h
@ -33,7 +33,7 @@
 */
 int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
                       const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
-                       ZDICT_legacy_params_t *params, ZDICT_cover_params_t *coverParams,
-                       int optimizeCover);
+                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
+                       ZDICT_fastCover_params_t* fastCoverParams, int optimize);

 #endif
--- a/programs/fileio.c
+++ b/programs/fileio.c
@ -30,6 +30,10 @@
 #include <stdlib.h>     /* malloc, free */
 #include <string.h>     /* strcmp, strlen */
 #include <errno.h>      /* errno */
+#include <signal.h>
+#ifndef _WIN32
+#include <execinfo.h>   /* backtrace, backtrace_symbols */
+#endif

 #if defined (_MSC_VER)
 #  include <sys/stat.h>
@ -125,8 +129,6 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
 /*-************************************
 *  Signal (Ctrl-C trapping)
 **************************************/
-#include  <signal.h>
-
 static const char* g_artefact = NULL;
 static void INThandler(int sig)
 {
@ -158,7 +160,60 @@ static void clearHandler(void)
 }


-/* ************************************************************
+/*-*********************************************************
+*  Termination signal trapping (Print debug stack trace)
+***********************************************************/
+#define MAX_STACK_FRAMES    50
+
+#ifndef _WIN32
+static void ABRThandler(int sig) {
+    const char* name;
+    void* addrlist[MAX_STACK_FRAMES];
+    char** symbollist;
+    U32 addrlen, i;
+
+    switch (sig) {
+        case SIGABRT: name = "SIGABRT"; break;
+        case SIGFPE: name = "SIGFPE"; break;
+        case SIGILL: name = "SIGILL"; break;
+        case SIGINT: name = "SIGINT"; break;
+        case SIGSEGV: name = "SIGSEGV"; break;
+        default: name = "UNKNOWN";
+    }
+
+    DISPLAY("Caught %s signal, printing stack:\n", name);
+    /* Retrieve current stack addresses. */
+    addrlen = backtrace(addrlist, MAX_STACK_FRAMES);
+    if (addrlen == 0) {
+        DISPLAY("\n");
+        return;
+    }
+    /* Create readable strings to each frame. */
+    symbollist = backtrace_symbols(addrlist, addrlen);
+    /* Print the stack trace, excluding calls handling the signal. */
+    for (i = ZSTD_START_SYMBOLLIST_FRAME; i < addrlen; i++) {
+        DISPLAY("%s\n", symbollist[i]);
+    }
+    free(symbollist);
+    /* Reset and raise the signal so default handler runs. */
+    signal(sig, SIG_DFL);
+    raise(sig);
+}
+#endif
+
+void FIO_addAbortHandler()
+{
+#ifndef _WIN32
+    signal(SIGABRT, ABRThandler);
+    signal(SIGFPE, ABRThandler);
+    signal(SIGILL, ABRThandler);
+    signal(SIGSEGV, ABRThandler);
+    signal(SIGBUS, ABRThandler);
+#endif
+}
+
+
+/*-************************************************************
 * Avoid fseek()'s 2GiB barrier with MSVC, macOS, *BSD, MinGW
 ***************************************************************/
 #if defined(_MSC_VER) && _MSC_VER >= 1400
@ -1119,8 +1174,8 @@ int FIO_compressMultipleFilenames(const char** inFileNamesTable, unsigned nbFile
                if (!dstFileName) {
                    EXM_THROW(30, "zstd: %s", strerror(errno));
            }   }
-            strcpy(dstFileName, inFileNamesTable[u]);
-            strcat(dstFileName, suffix);
+            strncpy(dstFileName, inFileNamesTable[u], ifnSize+1 /* Include null */);
+            strncat(dstFileName, suffix, suffixSize);
            missed_files += FIO_compressFilename_dstFile(ress, dstFileName, inFileNamesTable[u], compressionLevel);
    }   }

--- a/programs/fileio.h
+++ b/programs/fileio.h
@ -96,6 +96,9 @@ int FIO_decompressMultipleFilenames(const char** srcNamesTable, unsigned nbFiles
                                    const char* dictFileName);


+/* custom crash signal handler */
+void FIO_addAbortHandler(void);
+
 #if defined (__cplusplus)
 }
 #endif
--- a/programs/platform.h
+++ b/programs/platform.h
@ -148,6 +148,17 @@ static __inline int IS_CONSOLE(FILE* stdStream) {
 #endif


+#ifndef ZSTD_START_SYMBOLLIST_FRAME
+#  ifdef __linux__
+#    define ZSTD_START_SYMBOLLIST_FRAME 2
+#  elif defined __APPLE__
+#    define ZSTD_START_SYMBOLLIST_FRAME 4
+#  else
+#    define ZSTD_START_SYMBOLLIST_FRAME 0
+#  endif
+#endif
+
+
 #if defined (__cplusplus)
 }
 #endif
--- a/programs/util.h
+++ b/programs/util.h
@ -319,15 +319,18 @@ UTIL_STATIC U32 UTIL_isDirectory(const char* infilename)

 UTIL_STATIC U32 UTIL_isLink(const char* infilename)
 {
-#if defined(_WIN32)
-    /* no symlinks on windows */
-    (void)infilename;
-#else
+/* macro guards, as defined in : https://linux.die.net/man/2/lstat */
+#if defined(_BSD_SOURCE) \
+    || (defined(_XOPEN_SOURCE) && (_XOPEN_SOURCE >= 500)) \
+    || (defined(_XOPEN_SOURCE) && defined(_XOPEN_SOURCE_EXTENDED)) \
+    || (defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L)) \
+    || (defined(__APPLE__) && defined(__MACH__))
    int r;
    stat_t statbuf;
    r = lstat(infilename, &statbuf);
    if (!r && S_ISLNK(statbuf.st_mode)) return 1;
 #endif
+    (void)infilename;
    return 0;
 }

@ -526,7 +529,10 @@ UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_
 * After finishing usage of the list the structures should be freed with UTIL_freeFileList(params: return value, allocatedBuffer)
 * In case of error UTIL_createFileList returns NULL and UTIL_freeFileList should not be called.
 */
-UTIL_STATIC const char** UTIL_createFileList(const char **inputNames, unsigned inputNamesNb, char** allocatedBuffer, unsigned* allocatedNamesNb, int followLinks)
+UTIL_STATIC const char**
+UTIL_createFileList(const char **inputNames, unsigned inputNamesNb,
+                    char** allocatedBuffer, unsigned* allocatedNamesNb,
+                    int followLinks)
 {
    size_t pos;
    unsigned i, nbFiles;
--- a/programs/zstd.1
+++ b/programs/zstd.1
@ -194,7 +194,7 @@ All arguments after \fB\-\-\fR are treated as files
 Use FILEs as training set to create a dictionary\. The training set should contain a lot of small files (> 100), and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary)\.
 .
 .IP
-Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-cover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. Equivalent to \fB\-\-train\-cover=d=8,steps=4\fR\.
+Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-fastcover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\.  The cover dictionary builder can be accessed with \fB\-\-train\-cover\fR\. Equivalent to \fB\-\-train\-fastCover=d=8,steps=4\fR\.
 .
 .TP
 \fB\-o file\fR
@ -240,6 +240,25 @@ Examples:
 \fBzstd \-\-train\-cover=k=50 FILEs\fR
 .
 .TP
+\fB\-\-train\-fastcover[=k#,d=#,f=#,steps=#,split=#,accel=#]\fR
+Same as cover but with extra parameters \fIf\fR and \fIaccel\fR and different default value of split
+.
+.IP
+If \fIsplit\fR is not specified, then it tries \fIsplit\fR = 75. If \fIf\fR is not specified, then it tries \fIf\fR = 20. Requires that 0 < \fIf\fR < 32. If \fIaccel\fR is not specified, then it tries \fIaccel\fR = 1. Requires that 0 < \fIaccel\fR <= 10. Requires that \fId\fR = 6 or \fId\fR = 8.
+.
+.IP
+\fIf\fR is log of size of array that keeps track of frequency of subsegments of size \fId\fR. The subsegment is hashed to an index in the range [0,2^\fIf\fR - 1]. It is possible that 2 different subsegments are hashed to the same index, and they are considered as the same subsegment when computing frequency. Using a higher \fIf\fR reduces collision but takes longer.
+.
+.IP
+Examples:
+.
+.IP
+\fBzstd \-\-train\-fastcover FILEs\fR
+.
+.IP
+\fBzstd \-\-train\-fastcover=d=8,f=15,accel=2 FILEs\fR
+.
+.TP
 \fB\-\-train\-legacy[=selectivity=#]\fR
 Use legacy dictionary builder algorithm with the given dictionary \fIselectivity\fR (default: 9)\. The smaller the \fIselectivity\fR value, the denser the dictionary, improving its efficiency but reducing its possible maximum size\. \fB\-\-train\-legacy=s=#\fR is also accepted\.
 .
--- a/programs/zstd.1.md
+++ b/programs/zstd.1.md
@ -207,9 +207,10 @@ Compression of small files similar to the sample set will be greatly improved.
    (for example, 10 MB for a 100 KB dictionary).

    Supports multithreading if `zstd` is compiled with threading support.
-    Additional parameters can be specified with `--train-cover`.
+    Additional parameters can be specified with `--train-fastcover`.
    The legacy dictionary builder can be accessed with `--train-legacy`.
-    Equivalent to `--train-cover=d=8,steps=4`.
+    The cover dictionary builder can be accessed with `--train-cover`.
+    Equivalent to `--train-fastcover=d=8,steps=4`.
 * `-o file`:
    Dictionary saved into `file` (default name: dictionary).
 * `--maxdict=#`:
@ -261,6 +262,26 @@ Compression of small files similar to the sample set will be greatly improved.

    `zstd --train-cover=k=50,split=60 FILEs`

+* `--train-fastcover[=k#,d=#,f=#,steps=#,split=#,accel=#]`:
+    Same as cover but with extra parameters _f_ and _accel_ and different default value of split
+    If _split_ is not specified, then it tries _split_ = 75.
+    If _f_ is not specified, then it tries _f_ = 20.
+    Requires that 0 < _f_ < 32.
+    If _accel_ is not specified, then it tries _accel_ = 1.
+    Requires that 0 < _accel_ <= 10.
+    Requires that _d_ = 6 or _d_ = 8.
+
+    _f_ is log of size of array that keeps track of frequency of subsegments of size _d_.
+    The subsegment is hashed to an index in the range [0,2^_f_ - 1].
+    It is possible that 2 different subsegments are hashed to the same index, and they are considered as the same subsegment when computing frequency.
+    Using a higher _f_ reduces collision but takes longer.
+
+    Examples:
+
+    `zstd --train-fastcover FILEs`
+
+    `zstd --train-fastcover=d=8,f=15,accel=2 FILEs`
+
 * `--train-legacy[=selectivity=#]`:
    Use legacy dictionary builder algorithm with the given dictionary
    _selectivity_ (default: 9).
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@ -84,7 +84,10 @@ static U32 g_ldmMinMatch = 0;
 static U32 g_ldmHashEveryLog = LDM_PARAM_DEFAULT;
 static U32 g_ldmBucketSizeLog = LDM_PARAM_DEFAULT;

-#define DEFAULT_SPLITPOINT 1.0
+
+#define DEFAULT_ACCEL 1
+
+typedef enum { cover, fastCover, legacy } dictType;

 /*-************************************
 *  Display Macros
@ -173,6 +176,7 @@ static int usage_advanced(const char* programName)
    DISPLAY( "Dictionary builder : \n");
    DISPLAY( "--train ## : create a dictionary from a training set of files \n");
    DISPLAY( "--train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args\n");
+    DISPLAY( "--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#] : use the fast cover algorithm with optional args\n");
    DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel);
    DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
    DISPLAY( "--maxdict=# : limit dictionary to specified size (default: %u) \n", g_defaultMaxDictSize);
@ -296,6 +300,33 @@ static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t
    return 1;
 }

+/**
+ * parseFastCoverParameters() :
+ * reads fastcover parameters from *stringPtr (e.g. "--train-fastcover=k=48,d=8,f=20,steps=32,accel=2") into *params
+ * @return 1 means that fastcover parameters were correct
+ * @return 0 in case of malformed parameters
+ */
+static unsigned parseFastCoverParameters(const char* stringPtr, ZDICT_fastCover_params_t* params)
+{
+    memset(params, 0, sizeof(*params));
+    for (; ;) {
+        if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "f=")) { params->f = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "accel=")) { params->accel = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "split=")) {
+          unsigned splitPercentage = readU32FromChar(&stringPtr);
+          params->splitPoint = (double)splitPercentage / 100.0;
+          if (stringPtr[0]==',') { stringPtr++; continue; } else break;
+        }
+        return 0;
+    }
+    if (stringPtr[0] != 0) return 0;
+    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint * 100), params->accel);
+    return 1;
+}
+
 /**
 * parseLegacyParameters() :
 * reads legacy dictioanry builter parameters from *stringPtr (e.g. "--train-legacy=selectivity=8") into *selectivity
@ -317,7 +348,19 @@ static ZDICT_cover_params_t defaultCoverParams(void)
    memset(&params, 0, sizeof(params));
    params.d = 8;
    params.steps = 4;
-    params.splitPoint = DEFAULT_SPLITPOINT;
+    params.splitPoint = 1.0;
+    return params;
+}
+
+static ZDICT_fastCover_params_t defaultFastCoverParams(void)
+{
+    ZDICT_fastCover_params_t params;
+    memset(&params, 0, sizeof(params));
+    params.d = 8;
+    params.f = 20;
+    params.steps = 4;
+    params.splitPoint = 0.75; /* different from default splitPoint of cover */
+    params.accel = DEFAULT_ACCEL;
    return params;
 }
 #endif
@ -433,7 +476,8 @@ int main(int argCount, const char* argv[])
 #endif
 #ifndef ZSTD_NODICT
    ZDICT_cover_params_t coverParams = defaultCoverParams();
-    int cover = 1;
+    ZDICT_fastCover_params_t fastCoverParams = defaultFastCoverParams();
+    dictType dict = fastCover;
 #endif
 #ifndef ZSTD_NOBENCH
    BMK_advancedParams_t benchParams = BMK_initAdvancedParams();
@ -469,6 +513,9 @@ int main(int argCount, const char* argv[])
    if (exeNameMatch(programName, ZSTD_UNLZ4)) { operation=zom_decompress; FIO_setCompressionType(FIO_lz4Compression); }                                   /* behave like unlz4, also supports multiple formats */
    memset(&compressionParams, 0, sizeof(compressionParams));

+    /* init crash handler */
+    FIO_addAbortHandler();
+
    /* command switches */
    for (argNb=1; argNb<argCount; argNb++) {
        const char* argument = argv[argNb];
@ -533,18 +580,29 @@ int main(int argCount, const char* argv[])
                      operation = zom_train;
                      if (outFileName == NULL)
                          outFileName = g_defaultDictName;
-                      cover = 1;
+                      dict = cover;
                      /* Allow optional arguments following an = */
                      if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); }
                      else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
                      else if (!parseCoverParameters(argument, &coverParams)) { CLEAN_RETURN(badusage(programName)); }
                      continue;
                    }
+                    if (longCommandWArg(&argument, "--train-fastcover")) {
+                      operation = zom_train;
+                      if (outFileName == NULL)
+                          outFileName = g_defaultDictName;
+                      dict = fastCover;
+                      /* Allow optional arguments following an = */
+                      if (*argument == 0) { memset(&fastCoverParams, 0, sizeof(fastCoverParams)); }
+                      else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
+                      else if (!parseFastCoverParameters(argument, &fastCoverParams)) { CLEAN_RETURN(badusage(programName)); }
+                      continue;
+                    }
                    if (longCommandWArg(&argument, "--train-legacy")) {
                      operation = zom_train;
                      if (outFileName == NULL)
                          outFileName = g_defaultDictName;
-                      cover = 0;
+                      dict = legacy;
                      /* Allow optional arguments following an = */
                      if (*argument == 0) { continue; }
                      else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
@ -849,13 +907,13 @@ int main(int argCount, const char* argv[])
        if (cLevelLast > ZSTD_maxCLevel()) cLevelLast = ZSTD_maxCLevel();
        if (cLevelLast < cLevel) cLevelLast = cLevel;
        if (cLevelLast > cLevel)
-            DISPLAYLEVEL(2, "Benchmarking levels from %d to %d\n", cLevel, cLevelLast);
+            DISPLAYLEVEL(3, "Benchmarking levels from %d to %d\n", cLevel, cLevelLast);
        if(filenameIdx) {
            if(separateFiles) {
                unsigned i;
                for(i = 0; i < filenameIdx; i++) {
                    int c;
-                    DISPLAYLEVEL(2, "Benchmarking %s \n", filenameTable[i]);
+                    DISPLAYLEVEL(3, "Benchmarking %s \n", filenameTable[i]);
                    for(c = cLevel; c <= cLevelLast; c++) {
                        BMK_benchFilesAdvanced(&filenameTable[i], 1, dictFileName, c, &compressionParams, g_displayLevel, &benchParams);
                    }
@ -884,17 +942,22 @@ int main(int argCount, const char* argv[])
        zParams.compressionLevel = dictCLevel;
        zParams.notificationLevel = g_displayLevel;
        zParams.dictID = dictID;
-        if (cover) {
+        if (dict == cover) {
            int const optimize = !coverParams.k || !coverParams.d;
            coverParams.nbThreads = nbWorkers;
            coverParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, &coverParams, optimize);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, &coverParams, NULL, optimize);
+        } else if (dict == fastCover) {
+            int const optimize = !fastCoverParams.k || !fastCoverParams.d;
+            fastCoverParams.nbThreads = nbWorkers;
+            fastCoverParams.zParams = zParams;
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, NULL, &fastCoverParams, optimize);
        } else {
            ZDICT_legacy_params_t dictParams;
            memset(&dictParams, 0, sizeof(dictParams));
            dictParams.selectivityLevel = dictSelect;
            dictParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, &dictParams, NULL, 0);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, &dictParams, NULL, NULL, 0);
        }
 #endif
        goto _end;
--- a/tests/.gitignore
+++ b/tests/.gitignore
@ -26,6 +26,7 @@ invalidDictionaries
 checkTag
 zcat
 zstdcat
+tm

 # Tmp test directory
 zstdtest
--- a/tests/Makefile
+++ b/tests/Makefile
@ -200,7 +200,7 @@ zstreamtest-dll : $(ZSTDDIR)/common/xxhash.c  # xxh symbols not exposed from dll
 zstreamtest-dll : $(ZSTREAM_LOCAL_FILES)
 	$(CC) $(CPPFLAGS) $(CFLAGS) $(filter %.c,$^) $(LDFLAGS) -o $@$(EXT)

-paramgrill : DEBUGFLAGS = -DNDEBUG  # turn off assert() for speed measurements
+paramgrill : DEBUGFLAGS =  # turn off assert() by default for speed measurements
 paramgrill : $(ZSTD_FILES) $(PRGDIR)/bench.c $(PRGDIR)/datagen.c paramgrill.c
 	$(CC) $(FLAGS) $^ -lm -o $@$(EXT)

--- a/tests/decodecorpus.c
+++ b/tests/decodecorpus.c
@ -620,6 +620,8 @@ static size_t writeLiteralsBlock(U32* seed, frame_t* frame, size_t contentSize)
 }

 static inline void initSeqStore(seqStore_t *seqStore) {
+    seqStore->maxNbSeq = MAX_NB_SEQ;
+    seqStore->maxNbLit = ZSTD_BLOCKSIZE_MAX;
    seqStore->sequencesStart = SEQUENCE_BUFFER;
    seqStore->litStart = SEQUENCE_LITERAL_BUFFER;
    seqStore->llCode = SEQUENCE_LLCODE;
--- a/tests/fullbench.c
+++ b/tests/fullbench.c
@ -51,6 +51,8 @@
 #define COMPRESSIBILITY_DEFAULT 0.50
 static const size_t g_sampleSize = 10000000;

+#define TIMELOOP_NANOSEC      (1*1000000000ULL) /* 1 second */
+

 /*_************************************
 *  Macros
@ -92,63 +94,30 @@ static size_t BMK_findMaxMem(U64 requiredMem)
    return (size_t) requiredMem;
 }

-/*_*******************************************************
-*  Argument Parsing
-*********************************************************/
-
-#define ERROR_OUT(msg) { DISPLAY("%s \n", msg); exit(1); }
-
- static unsigned readU32FromChar(const char** stringPtr)
-{
-    const char errorMsg[] = "error: numeric value too large";
-    unsigned result = 0;
-    while ((**stringPtr >='0') && (**stringPtr <='9')) {
-        unsigned const max = (((unsigned)(-1)) / 10) - 1;
-        if (result > max) ERROR_OUT(errorMsg);
-        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
-    }
-    if ((**stringPtr=='K') || (**stringPtr=='M')) {
-        unsigned const maxK = ((unsigned)(-1)) >> 10;
-        if (result > maxK) ERROR_OUT(errorMsg);
-        result <<= 10;
-        if (**stringPtr=='M') {
-            if (result > maxK) ERROR_OUT(errorMsg);
-            result <<= 10;
-        }
-        (*stringPtr)++;  /* skip `K` or `M` */
-        if (**stringPtr=='i') (*stringPtr)++;
-        if (**stringPtr=='B') (*stringPtr)++;
-    }
-    return result;
-}
-
-static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
-{
-    size_t const comSize = strlen(longCommand);
-    int const result = !strncmp(*stringPtr, longCommand, comSize);
-    if (result) *stringPtr += comSize;
-    return result;
-}

 /*_*******************************************************
 *  Benchmark wrappers
 *********************************************************/

-
 static ZSTD_CCtx* g_zcc = NULL;

-size_t local_ZSTD_compress(const void* src, size_t srcSize, void* dst, size_t dstSize, void* buff2)
+static size_t
+local_ZSTD_compress(const void* src, size_t srcSize,
+                    void* dst, size_t dstSize,
+                    void* buff2)
 {
    ZSTD_parameters p;
-    ZSTD_frameParameters f = {1 /* contentSizeHeader*/, 0, 0};
+    ZSTD_frameParameters f = { 1 /* contentSizeHeader*/, 0, 0 };
    p.fParams = f;
    p.cParams = *(ZSTD_compressionParameters*)buff2;
-    return ZSTD_compress_advanced (g_zcc,dst, dstSize, src, srcSize, NULL ,0, p);
+    return ZSTD_compress_advanced (g_zcc, dst, dstSize, src, srcSize, NULL ,0, p);
    //return ZSTD_compress(dst, dstSize, src, srcSize, cLevel);
 }

 static size_t g_cSize = 0;
-size_t local_ZSTD_decompress(const void* src, size_t srcSize, void* dst, size_t dstSize, void* buff2)
+static size_t local_ZSTD_decompress(const void* src, size_t srcSize,
+                                    void* dst, size_t dstSize,
+                                    void* buff2)
 {
    (void)src; (void)srcSize;
    return ZSTD_decompress(dst, dstSize, buff2, g_cSize);
@ -174,7 +143,10 @@ size_t local_ZSTD_decodeSeqHeaders(const void* src, size_t srcSize, void* dst, s
 #endif

 static ZSTD_CStream* g_cstream= NULL;
-size_t local_ZSTD_compressStream(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+static size_t
+local_ZSTD_compressStream(const void* src, size_t srcSize,
+                          void* dst, size_t dstCapacity,
+                          void* buff2)
 {
    ZSTD_outBuffer buffOut;
    ZSTD_inBuffer buffIn;
@ -194,7 +166,10 @@ size_t local_ZSTD_compressStream(const void* src, size_t srcSize, void* dst, siz
    return buffOut.pos;
 }

-static size_t local_ZSTD_compress_generic_end(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+static size_t
+local_ZSTD_compress_generic_end(const void* src, size_t srcSize,
+                                void* dst, size_t dstCapacity,
+                                void* buff2)
 {
    ZSTD_outBuffer buffOut;
    ZSTD_inBuffer buffIn;
@ -209,7 +184,10 @@ static size_t local_ZSTD_compress_generic_end(const void* src, size_t srcSize, v
    return buffOut.pos;
 }

-static size_t local_ZSTD_compress_generic_continue(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+static size_t
+local_ZSTD_compress_generic_continue(const void* src, size_t srcSize,
+                                     void* dst, size_t dstCapacity,
+                                     void* buff2)
 {
    ZSTD_outBuffer buffOut;
    ZSTD_inBuffer buffIn;
@ -225,7 +203,10 @@ static size_t local_ZSTD_compress_generic_continue(const void* src, size_t srcSi
    return buffOut.pos;
 }

-static size_t local_ZSTD_compress_generic_T2_end(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+static size_t
+local_ZSTD_compress_generic_T2_end(const void* src, size_t srcSize,
+                                   void* dst, size_t dstCapacity,
+                                   void* buff2)
 {
    ZSTD_outBuffer buffOut;
    ZSTD_inBuffer buffIn;
@ -241,7 +222,10 @@ static size_t local_ZSTD_compress_generic_T2_end(const void* src, size_t srcSize
    return buffOut.pos;
 }

-static size_t local_ZSTD_compress_generic_T2_continue(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+static size_t
+local_ZSTD_compress_generic_T2_continue(const void* src, size_t srcSize,
+                                        void* dst, size_t dstCapacity,
+                                        void* buff2)
 {
    ZSTD_outBuffer buffOut;
    ZSTD_inBuffer buffIn;
@ -259,7 +243,10 @@ static size_t local_ZSTD_compress_generic_T2_continue(const void* src, size_t sr
 }

 static ZSTD_DStream* g_dstream= NULL;
-static size_t local_ZSTD_decompressStream(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+static size_t
+local_ZSTD_decompressStream(const void* src, size_t srcSize,
+                            void* dst, size_t dstCapacity,
+                            void* buff2)
 {
    ZSTD_outBuffer buffOut;
    ZSTD_inBuffer buffIn;
@ -276,10 +263,12 @@ static size_t local_ZSTD_decompressStream(const void* src, size_t srcSize, void*
 }

 #ifndef ZSTD_DLL_IMPORT
-size_t local_ZSTD_compressContinue(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+size_t local_ZSTD_compressContinue(const void* src, size_t srcSize,
+                                   void* dst, size_t dstCapacity,
+                                   void* buff2)
 {
    ZSTD_parameters p;
-    ZSTD_frameParameters f = {1 /* contentSizeHeader*/, 0, 0};
+    ZSTD_frameParameters f = { 1 /* contentSizeHeader*/, 0, 0 };
    p.fParams = f;
    p.cParams = *(ZSTD_compressionParameters*)buff2;
    ZSTD_compressBegin_advanced(g_zcc, NULL, 0, p, srcSize);
@ -287,26 +276,38 @@ size_t local_ZSTD_compressContinue(const void* src, size_t srcSize, void* dst, s
 }

 #define FIRST_BLOCK_SIZE 8
-size_t local_ZSTD_compressContinue_extDict(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+size_t local_ZSTD_compressContinue_extDict(const void* src, size_t srcSize,
+                                           void* dst, size_t dstCapacity,
+                                           void* buff2)
 {
    BYTE firstBlockBuf[FIRST_BLOCK_SIZE];
-    
+
    ZSTD_parameters p;
-    ZSTD_frameParameters f = {1 , 0, 0};
+    ZSTD_frameParameters f = { 1, 0, 0 };
    p.fParams = f;
    p.cParams = *(ZSTD_compressionParameters*)buff2;
    ZSTD_compressBegin_advanced(g_zcc, NULL, 0, p, srcSize);
    memcpy(firstBlockBuf, src, FIRST_BLOCK_SIZE);

-    {   size_t const compressResult = ZSTD_compressContinue(g_zcc, dst, dstCapacity, firstBlockBuf, FIRST_BLOCK_SIZE);
-        if (ZSTD_isError(compressResult)) { DISPLAY("local_ZSTD_compressContinue_extDict error : %s\n", ZSTD_getErrorName(compressResult)); return compressResult; }
+    {   size_t const compressResult = ZSTD_compressContinue(g_zcc,
+                                            dst, dstCapacity,
+                                            firstBlockBuf, FIRST_BLOCK_SIZE);
+        if (ZSTD_isError(compressResult)) {
+            DISPLAY("local_ZSTD_compressContinue_extDict error : %s\n",
+                    ZSTD_getErrorName(compressResult));
+            return compressResult;
+        }
        dst = (BYTE*)dst + compressResult;
        dstCapacity -= compressResult;
    }
-    return ZSTD_compressEnd(g_zcc, dst, dstCapacity, (const BYTE*)src + FIRST_BLOCK_SIZE, srcSize - FIRST_BLOCK_SIZE);
+    return ZSTD_compressEnd(g_zcc, dst, dstCapacity,
+                            (const BYTE*)src + FIRST_BLOCK_SIZE,
+                            srcSize - FIRST_BLOCK_SIZE);
 }

-size_t local_ZSTD_decompressContinue(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+size_t local_ZSTD_decompressContinue(const void* src, size_t srcSize,
+                                           void* dst, size_t dstCapacity,
+                                           void* buff2)
 {
    size_t regeneratedSize = 0;
    const BYTE* ip = (const BYTE*)buff2;
@ -314,7 +315,7 @@ size_t local_ZSTD_decompressContinue(const void* src, size_t srcSize, void* dst,
    BYTE* op = (BYTE*)dst;
    size_t remainingCapacity = dstCapacity;

-    (void)src; (void)srcSize;
+    (void)src; (void)srcSize;  /* unused */
    ZSTD_decompressBegin(g_zdc);
    while (ip < iend) {
        size_t const iSize = ZSTD_nextSrcSizeToDecompress(g_zdc);
@ -333,14 +334,16 @@ size_t local_ZSTD_decompressContinue(const void* src, size_t srcSize, void* dst,
 /*_*******************************************************
 *  Bench functions
 *********************************************************/
-static size_t benchMem(const void* src, size_t srcSize, U32 benchNb, int cLevel, ZSTD_compressionParameters* cparams)
+static size_t benchMem(U32 benchNb,
+                       const void* src, size_t srcSize,
+                       int cLevel, ZSTD_compressionParameters cparams)
 {
-    BYTE*  dstBuff;
    size_t dstBuffSize = ZSTD_compressBound(srcSize);
-    void*  buff2, *buff1;
+    BYTE*  dstBuff;
+    void*  dstBuff2;
+    void*  buff2;
    const char* benchName;
    BMK_benchFn_t benchFunction;
-    BMK_customReturn_t r;
    int errorcode = 0;

    /* Selection */
@ -393,56 +396,56 @@ static size_t benchMem(const void* src, size_t srcSize, U32 benchNb, int cLevel,

    /* Allocation */
    dstBuff = (BYTE*)malloc(dstBuffSize);
-    buff2 = malloc(dstBuffSize);
-    if ((!dstBuff) || (!buff2)) {
+    dstBuff2 = malloc(dstBuffSize);
+    if ((!dstBuff) || (!dstBuff2)) {
        DISPLAY("\nError: not enough memory!\n");
-        free(dstBuff); free(buff2);
+        free(dstBuff); free(dstBuff2);
        return 12;
    }
-    buff1 = buff2;
+    buff2 = dstBuff2;
    if (g_zcc==NULL) g_zcc = ZSTD_createCCtx();
    if (g_zdc==NULL) g_zdc = ZSTD_createDCtx();
    if (g_cstream==NULL) g_cstream = ZSTD_createCStream();
    if (g_dstream==NULL) g_dstream = ZSTD_createDStream();

-    /* DISPLAY("params: cLevel %d, wlog %d hlog %d clog %d slog %d slen %d tlen %d strat %d \n"
-        , cLevel, cparams->windowLog, cparams->hashLog, cparams->chainLog, cparams->searchLog, 
-          cparams->searchLength, cparams->targetLength, cparams->strategy);*/
+    /* DISPLAY("params: cLevel %d, wlog %d hlog %d clog %d slog %d slen %d tlen %d strat %d \n",
+          cLevel, cparams->windowLog, cparams->hashLog, cparams->chainLog, cparams->searchLog,
+          cparams->searchLength, cparams->targetLength, cparams->strategy); */

    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_compressionLevel, cLevel);
-    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_windowLog, cparams->windowLog);
-    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_hashLog, cparams->hashLog);
-    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_chainLog, cparams->chainLog);
-    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_searchLog, cparams->searchLog);
-    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_minMatch, cparams->searchLength);
-    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_targetLength, cparams->targetLength);
-    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_compressionStrategy, cparams->strategy);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_windowLog, cparams.windowLog);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_hashLog, cparams.hashLog);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_chainLog, cparams.chainLog);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_searchLog, cparams.searchLog);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_minMatch, cparams.searchLength);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_targetLength, cparams.targetLength);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_compressionStrategy, cparams.strategy);


    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionLevel, cLevel);
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_windowLog, cparams->windowLog);
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_hashLog, cparams->hashLog);
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_chainLog, cparams->chainLog);
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_searchLog, cparams->searchLog);
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_minMatch, cparams->searchLength);
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_targetLength, cparams->targetLength);
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionStrategy, cparams->strategy);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_windowLog, cparams.windowLog);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_hashLog, cparams.hashLog);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_chainLog, cparams.chainLog);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_searchLog, cparams.searchLog);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_minMatch, cparams.searchLength);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_targetLength, cparams.targetLength);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionStrategy, cparams.strategy);

    /* Preparation */
    switch(benchNb)
    {
    case 1:
-        buff2 = (void*)cparams;
+        buff2 = &cparams;
        break;
    case 2:
        g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, cLevel);
        break;
 #ifndef ZSTD_DLL_IMPORT
    case 11:
-        buff2 = (void*)cparams;
+        buff2 = &cparams;
        break;
    case 12:
-        buff2 = (void*)cparams;
+        buff2 = &cparams;
        break;
    case 13 :
        g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, cLevel);
@ -494,8 +497,8 @@ static size_t benchMem(const void* src, size_t srcSize, U32 benchNb, int cLevel,
    case 31:
        goto _cleanOut;
 #endif
-    case 41 : 
-        buff2 = (void*)cparams;
+    case 41 :
+        buff2 = &cparams;
        break;
    case 42 :
        g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, cLevel);
@ -507,29 +510,50 @@ static size_t benchMem(const void* src, size_t srcSize, U32 benchNb, int cLevel,
    default : ;
    }

-
-     /* warming up memory */
+     /* warming up dstBuff */
    { size_t i; for (i=0; i<dstBuffSize; i++) dstBuff[i]=(BYTE)i; }

-
    /* benchmark loop */
-    {
-        void* dstBuffv = (void*)dstBuff;
-        r = BMK_benchFunction(benchFunction, buff2, 
-            NULL, NULL,  1, &src, &srcSize, 
-            &dstBuffv, &dstBuffSize, NULL, g_nbIterations);
-        if(r.error) {
-            DISPLAY("ERROR %d ! ! \n", r.error);
-            errorcode = r.error;
-            goto _cleanOut;
-        }
+    {   BMK_timedFnState_t* const tfs = BMK_createTimedFnState(g_nbIterations * 1000, 1000);
+        BMK_runTime_t bestResult;
+        bestResult.sumOfReturn = 0;
+        bestResult.nanoSecPerRun = (unsigned long long)(-1LL);
+        assert(tfs != NULL);
+        for (;;) {
+            void* const dstBuffv = dstBuff;
+            BMK_runOutcome_t const bOutcome =
+                    BMK_benchTimedFn( tfs,
+                            benchFunction, buff2,
+                            NULL, NULL,   /* initFn */
+                            1,  /* blockCount */
+                            &src, &srcSize,
+                            &dstBuffv, &dstBuffSize,
+                            NULL);

-        DISPLAY("%2u#Speed: %f MB/s - Size: %f MB - %s\n", benchNb, (double)srcSize / r.result.nanoSecPerRun * 1000, (double)r.result.sumOfReturn / 1000000, benchName);
+            if (!BMK_isSuccessful_runOutcome(bOutcome)) {
+                DISPLAY("ERROR benchmarking function ! ! \n");
+                errorcode = 1;
+                goto _cleanOut;
+            }
+
+            {   BMK_runTime_t const newResult = BMK_extract_runTime(bOutcome);
+                if (newResult.nanoSecPerRun < bestResult.nanoSecPerRun )
+                    bestResult.nanoSecPerRun = newResult.nanoSecPerRun;
+                DISPLAY("\r%2u#%-29.29s:%8.1f MB/s  (%8u) ",
+                        benchNb, benchName,
+                        (double)srcSize * TIMELOOP_NANOSEC / bestResult.nanoSecPerRun / MB_UNIT,
+                        (unsigned)newResult.sumOfReturn );
+            }
+
+            if ( BMK_isCompleted_TimedFn(tfs) ) break;
+        }
+        BMK_freeTimedFnState(tfs);
    }
-    
+    DISPLAY("\n");
+
 _cleanOut:
-    free(buff1);
    free(dstBuff);
+    free(dstBuff2);
    ZSTD_freeCCtx(g_zcc); g_zcc=NULL;
    ZSTD_freeDCtx(g_zdc); g_zdc=NULL;
    ZSTD_freeCStream(g_cstream); g_cstream=NULL;
@ -538,87 +562,138 @@ _cleanOut:
 }


-static int benchSample(U32 benchNb, int cLevel, ZSTD_compressionParameters* cparams)
+static int benchSample(U32 benchNb,
+                       int cLevel, ZSTD_compressionParameters cparams)
 {
    size_t const benchedSize = g_sampleSize;
-    const char* name = "Sample 10MiB";
+    const char* const name = "Sample 10MiB";

    /* Allocation */
-    void* origBuff = malloc(benchedSize);
+    void* const origBuff = malloc(benchedSize);
    if (!origBuff) { DISPLAY("\nError: not enough memory!\n"); return 12; }

    /* Fill buffer */
    RDG_genBuffer(origBuff, benchedSize, g_compressibility, 0.0, 0);

    /* bench */
-    DISPLAY("\r%79s\r", "");
+    DISPLAY("\r%70s\r", "");
    DISPLAY(" %s : \n", name);
-    if (benchNb)
-        benchMem(origBuff, benchedSize, benchNb, cLevel, cparams);
-    else
-        for (benchNb=0; benchNb<100; benchNb++) benchMem(origBuff, benchedSize, benchNb, cLevel, cparams);
+    if (benchNb) {
+        benchMem(benchNb, origBuff, benchedSize, cLevel, cparams);
+    } else {  /* 0 == run all tests */
+        for (benchNb=0; benchNb<100; benchNb++) {
+            benchMem(benchNb, origBuff, benchedSize, cLevel, cparams);
+    }   }

    free(origBuff);
    return 0;
 }


-static int benchFiles(const char** fileNamesTable, const int nbFiles, U32 benchNb, int cLevel, ZSTD_compressionParameters* cparams)
+static int benchFiles(U32 benchNb,
+                      const char** fileNamesTable, const int nbFiles,
+                      int cLevel, ZSTD_compressionParameters cparams)
 {
    /* Loop for each file */
    int fileIdx;
    for (fileIdx=0; fileIdx<nbFiles; fileIdx++) {
        const char* const inFileName = fileNamesTable[fileIdx];
        FILE* const inFile = fopen( inFileName, "rb" );
-        U64   inFileSize;
        size_t benchedSize;
-        void* origBuff;

        /* Check file existence */
        if (inFile==NULL) { DISPLAY( "Pb opening %s\n", inFileName); return 11; }

        /* Memory allocation & restrictions */
-        inFileSize = UTIL_getFileSize(inFileName);
-        if (inFileSize == UTIL_FILESIZE_UNKNOWN) {
-            DISPLAY( "Cannot measure size of %s\n", inFileName);
-            fclose(inFile);
-            return 11;
-        }
-        benchedSize = BMK_findMaxMem(inFileSize*3) / 3;
-        if ((U64)benchedSize > inFileSize) benchedSize = (size_t)inFileSize;
-        if (benchedSize < inFileSize)
-            DISPLAY("Not enough memory for '%s' full size; testing %u MB only...\n", inFileName, (U32)(benchedSize>>20));
-
-        /* Alloc */
-        origBuff = malloc(benchedSize);
-        if (!origBuff) { DISPLAY("\nError: not enough memory!\n"); fclose(inFile); return 12; }
-
-        /* Fill input buffer */
-        DISPLAY("Loading %s...       \r", inFileName);
-        {
-            size_t readSize = fread(origBuff, 1, benchedSize, inFile);
-            fclose(inFile);
-            if (readSize != benchedSize) {
-                DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
-                free(origBuff);
-                return 13;
+        {   U64 const inFileSize = UTIL_getFileSize(inFileName);
+            if (inFileSize == UTIL_FILESIZE_UNKNOWN) {
+                DISPLAY( "Cannot measure size of %s\n", inFileName);
+                fclose(inFile);
+                return 11;
+            }
+            benchedSize = BMK_findMaxMem(inFileSize*3) / 3;
+            if ((U64)benchedSize > inFileSize)
+                benchedSize = (size_t)inFileSize;
+            if ((U64)benchedSize < inFileSize) {
+                DISPLAY("Not enough memory for '%s' full size; testing %u MB only... \n",
+                        inFileName, (U32)(benchedSize>>20));
        }   }

-        /* bench */
-        DISPLAY("\r%79s\r", "");
-        DISPLAY(" %s : \n", inFileName);
-        if (benchNb)
-            benchMem(origBuff, benchedSize, benchNb, cLevel, cparams);
-        else
-            for (benchNb=0; benchNb<100; benchNb++) benchMem(origBuff, benchedSize, benchNb, cLevel, cparams);
+        /* Alloc */
+        {   void* const origBuff = malloc(benchedSize);
+            if (!origBuff) { DISPLAY("\nError: not enough memory!\n"); fclose(inFile); return 12; }

-        free(origBuff);
-    }
+            /* Fill input buffer */
+            DISPLAY("Loading %s...       \r", inFileName);
+            {   size_t const readSize = fread(origBuff, 1, benchedSize, inFile);
+                fclose(inFile);
+                if (readSize != benchedSize) {
+                    DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
+                    free(origBuff);
+                    return 13;
+            }   }
+
+            /* bench */
+            DISPLAY("\r%70s\r", "");   /* blank line */
+            DISPLAY(" %s : \n", inFileName);
+            if (benchNb) {
+                benchMem(benchNb, origBuff, benchedSize, cLevel, cparams);
+            } else {
+                for (benchNb=0; benchNb<100; benchNb++) {
+                    benchMem(benchNb, origBuff, benchedSize, cLevel, cparams);
+            }   }
+
+            free(origBuff);
+    }   }

    return 0;
 }


+
+/*_*******************************************************
+*  Argument Parsing
+*********************************************************/
+
+#define ERROR_OUT(msg) { DISPLAY("%s \n", msg); exit(1); }
+
+static unsigned readU32FromChar(const char** stringPtr)
+{
+    const char errorMsg[] = "error: numeric value too large";
+    unsigned result = 0;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        unsigned const max = (((unsigned)(-1)) / 10) - 1;
+        if (result > max) ERROR_OUT(errorMsg);
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        unsigned const maxK = ((unsigned)(-1)) >> 10;
+        if (result > maxK) ERROR_OUT(errorMsg);
+        result <<= 10;
+        if (**stringPtr=='M') {
+            if (result > maxK) ERROR_OUT(errorMsg);
+            result <<= 10;
+        }
+        (*stringPtr)++;  /* skip `K` or `M` */
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result;
+}
+
+static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
+{
+    size_t const comSize = strlen(longCommand);
+    int const result = !strncmp(*stringPtr, longCommand, comSize);
+    if (result) *stringPtr += comSize;
+    return result;
+}
+
+
+/*_*******************************************************
+*  Command line
+*********************************************************/
+
 static int usage(const char* exename)
 {
    DISPLAY( "Usage :\n");
@ -649,8 +724,8 @@ static int badusage(const char* exename)

 int main(int argc, const char** argv)
 {
-    int i, filenamesStart=0, result;
-    const char* exename = argv[0];
+    int argNb, filenamesStart=0, result;
+    const char* const exename = argv[0];
    const char* input_filename = NULL;
    U32 benchNb = 0, main_pause = 0;
    int cLevel = DEFAULT_CLEVEL;
@ -659,8 +734,8 @@ int main(int argc, const char** argv)
    DISPLAY(WELCOME_MESSAGE);
    if (argc<1) return badusage(exename);

-    for(i=1; i<argc; i++) {
-        const char* argument = argv[i];
+    for (argNb=1; argNb<argc; argNb++) {
+        const char* argument = argv[argNb];
        assert(argument != NULL);

        if (longCommandWArg(&argument, "--zstd=")) {
@ -677,12 +752,14 @@ int main(int argc, const char** argv)
                return 1;
            }

+            /* check end of string */
            if (argument[0] != 0) {
-                DISPLAY("invalid --zstd= format\n");
-                return 1; // check the end of string 
+                DISPLAY("invalid --zstd= format \n");
+                return 1;
            } else {
                continue;
            }
+
        } else if (argument[0]=='-') { /* Commands (note : aggregated commands are allowed) */
            argument++;
            while (argument[0]!=0) {
@ -698,35 +775,27 @@ int main(int argc, const char** argv)

                    /* Select specific algorithm to bench */
                case 'b':
-                    {
-                        argument++;
-                        benchNb = readU32FromChar(&argument);
-                        break;
-                    }
+                    argument++;
+                    benchNb = readU32FromChar(&argument);
+                    break;

                    /* Modify Nb Iterations */
                case 'i':
-                    {
-                        argument++;
-                        BMK_SetNbIterations((int)readU32FromChar(&argument));
-                    }
+                    argument++;
+                    BMK_SetNbIterations((int)readU32FromChar(&argument));
                    break;

                    /* Select compressibility of synthetic sample */
                case 'P':
-                    {   argument++;
-                        g_compressibility = (double)readU32FromChar(&argument) / 100.;
-                    }
+                    argument++;
+                    g_compressibility = (double)readU32FromChar(&argument) / 100.;
                    break;
                case 'l':
-                    {   argument++;
-                        cLevel = readU32FromChar(&argument);
-                        cparams = ZSTD_getCParams(cLevel, 0, 0);
-                    }
+                    argument++;
+                    cLevel = readU32FromChar(&argument);
+                    cparams = ZSTD_getCParams(cLevel, 0, 0);
                    break;

-
-
                    /* Unknown command */
                default : return badusage(exename);
                }
@ -735,15 +804,15 @@ int main(int argc, const char** argv)
        }

        /* first provided filename is input */
-        if (!input_filename) { input_filename=argument; filenamesStart=i; continue; }
+        if (!input_filename) { input_filename=argument; filenamesStart=argNb; continue; }
    }



    if (filenamesStart==0)   /* no input file */
-        result = benchSample(benchNb, cLevel, &cparams);
+        result = benchSample(benchNb, cLevel, cparams);
    else
-        result = benchFiles(argv+filenamesStart, argc-filenamesStart, benchNb, cLevel, &cparams);
+        result = benchFiles(benchNb, argv+filenamesStart, argc-filenamesStart, cLevel, cparams);

    if (main_pause) { int unused; printf("press enter...\n"); unused = getchar(); (void)unused; }

--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@ -179,13 +179,9 @@ static void FUZ_displayMallocStats(mallocCounter_t count)
        (U32)(count.totalMalloc >> 10));
 }

-static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
+static int FUZ_mallocTests_internal(unsigned seed, double compressibility, unsigned part,
+                void* inBuffer, size_t inSize, void* outBuffer, size_t outSize)
 {
-    size_t const inSize = 64 MB + 16 MB + 4 MB + 1 MB + 256 KB + 64 KB; /* 85.3 MB */
-    size_t const outSize = ZSTD_compressBound(inSize);
-    void* const inBuffer = malloc(inSize);
-    void* const outBuffer = malloc(outSize);
-
    /* test only played in verbose mode, as they are long */
    if (g_displayLevel<3) return 0;

@ -270,6 +266,28 @@ static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
    return 0;
 }

+static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
+{
+    size_t const inSize = 64 MB + 16 MB + 4 MB + 1 MB + 256 KB + 64 KB; /* 85.3 MB */
+    size_t const outSize = ZSTD_compressBound(inSize);
+    void* const inBuffer = malloc(inSize);
+    void* const outBuffer = malloc(outSize);
+    int result;
+
+    /* Create compressible noise */
+    if (!inBuffer || !outBuffer) {
+        DISPLAY("Not enough memory, aborting \n");
+        exit(1);
+    }
+
+    result = FUZ_mallocTests_internal(seed, compressibility, part,
+                    inBuffer, inSize, outBuffer, outSize);
+
+    free(inBuffer);
+    free(outBuffer);
+    return result;
+}
+
 #else

 static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
@ -1357,6 +1375,24 @@ static int basicUnitTests(U32 seed, double compressibility)
                ((BYTE*)CNBuffer)[i+1] = _3BytesSeqs[id][1];
                ((BYTE*)CNBuffer)[i+2] = _3BytesSeqs[id][2];
    }   }   }
+    DISPLAYLEVEL(3, "test%3i : growing nbSeq : ", testNb++);
+    {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+        size_t const maxNbSeq = _3BYTESTESTLENGTH / 3;
+        size_t const bound = ZSTD_compressBound(_3BYTESTESTLENGTH);
+        size_t nbSeq = 1;
+        while (nbSeq <= maxNbSeq) {
+          CHECK(ZSTD_compressCCtx(cctx, compressedBuffer, bound, CNBuffer, nbSeq * 3, 19));
+          /* Check every sequence for the first 100, then skip more rapidly. */
+          if (nbSeq < 100) {
+            ++nbSeq;
+          } else {
+            nbSeq += (nbSeq >> 2);
+          }
+        }
+        ZSTD_freeCCtx(cctx);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
    DISPLAYLEVEL(3, "test%3i : compress lots 3-bytes sequences : ", testNb++);
    { CHECK_V(r, ZSTD_compress(compressedBuffer, ZSTD_compressBound(_3BYTESTESTLENGTH),
                                 CNBuffer, _3BYTESTESTLENGTH, 19) );
@ -1368,8 +1404,26 @@ static int basicUnitTests(U32 seed, double compressibility)
      if (r != _3BYTESTESTLENGTH) goto _output_error; }
    DISPLAYLEVEL(3, "OK \n");

-    DISPLAYLEVEL(3, "test%3i : incompressible data and ill suited dictionary : ", testNb++);
+
+    DISPLAYLEVEL(3, "test%3i : growing literals buffer : ", testNb++);
    RDG_genBuffer(CNBuffer, CNBuffSize, 0.0, 0.1, seed);
+    {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+        size_t const bound = ZSTD_compressBound(CNBuffSize);
+        size_t size = 1;
+        while (size <= CNBuffSize) {
+          CHECK(ZSTD_compressCCtx(cctx, compressedBuffer, bound, CNBuffer, size, 3));
+          /* Check every size for the first 100, then skip more rapidly. */
+          if (size < 100) {
+            ++size;
+          } else {
+            size += (size >> 2);
+          }
+        }
+        ZSTD_freeCCtx(cctx);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
+    DISPLAYLEVEL(3, "test%3i : incompressible data and ill suited dictionary : ", testNb++);
    {   /* Train a dictionary on low characters */
        size_t dictSize = 16 KB;
        void* const dictBuffer = malloc(dictSize);
@ -1535,7 +1589,6 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
    size_t const dstBufferSize = (size_t)1<<maxSampleLog;
    size_t const cBufferSize   = ZSTD_compressBound(dstBufferSize);
    BYTE* cNoiseBuffer[5];
-    BYTE* srcBuffer;   /* jumping pointer */
    BYTE* const cBuffer = (BYTE*) malloc (cBufferSize);
    BYTE* const dstBuffer = (BYTE*) malloc (dstBufferSize);
    BYTE* const mirrorBuffer = (BYTE*) malloc (dstBufferSize);
@ -1544,7 +1597,7 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
    ZSTD_DCtx* const dctx = ZSTD_createDCtx();
    U32 result = 0;
    U32 testNb = 0;
-    U32 coreSeed = seed, lseed = 0;
+    U32 coreSeed = seed;
    UTIL_time_t const startClock = UTIL_getTime();
    U64 const maxClockSpan = maxDurationS * SEC_TO_MICRO;
    int const cLevelLimiter = bigTests ? 3 : 2;
@ -1565,13 +1618,14 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
    RDG_genBuffer(cNoiseBuffer[2], srcBufferSize, compressibility, 0., coreSeed);
    RDG_genBuffer(cNoiseBuffer[3], srcBufferSize, 0.95, 0., coreSeed);    /* highly compressible */
    RDG_genBuffer(cNoiseBuffer[4], srcBufferSize, 1.00, 0., coreSeed);    /* sparse content */
-    srcBuffer = cNoiseBuffer[2];

    /* catch up testNb */
    for (testNb=1; testNb < startTest; testNb++) FUZ_rand(&coreSeed);

    /* main test loop */
    for ( ; (testNb <= nbTests) || (UTIL_clockSpanMicro(startClock) < maxClockSpan); testNb++ ) {
+        BYTE* srcBuffer;   /* jumping pointer */
+        U32 lseed;
        size_t sampleSize, maxTestSize, totalTestSize;
        size_t cSize, totalCSize, totalGenSize;
        U64 crcOrig;
@ -1802,11 +1856,9 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
        CHECK (totalGenSize != totalTestSize, "streaming decompressed data : wrong size")
        CHECK (totalCSize != cSize, "compressed data should be fully read")
        {   U64 const crcDest = XXH64(dstBuffer, totalTestSize, 0);
-            if (crcDest!=crcOrig) {
-                size_t const errorPos = findDiff(mirrorBuffer, dstBuffer, totalTestSize);
-                CHECK (1, "streaming decompressed data corrupted : byte %u / %u  (%02X!=%02X)",
-                   (U32)errorPos, (U32)totalTestSize, dstBuffer[errorPos], mirrorBuffer[errorPos]);
-        }   }
+            CHECK(crcOrig != crcDest, "streaming decompressed data corrupted (pos %u / %u)",
+                (U32)findDiff(mirrorBuffer, dstBuffer, totalTestSize), (U32)totalTestSize);
+        }
    }   /* for ( ; (testNb <= nbTests) */
    DISPLAY("\r%u fuzzer tests completed   \n", testNb-1);

--- a/tests/paramgrill.c
+++ b/tests/paramgrill.c
--- a/tests/playTests.sh
+++ b/tests/playTests.sh
@ -48,6 +48,10 @@ fileRoundTripTest() {
    $DIFF -q tmp.md5.1 tmp.md5.2
 }

+truncateLastByte() {
+	dd bs=1 count=$(($(wc -c < "$1") - 1)) if="$1" status=none
+}
+
 UNAME=$(uname)

 isTerminal=false
@ -427,7 +431,7 @@ $ECHO "- Create second (different) dictionary"
 $ZSTD --train-cover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC
 $ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
 $ECHO "- Create dictionary with short dictID"
-$ZSTD --train-cover=k=46,d=8 *.c ../programs/*.c --dictID=1 -o tmpDict1
+$ZSTD --train-cover=k=46,d=8,split=80 *.c ../programs/*.c --dictID=1 -o tmpDict1
 cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
 $ECHO "- Create dictionary with size limit"
 $ZSTD --train-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
@ -444,6 +448,47 @@ $ZSTD --train-cover *.c ../programs/*.c
 test -f dictionary
 rm tmp* dictionary

+
+$ECHO "\n===>  fastCover dictionary builder : advanced options "
+
+TESTFILE=../programs/zstdcli.c
+./datagen > tmpDict
+$ECHO "- Create first dictionary"
+$ZSTD --train-fastcover=k=46,d=8,f=15,split=80 *.c ../programs/*.c -o tmpDict
+cp $TESTFILE tmp
+$ZSTD -f tmp -D tmpDict
+$ZSTD -d tmp.zst -D tmpDict -fo result
+$DIFF $TESTFILE result
+$ECHO "- Create second (different) dictionary"
+$ZSTD --train-fastcover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC
+$ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
+$ECHO "- Create dictionary with short dictID"
+$ZSTD --train-fastcover=k=46,d=8,f=15,split=80 *.c ../programs/*.c --dictID=1 -o tmpDict1
+cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
+$ECHO "- Create dictionary with size limit"
+$ZSTD --train-fastcover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
+$ECHO "- Compare size of dictionary from 90% training samples with 80% training samples"
+$ZSTD --train-fastcover=split=90 -r *.c ../programs/*.c
+$ZSTD --train-fastcover=split=80 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using all samples for both training and testing"
+$ZSTD --train-fastcover=split=100 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using f=16"
+$ZSTD --train-fastcover=f=16 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using accel=2"
+$ZSTD --train-fastcover=accel=2 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using accel=10"
+$ZSTD --train-fastcover=accel=10 -r *.c ../programs/*.c
+$ECHO "- Create dictionary with multithreading"
+$ZSTD --train-fastcover -T4 -r *.c ../programs/*.c
+$ECHO "- Test -o before --train-fastcover"
+rm -f tmpDict dictionary
+$ZSTD -o tmpDict --train-fastcover *.c ../programs/*.c
+test -f tmpDict
+$ZSTD --train-fastcover *.c ../programs/*.c
+test -f dictionary
+rm tmp* dictionary
+
+
 $ECHO "\n===>  legacy dictionary builder "

 TESTFILE=../programs/zstdcli.c
@ -551,7 +596,7 @@ if [ $GZIPMODE -eq 1 ]; then
    $ZSTD -f --format=gzip tmp
    $ZSTD -f tmp
    cat tmp.gz tmp.zst tmp.gz tmp.zst | $ZSTD -d -f -o tmp
-    head -c -1 tmp.gz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    truncateLastByte tmp.gz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
    rm tmp*
 else
    $ECHO "gzip mode not supported"
@ -618,8 +663,8 @@ if [ $LZMAMODE -eq 1 ]; then
    $ZSTD -f --format=lzma tmp
    $ZSTD -f tmp
    cat tmp.xz tmp.lzma tmp.zst tmp.lzma tmp.xz tmp.zst | $ZSTD -d -f -o tmp
-    head -c -1 tmp.xz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
-    head -c -1 tmp.lzma | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    truncateLastByte tmp.xz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    truncateLastByte tmp.lzma | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
    rm tmp*
 else
    $ECHO "xz mode not supported"
@ -655,7 +700,7 @@ if [ $LZ4MODE -eq 1 ]; then
    $ZSTD -f --format=lz4 tmp
    $ZSTD -f tmp
    cat tmp.lz4 tmp.zst tmp.lz4 tmp.zst | $ZSTD -d -f -o tmp
-    head -c -1 tmp.lz4 | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    truncateLastByte tmp.lz4 | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
    rm tmp*
 else
    $ECHO "lz4 mode not supported"
--- a/tests/roundTripCrash.c
+++ b/tests/roundTripCrash.c
@ -212,7 +212,7 @@ static void loadFile(void* buffer, const char* fileName, size_t fileSize)
 static void fileCheck(const char* fileName, int testCCtxParams)
 {
    size_t const fileSize = getFileSize(fileName);
-    void* buffer = malloc(fileSize);
+    void* const buffer = malloc(fileSize + !fileSize /* avoid 0 */);
    if (!buffer) {
        fprintf(stderr, "not enough memory \n");
        exit(4);
--- a/tests/symbols.c
+++ b/tests/symbols.c
@ -144,6 +144,8 @@ static const void *symbols[] = {
 /* zdict.h: advanced functions */
  &ZDICT_trainFromBuffer_cover,
  &ZDICT_optimizeTrainFromBuffer_cover,
+  &ZDICT_trainFromBuffer_fastCover,
+  &ZDICT_optimizeTrainFromBuffer_fastCover,
  &ZDICT_finalizeDictionary,
  &ZDICT_trainFromBuffer_legacy,
  &ZDICT_addEntropyTablesFromBuffer,
--- a/tests/zstreamtest.c
+++ b/tests/zstreamtest.c
@ -135,34 +135,34 @@ typedef struct {
    size_t filled;
 } buffer_t;

-static const buffer_t g_nullBuffer = { NULL, 0 , 0 };
+static const buffer_t kBuffNull = { NULL, 0 , 0 };
+
+static void FUZ_freeDictionary(buffer_t dict)
+{
+    free(dict.start);
+}

 static buffer_t FUZ_createDictionary(const void* src, size_t srcSize, size_t blockSize, size_t requestedDictSize)
 {
-    buffer_t dict = { NULL, 0, 0 };
+    buffer_t dict = kBuffNull;
    size_t const nbBlocks = (srcSize + (blockSize-1)) / blockSize;
-    size_t* const blockSizes = (size_t*) malloc(nbBlocks * sizeof(size_t));
-    if (!blockSizes) return dict;
+    size_t* const blockSizes = (size_t*)malloc(nbBlocks * sizeof(size_t));
+    if (!blockSizes) return kBuffNull;
    dict.start = malloc(requestedDictSize);
-    if (!dict.start) { free(blockSizes); return dict; }
+    if (!dict.start) { free(blockSizes); return kBuffNull; }
    {   size_t nb;
        for (nb=0; nb<nbBlocks-1; nb++) blockSizes[nb] = blockSize;
        blockSizes[nbBlocks-1] = srcSize - (blockSize * (nbBlocks-1));
    }
    {   size_t const dictSize = ZDICT_trainFromBuffer(dict.start, requestedDictSize, src, blockSizes, (unsigned)nbBlocks);
        free(blockSizes);
-        if (ZDICT_isError(dictSize)) { free(dict.start); return g_nullBuffer; }
+        if (ZDICT_isError(dictSize)) { FUZ_freeDictionary(dict); return kBuffNull; }
        dict.size = requestedDictSize;
        dict.filled = dictSize;
-        return dict;   /* how to return dictSize ? */
+        return dict;
    }
 }

-static void FUZ_freeDictionary(buffer_t dict)
-{
-    free(dict.start);
-}
-
 /* Round trips data and updates xxh with the decompressed data produced */
 static size_t SEQ_roundTrip(ZSTD_CCtx* cctx, ZSTD_DCtx* dctx,
                            XXH64_state_t* xxh, void* data, size_t size,
@ -276,7 +276,7 @@ static int basicUnitTests(U32 seed, double compressibility)

    ZSTD_inBuffer  inBuff, inBuff2;
    ZSTD_outBuffer outBuff;
-    buffer_t dictionary = g_nullBuffer;
+    buffer_t dictionary = kBuffNull;
    size_t const dictSize = 128 KB;
    unsigned dictID = 0;

@ -600,7 +600,6 @@ static int basicUnitTests(U32 seed, double compressibility)
        size_t const initError = ZSTD_initCStream_usingCDict(zc, cdict);
        DISPLAYLEVEL(5, "ZSTD_initCStream_usingCDict result : %u ", (U32)initError);
        if (ZSTD_isError(initError)) goto _output_error;
-        cSize = 0;
        outBuff.dst = compressedBuffer;
        outBuff.size = compressedBufferSize;
        outBuff.pos = 0;
@ -718,7 +717,6 @@ static int basicUnitTests(U32 seed, double compressibility)
        ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dictionary.start, dictionary.filled, ZSTD_dlm_byRef, ZSTD_dct_auto, cParams, ZSTD_defaultCMem);
        size_t const initError = ZSTD_initCStream_usingCDict_advanced(zc, cdict, fParams, CNBufferSize);
        if (ZSTD_isError(initError)) goto _output_error;
-        cSize = 0;
        outBuff.dst = compressedBuffer;
        outBuff.size = compressedBufferSize;
        outBuff.pos = 0;
@ -1022,6 +1020,59 @@ static int basicUnitTests(U32 seed, double compressibility)
    }
    DISPLAYLEVEL(3, "OK \n");

+    DISPLAYLEVEL(3, "test%3i : dictionary + uncompressible block + reusing tables checks offset table validity: ", testNb++);
+    {   ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(
+            dictionary.start, dictionary.filled,
+            ZSTD_dlm_byRef, ZSTD_dct_fullDict,
+            ZSTD_getCParams(3, 0, dictionary.filled),
+            ZSTD_defaultCMem);
+        const size_t inbufsize = 2 * 128 * 1024; /* 2 blocks */
+        const size_t outbufsize = ZSTD_compressBound(inbufsize);
+        size_t inbufpos = 0;
+        size_t cursegmentlen;
+        BYTE *inbuf = (BYTE *)malloc(inbufsize);
+        BYTE *outbuf = (BYTE *)malloc(outbufsize);
+        BYTE *checkbuf = (BYTE *)malloc(inbufsize);
+        size_t ret;
+
+        CHECK(cdict == NULL, "failed to alloc cdict");
+        CHECK(inbuf == NULL, "failed to alloc input buffer");
+
+        /* first block is uncompressible */
+        cursegmentlen = 128 * 1024;
+        RDG_genBuffer(inbuf + inbufpos, cursegmentlen, 0., 0., seed);
+        inbufpos += cursegmentlen;
+
+        /* second block is compressible */
+        cursegmentlen = 128 * 1024 - 256;
+        RDG_genBuffer(inbuf + inbufpos, cursegmentlen, 0.05, 0., seed);
+        inbufpos += cursegmentlen;
+
+        /* and includes a very long backref */
+        cursegmentlen = 128;
+        memcpy(inbuf + inbufpos, dictionary.start + 256, cursegmentlen);
+        inbufpos += cursegmentlen;
+
+        /* and includes a very long backref */
+        cursegmentlen = 128;
+        memcpy(inbuf + inbufpos, dictionary.start + 128, cursegmentlen);
+        inbufpos += cursegmentlen;
+
+        ret = ZSTD_compress_usingCDict(zc, outbuf, outbufsize, inbuf, inbufpos, cdict);
+        CHECK_Z(ret);
+
+        ret = ZSTD_decompress_usingDict(zd, checkbuf, inbufsize, outbuf, ret, dictionary.start, dictionary.filled);
+        CHECK_Z(ret);
+
+        CHECK(memcmp(inbuf, checkbuf, inbufpos), "start and finish buffers don't match");
+
+        ZSTD_freeCDict(cdict);
+        free(inbuf);
+        free(outbuf);
+        free(checkbuf);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
 _end:
    FUZ_freeDictionary(dictionary);
    ZSTD_freeCStream(zc);
--- a/zlibWrapper/examples/zwrapbench.c
+++ b/zlibWrapper/examples/zwrapbench.c
@ -573,10 +573,10 @@ static size_t BMK_findMaxMem(U64 requiredMem)
    do {
        testmem = (BYTE*)malloc((size_t)requiredMem);
        requiredMem -= step;
-    } while (!testmem);
+    } while (!testmem && requiredMem);   /* do not allocate zero bytes */

    free(testmem);
-    return (size_t)(requiredMem);
+    return (size_t)(requiredMem+1);  /* avoid zero */
 }

 static void BMK_benchCLevel(void* srcBuffer, size_t benchedSize,
@ -734,7 +734,7 @@ static void BMK_benchFileTable(const char** fileNamesTable, unsigned nbFiles,
    if ((U64)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
    if (benchedSize < totalSizeToLoad)
        DISPLAY("Not enough memory; testing %u MB only...\n", (U32)(benchedSize >> 20));
-    srcBuffer = malloc(benchedSize);
+    srcBuffer = malloc(benchedSize + !benchedSize);
    if (!srcBuffer) EXM_THROW(12, "not enough memory");

    /* Load input buffer */
--- a/zlibWrapper/gzlib.c
+++ b/zlibWrapper/gzlib.c
@ -111,7 +111,7 @@ local gzFile gz_open(path, fd, mode)
        return NULL;

    /* allocate gzFile structure to return */
-    state = (gz_statep)(gz_state*)malloc(sizeof(gz_state));
+    state.state = (gz_state*)malloc(sizeof(gz_state));
    if (state.state == NULL)
        return NULL;
    state.state->size = 0;            /* no buffers allocated yet */
@ -266,7 +266,7 @@ local gzFile gz_open(path, fd, mode)
    gz_reset(state);

    /* return stream */
-    return (gzFile)state.file;
+    return state.file;
 }

 /* -- see zlib.h -- */
--- a/zlibWrapper/gzwrite.c
+++ b/zlibWrapper/gzwrite.c
@ -6,6 +6,8 @@
 * For conditions of distribution and use, see http://www.zlib.net/zlib_license.html
 */

+#include <assert.h>
+
 #include "gzguts.h"

 /* Local functions */
@ -24,7 +26,7 @@ local int gz_init(state)
    z_streamp strm = &(state.state->strm);

    /* allocate input buffer (double size for gzprintf) */
-    state.state->in = (unsigned char *)malloc(state.state->want << 1);
+    state.state->in = (unsigned char*)malloc(state.state->want << 1);
    if (state.state->in == NULL) {
        gz_error(state, Z_MEM_ERROR, "out of memory");
        return -1;
@ -33,7 +35,7 @@ local int gz_init(state)
    /* only need output buffer and deflate state if compressing */
    if (!state.state->direct) {
        /* allocate output buffer */
-        state.state->out = (unsigned char *)malloc(state.state->want);
+        state.state->out = (unsigned char*)malloc(state.state->want);
        if (state.state->out == NULL) {
            free(state.state->in);
            gz_error(state, Z_MEM_ERROR, "out of memory");
@ -284,6 +286,7 @@ z_size_t ZEXPORT gzfwrite(buf, size, nitems, file)
    gz_statep state;

    /* get internal structure */
+    assert(size != 0);
    if (file == NULL)
        return 0;
    state = (gz_statep)file;
@ -294,7 +297,7 @@ z_size_t ZEXPORT gzfwrite(buf, size, nitems, file)

    /* compute bytes to read -- error on overflow */
    len = nitems * size;
-    if (size && len / size != nitems) {
+    if (size && (len / size != nitems)) {
        gz_error(state, Z_STREAM_ERROR, "request does not fit in a size_t");
        return 0;
    }