zstd/lib/Makefile
mgrice 812e8f2a16 perf improvements for zstd decode (#1668)
* perf improvements for zstd decode

tldr: 7.5% average decode speedup on silesia corpus at compression levels 1-3 (sandy bridge)

Background: while investigating zstd perf differences between clang and gcc I noticed that even though gcc is vectorizing the loop in in wildcopy, it was not being done as well as could be done by hand.  The sites where wildcopy is invoked have an interesting distribution of lengths to be copied.  The loop trip count is rarely above 1, yet long copies are common enough to make their performance important.The code in zstd_decompress.c to invoke wildcopy handles the latter well but the gcc autovectorizer introduces a needlessly expensive startup check for vectorization.

See how GCC autovectorizes the loop here:
https://godbolt.org/z/apr0x0

Here is the code after this diff has been applied: (left hand side is the good one, right is with vectorizer on)
After: https://godbolt.org/z/OwO4F8

Note that autovectorization still does not do a good job on the optimized version, so it's turned off\
 via attribute and flag.  I found that neither attribute nor command-line flag were entirely successful in turning off vectorization, which is why there were both.

    silesia benchmark data - second triad of each file is with the original code:

    file      orig        compressedratio     encode              decode           change
    1#dickens   10192446->   4268865(2.388),       198.9MB/s           709.6MB/s
    2#dickens   10192446->   3876126(2.630),       128.7MB/s           552.5MB/s
    3#dickens   10192446->   3682956(2.767),       104.6MB/s             537MB/s
    1#dickens   10192446->   4268865(2.388),       195.4MB/s           659.5MB/s     7.60%
    2#dickens   10192446->   3876126(2.630),         127MB/s           516.3MB/s     7.01%
    3#dickens   10192446->   3682956(2.767),         105MB/s           479.5MB/s    11.99%
    1#mozilla   51220480->  20117517(2.546),       285.4MB/s           734.9MB/s
    2#mozilla   51220480->  19067018(2.686),       220.8MB/s           686.3MB/s
    3#mozilla   51220480->  18508283(2.767),       152.2MB/s           669.4MB/s
    1#mozilla   51220480->  20117517(2.546),       283.4MB/s           697.9MB/s     5.30%
    2#mozilla   51220480->  19067018(2.686),       225.9MB/s             665MB/s     3.20%
    3#mozilla   51220480->  18508283(2.767),       154.5MB/s           640.6MB/s     4.50%
    1#mr         9970564->   3840242(2.596),       262.4MB/s           899.8MB/s
    2#mr         9970564->   3600976(2.769),       181.2MB/s           717.9MB/s
    3#mr         9970564->   3563987(2.798),       116.3MB/s             620MB/s
    1#mr         9970564->   3840242(2.596),       253.2MB/s           827.3MB/s     8.76%
    2#mr         9970564->   3600976(2.769),       177.4MB/s           655.4MB/s     9.54%
    3#mr         9970564->   3563987(2.798),       111.2MB/s           564.2MB/s     9.89%
    1#nci       33553445->   2849306(11.78),       575.2MB/s ,        1335.8MB/s
    2#nci       33553445->   2890166(11.61),       509.3MB/s ,        1238.1MB/s
    3#nci       33553445->   2857408(11.74),         431MB/s ,        1210.7MB/s
    1#nci       33553445->   2849306(11.78),       565.4MB/s ,        1220.2MB/s     9.47%
    2#nci       33553445->   2890166(11.61),       508.2MB/s ,        1128.4MB/s     9.72%
    3#nci       33553445->   2857408(11.74),       429.1MB/s ,        1097.7MB/s    10.29%
    1#ooffice    6152192->   3590954(1.713),       231.4MB/s ,         662.6MB/s
    2#ooffice    6152192->   3323931(1.851),       162.8MB/s ,         592.6MB/s
    3#ooffice    6152192->   3145625(1.956),        99.9MB/s ,         549.6MB/s
    1#ooffice    6152192->   3590954(1.713),       224.7MB/s ,         624.2MB/s     6.15%
    2#ooffice    6152192->   3323931 (1.851),        155MB/s ,         564.5MB/s     4.98%
    3#ooffice    6152192->   3145625(1.956),       101.1MB/s ,         521.2MB/s     5.45%
    1#osdb      10085684->   3739042(2.697),       271.9MB/s           876.4MB/s
    2#osdb      10085684->   3493875(2.887),       208.2MB/s             857MB/s
    3#osdb      10085684->   3515831(2.869),       135.3MB/s           805.4MB/s
    1#osdb      10085684->   3739042(2.697),       257.4MB/s           793.8MB/s    10.41%
    2#osdb      10085684->   3493875(2.887),       209.7MB/s           776.1MB/s    10.42%
    3#osdb      10085684->   3515831(2.869),       130.6MB/s           727.7MB/s    10.68%
    1#reymont    6627202->   2152771(3.078),       198.9MB/s           696.2MB/s
    2#reymont    6627202->   2071140(3.200),         170MB/s           595.2MB/s
    3#reymont    6627202->   1953597(3.392),       128.5MB/s           609.7MB/s
    1#reymont    6627202->   2152771(3.078),       199.6MB/s           655.2MB/s     6.26%
    2#reymont    6627202->   2071140(3.200),       168.2MB/s           554.4MB/s     7.36%
    3#reymont    6627202->   1953597(3.392),       128.7MB/s           557.4MB/s     9.38%
    1#samba     21606400->   5510994(3.921),       338.1MB/s            1066MB/s
    2#samba     21606400->   5240208(4.123),       258.7MB/s           992.3MB/s
    3#samba     21606400->   5003358(4.318),       200.2MB/s           991.1MB/s
    1#samba     21606400->   5510994(3.921),       330.8MB/s             974MB/s     9.45%
    2#samba     21606400->   5240208(4.123),       257.9MB/s           919.4MB/s     7.93%
    3#samba     21606400->   5003358(4.318),       198.5MB/s           908.9MB/s     9.04%
    1#sao        7251944->   6256401(1.159),       194.6MB/s           602.2MB/s
    2#sao        7251944->   5808761(1.248),       128.2MB/s           532.1MB/s
    3#sao        7251944->   5556318(1.305),          73MB/s           509.4MB/s
    1#sao        7251944->   6256401(1.159),       198.7MB/s           580.7MB/s     3.70%
    2#sao        7251944->   5808761(1.248),       129.1MB/s           502.7MB/s     5.85%
    3#sao        7251944->   5556318(1.305),        74.6MB/s           493.1MB/s     3.31%
    1#webster   41458703->  13692222(3.028),       222.3MB/s             752MB/s
    2#webster   41458703->  12842646(3.228),       157.6MB/s           532.2MB/s
    3#webster   41458703->  12191964(3.400),         124MB/s           468.5MB/s
    1#webster   41458703->  13692222(3.028),       219.7MB/s             697MB/s     7.89%
    2#webster   41458703->  12842646(3.228),       153.9MB/s           495.4MB/s     7.43%
    3#webster   41458703->  12191964(3.400),       124.8MB/s           444.8MB/s     5.33%
    1#xml        5345280->    696652(7.673),         485MB/s ,        1333.9MB/s
    2#xml        5345280->    681492(7.843),       405.2MB/s ,        1237.5MB/s
    3#xml        5345280->    639057(8.364),       328.5MB/s ,        1281.3MB/s
    1#xml        5345280->    696652(7.673),       473.1MB/s ,        1232.4MB/s     8.24%
    2#xml        5345280->    681492(7.843),       398.6MB/s ,        1145.9MB/s     7.99%
    3#xml        5345280->    639057(8.364),       327.1MB/s ,          1175MB/s     9.05%
    1#x-ray      8474240->   6772557(1.251),       521.3MB/s           762.6MB/s
    2#x-ray      8474240->   6684531(1.268),       230.5MB/s           688.5MB/s
    3#x-ray      8474240->   6166679(1.374),        68.7MB/s           478.8MB/s
    1#x-ray      8474240->   6772557(1.251),       502.8MB/s           736.7MB/s     3.52%
    2#x-ray      8474240->   6684531(1.268),       224.4MB/s             662MB/s     4.00%
    3#x-ray      8474240->   6166679(1.374),        67.3MB/s           437.8MB/s     9.37%

                                                                                     7.51%

* makefile changed to only pass -fno-tree-vectorize to gcc

* <Replace this line with a title. Use 1 line only, 67 chars or less>

Don't add "no-tree-vectorize" attribute on clang (which defines __GNUC__)

* fix for warning/error with subtraction of void* pointers

* fix c90 conformance issue - ISO C90 forbids mixed declarations and code

* Fix assert for negative diff, only when there is no overlap

* fix overflow revealed in fuzzing tests

* tweak for small speed increase
2019-07-11 18:31:07 -04:00

292 lines
9.2 KiB
Makefile

# ################################################################
# Copyright (c) 2015-present, Yann Collet, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under both the BSD-style license (found in the
# LICENSE file in the root directory of this source tree) and the GPLv2 (found
# in the COPYING file in the root directory of this source tree).
# ################################################################
# Version numbers
LIBVER_MAJOR_SCRIPT:=`sed -n '/define ZSTD_VERSION_MAJOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < ./zstd.h`
LIBVER_MINOR_SCRIPT:=`sed -n '/define ZSTD_VERSION_MINOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < ./zstd.h`
LIBVER_PATCH_SCRIPT:=`sed -n '/define ZSTD_VERSION_RELEASE/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < ./zstd.h`
LIBVER_SCRIPT:= $(LIBVER_MAJOR_SCRIPT).$(LIBVER_MINOR_SCRIPT).$(LIBVER_PATCH_SCRIPT)
LIBVER_MAJOR := $(shell echo $(LIBVER_MAJOR_SCRIPT))
LIBVER_MINOR := $(shell echo $(LIBVER_MINOR_SCRIPT))
LIBVER_PATCH := $(shell echo $(LIBVER_PATCH_SCRIPT))
LIBVER := $(shell echo $(LIBVER_SCRIPT))
VERSION?= $(LIBVER)
CCVER := $(shell $(CC) --version)
CPPFLAGS+= -I. -I./common -DXXH_NAMESPACE=ZSTD_
ifeq ($(OS),Windows_NT) # MinGW assumed
CPPFLAGS += -D__USE_MINGW_ANSI_STDIO # compatibility with %zu formatting
endif
CFLAGS ?= -O3
DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
-Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
-Wstrict-prototypes -Wundef -Wpointer-arith \
-Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
-Wredundant-decls -Wmissing-prototypes -Wc++-compat
CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS)
FLAGS = $(CPPFLAGS) $(CFLAGS)
HAVE_COLORNEVER = $(shell echo a | grep --color=never a > /dev/null 2> /dev/null && echo 1 || echo 0)
GREP_OPTIONS ?=
ifeq ($HAVE_COLORNEVER, 1)
GREP_OPTIONS += --color=never
endif
GREP = grep $(GREP_OPTIONS)
ZSTDCOMMON_FILES := $(sort $(wildcard common/*.c))
ZSTDCOMP_FILES := $(sort $(wildcard compress/*.c))
ZSTDDECOMP_FILES := $(sort $(wildcard decompress/*.c))
ZDICT_FILES := $(sort $(wildcard dictBuilder/*.c))
ZDEPR_FILES := $(sort $(wildcard deprecated/*.c))
ZSTD_FILES := $(ZSTDCOMMON_FILES)
ifeq ($(findstring GCC,$(CCVER)),GCC)
decompress/zstd_decompress_block.o : CFLAGS+=-fno-tree-vectorize
endif
ZSTD_LEGACY_SUPPORT ?= 5
ZSTD_LIB_COMPRESSION ?= 1
ZSTD_LIB_DECOMPRESSION ?= 1
ZSTD_LIB_DICTBUILDER ?= 1
ZSTD_LIB_DEPRECATED ?= 1
HUF_FORCE_DECOMPRESS_X1 ?= 0
HUF_FORCE_DECOMPRESS_X2 ?= 0
ZSTD_FORCE_DECOMPRESS_SHORT ?= 0
ZSTD_FORCE_DECOMPRESS_LONG ?= 0
ZSTD_NO_INLINE ?= 0
ZSTD_STRIP_ERROR_STRINGS ?= 0
ZSTD_LEGACY_MULTITHREADED_API ?= 0
ifeq ($(ZSTD_LIB_COMPRESSION), 0)
ZSTD_LIB_DICTBUILDER = 0
ZSTD_LIB_DEPRECATED = 0
endif
ifeq ($(ZSTD_LIB_DECOMPRESSION), 0)
ZSTD_LEGACY_SUPPORT = 0
ZSTD_LIB_DEPRECATED = 0
endif
ifneq ($(ZSTD_LIB_COMPRESSION), 0)
ZSTD_FILES += $(ZSTDCOMP_FILES)
endif
ifneq ($(ZSTD_LIB_DECOMPRESSION), 0)
ZSTD_FILES += $(ZSTDDECOMP_FILES)
endif
ifneq ($(ZSTD_LIB_DEPRECATED), 0)
ZSTD_FILES += $(ZDEPR_FILES)
endif
ifneq ($(ZSTD_LIB_DICTBUILDER), 0)
ZSTD_FILES += $(ZDICT_FILES)
endif
ifneq ($(HUF_FORCE_DECOMPRESS_X1), 0)
CFLAGS += -DHUF_FORCE_DECOMPRESS_X1
endif
ifneq ($(HUF_FORCE_DECOMPRESS_X2), 0)
CFLAGS += -DHUF_FORCE_DECOMPRESS_X2
endif
ifneq ($(ZSTD_FORCE_DECOMPRESS_SHORT), 0)
CFLAGS += -DZSTD_FORCE_DECOMPRESS_SHORT
endif
ifneq ($(ZSTD_FORCE_DECOMPRESS_LONG), 0)
CFLAGS += -DZSTD_FORCE_DECOMPRESS_LONG
endif
ifneq ($(ZSTD_NO_INLINE), 0)
CFLAGS += -DZSTD_NO_INLINE
endif
ifneq ($(ZSTD_STRIP_ERROR_STRINGS), 0)
CFLAGS += -DZSTD_STRIP_ERROR_STRINGS
endif
ifneq ($(ZSTD_LEGACY_MULTITHREADED_API), 0)
CFLAGS += -DZSTD_LEGACY_MULTITHREADED_API
endif
ifneq ($(ZSTD_LEGACY_SUPPORT), 0)
ifeq ($(shell test $(ZSTD_LEGACY_SUPPORT) -lt 8; echo $$?), 0)
ZSTD_FILES += $(shell ls legacy/*.c | $(GREP) 'v0[$(ZSTD_LEGACY_SUPPORT)-7]')
endif
CPPFLAGS += -I./legacy
endif
CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
ZSTD_OBJ := $(patsubst %.c,%.o,$(ZSTD_FILES))
# macOS linker doesn't support -soname, and use different extension
# see : https://developer.apple.com/library/mac/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html
ifeq ($(shell uname), Darwin)
SHARED_EXT = dylib
SHARED_EXT_MAJOR = $(LIBVER_MAJOR).$(SHARED_EXT)
SHARED_EXT_VER = $(LIBVER).$(SHARED_EXT)
SONAME_FLAGS = -install_name $(LIBDIR)/libzstd.$(SHARED_EXT_MAJOR) -compatibility_version $(LIBVER_MAJOR) -current_version $(LIBVER)
else
SONAME_FLAGS = -Wl,-soname=libzstd.$(SHARED_EXT).$(LIBVER_MAJOR)
SHARED_EXT = so
SHARED_EXT_MAJOR = $(SHARED_EXT).$(LIBVER_MAJOR)
SHARED_EXT_VER = $(SHARED_EXT).$(LIBVER)
endif
.PHONY: default all clean install uninstall
default: lib-release
all: lib
libzstd.a: ARFLAGS = rcs
libzstd.a: $(ZSTD_OBJ)
@echo compiling static library
@$(AR) $(ARFLAGS) $@ $^
libzstd.a-mt: CPPFLAGS += -DZSTD_MULTITHREAD
libzstd.a-mt: libzstd.a
ifneq (,$(filter Windows%,$(OS)))
LIBZSTD = dll\libzstd.dll
$(LIBZSTD): $(ZSTD_FILES)
@echo compiling dynamic library $(LIBVER)
$(CC) $(FLAGS) -DZSTD_DLL_EXPORT=1 -Wl,--out-implib,dll\libzstd.lib -shared $^ -o $@
else
LIBZSTD = libzstd.$(SHARED_EXT_VER)
$(LIBZSTD): LDFLAGS += -shared -fPIC -fvisibility=hidden
$(LIBZSTD): $(ZSTD_FILES)
@echo compiling dynamic library $(LIBVER)
@$(CC) $(FLAGS) $^ $(LDFLAGS) $(SONAME_FLAGS) -o $@
@echo creating versioned links
@ln -sf $@ libzstd.$(SHARED_EXT_MAJOR)
@ln -sf $@ libzstd.$(SHARED_EXT)
endif
libzstd : $(LIBZSTD)
libzstd-mt : CPPFLAGS += -DZSTD_MULTITHREAD
libzstd-mt : libzstd
lib: libzstd.a libzstd
lib-mt: CPPFLAGS += -DZSTD_MULTITHREAD
lib-mt: lib
lib-release lib-release-mt: DEBUGFLAGS :=
lib-release: lib
lib-release-mt: lib-mt
# Special case : building library in single-thread mode _and_ without zstdmt_compress.c
ZSTDMT_FILES = compress/zstdmt_compress.c
ZSTD_NOMT_FILES = $(filter-out $(ZSTDMT_FILES),$(ZSTD_FILES))
libzstd-nomt: LDFLAGS += -shared -fPIC -fvisibility=hidden
libzstd-nomt: $(ZSTD_NOMT_FILES)
@echo compiling single-thread dynamic library $(LIBVER)
@echo files : $(ZSTD_NOMT_FILES)
@$(CC) $(FLAGS) $^ $(LDFLAGS) $(SONAME_FLAGS) -o $@
clean:
@$(RM) -r *.dSYM # macOS-specific
@$(RM) core *.o *.a *.gcda *.$(SHARED_EXT) *.$(SHARED_EXT).* libzstd.pc
@$(RM) dll/libzstd.dll dll/libzstd.lib libzstd-nomt*
@$(RM) common/*.o compress/*.o decompress/*.o dictBuilder/*.o legacy/*.o deprecated/*.o
@echo Cleaning library completed
#-----------------------------------------------------------------------------
# make install is validated only for Linux, macOS, BSD, Hurd and Solaris targets
#-----------------------------------------------------------------------------
ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS Haiku))
DESTDIR ?=
# directory variables : GNU conventions prefer lowercase
# see https://www.gnu.org/prep/standards/html_node/Makefile-Conventions.html
# support both lower and uppercase (BSD), use uppercase in script
prefix ?= /usr/local
PREFIX ?= $(prefix)
exec_prefix ?= $(PREFIX)
libdir ?= $(exec_prefix)/lib
LIBDIR ?= $(libdir)
includedir ?= $(PREFIX)/include
INCLUDEDIR ?= $(includedir)
ifneq (,$(filter $(shell uname),FreeBSD NetBSD DragonFly))
PKGCONFIGDIR ?= $(PREFIX)/libdata/pkgconfig
else
PKGCONFIGDIR ?= $(LIBDIR)/pkgconfig
endif
ifneq (,$(filter $(shell uname),SunOS))
INSTALL ?= ginstall
else
INSTALL ?= install
endif
INSTALL_PROGRAM ?= $(INSTALL)
INSTALL_DATA ?= $(INSTALL) -m 644
libzstd.pc:
libzstd.pc: libzstd.pc.in
@echo creating pkgconfig
@sed -e 's|@PREFIX@|$(PREFIX)|' \
-e 's|@LIBDIR@|$(LIBDIR)|' \
-e 's|@INCLUDEDIR@|$(INCLUDEDIR)|' \
-e 's|@VERSION@|$(VERSION)|' \
$< >$@
install: install-pc install-static install-shared install-includes
@echo zstd static and shared library installed
install-pc: libzstd.pc
@$(INSTALL) -d -m 755 $(DESTDIR)$(PKGCONFIGDIR)/
@$(INSTALL_DATA) libzstd.pc $(DESTDIR)$(PKGCONFIGDIR)/
install-static: libzstd.a
@echo Installing static library
@$(INSTALL) -d -m 755 $(DESTDIR)$(LIBDIR)/
@$(INSTALL_DATA) libzstd.a $(DESTDIR)$(LIBDIR)
install-shared: libzstd
@echo Installing shared library
@$(INSTALL) -d -m 755 $(DESTDIR)$(LIBDIR)/
@$(INSTALL_PROGRAM) $(LIBZSTD) $(DESTDIR)$(LIBDIR)
@ln -sf $(LIBZSTD) $(DESTDIR)$(LIBDIR)/libzstd.$(SHARED_EXT_MAJOR)
@ln -sf $(LIBZSTD) $(DESTDIR)$(LIBDIR)/libzstd.$(SHARED_EXT)
install-includes:
@echo Installing includes
@$(INSTALL) -d -m 755 $(DESTDIR)$(INCLUDEDIR)/
@$(INSTALL_DATA) zstd.h $(DESTDIR)$(INCLUDEDIR)
@$(INSTALL_DATA) common/zstd_errors.h $(DESTDIR)$(INCLUDEDIR)
@$(INSTALL_DATA) deprecated/zbuff.h $(DESTDIR)$(INCLUDEDIR) # prototypes generate deprecation warnings
@$(INSTALL_DATA) dictBuilder/zdict.h $(DESTDIR)$(INCLUDEDIR)
uninstall:
@$(RM) $(DESTDIR)$(LIBDIR)/libzstd.a
@$(RM) $(DESTDIR)$(LIBDIR)/libzstd.$(SHARED_EXT)
@$(RM) $(DESTDIR)$(LIBDIR)/libzstd.$(SHARED_EXT_MAJOR)
@$(RM) $(DESTDIR)$(LIBDIR)/$(LIBZSTD)
@$(RM) $(DESTDIR)$(PKGCONFIGDIR)/libzstd.pc
@$(RM) $(DESTDIR)$(INCLUDEDIR)/zstd.h
@$(RM) $(DESTDIR)$(INCLUDEDIR)/zstd_errors.h
@$(RM) $(DESTDIR)$(INCLUDEDIR)/zbuff.h # Deprecated streaming functions
@$(RM) $(DESTDIR)$(INCLUDEDIR)/zdict.h
@echo zstd libraries successfully uninstalled
endif