Merge pull request #185 from jwerner-chromium/dev

Add support for safe in-place decoding
This commit is contained in:
Yann Collet 2016-02-16 22:29:10 +01:00
commit 2995a45e53
12 changed files with 79 additions and 76 deletions

View File

@ -2,18 +2,18 @@
# LZ4 - Makefile
# Copyright (C) Yann Collet 2011-2015
# All rights reserved.
#
#
# BSD license
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
#
# * Redistributions in binary form must reproduce the above copyright notice, this
# list of conditions and the following disclaimer in the documentation and/or
# other materials provided with the distribution.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@ -24,7 +24,7 @@
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# You can contact the author at :
# - LZ4 source repository : https://github.com/Cyan4973/lz4
# - LZ4 forum froup : https://groups.google.com/forum/#!forum/lz4c

View File

@ -1,11 +1,11 @@
LZ4 - Extremely fast compression
================================
LZ4 is lossless compression algorithm,
providing compression speed at 400 MB/s per core,
scalable with multi-cores CPU.
It features an extremely fast decoder,
with speed in multiple GB/s per core,
LZ4 is lossless compression algorithm,
providing compression speed at 400 MB/s per core,
scalable with multi-cores CPU.
It features an extremely fast decoder,
with speed in multiple GB/s per core,
typically reaching RAM speed limits on multi-core systems.
Speed can be tuned dynamically, selecting an "acceleration" factor
@ -77,7 +77,7 @@ Interoperable versions of LZ4 must respect this frame format.
Other source versions
-------------------------
Beyond the C reference source,
Beyond the C reference source,
many contributors have created versions of lz4 in multiple languages
(Java, C#, Python, Perl, Ruby, etc.).
A list of known source ports is maintained on the [LZ4 Homepage].

View File

@ -42,9 +42,9 @@ if(BUILD_TOOLS)
install(TARGETS lz4 RUNTIME DESTINATION "bin/")
endif()
if(BUILD_LIBS)
if(BUILD_LIBS)
SET(LIBS_TARGETS "")
IF(NOT WIN32)
add_library(liblz4 SHARED ${LZ4_SRCS_LIB})
@ -55,12 +55,12 @@ if(BUILD_LIBS)
add_library(liblz4 STATIC ${LZ4_SRCS_LIB})
SET(LIBS_TARGETS liblz4)
ENDIF(NOT WIN32)
set_target_properties(liblz4 PROPERTIES
OUTPUT_NAME lz4
SOVERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}"
)
install(TARGETS ${LIBS_TARGETS}
RUNTIME DESTINATION lib #on Windows: cmake considers dlls as runtime component
LIBRARY DESTINATION lib
@ -73,7 +73,7 @@ if(BUILD_LIBS)
${LZ4_DIR}/lz4frame.h
DESTINATION include
)
if(BUILD_TOOLS)
target_link_libraries(lz4 liblz4)
endif()

View File

@ -43,7 +43,7 @@ void test_compress(FILE* outFp, FILE* inpFp)
char inpBuf[2][BLOCK_BYTES];
int inpBufIndex = 0;
LZ4_resetStream(lz4Stream);
for(;;) {

View File

@ -39,7 +39,7 @@ Example (1) : Block API, 4KiB Block
+---------------+---------------+----+----+----+
| Block #1 | Block #2 | #3 | #4 |... |
+---------------+---------------+----+----+----+
(No Dependency)

View File

@ -2,18 +2,18 @@
# LZ4 library - Makefile
# Copyright (C) Yann Collet 2011-2015
# All rights reserved.
#
#
# BSD license
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
#
# * Redistributions in binary form must reproduce the above copyright notice, this
# list of conditions and the following disclaimer in the documentation and/or
# other materials provided with the distribution.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@ -24,7 +24,7 @@
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# You can contact the author at :
# - LZ4 source repository : https://github.com/Cyan4973/lz4
# - LZ4 forum froup : https://groups.google.com/forum/#!forum/lz4c

View File

@ -1182,6 +1182,7 @@ FORCE_INLINE int LZ4_decompress_generic(
const int safeDecode = (endOnInput==endOnInputSize);
const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
const int inPlaceDecode = ((ip >= op) && (ip < oend));
/* Special cases */
@ -1198,6 +1199,8 @@ FORCE_INLINE int LZ4_decompress_generic(
const BYTE* match;
size_t offset;
if (unlikely((inPlaceDecode) && (op + WILDCOPYLENGTH > ip))) goto _output_error; /* output stream ran over input stream */
/* get literal length */
token = *ip++;
if ((length=(token>>ML_BITS)) == RUN_MASK)
@ -1228,7 +1231,7 @@ FORCE_INLINE int LZ4_decompress_generic(
if ((!endOnInput) && (cpy != oend)) goto _output_error; /* Error : block decoding must stop exactly there */
if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; /* Error : input must be consumed */
}
memcpy(op, ip, length);
memmove(op, ip, length);
ip += length;
op += length;
break; /* Necessarily EOF, due to parsing restrictions */

View File

@ -258,7 +258,7 @@ size_t LZ4F_getFrameInfo(LZ4F_decompressionContext_t dctx,
/* LZ4F_getFrameInfo()
* This function decodes frame header information (such as max blockSize, frame checksum, etc.).
* Its usage is optional. The objective is to extract frame header information, typically for allocation purposes.
* A header size is variable and can be from 7 to 15 bytes. It's also possible to input more bytes than that.
* A header size is variable and can be from 7 to 15 bytes. It's also possible to input more bytes than that.
* The number of bytes read from srcBuffer will be updated within *srcSizePtr (necessarily <= original value).
* (note that LZ4F_getFrameInfo() can also be used anytime *after* starting decompression, in this case 0 input byte is enough)
* Frame header info is *copied into* an already allocated LZ4F_frameInfo_t structure.

View File

@ -75,15 +75,15 @@ This is a 2 bytes value, in little endian format
The offset represents the position of the match to be copied from.
1 means "current position - 1 byte".
The maximum offset value is 65535, 65536 cannot be coded.
Note that 0 is an invalid value, not used.
Note that 0 is an invalid value, not used.
Then we need to extract the match length.
For this, we use the second token field, the low 4-bits.
Value, obviously, ranges from 0 to 15.
However here, 0 means that the copy operation will be minimal.
The minimum length of a match, called minmatch, is 4.
The minimum length of a match, called minmatch, is 4.
As a consequence, a 0 value means 4 bytes, and a value of 15 means 19+ bytes.
Similar to literal length, on reaching the highest possible value (15),
Similar to literal length, on reaching the highest possible value (15),
we output additional bytes, one at a time, with values ranging from 0 to 255.
They are added to total to provide the final match length.
A 255 value means there is another byte to read and add.
@ -102,7 +102,7 @@ There are specific parsing rules to respect in order to remain compatible
with assumptions made by the decoder :
1. The last 5 bytes are always literals
2. The last match must start at least 12 bytes before end of block.
2. The last match must start at least 12 bytes before end of block.
Consequently, a block with less than 13 bytes cannot be compressed.
These rules are in place to ensure that the decoder

View File

@ -5,12 +5,12 @@ LZ4 Frame Format Description
Copyright (c) 2013-2015 Yann Collet
Permission is granted to copy and distribute this document
for any purpose and without charge,
including translations into other languages
and incorporation into compilations,
provided that the copyright notice and this notice are preserved,
and that any substantive changes or deletions from the original
Permission is granted to copy and distribute this document
for any purpose and without charge,
including translations into other languages
and incorporation into compilations,
provided that the copyright notice and this notice are preserved,
and that any substantive changes or deletions from the original
are clearly marked.
Distribution of this document is unlimited.
@ -22,13 +22,13 @@ Distribution of this document is unlimited.
Introduction
------------
The purpose of this document is to define a lossless compressed data format,
that is independent of CPU type, operating system,
file system and character set, suitable for
File compression, Pipe and streaming compression
The purpose of this document is to define a lossless compressed data format,
that is independent of CPU type, operating system,
file system and character set, suitable for
File compression, Pipe and streaming compression
using the [LZ4 algorithm](http://www.lz4.org).
The data can be produced or consumed,
The data can be produced or consumed,
even for an arbitrarily long sequentially presented input data stream,
using only an a priori bounded amount of intermediate storage,
and hence can be used in data communications.
@ -36,7 +36,7 @@ The format uses the LZ4 compression method,
and optional [xxHash-32 checksum method](https://github.com/Cyan4973/xxHash),
for detection of data corruption.
The data format defined by this specification
The data format defined by this specification
does not attempt to allow random access to compressed data.
This specification is intended for use by implementers of software
@ -63,7 +63,7 @@ General Structure of LZ4 Frame format
| MagicNb | F. Descriptor | Block | (...) | EndMark | C. Checksum |
|:-------:|:-------------:| ----- | ----- | ------- | ----------- |
| 4 bytes | 3-11 bytes | | | 4 bytes | 0-4 bytes |
| 4 bytes | 3-11 bytes | | | 4 bytes | 0-4 bytes |
__Magic Number__
@ -88,11 +88,11 @@ The size is expressed as a 32-bits value.
__Content Checksum__
Content Checksum verify that the full content has been decoded correctly.
The content checksum is the result
The content checksum is the result
of [xxh32() hash function](https://github.com/Cyan4973/xxHash)
digesting the original (decoded) data as input, and a seed of zero.
Content checksum is only present when its associated flag
is set in the frame descriptor.
is set in the frame descriptor.
Content Checksum validates the result,
that all blocks were fully transmitted in the correct order and without error,
and also that the encoding/decoding process itself generated no distortion.
@ -108,19 +108,19 @@ In such case, each frame has its own set of descriptor flags.
Each frame is considered independent.
The only relation between frames is their sequential order.
The ability to decode multiple concatenated frames
The ability to decode multiple concatenated frames
within a single stream or file
is left outside of this specification.
is left outside of this specification.
As an example, the reference lz4 command line utility behavior is
to decode all concatenated frames in their sequential order.
Frame Descriptor
----------------
| FLG | BD | (Content Size) | HC |
| ------- | ------- |:--------------:| ------- |
| 1 byte | 1 byte | 0 - 8 bytes | 1 byte |
| 1 byte | 1 byte | 0 - 8 bytes | 1 byte |
The descriptor uses a minimum of 3 bytes,
and up to 11 bytes depending on optional parameters.
@ -148,7 +148,7 @@ Other version numbers will use different flag layouts.
__Block Independence flag__
If this flag is set to “1”, blocks are independent.
If this flag is set to “1”, blocks are independent.
If this flag is set to “0”, each block depends on previous ones
(up to LZ4 window size, which is 64 KB).
In such case, its necessary to decode all blocks in sequence.
@ -160,13 +160,13 @@ __Block checksum flag__
If this flag is set, each data block will be followed by a 4-bytes checksum,
calculated by using the xxHash-32 algorithm on the raw (compressed) data block.
The intention is to detect data corruption (storage or transmission errors)
The intention is to detect data corruption (storage or transmission errors)
immediately, before decoding.
Block checksum usage is optional.
__Content Size flag__
If this flag is set, the uncompressed size of data included within the frame
If this flag is set, the uncompressed size of data included within the frame
will be present as an 8 bytes unsigned little endian value, after the flags.
Content Size usage is optional.
@ -182,9 +182,9 @@ This information is intended to help the decoder allocate memory.
Size here refers to the original (uncompressed) data size.
Block Maximum Size is one value among the following table :
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
| --- | --- | --- | --- | ----- | ------ | ---- | ---- |
| N/A | N/A | N/A | N/A | 64 KB | 256 KB | 1 MB | 4 MB |
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
| --- | --- | --- | --- | ----- | ------ | ---- | ---- |
| N/A | N/A | N/A | N/A | 64 KB | 256 KB | 1 MB | 4 MB |
The decoder may refuse to allocate block sizes above a (system-specific) size.
Unused values may be used in a future revision of the spec.
@ -224,7 +224,7 @@ Data Blocks
| Block Size | data | (Block Checksum) |
|:----------:| ------ |:----------------:|
| 4 bytes | | 0 - 4 bytes |
| 4 bytes | | 0 - 4 bytes |
__Block Size__
@ -239,7 +239,7 @@ All other bits give the size, in bytes, of the following data block
(the size does not include the block checksum if present).
Block Size shall never be larger than Block Maximum Size.
Such a thing could happen for incompressible source data.
Such a thing could happen for incompressible source data.
In such case, such a data block shall be passed in uncompressed format.
__Data__
@ -247,7 +247,7 @@ __Data__
Where the actual data to decode stands.
It might be compressed or not, depending on previous field indications.
Uncompressed size of Data can be any size, up to “block maximum size”.
Note that data block is not necessarily full :
Note that data block is not necessarily full :
an arbitrary “flush” may happen anytime. Any block can be “partially filled”.
__Block checksum__
@ -256,7 +256,7 @@ Only present if the associated flag is set.
This is a 4-bytes checksum value, in little endian format,
calculated by using the xxHash-32 algorithm on the raw (undecoded) data block,
and a seed of zero.
The intention is to detect data corruption (storage or transmission errors)
The intention is to detect data corruption (storage or transmission errors)
before decoding.
Block checksum is cumulative with Content checksum.
@ -267,12 +267,12 @@ Skippable Frames
| Magic Number | Frame Size | User Data |
|:------------:|:----------:| --------- |
| 4 bytes | 4 bytes | |
| 4 bytes | 4 bytes | |
Skippable frames allow the integration of user-defined data
into a flow of concatenated frames.
Its design is pretty straightforward,
with the sole objective to allow the decoder to quickly skip
with the sole objective to allow the decoder to quickly skip
over user-defined data and continue decoding.
For the purpose of facilitating identification,
@ -283,14 +283,14 @@ its recommended to start with a zero-byte LZ4 frame
followed by a skippable frame.
This will make it easier for file type identifiers.
__Magic Number__
4 Bytes, Little endian format.
Value : 0x184D2A5X, which means any value from 0x184D2A50 to 0x184D2A5F.
All 16 values are valid to identify a skippable frame.
__Frame Size__
__Frame Size__
This is the size, in bytes, of the following User Data
(without including the magic number nor the size field itself).
@ -313,7 +313,7 @@ Main characteristics of the legacy format :
- Fixed block size : 8 MB.
- All blocks must be completely filled, except the last one.
- All blocks are always compressed, even when compression is detrimental.
- The last block is detected either because
- The last block is detected either because
it is followed by the “EOF” (End of File) mark,
or because it is followed by a known Frame Magic Number.
- No checksum

View File

@ -18,11 +18,11 @@
.PP
.B unlz4
is equivalent to
.BR "lz4 \-d"
.BR "lz4 \-d"
.br
.B lz4cat
is equivalent to
.BR "lz4 \-dc"
.BR "lz4 \-dc"
.br
.PP
When writing scripts that need to decompress files,
@ -42,17 +42,17 @@ and
.PP
\fBlz4\fR is an extremely fast lossless compression algorithm,
based on \fBbyte-aligned LZ77\fR family of compression scheme.
\fBlz4\fR offers compression speeds of 400 MB/s per core, linearly scalable with multi-core CPUs.
\fBlz4\fR offers compression speeds of 400 MB/s per core, linearly scalable with multi-core CPUs.
It features an extremely fast decoder, with speed in multiple GB/s per core,
typically reaching RAM speed limit on multi-core systems.
typically reaching RAM speed limit on multi-core systems.
The native file format is the
.B .lz4
format.
.B lz4
supports a command line syntax similar but not identical to
.BR gzip (1).
Differences are :
.B lz4
supports a command line syntax similar but not identical to
.BR gzip (1).
Differences are :
\fBlz4\fR preserve original files ;
\fBlz4 file1 file2\fR means : compress file1 \fIinto\fR file2 ;
\fBlz4 file\fR shows real-time statistics during compression .
@ -98,9 +98,9 @@ only the latest one will be applied.
.TP
.BR \-z ", " \-\-compress
Compress.
This is the default operation mode
This is the default operation mode
when no operation mode option is specified ,
no other operation mode is implied from the command name
no other operation mode is implied from the command name
(for example,
.B unlz4
implies
@ -192,7 +192,7 @@ independently, and the resulting name of each compressed file will be
Note : this option can only be activated when the original size can be determined,
hence for a file. It won't work with unknown source size, such as stdin or pipe.
.TP
.B \--[no-]sparse
.B \--[no-]sparse
sparse mode support (default:enabled on file, disabled on stdout)
.TP
.B \-l

View File

@ -21,7 +21,7 @@ def proc(cmd_args, pipe=True, dummy=False):
return
if pipe:
subproc = subprocess.Popen(cmd_args,
stdout=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
else:
subproc = subprocess.Popen(cmd_args)