2018-10-25 23:28:41 +00:00
/*
* Copyright ( c ) 2016 - present , Yann Collet , Facebook , Inc .
* All rights reserved .
*
* This source code is licensed under both the BSD - style license ( found in the
* LICENSE file in the root directory of this source tree ) and the GPLv2 ( found
* in the COPYING file in the root directory of this source tree ) .
* You may select , at your option , one of the above - listed licenses .
*/
/* zstd_decompress_block :
* this module takes care of decompressing _compressed_ block */
/*-*******************************************************
* Dependencies
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
# include <string.h> /* memcpy, memmove, memset */
# include "compiler.h" /* prefetch */
# include "cpu.h" /* bmi2 */
# include "mem.h" /* low level memory routines */
# define FSE_STATIC_LINKING_ONLY
# include "fse.h"
# define HUF_STATIC_LINKING_ONLY
# include "huf.h"
# include "zstd_internal.h"
# include "zstd_decompress_internal.h" /* ZSTD_DCtx */
# include "zstd_ddict.h" /* ZSTD_DDictDictContent */
# include "zstd_decompress_block.h"
2018-12-04 01:36:24 +00:00
/*_*******************************************************
* Macros
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/* These two optional macros force the use one way or another of the two
* ZSTD_decompressSequences implementations . You can ' t force in both directions
* at the same time .
*/
# if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
defined ( ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG )
# error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!"
# endif
2018-10-25 23:28:41 +00:00
/*_*******************************************************
* Memory operations
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
static void ZSTD_copy4 ( void * dst , const void * src ) { memcpy ( dst , src , 4 ) ; }
/*-*************************************************************
* Block decoding
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/*! ZSTD_getcBlockSize() :
2018-10-26 22:00:00 +00:00
* Provides the size of compressed block from block header ` src ` */
2018-10-25 23:28:41 +00:00
size_t ZSTD_getcBlockSize ( const void * src , size_t srcSize ,
blockProperties_t * bpPtr )
{
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( srcSize < ZSTD_blockHeaderSize , srcSize_wrong ) ;
2018-10-25 23:28:41 +00:00
{ U32 const cBlockHeader = MEM_readLE24 ( src ) ;
U32 const cSize = cBlockHeader > > 3 ;
bpPtr - > lastBlock = cBlockHeader & 1 ;
bpPtr - > blockType = ( blockType_e ) ( ( cBlockHeader > > 1 ) & 3 ) ;
bpPtr - > origSize = cSize ; /* only useful for RLE */
if ( bpPtr - > blockType = = bt_rle ) return 1 ;
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( bpPtr - > blockType = = bt_reserved , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
return cSize ;
}
}
/* Hidden declaration for fullbench */
size_t ZSTD_decodeLiteralsBlock ( ZSTD_DCtx * dctx ,
const void * src , size_t srcSize ) ;
/*! ZSTD_decodeLiteralsBlock() :
* @ return : nb of bytes read from src ( < srcSize )
* note : symbol not declared but exposed for fullbench */
size_t ZSTD_decodeLiteralsBlock ( ZSTD_DCtx * dctx ,
const void * src , size_t srcSize ) /* note : srcSize < BLOCKSIZE */
{
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( srcSize < MIN_CBLOCK_SIZE , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
{ const BYTE * const istart = ( const BYTE * ) src ;
symbolEncodingType_e const litEncType = ( symbolEncodingType_e ) ( istart [ 0 ] & 3 ) ;
switch ( litEncType )
{
case set_repeat :
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( dctx - > litEntropy = = 0 , dictionary_corrupted ) ;
2018-10-25 23:28:41 +00:00
/* fall-through */
case set_compressed :
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( srcSize < 5 , corruption_detected , " srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 " ) ;
2018-10-25 23:28:41 +00:00
{ size_t lhSize , litSize , litCSize ;
U32 singleStream = 0 ;
U32 const lhlCode = ( istart [ 0 ] > > 2 ) & 3 ;
U32 const lhc = MEM_readLE32 ( istart ) ;
2018-12-04 19:24:36 +00:00
size_t hufSuccess ;
2018-10-25 23:28:41 +00:00
switch ( lhlCode )
{
case 0 : case 1 : default : /* note : default is impossible, since lhlCode into [0..3] */
/* 2 - 2 - 10 - 10 */
singleStream = ! lhlCode ;
lhSize = 3 ;
litSize = ( lhc > > 4 ) & 0x3FF ;
litCSize = ( lhc > > 14 ) & 0x3FF ;
break ;
case 2 :
/* 2 - 2 - 14 - 14 */
lhSize = 4 ;
litSize = ( lhc > > 4 ) & 0x3FFF ;
litCSize = lhc > > 18 ;
break ;
case 3 :
/* 2 - 2 - 18 - 18 */
lhSize = 5 ;
litSize = ( lhc > > 4 ) & 0x3FFFF ;
litCSize = ( lhc > > 22 ) + ( istart [ 4 ] < < 10 ) ;
break ;
}
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( litSize > ZSTD_BLOCKSIZE_MAX , corruption_detected ) ;
RETURN_ERROR_IF ( litCSize + lhSize > srcSize , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
/* prefetch huffman table if cold */
if ( dctx - > ddictIsCold & & ( litSize > 768 /* heuristic */ ) ) {
PREFETCH_AREA ( dctx - > HUFptr , sizeof ( dctx - > entropy . hufTable ) ) ;
}
2018-12-04 19:24:36 +00:00
if ( litEncType = = set_repeat ) {
if ( singleStream ) {
hufSuccess = HUF_decompress1X_usingDTable_bmi2 (
dctx - > litBuffer , litSize , istart + lhSize , litCSize ,
dctx - > HUFptr , dctx - > bmi2 ) ;
} else {
hufSuccess = HUF_decompress4X_usingDTable_bmi2 (
dctx - > litBuffer , litSize , istart + lhSize , litCSize ,
dctx - > HUFptr , dctx - > bmi2 ) ;
}
} else {
if ( singleStream ) {
2018-12-04 19:44:02 +00:00
# if defined(HUF_FORCE_DECOMPRESS_X2)
hufSuccess = HUF_decompress1X_DCtx_wksp (
dctx - > entropy . hufTable , dctx - > litBuffer , litSize ,
istart + lhSize , litCSize , dctx - > workspace ,
sizeof ( dctx - > workspace ) ) ;
# else
2018-12-04 19:24:36 +00:00
hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2 (
dctx - > entropy . hufTable , dctx - > litBuffer , litSize ,
istart + lhSize , litCSize , dctx - > workspace ,
sizeof ( dctx - > workspace ) , dctx - > bmi2 ) ;
2018-12-04 19:44:02 +00:00
# endif
2018-12-04 19:24:36 +00:00
} else {
hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2 (
dctx - > entropy . hufTable , dctx - > litBuffer , litSize ,
istart + lhSize , litCSize , dctx - > workspace ,
sizeof ( dctx - > workspace ) , dctx - > bmi2 ) ;
}
}
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( HUF_isError ( hufSuccess ) , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
dctx - > litPtr = dctx - > litBuffer ;
dctx - > litSize = litSize ;
dctx - > litEntropy = 1 ;
if ( litEncType = = set_compressed ) dctx - > HUFptr = dctx - > entropy . hufTable ;
memset ( dctx - > litBuffer + dctx - > litSize , 0 , WILDCOPY_OVERLENGTH ) ;
return litCSize + lhSize ;
}
case set_basic :
{ size_t litSize , lhSize ;
U32 const lhlCode = ( ( istart [ 0 ] ) > > 2 ) & 3 ;
switch ( lhlCode )
{
case 0 : case 2 : default : /* note : default is impossible, since lhlCode into [0..3] */
lhSize = 1 ;
litSize = istart [ 0 ] > > 3 ;
break ;
case 1 :
lhSize = 2 ;
litSize = MEM_readLE16 ( istart ) > > 4 ;
break ;
case 3 :
lhSize = 3 ;
litSize = MEM_readLE24 ( istart ) > > 4 ;
break ;
}
if ( lhSize + litSize + WILDCOPY_OVERLENGTH > srcSize ) { /* risk reading beyond src buffer with wildcopy */
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( litSize + lhSize > srcSize , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
memcpy ( dctx - > litBuffer , istart + lhSize , litSize ) ;
dctx - > litPtr = dctx - > litBuffer ;
dctx - > litSize = litSize ;
memset ( dctx - > litBuffer + dctx - > litSize , 0 , WILDCOPY_OVERLENGTH ) ;
return lhSize + litSize ;
}
/* direct reference into compressed stream */
dctx - > litPtr = istart + lhSize ;
dctx - > litSize = litSize ;
return lhSize + litSize ;
}
case set_rle :
{ U32 const lhlCode = ( ( istart [ 0 ] ) > > 2 ) & 3 ;
size_t litSize , lhSize ;
switch ( lhlCode )
{
case 0 : case 2 : default : /* note : default is impossible, since lhlCode into [0..3] */
lhSize = 1 ;
litSize = istart [ 0 ] > > 3 ;
break ;
case 1 :
lhSize = 2 ;
litSize = MEM_readLE16 ( istart ) > > 4 ;
break ;
case 3 :
lhSize = 3 ;
litSize = MEM_readLE24 ( istart ) > > 4 ;
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( srcSize < 4 , corruption_detected , " srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4 " ) ;
2018-10-25 23:28:41 +00:00
break ;
}
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( litSize > ZSTD_BLOCKSIZE_MAX , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
memset ( dctx - > litBuffer , istart [ lhSize ] , litSize + WILDCOPY_OVERLENGTH ) ;
dctx - > litPtr = dctx - > litBuffer ;
dctx - > litSize = litSize ;
return lhSize + 1 ;
}
default :
2019-01-28 17:35:56 +00:00
RETURN_ERROR ( corruption_detected , " impossible " ) ;
2018-10-25 23:28:41 +00:00
}
}
}
/* Default FSE distribution tables.
* These are pre - calculated FSE decoding tables using default distributions as defined in specification :
* https : //github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions
* They were generated programmatically with following method :
* - start from default distributions , present in / lib / common / zstd_internal . h
* - generate tables normally , using ZSTD_buildFSETable ( )
* - printout the content of tables
* - pretify output , report below , test with fuzzer to ensure it ' s correct */
/* Default FSE distribution table for Literal Lengths */
static const ZSTD_seqSymbol LL_defaultDTable [ ( 1 < < LL_DEFAULTNORMLOG ) + 1 ] = {
{ 1 , 1 , 1 , LL_DEFAULTNORMLOG } , /* header : fastMode, tableLog */
/* nextState, nbAddBits, nbBits, baseVal */
{ 0 , 0 , 4 , 0 } , { 16 , 0 , 4 , 0 } ,
{ 32 , 0 , 5 , 1 } , { 0 , 0 , 5 , 3 } ,
{ 0 , 0 , 5 , 4 } , { 0 , 0 , 5 , 6 } ,
{ 0 , 0 , 5 , 7 } , { 0 , 0 , 5 , 9 } ,
{ 0 , 0 , 5 , 10 } , { 0 , 0 , 5 , 12 } ,
{ 0 , 0 , 6 , 14 } , { 0 , 1 , 5 , 16 } ,
{ 0 , 1 , 5 , 20 } , { 0 , 1 , 5 , 22 } ,
{ 0 , 2 , 5 , 28 } , { 0 , 3 , 5 , 32 } ,
{ 0 , 4 , 5 , 48 } , { 32 , 6 , 5 , 64 } ,
{ 0 , 7 , 5 , 128 } , { 0 , 8 , 6 , 256 } ,
{ 0 , 10 , 6 , 1024 } , { 0 , 12 , 6 , 4096 } ,
{ 32 , 0 , 4 , 0 } , { 0 , 0 , 4 , 1 } ,
{ 0 , 0 , 5 , 2 } , { 32 , 0 , 5 , 4 } ,
{ 0 , 0 , 5 , 5 } , { 32 , 0 , 5 , 7 } ,
{ 0 , 0 , 5 , 8 } , { 32 , 0 , 5 , 10 } ,
{ 0 , 0 , 5 , 11 } , { 0 , 0 , 6 , 13 } ,
{ 32 , 1 , 5 , 16 } , { 0 , 1 , 5 , 18 } ,
{ 32 , 1 , 5 , 22 } , { 0 , 2 , 5 , 24 } ,
{ 32 , 3 , 5 , 32 } , { 0 , 3 , 5 , 40 } ,
{ 0 , 6 , 4 , 64 } , { 16 , 6 , 4 , 64 } ,
{ 32 , 7 , 5 , 128 } , { 0 , 9 , 6 , 512 } ,
{ 0 , 11 , 6 , 2048 } , { 48 , 0 , 4 , 0 } ,
{ 16 , 0 , 4 , 1 } , { 32 , 0 , 5 , 2 } ,
{ 32 , 0 , 5 , 3 } , { 32 , 0 , 5 , 5 } ,
{ 32 , 0 , 5 , 6 } , { 32 , 0 , 5 , 8 } ,
{ 32 , 0 , 5 , 9 } , { 32 , 0 , 5 , 11 } ,
{ 32 , 0 , 5 , 12 } , { 0 , 0 , 6 , 15 } ,
{ 32 , 1 , 5 , 18 } , { 32 , 1 , 5 , 20 } ,
{ 32 , 2 , 5 , 24 } , { 32 , 2 , 5 , 28 } ,
{ 32 , 3 , 5 , 40 } , { 32 , 4 , 5 , 48 } ,
{ 0 , 16 , 6 , 65536 } , { 0 , 15 , 6 , 32768 } ,
{ 0 , 14 , 6 , 16384 } , { 0 , 13 , 6 , 8192 } ,
} ; /* LL_defaultDTable */
/* Default FSE distribution table for Offset Codes */
static const ZSTD_seqSymbol OF_defaultDTable [ ( 1 < < OF_DEFAULTNORMLOG ) + 1 ] = {
{ 1 , 1 , 1 , OF_DEFAULTNORMLOG } , /* header : fastMode, tableLog */
/* nextState, nbAddBits, nbBits, baseVal */
{ 0 , 0 , 5 , 0 } , { 0 , 6 , 4 , 61 } ,
{ 0 , 9 , 5 , 509 } , { 0 , 15 , 5 , 32765 } ,
{ 0 , 21 , 5 , 2097149 } , { 0 , 3 , 5 , 5 } ,
{ 0 , 7 , 4 , 125 } , { 0 , 12 , 5 , 4093 } ,
{ 0 , 18 , 5 , 262141 } , { 0 , 23 , 5 , 8388605 } ,
{ 0 , 5 , 5 , 29 } , { 0 , 8 , 4 , 253 } ,
{ 0 , 14 , 5 , 16381 } , { 0 , 20 , 5 , 1048573 } ,
{ 0 , 2 , 5 , 1 } , { 16 , 7 , 4 , 125 } ,
{ 0 , 11 , 5 , 2045 } , { 0 , 17 , 5 , 131069 } ,
{ 0 , 22 , 5 , 4194301 } , { 0 , 4 , 5 , 13 } ,
{ 16 , 8 , 4 , 253 } , { 0 , 13 , 5 , 8189 } ,
{ 0 , 19 , 5 , 524285 } , { 0 , 1 , 5 , 1 } ,
{ 16 , 6 , 4 , 61 } , { 0 , 10 , 5 , 1021 } ,
{ 0 , 16 , 5 , 65533 } , { 0 , 28 , 5 , 268435453 } ,
{ 0 , 27 , 5 , 134217725 } , { 0 , 26 , 5 , 67108861 } ,
{ 0 , 25 , 5 , 33554429 } , { 0 , 24 , 5 , 16777213 } ,
} ; /* OF_defaultDTable */
/* Default FSE distribution table for Match Lengths */
static const ZSTD_seqSymbol ML_defaultDTable [ ( 1 < < ML_DEFAULTNORMLOG ) + 1 ] = {
{ 1 , 1 , 1 , ML_DEFAULTNORMLOG } , /* header : fastMode, tableLog */
/* nextState, nbAddBits, nbBits, baseVal */
{ 0 , 0 , 6 , 3 } , { 0 , 0 , 4 , 4 } ,
{ 32 , 0 , 5 , 5 } , { 0 , 0 , 5 , 6 } ,
{ 0 , 0 , 5 , 8 } , { 0 , 0 , 5 , 9 } ,
{ 0 , 0 , 5 , 11 } , { 0 , 0 , 6 , 13 } ,
{ 0 , 0 , 6 , 16 } , { 0 , 0 , 6 , 19 } ,
{ 0 , 0 , 6 , 22 } , { 0 , 0 , 6 , 25 } ,
{ 0 , 0 , 6 , 28 } , { 0 , 0 , 6 , 31 } ,
{ 0 , 0 , 6 , 34 } , { 0 , 1 , 6 , 37 } ,
{ 0 , 1 , 6 , 41 } , { 0 , 2 , 6 , 47 } ,
{ 0 , 3 , 6 , 59 } , { 0 , 4 , 6 , 83 } ,
{ 0 , 7 , 6 , 131 } , { 0 , 9 , 6 , 515 } ,
{ 16 , 0 , 4 , 4 } , { 0 , 0 , 4 , 5 } ,
{ 32 , 0 , 5 , 6 } , { 0 , 0 , 5 , 7 } ,
{ 32 , 0 , 5 , 9 } , { 0 , 0 , 5 , 10 } ,
{ 0 , 0 , 6 , 12 } , { 0 , 0 , 6 , 15 } ,
{ 0 , 0 , 6 , 18 } , { 0 , 0 , 6 , 21 } ,
{ 0 , 0 , 6 , 24 } , { 0 , 0 , 6 , 27 } ,
{ 0 , 0 , 6 , 30 } , { 0 , 0 , 6 , 33 } ,
{ 0 , 1 , 6 , 35 } , { 0 , 1 , 6 , 39 } ,
{ 0 , 2 , 6 , 43 } , { 0 , 3 , 6 , 51 } ,
{ 0 , 4 , 6 , 67 } , { 0 , 5 , 6 , 99 } ,
{ 0 , 8 , 6 , 259 } , { 32 , 0 , 4 , 4 } ,
{ 48 , 0 , 4 , 4 } , { 16 , 0 , 4 , 5 } ,
{ 32 , 0 , 5 , 7 } , { 32 , 0 , 5 , 8 } ,
{ 32 , 0 , 5 , 10 } , { 32 , 0 , 5 , 11 } ,
{ 0 , 0 , 6 , 14 } , { 0 , 0 , 6 , 17 } ,
{ 0 , 0 , 6 , 20 } , { 0 , 0 , 6 , 23 } ,
{ 0 , 0 , 6 , 26 } , { 0 , 0 , 6 , 29 } ,
{ 0 , 0 , 6 , 32 } , { 0 , 16 , 6 , 65539 } ,
{ 0 , 15 , 6 , 32771 } , { 0 , 14 , 6 , 16387 } ,
{ 0 , 13 , 6 , 8195 } , { 0 , 12 , 6 , 4099 } ,
{ 0 , 11 , 6 , 2051 } , { 0 , 10 , 6 , 1027 } ,
} ; /* ML_defaultDTable */
static void ZSTD_buildSeqTable_rle ( ZSTD_seqSymbol * dt , U32 baseValue , U32 nbAddBits )
{
void * ptr = dt ;
ZSTD_seqSymbol_header * const DTableH = ( ZSTD_seqSymbol_header * ) ptr ;
ZSTD_seqSymbol * const cell = dt + 1 ;
DTableH - > tableLog = 0 ;
DTableH - > fastMode = 0 ;
cell - > nbBits = 0 ;
cell - > nextState = 0 ;
assert ( nbAddBits < 255 ) ;
cell - > nbAdditionalBits = ( BYTE ) nbAddBits ;
cell - > baseValue = baseValue ;
}
/* ZSTD_buildFSETable() :
2018-10-26 22:21:52 +00:00
* generate FSE decoding table for one symbol ( ll , ml or off )
* cannot fail if input is valid = >
* all inputs are presumed validated at this stage */
2018-10-25 23:28:41 +00:00
void
ZSTD_buildFSETable ( ZSTD_seqSymbol * dt ,
const short * normalizedCounter , unsigned maxSymbolValue ,
const U32 * baseValue , const U32 * nbAdditionalBits ,
unsigned tableLog )
{
ZSTD_seqSymbol * const tableDecode = dt + 1 ;
U16 symbolNext [ MaxSeq + 1 ] ;
U32 const maxSV1 = maxSymbolValue + 1 ;
U32 const tableSize = 1 < < tableLog ;
U32 highThreshold = tableSize - 1 ;
/* Sanity Checks */
assert ( maxSymbolValue < = MaxSeq ) ;
assert ( tableLog < = MaxFSELog ) ;
/* Init, lay down lowprob symbols */
{ ZSTD_seqSymbol_header DTableH ;
DTableH . tableLog = tableLog ;
DTableH . fastMode = 1 ;
{ S16 const largeLimit = ( S16 ) ( 1 < < ( tableLog - 1 ) ) ;
U32 s ;
for ( s = 0 ; s < maxSV1 ; s + + ) {
if ( normalizedCounter [ s ] = = - 1 ) {
tableDecode [ highThreshold - - ] . baseValue = s ;
symbolNext [ s ] = 1 ;
} else {
if ( normalizedCounter [ s ] > = largeLimit ) DTableH . fastMode = 0 ;
symbolNext [ s ] = normalizedCounter [ s ] ;
} } }
memcpy ( dt , & DTableH , sizeof ( DTableH ) ) ;
}
/* Spread symbols */
{ U32 const tableMask = tableSize - 1 ;
U32 const step = FSE_TABLESTEP ( tableSize ) ;
U32 s , position = 0 ;
for ( s = 0 ; s < maxSV1 ; s + + ) {
int i ;
for ( i = 0 ; i < normalizedCounter [ s ] ; i + + ) {
tableDecode [ position ] . baseValue = s ;
position = ( position + step ) & tableMask ;
while ( position > highThreshold ) position = ( position + step ) & tableMask ; /* lowprob area */
} }
assert ( position = = 0 ) ; /* position must reach all cells once, otherwise normalizedCounter is incorrect */
}
/* Build Decoding table */
{ U32 u ;
for ( u = 0 ; u < tableSize ; u + + ) {
U32 const symbol = tableDecode [ u ] . baseValue ;
U32 const nextState = symbolNext [ symbol ] + + ;
tableDecode [ u ] . nbBits = ( BYTE ) ( tableLog - BIT_highbit32 ( nextState ) ) ;
tableDecode [ u ] . nextState = ( U16 ) ( ( nextState < < tableDecode [ u ] . nbBits ) - tableSize ) ;
assert ( nbAdditionalBits [ symbol ] < 255 ) ;
tableDecode [ u ] . nbAdditionalBits = ( BYTE ) nbAdditionalBits [ symbol ] ;
tableDecode [ u ] . baseValue = baseValue [ symbol ] ;
} }
}
/*! ZSTD_buildSeqTable() :
* @ return : nb bytes read from src ,
* or an error code if it fails */
static size_t ZSTD_buildSeqTable ( ZSTD_seqSymbol * DTableSpace , const ZSTD_seqSymbol * * DTablePtr ,
fix confusion between unsigned <-> U32
as suggested in #1441.
generally U32 and unsigned are the same thing,
except when they are not ...
case : 32-bit compilation for MIPS (uint32_t == unsigned long)
A vast majority of transformation consists in transforming U32 into unsigned.
In rare cases, it's the other way around (typically for internal code, such as seeds).
Among a few issues this patches solves :
- some parameters were declared with type `unsigned` in *.h,
but with type `U32` in their implementation *.c .
- some parameters have type unsigned*,
but the caller user a pointer to U32 instead.
These fixes are useful.
However, the bulk of changes is about %u formating,
which requires unsigned type,
but generally receives U32 values instead,
often just for brevity (U32 is shorter than unsigned).
These changes are generally minor, or even annoying.
As a consequence, the amount of code changed is larger than I would expect for such a patch.
Testing is also a pain :
it requires manually modifying `mem.h`,
in order to lie about `U32`
and force it to be an `unsigned long` typically.
On a 64-bit system, this will break the equivalence unsigned == U32.
Unfortunately, it will also break a few static_assert(), controlling structure sizes.
So it also requires modifying `debug.h` to make `static_assert()` a noop.
And then reverting these changes.
So it's inconvenient, and as a consequence,
this property is currently not checked during CI tests.
Therefore, these problems can emerge again in the future.
I wonder if it is worth ensuring proper distinction of U32 != unsigned in CI tests.
It's another restriction for coding, adding more frustration during merge tests,
since most platforms don't need this distinction (hence contributor will not see it),
and while this can matter in theory, the number of platforms impacted seems minimal.
Thoughts ?
2018-12-22 00:19:44 +00:00
symbolEncodingType_e type , unsigned max , U32 maxLog ,
2018-10-25 23:28:41 +00:00
const void * src , size_t srcSize ,
const U32 * baseValue , const U32 * nbAdditionalBits ,
const ZSTD_seqSymbol * defaultTable , U32 flagRepeatTable ,
int ddictIsCold , int nbSeq )
{
switch ( type )
{
case set_rle :
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( ! srcSize , srcSize_wrong ) ;
RETURN_ERROR_IF ( ( * ( const BYTE * ) src ) > max , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
{ U32 const symbol = * ( const BYTE * ) src ;
U32 const baseline = baseValue [ symbol ] ;
U32 const nbBits = nbAdditionalBits [ symbol ] ;
ZSTD_buildSeqTable_rle ( DTableSpace , baseline , nbBits ) ;
}
* DTablePtr = DTableSpace ;
return 1 ;
case set_basic :
* DTablePtr = defaultTable ;
return 0 ;
case set_repeat :
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( ! flagRepeatTable , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
/* prefetch FSE table if used */
if ( ddictIsCold & & ( nbSeq > 24 /* heuristic */ ) ) {
const void * const pStart = * DTablePtr ;
size_t const pSize = sizeof ( ZSTD_seqSymbol ) * ( SEQSYMBOL_TABLE_SIZE ( maxLog ) ) ;
PREFETCH_AREA ( pStart , pSize ) ;
}
return 0 ;
case set_compressed :
fix confusion between unsigned <-> U32
as suggested in #1441.
generally U32 and unsigned are the same thing,
except when they are not ...
case : 32-bit compilation for MIPS (uint32_t == unsigned long)
A vast majority of transformation consists in transforming U32 into unsigned.
In rare cases, it's the other way around (typically for internal code, such as seeds).
Among a few issues this patches solves :
- some parameters were declared with type `unsigned` in *.h,
but with type `U32` in their implementation *.c .
- some parameters have type unsigned*,
but the caller user a pointer to U32 instead.
These fixes are useful.
However, the bulk of changes is about %u formating,
which requires unsigned type,
but generally receives U32 values instead,
often just for brevity (U32 is shorter than unsigned).
These changes are generally minor, or even annoying.
As a consequence, the amount of code changed is larger than I would expect for such a patch.
Testing is also a pain :
it requires manually modifying `mem.h`,
in order to lie about `U32`
and force it to be an `unsigned long` typically.
On a 64-bit system, this will break the equivalence unsigned == U32.
Unfortunately, it will also break a few static_assert(), controlling structure sizes.
So it also requires modifying `debug.h` to make `static_assert()` a noop.
And then reverting these changes.
So it's inconvenient, and as a consequence,
this property is currently not checked during CI tests.
Therefore, these problems can emerge again in the future.
I wonder if it is worth ensuring proper distinction of U32 != unsigned in CI tests.
It's another restriction for coding, adding more frustration during merge tests,
since most platforms don't need this distinction (hence contributor will not see it),
and while this can matter in theory, the number of platforms impacted seems minimal.
Thoughts ?
2018-12-22 00:19:44 +00:00
{ unsigned tableLog ;
2018-10-25 23:28:41 +00:00
S16 norm [ MaxSeq + 1 ] ;
size_t const headerSize = FSE_readNCount ( norm , & max , & tableLog , src , srcSize ) ;
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( FSE_isError ( headerSize ) , corruption_detected ) ;
RETURN_ERROR_IF ( tableLog > maxLog , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
ZSTD_buildFSETable ( DTableSpace , norm , max , baseValue , nbAdditionalBits , tableLog ) ;
* DTablePtr = DTableSpace ;
return headerSize ;
}
2018-12-06 01:17:11 +00:00
default :
2018-10-25 23:28:41 +00:00
assert ( 0 ) ;
2019-01-28 17:35:56 +00:00
RETURN_ERROR ( GENERIC , " impossible " ) ;
2018-10-25 23:28:41 +00:00
}
}
size_t ZSTD_decodeSeqHeaders ( ZSTD_DCtx * dctx , int * nbSeqPtr ,
const void * src , size_t srcSize )
{
const BYTE * const istart = ( const BYTE * const ) src ;
const BYTE * const iend = istart + srcSize ;
const BYTE * ip = istart ;
int nbSeq ;
DEBUGLOG ( 5 , " ZSTD_decodeSeqHeaders " ) ;
/* check */
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( srcSize < MIN_SEQUENCES_SIZE , srcSize_wrong ) ;
2018-10-25 23:28:41 +00:00
/* SeqHead */
nbSeq = * ip + + ;
if ( ! nbSeq ) {
* nbSeqPtr = 0 ;
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( srcSize ! = 1 , srcSize_wrong ) ;
2018-10-25 23:28:41 +00:00
return 1 ;
}
if ( nbSeq > 0x7F ) {
if ( nbSeq = = 0xFF ) {
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( ip + 2 > iend , srcSize_wrong ) ;
2018-10-25 23:28:41 +00:00
nbSeq = MEM_readLE16 ( ip ) + LONGNBSEQ , ip + = 2 ;
} else {
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( ip > = iend , srcSize_wrong ) ;
2018-10-25 23:28:41 +00:00
nbSeq = ( ( nbSeq - 0x80 ) < < 8 ) + * ip + + ;
}
}
* nbSeqPtr = nbSeq ;
/* FSE table descriptors */
2019-06-26 23:43:37 +00:00
RETURN_ERROR_IF ( ip + 1 > iend , srcSize_wrong ) ; /* minimum possible size: 1 byte for symbol encoding types */
2018-10-25 23:28:41 +00:00
{ symbolEncodingType_e const LLtype = ( symbolEncodingType_e ) ( * ip > > 6 ) ;
symbolEncodingType_e const OFtype = ( symbolEncodingType_e ) ( ( * ip > > 4 ) & 3 ) ;
symbolEncodingType_e const MLtype = ( symbolEncodingType_e ) ( ( * ip > > 2 ) & 3 ) ;
ip + + ;
/* Build DTables */
{ size_t const llhSize = ZSTD_buildSeqTable ( dctx - > entropy . LLTable , & dctx - > LLTptr ,
LLtype , MaxLL , LLFSELog ,
ip , iend - ip ,
LL_base , LL_bits ,
LL_defaultDTable , dctx - > fseEntropy ,
dctx - > ddictIsCold , nbSeq ) ;
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( ZSTD_isError ( llhSize ) , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
ip + = llhSize ;
}
{ size_t const ofhSize = ZSTD_buildSeqTable ( dctx - > entropy . OFTable , & dctx - > OFTptr ,
OFtype , MaxOff , OffFSELog ,
ip , iend - ip ,
OF_base , OF_bits ,
OF_defaultDTable , dctx - > fseEntropy ,
dctx - > ddictIsCold , nbSeq ) ;
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( ZSTD_isError ( ofhSize ) , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
ip + = ofhSize ;
}
{ size_t const mlhSize = ZSTD_buildSeqTable ( dctx - > entropy . MLTable , & dctx - > MLTptr ,
MLtype , MaxML , MLFSELog ,
ip , iend - ip ,
ML_base , ML_bits ,
ML_defaultDTable , dctx - > fseEntropy ,
dctx - > ddictIsCold , nbSeq ) ;
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( ZSTD_isError ( mlhSize ) , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
ip + = mlhSize ;
}
}
return ip - istart ;
}
typedef struct {
size_t litLength ;
size_t matchLength ;
size_t offset ;
const BYTE * match ;
} seq_t ;
typedef struct {
size_t state ;
const ZSTD_seqSymbol * table ;
} ZSTD_fseState ;
typedef struct {
BIT_DStream_t DStream ;
ZSTD_fseState stateLL ;
ZSTD_fseState stateOffb ;
ZSTD_fseState stateML ;
size_t prevOffset [ ZSTD_REP_NUM ] ;
const BYTE * prefixStart ;
const BYTE * dictEnd ;
size_t pos ;
} seqState_t ;
/* ZSTD_execSequenceLast7():
* exceptional case : decompress a match starting within last 7 bytes of output buffer .
* requires more careful checks , to ensure there is no overflow .
* performance does not matter though .
* note : this case is supposed to be never generated " naturally " by reference encoder ,
* since in most cases it needs at least 8 bytes to look for a match .
* but it ' s allowed by the specification . */
FORCE_NOINLINE
size_t ZSTD_execSequenceLast7 ( BYTE * op ,
BYTE * const oend , seq_t sequence ,
const BYTE * * litPtr , const BYTE * const litLimit ,
const BYTE * const base , const BYTE * const vBase , const BYTE * const dictEnd )
{
BYTE * const oLitEnd = op + sequence . litLength ;
size_t const sequenceLength = sequence . litLength + sequence . matchLength ;
BYTE * const oMatchEnd = op + sequenceLength ; /* risk : address space overflow (32-bits) */
const BYTE * const iLitEnd = * litPtr + sequence . litLength ;
const BYTE * match = oLitEnd - sequence . offset ;
/* check */
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( oMatchEnd > oend , dstSize_tooSmall , " last match must fit within dstBuffer " ) ;
RETURN_ERROR_IF ( iLitEnd > litLimit , corruption_detected , " try to read beyond literal buffer " ) ;
2018-10-25 23:28:41 +00:00
/* copy literals */
while ( op < oLitEnd ) * op + + = * ( * litPtr ) + + ;
/* copy Match */
if ( sequence . offset > ( size_t ) ( oLitEnd - base ) ) {
/* offset beyond prefix */
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( sequence . offset > ( size_t ) ( oLitEnd - vBase ) , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
match = dictEnd - ( base - match ) ;
if ( match + sequence . matchLength < = dictEnd ) {
memmove ( oLitEnd , match , sequence . matchLength ) ;
return sequenceLength ;
}
/* span extDict & currentPrefixSegment */
{ size_t const length1 = dictEnd - match ;
memmove ( oLitEnd , match , length1 ) ;
op = oLitEnd + length1 ;
sequence . matchLength - = length1 ;
match = base ;
} }
while ( op < oMatchEnd ) * op + + = * match + + ;
return sequenceLength ;
}
HINT_INLINE
size_t ZSTD_execSequence ( BYTE * op ,
BYTE * const oend , seq_t sequence ,
const BYTE * * litPtr , const BYTE * const litLimit ,
const BYTE * const prefixStart , const BYTE * const virtualStart , const BYTE * const dictEnd )
{
BYTE * const oLitEnd = op + sequence . litLength ;
size_t const sequenceLength = sequence . litLength + sequence . matchLength ;
BYTE * const oMatchEnd = op + sequenceLength ; /* risk : address space overflow (32-bits) */
BYTE * const oend_w = oend - WILDCOPY_OVERLENGTH ;
const BYTE * const iLitEnd = * litPtr + sequence . litLength ;
const BYTE * match = oLitEnd - sequence . offset ;
/* check */
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( oMatchEnd > oend , dstSize_tooSmall , " last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend " ) ;
RETURN_ERROR_IF ( iLitEnd > litLimit , corruption_detected , " over-read beyond lit buffer " ) ;
2018-10-25 23:28:41 +00:00
if ( oLitEnd > oend_w ) return ZSTD_execSequenceLast7 ( op , oend , sequence , litPtr , litLimit , prefixStart , virtualStart , dictEnd ) ;
/* copy Literals */
if ( sequence . litLength > 8 )
perf improvements for zstd decode (#1668)
* perf improvements for zstd decode
tldr: 7.5% average decode speedup on silesia corpus at compression levels 1-3 (sandy bridge)
Background: while investigating zstd perf differences between clang and gcc I noticed that even though gcc is vectorizing the loop in in wildcopy, it was not being done as well as could be done by hand. The sites where wildcopy is invoked have an interesting distribution of lengths to be copied. The loop trip count is rarely above 1, yet long copies are common enough to make their performance important.The code in zstd_decompress.c to invoke wildcopy handles the latter well but the gcc autovectorizer introduces a needlessly expensive startup check for vectorization.
See how GCC autovectorizes the loop here:
https://godbolt.org/z/apr0x0
Here is the code after this diff has been applied: (left hand side is the good one, right is with vectorizer on)
After: https://godbolt.org/z/OwO4F8
Note that autovectorization still does not do a good job on the optimized version, so it's turned off\
via attribute and flag. I found that neither attribute nor command-line flag were entirely successful in turning off vectorization, which is why there were both.
silesia benchmark data - second triad of each file is with the original code:
file orig compressedratio encode decode change
1#dickens 10192446-> 4268865(2.388), 198.9MB/s 709.6MB/s
2#dickens 10192446-> 3876126(2.630), 128.7MB/s 552.5MB/s
3#dickens 10192446-> 3682956(2.767), 104.6MB/s 537MB/s
1#dickens 10192446-> 4268865(2.388), 195.4MB/s 659.5MB/s 7.60%
2#dickens 10192446-> 3876126(2.630), 127MB/s 516.3MB/s 7.01%
3#dickens 10192446-> 3682956(2.767), 105MB/s 479.5MB/s 11.99%
1#mozilla 51220480-> 20117517(2.546), 285.4MB/s 734.9MB/s
2#mozilla 51220480-> 19067018(2.686), 220.8MB/s 686.3MB/s
3#mozilla 51220480-> 18508283(2.767), 152.2MB/s 669.4MB/s
1#mozilla 51220480-> 20117517(2.546), 283.4MB/s 697.9MB/s 5.30%
2#mozilla 51220480-> 19067018(2.686), 225.9MB/s 665MB/s 3.20%
3#mozilla 51220480-> 18508283(2.767), 154.5MB/s 640.6MB/s 4.50%
1#mr 9970564-> 3840242(2.596), 262.4MB/s 899.8MB/s
2#mr 9970564-> 3600976(2.769), 181.2MB/s 717.9MB/s
3#mr 9970564-> 3563987(2.798), 116.3MB/s 620MB/s
1#mr 9970564-> 3840242(2.596), 253.2MB/s 827.3MB/s 8.76%
2#mr 9970564-> 3600976(2.769), 177.4MB/s 655.4MB/s 9.54%
3#mr 9970564-> 3563987(2.798), 111.2MB/s 564.2MB/s 9.89%
1#nci 33553445-> 2849306(11.78), 575.2MB/s , 1335.8MB/s
2#nci 33553445-> 2890166(11.61), 509.3MB/s , 1238.1MB/s
3#nci 33553445-> 2857408(11.74), 431MB/s , 1210.7MB/s
1#nci 33553445-> 2849306(11.78), 565.4MB/s , 1220.2MB/s 9.47%
2#nci 33553445-> 2890166(11.61), 508.2MB/s , 1128.4MB/s 9.72%
3#nci 33553445-> 2857408(11.74), 429.1MB/s , 1097.7MB/s 10.29%
1#ooffice 6152192-> 3590954(1.713), 231.4MB/s , 662.6MB/s
2#ooffice 6152192-> 3323931(1.851), 162.8MB/s , 592.6MB/s
3#ooffice 6152192-> 3145625(1.956), 99.9MB/s , 549.6MB/s
1#ooffice 6152192-> 3590954(1.713), 224.7MB/s , 624.2MB/s 6.15%
2#ooffice 6152192-> 3323931 (1.851), 155MB/s , 564.5MB/s 4.98%
3#ooffice 6152192-> 3145625(1.956), 101.1MB/s , 521.2MB/s 5.45%
1#osdb 10085684-> 3739042(2.697), 271.9MB/s 876.4MB/s
2#osdb 10085684-> 3493875(2.887), 208.2MB/s 857MB/s
3#osdb 10085684-> 3515831(2.869), 135.3MB/s 805.4MB/s
1#osdb 10085684-> 3739042(2.697), 257.4MB/s 793.8MB/s 10.41%
2#osdb 10085684-> 3493875(2.887), 209.7MB/s 776.1MB/s 10.42%
3#osdb 10085684-> 3515831(2.869), 130.6MB/s 727.7MB/s 10.68%
1#reymont 6627202-> 2152771(3.078), 198.9MB/s 696.2MB/s
2#reymont 6627202-> 2071140(3.200), 170MB/s 595.2MB/s
3#reymont 6627202-> 1953597(3.392), 128.5MB/s 609.7MB/s
1#reymont 6627202-> 2152771(3.078), 199.6MB/s 655.2MB/s 6.26%
2#reymont 6627202-> 2071140(3.200), 168.2MB/s 554.4MB/s 7.36%
3#reymont 6627202-> 1953597(3.392), 128.7MB/s 557.4MB/s 9.38%
1#samba 21606400-> 5510994(3.921), 338.1MB/s 1066MB/s
2#samba 21606400-> 5240208(4.123), 258.7MB/s 992.3MB/s
3#samba 21606400-> 5003358(4.318), 200.2MB/s 991.1MB/s
1#samba 21606400-> 5510994(3.921), 330.8MB/s 974MB/s 9.45%
2#samba 21606400-> 5240208(4.123), 257.9MB/s 919.4MB/s 7.93%
3#samba 21606400-> 5003358(4.318), 198.5MB/s 908.9MB/s 9.04%
1#sao 7251944-> 6256401(1.159), 194.6MB/s 602.2MB/s
2#sao 7251944-> 5808761(1.248), 128.2MB/s 532.1MB/s
3#sao 7251944-> 5556318(1.305), 73MB/s 509.4MB/s
1#sao 7251944-> 6256401(1.159), 198.7MB/s 580.7MB/s 3.70%
2#sao 7251944-> 5808761(1.248), 129.1MB/s 502.7MB/s 5.85%
3#sao 7251944-> 5556318(1.305), 74.6MB/s 493.1MB/s 3.31%
1#webster 41458703-> 13692222(3.028), 222.3MB/s 752MB/s
2#webster 41458703-> 12842646(3.228), 157.6MB/s 532.2MB/s
3#webster 41458703-> 12191964(3.400), 124MB/s 468.5MB/s
1#webster 41458703-> 13692222(3.028), 219.7MB/s 697MB/s 7.89%
2#webster 41458703-> 12842646(3.228), 153.9MB/s 495.4MB/s 7.43%
3#webster 41458703-> 12191964(3.400), 124.8MB/s 444.8MB/s 5.33%
1#xml 5345280-> 696652(7.673), 485MB/s , 1333.9MB/s
2#xml 5345280-> 681492(7.843), 405.2MB/s , 1237.5MB/s
3#xml 5345280-> 639057(8.364), 328.5MB/s , 1281.3MB/s
1#xml 5345280-> 696652(7.673), 473.1MB/s , 1232.4MB/s 8.24%
2#xml 5345280-> 681492(7.843), 398.6MB/s , 1145.9MB/s 7.99%
3#xml 5345280-> 639057(8.364), 327.1MB/s , 1175MB/s 9.05%
1#x-ray 8474240-> 6772557(1.251), 521.3MB/s 762.6MB/s
2#x-ray 8474240-> 6684531(1.268), 230.5MB/s 688.5MB/s
3#x-ray 8474240-> 6166679(1.374), 68.7MB/s 478.8MB/s
1#x-ray 8474240-> 6772557(1.251), 502.8MB/s 736.7MB/s 3.52%
2#x-ray 8474240-> 6684531(1.268), 224.4MB/s 662MB/s 4.00%
3#x-ray 8474240-> 6166679(1.374), 67.3MB/s 437.8MB/s 9.37%
7.51%
* makefile changed to only pass -fno-tree-vectorize to gcc
* <Replace this line with a title. Use 1 line only, 67 chars or less>
Don't add "no-tree-vectorize" attribute on clang (which defines __GNUC__)
* fix for warning/error with subtraction of void* pointers
* fix c90 conformance issue - ISO C90 forbids mixed declarations and code
* Fix assert for negative diff, only when there is no overlap
* fix overflow revealed in fuzzing tests
* tweak for small speed increase
2019-07-11 22:31:07 +00:00
ZSTD_wildcopy_16min ( op , ( * litPtr ) , sequence . litLength , ZSTD_no_overlap ) ; /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
else
ZSTD_copy8 ( op , * litPtr ) ;
2018-10-25 23:28:41 +00:00
op = oLitEnd ;
* litPtr = iLitEnd ; /* update for next sequence */
/* copy Match */
if ( sequence . offset > ( size_t ) ( oLitEnd - prefixStart ) ) {
/* offset beyond prefix -> go into extDict */
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( sequence . offset > ( size_t ) ( oLitEnd - virtualStart ) , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
match = dictEnd + ( match - prefixStart ) ;
if ( match + sequence . matchLength < = dictEnd ) {
memmove ( oLitEnd , match , sequence . matchLength ) ;
return sequenceLength ;
}
/* span extDict & currentPrefixSegment */
{ size_t const length1 = dictEnd - match ;
memmove ( oLitEnd , match , length1 ) ;
op = oLitEnd + length1 ;
sequence . matchLength - = length1 ;
match = prefixStart ;
if ( op > oend_w | | sequence . matchLength < MINMATCH ) {
U32 i ;
for ( i = 0 ; i < sequence . matchLength ; + + i ) op [ i ] = match [ i ] ;
return sequenceLength ;
}
} }
/* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */
/* match within prefix */
if ( sequence . offset < 8 ) {
/* close range match, overlap */
static const U32 dec32table [ ] = { 0 , 1 , 2 , 1 , 4 , 4 , 4 , 4 } ; /* added */
static const int dec64table [ ] = { 8 , 8 , 8 , 7 , 8 , 9 , 10 , 11 } ; /* subtracted */
int const sub2 = dec64table [ sequence . offset ] ;
op [ 0 ] = match [ 0 ] ;
op [ 1 ] = match [ 1 ] ;
op [ 2 ] = match [ 2 ] ;
op [ 3 ] = match [ 3 ] ;
match + = dec32table [ sequence . offset ] ;
ZSTD_copy4 ( op + 4 , match ) ;
match - = sub2 ;
} else {
ZSTD_copy8 ( op , match ) ;
}
op + = 8 ; match + = 8 ;
if ( oMatchEnd > oend - ( 16 - MINMATCH ) ) {
if ( op < oend_w ) {
perf improvements for zstd decode (#1668)
* perf improvements for zstd decode
tldr: 7.5% average decode speedup on silesia corpus at compression levels 1-3 (sandy bridge)
Background: while investigating zstd perf differences between clang and gcc I noticed that even though gcc is vectorizing the loop in in wildcopy, it was not being done as well as could be done by hand. The sites where wildcopy is invoked have an interesting distribution of lengths to be copied. The loop trip count is rarely above 1, yet long copies are common enough to make their performance important.The code in zstd_decompress.c to invoke wildcopy handles the latter well but the gcc autovectorizer introduces a needlessly expensive startup check for vectorization.
See how GCC autovectorizes the loop here:
https://godbolt.org/z/apr0x0
Here is the code after this diff has been applied: (left hand side is the good one, right is with vectorizer on)
After: https://godbolt.org/z/OwO4F8
Note that autovectorization still does not do a good job on the optimized version, so it's turned off\
via attribute and flag. I found that neither attribute nor command-line flag were entirely successful in turning off vectorization, which is why there were both.
silesia benchmark data - second triad of each file is with the original code:
file orig compressedratio encode decode change
1#dickens 10192446-> 4268865(2.388), 198.9MB/s 709.6MB/s
2#dickens 10192446-> 3876126(2.630), 128.7MB/s 552.5MB/s
3#dickens 10192446-> 3682956(2.767), 104.6MB/s 537MB/s
1#dickens 10192446-> 4268865(2.388), 195.4MB/s 659.5MB/s 7.60%
2#dickens 10192446-> 3876126(2.630), 127MB/s 516.3MB/s 7.01%
3#dickens 10192446-> 3682956(2.767), 105MB/s 479.5MB/s 11.99%
1#mozilla 51220480-> 20117517(2.546), 285.4MB/s 734.9MB/s
2#mozilla 51220480-> 19067018(2.686), 220.8MB/s 686.3MB/s
3#mozilla 51220480-> 18508283(2.767), 152.2MB/s 669.4MB/s
1#mozilla 51220480-> 20117517(2.546), 283.4MB/s 697.9MB/s 5.30%
2#mozilla 51220480-> 19067018(2.686), 225.9MB/s 665MB/s 3.20%
3#mozilla 51220480-> 18508283(2.767), 154.5MB/s 640.6MB/s 4.50%
1#mr 9970564-> 3840242(2.596), 262.4MB/s 899.8MB/s
2#mr 9970564-> 3600976(2.769), 181.2MB/s 717.9MB/s
3#mr 9970564-> 3563987(2.798), 116.3MB/s 620MB/s
1#mr 9970564-> 3840242(2.596), 253.2MB/s 827.3MB/s 8.76%
2#mr 9970564-> 3600976(2.769), 177.4MB/s 655.4MB/s 9.54%
3#mr 9970564-> 3563987(2.798), 111.2MB/s 564.2MB/s 9.89%
1#nci 33553445-> 2849306(11.78), 575.2MB/s , 1335.8MB/s
2#nci 33553445-> 2890166(11.61), 509.3MB/s , 1238.1MB/s
3#nci 33553445-> 2857408(11.74), 431MB/s , 1210.7MB/s
1#nci 33553445-> 2849306(11.78), 565.4MB/s , 1220.2MB/s 9.47%
2#nci 33553445-> 2890166(11.61), 508.2MB/s , 1128.4MB/s 9.72%
3#nci 33553445-> 2857408(11.74), 429.1MB/s , 1097.7MB/s 10.29%
1#ooffice 6152192-> 3590954(1.713), 231.4MB/s , 662.6MB/s
2#ooffice 6152192-> 3323931(1.851), 162.8MB/s , 592.6MB/s
3#ooffice 6152192-> 3145625(1.956), 99.9MB/s , 549.6MB/s
1#ooffice 6152192-> 3590954(1.713), 224.7MB/s , 624.2MB/s 6.15%
2#ooffice 6152192-> 3323931 (1.851), 155MB/s , 564.5MB/s 4.98%
3#ooffice 6152192-> 3145625(1.956), 101.1MB/s , 521.2MB/s 5.45%
1#osdb 10085684-> 3739042(2.697), 271.9MB/s 876.4MB/s
2#osdb 10085684-> 3493875(2.887), 208.2MB/s 857MB/s
3#osdb 10085684-> 3515831(2.869), 135.3MB/s 805.4MB/s
1#osdb 10085684-> 3739042(2.697), 257.4MB/s 793.8MB/s 10.41%
2#osdb 10085684-> 3493875(2.887), 209.7MB/s 776.1MB/s 10.42%
3#osdb 10085684-> 3515831(2.869), 130.6MB/s 727.7MB/s 10.68%
1#reymont 6627202-> 2152771(3.078), 198.9MB/s 696.2MB/s
2#reymont 6627202-> 2071140(3.200), 170MB/s 595.2MB/s
3#reymont 6627202-> 1953597(3.392), 128.5MB/s 609.7MB/s
1#reymont 6627202-> 2152771(3.078), 199.6MB/s 655.2MB/s 6.26%
2#reymont 6627202-> 2071140(3.200), 168.2MB/s 554.4MB/s 7.36%
3#reymont 6627202-> 1953597(3.392), 128.7MB/s 557.4MB/s 9.38%
1#samba 21606400-> 5510994(3.921), 338.1MB/s 1066MB/s
2#samba 21606400-> 5240208(4.123), 258.7MB/s 992.3MB/s
3#samba 21606400-> 5003358(4.318), 200.2MB/s 991.1MB/s
1#samba 21606400-> 5510994(3.921), 330.8MB/s 974MB/s 9.45%
2#samba 21606400-> 5240208(4.123), 257.9MB/s 919.4MB/s 7.93%
3#samba 21606400-> 5003358(4.318), 198.5MB/s 908.9MB/s 9.04%
1#sao 7251944-> 6256401(1.159), 194.6MB/s 602.2MB/s
2#sao 7251944-> 5808761(1.248), 128.2MB/s 532.1MB/s
3#sao 7251944-> 5556318(1.305), 73MB/s 509.4MB/s
1#sao 7251944-> 6256401(1.159), 198.7MB/s 580.7MB/s 3.70%
2#sao 7251944-> 5808761(1.248), 129.1MB/s 502.7MB/s 5.85%
3#sao 7251944-> 5556318(1.305), 74.6MB/s 493.1MB/s 3.31%
1#webster 41458703-> 13692222(3.028), 222.3MB/s 752MB/s
2#webster 41458703-> 12842646(3.228), 157.6MB/s 532.2MB/s
3#webster 41458703-> 12191964(3.400), 124MB/s 468.5MB/s
1#webster 41458703-> 13692222(3.028), 219.7MB/s 697MB/s 7.89%
2#webster 41458703-> 12842646(3.228), 153.9MB/s 495.4MB/s 7.43%
3#webster 41458703-> 12191964(3.400), 124.8MB/s 444.8MB/s 5.33%
1#xml 5345280-> 696652(7.673), 485MB/s , 1333.9MB/s
2#xml 5345280-> 681492(7.843), 405.2MB/s , 1237.5MB/s
3#xml 5345280-> 639057(8.364), 328.5MB/s , 1281.3MB/s
1#xml 5345280-> 696652(7.673), 473.1MB/s , 1232.4MB/s 8.24%
2#xml 5345280-> 681492(7.843), 398.6MB/s , 1145.9MB/s 7.99%
3#xml 5345280-> 639057(8.364), 327.1MB/s , 1175MB/s 9.05%
1#x-ray 8474240-> 6772557(1.251), 521.3MB/s 762.6MB/s
2#x-ray 8474240-> 6684531(1.268), 230.5MB/s 688.5MB/s
3#x-ray 8474240-> 6166679(1.374), 68.7MB/s 478.8MB/s
1#x-ray 8474240-> 6772557(1.251), 502.8MB/s 736.7MB/s 3.52%
2#x-ray 8474240-> 6684531(1.268), 224.4MB/s 662MB/s 4.00%
3#x-ray 8474240-> 6166679(1.374), 67.3MB/s 437.8MB/s 9.37%
7.51%
* makefile changed to only pass -fno-tree-vectorize to gcc
* <Replace this line with a title. Use 1 line only, 67 chars or less>
Don't add "no-tree-vectorize" attribute on clang (which defines __GNUC__)
* fix for warning/error with subtraction of void* pointers
* fix c90 conformance issue - ISO C90 forbids mixed declarations and code
* Fix assert for negative diff, only when there is no overlap
* fix overflow revealed in fuzzing tests
* tweak for small speed increase
2019-07-11 22:31:07 +00:00
ZSTD_wildcopy ( op , match , oend_w - op , ZSTD_overlap_src_before_dst ) ;
2018-10-25 23:28:41 +00:00
match + = oend_w - op ;
op = oend_w ;
}
while ( op < oMatchEnd ) * op + + = * match + + ;
} else {
perf improvements for zstd decode (#1668)
* perf improvements for zstd decode
tldr: 7.5% average decode speedup on silesia corpus at compression levels 1-3 (sandy bridge)
Background: while investigating zstd perf differences between clang and gcc I noticed that even though gcc is vectorizing the loop in in wildcopy, it was not being done as well as could be done by hand. The sites where wildcopy is invoked have an interesting distribution of lengths to be copied. The loop trip count is rarely above 1, yet long copies are common enough to make their performance important.The code in zstd_decompress.c to invoke wildcopy handles the latter well but the gcc autovectorizer introduces a needlessly expensive startup check for vectorization.
See how GCC autovectorizes the loop here:
https://godbolt.org/z/apr0x0
Here is the code after this diff has been applied: (left hand side is the good one, right is with vectorizer on)
After: https://godbolt.org/z/OwO4F8
Note that autovectorization still does not do a good job on the optimized version, so it's turned off\
via attribute and flag. I found that neither attribute nor command-line flag were entirely successful in turning off vectorization, which is why there were both.
silesia benchmark data - second triad of each file is with the original code:
file orig compressedratio encode decode change
1#dickens 10192446-> 4268865(2.388), 198.9MB/s 709.6MB/s
2#dickens 10192446-> 3876126(2.630), 128.7MB/s 552.5MB/s
3#dickens 10192446-> 3682956(2.767), 104.6MB/s 537MB/s
1#dickens 10192446-> 4268865(2.388), 195.4MB/s 659.5MB/s 7.60%
2#dickens 10192446-> 3876126(2.630), 127MB/s 516.3MB/s 7.01%
3#dickens 10192446-> 3682956(2.767), 105MB/s 479.5MB/s 11.99%
1#mozilla 51220480-> 20117517(2.546), 285.4MB/s 734.9MB/s
2#mozilla 51220480-> 19067018(2.686), 220.8MB/s 686.3MB/s
3#mozilla 51220480-> 18508283(2.767), 152.2MB/s 669.4MB/s
1#mozilla 51220480-> 20117517(2.546), 283.4MB/s 697.9MB/s 5.30%
2#mozilla 51220480-> 19067018(2.686), 225.9MB/s 665MB/s 3.20%
3#mozilla 51220480-> 18508283(2.767), 154.5MB/s 640.6MB/s 4.50%
1#mr 9970564-> 3840242(2.596), 262.4MB/s 899.8MB/s
2#mr 9970564-> 3600976(2.769), 181.2MB/s 717.9MB/s
3#mr 9970564-> 3563987(2.798), 116.3MB/s 620MB/s
1#mr 9970564-> 3840242(2.596), 253.2MB/s 827.3MB/s 8.76%
2#mr 9970564-> 3600976(2.769), 177.4MB/s 655.4MB/s 9.54%
3#mr 9970564-> 3563987(2.798), 111.2MB/s 564.2MB/s 9.89%
1#nci 33553445-> 2849306(11.78), 575.2MB/s , 1335.8MB/s
2#nci 33553445-> 2890166(11.61), 509.3MB/s , 1238.1MB/s
3#nci 33553445-> 2857408(11.74), 431MB/s , 1210.7MB/s
1#nci 33553445-> 2849306(11.78), 565.4MB/s , 1220.2MB/s 9.47%
2#nci 33553445-> 2890166(11.61), 508.2MB/s , 1128.4MB/s 9.72%
3#nci 33553445-> 2857408(11.74), 429.1MB/s , 1097.7MB/s 10.29%
1#ooffice 6152192-> 3590954(1.713), 231.4MB/s , 662.6MB/s
2#ooffice 6152192-> 3323931(1.851), 162.8MB/s , 592.6MB/s
3#ooffice 6152192-> 3145625(1.956), 99.9MB/s , 549.6MB/s
1#ooffice 6152192-> 3590954(1.713), 224.7MB/s , 624.2MB/s 6.15%
2#ooffice 6152192-> 3323931 (1.851), 155MB/s , 564.5MB/s 4.98%
3#ooffice 6152192-> 3145625(1.956), 101.1MB/s , 521.2MB/s 5.45%
1#osdb 10085684-> 3739042(2.697), 271.9MB/s 876.4MB/s
2#osdb 10085684-> 3493875(2.887), 208.2MB/s 857MB/s
3#osdb 10085684-> 3515831(2.869), 135.3MB/s 805.4MB/s
1#osdb 10085684-> 3739042(2.697), 257.4MB/s 793.8MB/s 10.41%
2#osdb 10085684-> 3493875(2.887), 209.7MB/s 776.1MB/s 10.42%
3#osdb 10085684-> 3515831(2.869), 130.6MB/s 727.7MB/s 10.68%
1#reymont 6627202-> 2152771(3.078), 198.9MB/s 696.2MB/s
2#reymont 6627202-> 2071140(3.200), 170MB/s 595.2MB/s
3#reymont 6627202-> 1953597(3.392), 128.5MB/s 609.7MB/s
1#reymont 6627202-> 2152771(3.078), 199.6MB/s 655.2MB/s 6.26%
2#reymont 6627202-> 2071140(3.200), 168.2MB/s 554.4MB/s 7.36%
3#reymont 6627202-> 1953597(3.392), 128.7MB/s 557.4MB/s 9.38%
1#samba 21606400-> 5510994(3.921), 338.1MB/s 1066MB/s
2#samba 21606400-> 5240208(4.123), 258.7MB/s 992.3MB/s
3#samba 21606400-> 5003358(4.318), 200.2MB/s 991.1MB/s
1#samba 21606400-> 5510994(3.921), 330.8MB/s 974MB/s 9.45%
2#samba 21606400-> 5240208(4.123), 257.9MB/s 919.4MB/s 7.93%
3#samba 21606400-> 5003358(4.318), 198.5MB/s 908.9MB/s 9.04%
1#sao 7251944-> 6256401(1.159), 194.6MB/s 602.2MB/s
2#sao 7251944-> 5808761(1.248), 128.2MB/s 532.1MB/s
3#sao 7251944-> 5556318(1.305), 73MB/s 509.4MB/s
1#sao 7251944-> 6256401(1.159), 198.7MB/s 580.7MB/s 3.70%
2#sao 7251944-> 5808761(1.248), 129.1MB/s 502.7MB/s 5.85%
3#sao 7251944-> 5556318(1.305), 74.6MB/s 493.1MB/s 3.31%
1#webster 41458703-> 13692222(3.028), 222.3MB/s 752MB/s
2#webster 41458703-> 12842646(3.228), 157.6MB/s 532.2MB/s
3#webster 41458703-> 12191964(3.400), 124MB/s 468.5MB/s
1#webster 41458703-> 13692222(3.028), 219.7MB/s 697MB/s 7.89%
2#webster 41458703-> 12842646(3.228), 153.9MB/s 495.4MB/s 7.43%
3#webster 41458703-> 12191964(3.400), 124.8MB/s 444.8MB/s 5.33%
1#xml 5345280-> 696652(7.673), 485MB/s , 1333.9MB/s
2#xml 5345280-> 681492(7.843), 405.2MB/s , 1237.5MB/s
3#xml 5345280-> 639057(8.364), 328.5MB/s , 1281.3MB/s
1#xml 5345280-> 696652(7.673), 473.1MB/s , 1232.4MB/s 8.24%
2#xml 5345280-> 681492(7.843), 398.6MB/s , 1145.9MB/s 7.99%
3#xml 5345280-> 639057(8.364), 327.1MB/s , 1175MB/s 9.05%
1#x-ray 8474240-> 6772557(1.251), 521.3MB/s 762.6MB/s
2#x-ray 8474240-> 6684531(1.268), 230.5MB/s 688.5MB/s
3#x-ray 8474240-> 6166679(1.374), 68.7MB/s 478.8MB/s
1#x-ray 8474240-> 6772557(1.251), 502.8MB/s 736.7MB/s 3.52%
2#x-ray 8474240-> 6684531(1.268), 224.4MB/s 662MB/s 4.00%
3#x-ray 8474240-> 6166679(1.374), 67.3MB/s 437.8MB/s 9.37%
7.51%
* makefile changed to only pass -fno-tree-vectorize to gcc
* <Replace this line with a title. Use 1 line only, 67 chars or less>
Don't add "no-tree-vectorize" attribute on clang (which defines __GNUC__)
* fix for warning/error with subtraction of void* pointers
* fix c90 conformance issue - ISO C90 forbids mixed declarations and code
* Fix assert for negative diff, only when there is no overlap
* fix overflow revealed in fuzzing tests
* tweak for small speed increase
2019-07-11 22:31:07 +00:00
ZSTD_wildcopy ( op , match , ( ptrdiff_t ) sequence . matchLength - 8 , ZSTD_overlap_src_before_dst ) ; /* works even if matchLength < 8 */
2018-10-25 23:28:41 +00:00
}
return sequenceLength ;
}
HINT_INLINE
size_t ZSTD_execSequenceLong ( BYTE * op ,
BYTE * const oend , seq_t sequence ,
const BYTE * * litPtr , const BYTE * const litLimit ,
const BYTE * const prefixStart , const BYTE * const dictStart , const BYTE * const dictEnd )
{
BYTE * const oLitEnd = op + sequence . litLength ;
size_t const sequenceLength = sequence . litLength + sequence . matchLength ;
BYTE * const oMatchEnd = op + sequenceLength ; /* risk : address space overflow (32-bits) */
BYTE * const oend_w = oend - WILDCOPY_OVERLENGTH ;
const BYTE * const iLitEnd = * litPtr + sequence . litLength ;
const BYTE * match = sequence . match ;
/* check */
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( oMatchEnd > oend , dstSize_tooSmall , " last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend " ) ;
RETURN_ERROR_IF ( iLitEnd > litLimit , corruption_detected , " over-read beyond lit buffer " ) ;
2018-10-25 23:28:41 +00:00
if ( oLitEnd > oend_w ) return ZSTD_execSequenceLast7 ( op , oend , sequence , litPtr , litLimit , prefixStart , dictStart , dictEnd ) ;
/* copy Literals */
if ( sequence . litLength > 8 )
perf improvements for zstd decode (#1668)
* perf improvements for zstd decode
tldr: 7.5% average decode speedup on silesia corpus at compression levels 1-3 (sandy bridge)
Background: while investigating zstd perf differences between clang and gcc I noticed that even though gcc is vectorizing the loop in in wildcopy, it was not being done as well as could be done by hand. The sites where wildcopy is invoked have an interesting distribution of lengths to be copied. The loop trip count is rarely above 1, yet long copies are common enough to make their performance important.The code in zstd_decompress.c to invoke wildcopy handles the latter well but the gcc autovectorizer introduces a needlessly expensive startup check for vectorization.
See how GCC autovectorizes the loop here:
https://godbolt.org/z/apr0x0
Here is the code after this diff has been applied: (left hand side is the good one, right is with vectorizer on)
After: https://godbolt.org/z/OwO4F8
Note that autovectorization still does not do a good job on the optimized version, so it's turned off\
via attribute and flag. I found that neither attribute nor command-line flag were entirely successful in turning off vectorization, which is why there were both.
silesia benchmark data - second triad of each file is with the original code:
file orig compressedratio encode decode change
1#dickens 10192446-> 4268865(2.388), 198.9MB/s 709.6MB/s
2#dickens 10192446-> 3876126(2.630), 128.7MB/s 552.5MB/s
3#dickens 10192446-> 3682956(2.767), 104.6MB/s 537MB/s
1#dickens 10192446-> 4268865(2.388), 195.4MB/s 659.5MB/s 7.60%
2#dickens 10192446-> 3876126(2.630), 127MB/s 516.3MB/s 7.01%
3#dickens 10192446-> 3682956(2.767), 105MB/s 479.5MB/s 11.99%
1#mozilla 51220480-> 20117517(2.546), 285.4MB/s 734.9MB/s
2#mozilla 51220480-> 19067018(2.686), 220.8MB/s 686.3MB/s
3#mozilla 51220480-> 18508283(2.767), 152.2MB/s 669.4MB/s
1#mozilla 51220480-> 20117517(2.546), 283.4MB/s 697.9MB/s 5.30%
2#mozilla 51220480-> 19067018(2.686), 225.9MB/s 665MB/s 3.20%
3#mozilla 51220480-> 18508283(2.767), 154.5MB/s 640.6MB/s 4.50%
1#mr 9970564-> 3840242(2.596), 262.4MB/s 899.8MB/s
2#mr 9970564-> 3600976(2.769), 181.2MB/s 717.9MB/s
3#mr 9970564-> 3563987(2.798), 116.3MB/s 620MB/s
1#mr 9970564-> 3840242(2.596), 253.2MB/s 827.3MB/s 8.76%
2#mr 9970564-> 3600976(2.769), 177.4MB/s 655.4MB/s 9.54%
3#mr 9970564-> 3563987(2.798), 111.2MB/s 564.2MB/s 9.89%
1#nci 33553445-> 2849306(11.78), 575.2MB/s , 1335.8MB/s
2#nci 33553445-> 2890166(11.61), 509.3MB/s , 1238.1MB/s
3#nci 33553445-> 2857408(11.74), 431MB/s , 1210.7MB/s
1#nci 33553445-> 2849306(11.78), 565.4MB/s , 1220.2MB/s 9.47%
2#nci 33553445-> 2890166(11.61), 508.2MB/s , 1128.4MB/s 9.72%
3#nci 33553445-> 2857408(11.74), 429.1MB/s , 1097.7MB/s 10.29%
1#ooffice 6152192-> 3590954(1.713), 231.4MB/s , 662.6MB/s
2#ooffice 6152192-> 3323931(1.851), 162.8MB/s , 592.6MB/s
3#ooffice 6152192-> 3145625(1.956), 99.9MB/s , 549.6MB/s
1#ooffice 6152192-> 3590954(1.713), 224.7MB/s , 624.2MB/s 6.15%
2#ooffice 6152192-> 3323931 (1.851), 155MB/s , 564.5MB/s 4.98%
3#ooffice 6152192-> 3145625(1.956), 101.1MB/s , 521.2MB/s 5.45%
1#osdb 10085684-> 3739042(2.697), 271.9MB/s 876.4MB/s
2#osdb 10085684-> 3493875(2.887), 208.2MB/s 857MB/s
3#osdb 10085684-> 3515831(2.869), 135.3MB/s 805.4MB/s
1#osdb 10085684-> 3739042(2.697), 257.4MB/s 793.8MB/s 10.41%
2#osdb 10085684-> 3493875(2.887), 209.7MB/s 776.1MB/s 10.42%
3#osdb 10085684-> 3515831(2.869), 130.6MB/s 727.7MB/s 10.68%
1#reymont 6627202-> 2152771(3.078), 198.9MB/s 696.2MB/s
2#reymont 6627202-> 2071140(3.200), 170MB/s 595.2MB/s
3#reymont 6627202-> 1953597(3.392), 128.5MB/s 609.7MB/s
1#reymont 6627202-> 2152771(3.078), 199.6MB/s 655.2MB/s 6.26%
2#reymont 6627202-> 2071140(3.200), 168.2MB/s 554.4MB/s 7.36%
3#reymont 6627202-> 1953597(3.392), 128.7MB/s 557.4MB/s 9.38%
1#samba 21606400-> 5510994(3.921), 338.1MB/s 1066MB/s
2#samba 21606400-> 5240208(4.123), 258.7MB/s 992.3MB/s
3#samba 21606400-> 5003358(4.318), 200.2MB/s 991.1MB/s
1#samba 21606400-> 5510994(3.921), 330.8MB/s 974MB/s 9.45%
2#samba 21606400-> 5240208(4.123), 257.9MB/s 919.4MB/s 7.93%
3#samba 21606400-> 5003358(4.318), 198.5MB/s 908.9MB/s 9.04%
1#sao 7251944-> 6256401(1.159), 194.6MB/s 602.2MB/s
2#sao 7251944-> 5808761(1.248), 128.2MB/s 532.1MB/s
3#sao 7251944-> 5556318(1.305), 73MB/s 509.4MB/s
1#sao 7251944-> 6256401(1.159), 198.7MB/s 580.7MB/s 3.70%
2#sao 7251944-> 5808761(1.248), 129.1MB/s 502.7MB/s 5.85%
3#sao 7251944-> 5556318(1.305), 74.6MB/s 493.1MB/s 3.31%
1#webster 41458703-> 13692222(3.028), 222.3MB/s 752MB/s
2#webster 41458703-> 12842646(3.228), 157.6MB/s 532.2MB/s
3#webster 41458703-> 12191964(3.400), 124MB/s 468.5MB/s
1#webster 41458703-> 13692222(3.028), 219.7MB/s 697MB/s 7.89%
2#webster 41458703-> 12842646(3.228), 153.9MB/s 495.4MB/s 7.43%
3#webster 41458703-> 12191964(3.400), 124.8MB/s 444.8MB/s 5.33%
1#xml 5345280-> 696652(7.673), 485MB/s , 1333.9MB/s
2#xml 5345280-> 681492(7.843), 405.2MB/s , 1237.5MB/s
3#xml 5345280-> 639057(8.364), 328.5MB/s , 1281.3MB/s
1#xml 5345280-> 696652(7.673), 473.1MB/s , 1232.4MB/s 8.24%
2#xml 5345280-> 681492(7.843), 398.6MB/s , 1145.9MB/s 7.99%
3#xml 5345280-> 639057(8.364), 327.1MB/s , 1175MB/s 9.05%
1#x-ray 8474240-> 6772557(1.251), 521.3MB/s 762.6MB/s
2#x-ray 8474240-> 6684531(1.268), 230.5MB/s 688.5MB/s
3#x-ray 8474240-> 6166679(1.374), 68.7MB/s 478.8MB/s
1#x-ray 8474240-> 6772557(1.251), 502.8MB/s 736.7MB/s 3.52%
2#x-ray 8474240-> 6684531(1.268), 224.4MB/s 662MB/s 4.00%
3#x-ray 8474240-> 6166679(1.374), 67.3MB/s 437.8MB/s 9.37%
7.51%
* makefile changed to only pass -fno-tree-vectorize to gcc
* <Replace this line with a title. Use 1 line only, 67 chars or less>
Don't add "no-tree-vectorize" attribute on clang (which defines __GNUC__)
* fix for warning/error with subtraction of void* pointers
* fix c90 conformance issue - ISO C90 forbids mixed declarations and code
* Fix assert for negative diff, only when there is no overlap
* fix overflow revealed in fuzzing tests
* tweak for small speed increase
2019-07-11 22:31:07 +00:00
ZSTD_wildcopy_16min ( op , * litPtr , sequence . litLength , ZSTD_no_overlap ) ; /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
else
ZSTD_copy8 ( op , * litPtr ) ; /* note : op <= oLitEnd <= oend_w == oend - 8 */
2018-10-25 23:28:41 +00:00
op = oLitEnd ;
* litPtr = iLitEnd ; /* update for next sequence */
/* copy Match */
if ( sequence . offset > ( size_t ) ( oLitEnd - prefixStart ) ) {
/* offset beyond prefix */
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( sequence . offset > ( size_t ) ( oLitEnd - dictStart ) , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
if ( match + sequence . matchLength < = dictEnd ) {
memmove ( oLitEnd , match , sequence . matchLength ) ;
return sequenceLength ;
}
/* span extDict & currentPrefixSegment */
{ size_t const length1 = dictEnd - match ;
memmove ( oLitEnd , match , length1 ) ;
op = oLitEnd + length1 ;
sequence . matchLength - = length1 ;
match = prefixStart ;
if ( op > oend_w | | sequence . matchLength < MINMATCH ) {
U32 i ;
for ( i = 0 ; i < sequence . matchLength ; + + i ) op [ i ] = match [ i ] ;
return sequenceLength ;
}
} }
assert ( op < = oend_w ) ;
assert ( sequence . matchLength > = MINMATCH ) ;
/* match within prefix */
if ( sequence . offset < 8 ) {
/* close range match, overlap */
static const U32 dec32table [ ] = { 0 , 1 , 2 , 1 , 4 , 4 , 4 , 4 } ; /* added */
static const int dec64table [ ] = { 8 , 8 , 8 , 7 , 8 , 9 , 10 , 11 } ; /* subtracted */
int const sub2 = dec64table [ sequence . offset ] ;
op [ 0 ] = match [ 0 ] ;
op [ 1 ] = match [ 1 ] ;
op [ 2 ] = match [ 2 ] ;
op [ 3 ] = match [ 3 ] ;
match + = dec32table [ sequence . offset ] ;
ZSTD_copy4 ( op + 4 , match ) ;
match - = sub2 ;
} else {
ZSTD_copy8 ( op , match ) ;
}
op + = 8 ; match + = 8 ;
if ( oMatchEnd > oend - ( 16 - MINMATCH ) ) {
if ( op < oend_w ) {
perf improvements for zstd decode (#1668)
* perf improvements for zstd decode
tldr: 7.5% average decode speedup on silesia corpus at compression levels 1-3 (sandy bridge)
Background: while investigating zstd perf differences between clang and gcc I noticed that even though gcc is vectorizing the loop in in wildcopy, it was not being done as well as could be done by hand. The sites where wildcopy is invoked have an interesting distribution of lengths to be copied. The loop trip count is rarely above 1, yet long copies are common enough to make their performance important.The code in zstd_decompress.c to invoke wildcopy handles the latter well but the gcc autovectorizer introduces a needlessly expensive startup check for vectorization.
See how GCC autovectorizes the loop here:
https://godbolt.org/z/apr0x0
Here is the code after this diff has been applied: (left hand side is the good one, right is with vectorizer on)
After: https://godbolt.org/z/OwO4F8
Note that autovectorization still does not do a good job on the optimized version, so it's turned off\
via attribute and flag. I found that neither attribute nor command-line flag were entirely successful in turning off vectorization, which is why there were both.
silesia benchmark data - second triad of each file is with the original code:
file orig compressedratio encode decode change
1#dickens 10192446-> 4268865(2.388), 198.9MB/s 709.6MB/s
2#dickens 10192446-> 3876126(2.630), 128.7MB/s 552.5MB/s
3#dickens 10192446-> 3682956(2.767), 104.6MB/s 537MB/s
1#dickens 10192446-> 4268865(2.388), 195.4MB/s 659.5MB/s 7.60%
2#dickens 10192446-> 3876126(2.630), 127MB/s 516.3MB/s 7.01%
3#dickens 10192446-> 3682956(2.767), 105MB/s 479.5MB/s 11.99%
1#mozilla 51220480-> 20117517(2.546), 285.4MB/s 734.9MB/s
2#mozilla 51220480-> 19067018(2.686), 220.8MB/s 686.3MB/s
3#mozilla 51220480-> 18508283(2.767), 152.2MB/s 669.4MB/s
1#mozilla 51220480-> 20117517(2.546), 283.4MB/s 697.9MB/s 5.30%
2#mozilla 51220480-> 19067018(2.686), 225.9MB/s 665MB/s 3.20%
3#mozilla 51220480-> 18508283(2.767), 154.5MB/s 640.6MB/s 4.50%
1#mr 9970564-> 3840242(2.596), 262.4MB/s 899.8MB/s
2#mr 9970564-> 3600976(2.769), 181.2MB/s 717.9MB/s
3#mr 9970564-> 3563987(2.798), 116.3MB/s 620MB/s
1#mr 9970564-> 3840242(2.596), 253.2MB/s 827.3MB/s 8.76%
2#mr 9970564-> 3600976(2.769), 177.4MB/s 655.4MB/s 9.54%
3#mr 9970564-> 3563987(2.798), 111.2MB/s 564.2MB/s 9.89%
1#nci 33553445-> 2849306(11.78), 575.2MB/s , 1335.8MB/s
2#nci 33553445-> 2890166(11.61), 509.3MB/s , 1238.1MB/s
3#nci 33553445-> 2857408(11.74), 431MB/s , 1210.7MB/s
1#nci 33553445-> 2849306(11.78), 565.4MB/s , 1220.2MB/s 9.47%
2#nci 33553445-> 2890166(11.61), 508.2MB/s , 1128.4MB/s 9.72%
3#nci 33553445-> 2857408(11.74), 429.1MB/s , 1097.7MB/s 10.29%
1#ooffice 6152192-> 3590954(1.713), 231.4MB/s , 662.6MB/s
2#ooffice 6152192-> 3323931(1.851), 162.8MB/s , 592.6MB/s
3#ooffice 6152192-> 3145625(1.956), 99.9MB/s , 549.6MB/s
1#ooffice 6152192-> 3590954(1.713), 224.7MB/s , 624.2MB/s 6.15%
2#ooffice 6152192-> 3323931 (1.851), 155MB/s , 564.5MB/s 4.98%
3#ooffice 6152192-> 3145625(1.956), 101.1MB/s , 521.2MB/s 5.45%
1#osdb 10085684-> 3739042(2.697), 271.9MB/s 876.4MB/s
2#osdb 10085684-> 3493875(2.887), 208.2MB/s 857MB/s
3#osdb 10085684-> 3515831(2.869), 135.3MB/s 805.4MB/s
1#osdb 10085684-> 3739042(2.697), 257.4MB/s 793.8MB/s 10.41%
2#osdb 10085684-> 3493875(2.887), 209.7MB/s 776.1MB/s 10.42%
3#osdb 10085684-> 3515831(2.869), 130.6MB/s 727.7MB/s 10.68%
1#reymont 6627202-> 2152771(3.078), 198.9MB/s 696.2MB/s
2#reymont 6627202-> 2071140(3.200), 170MB/s 595.2MB/s
3#reymont 6627202-> 1953597(3.392), 128.5MB/s 609.7MB/s
1#reymont 6627202-> 2152771(3.078), 199.6MB/s 655.2MB/s 6.26%
2#reymont 6627202-> 2071140(3.200), 168.2MB/s 554.4MB/s 7.36%
3#reymont 6627202-> 1953597(3.392), 128.7MB/s 557.4MB/s 9.38%
1#samba 21606400-> 5510994(3.921), 338.1MB/s 1066MB/s
2#samba 21606400-> 5240208(4.123), 258.7MB/s 992.3MB/s
3#samba 21606400-> 5003358(4.318), 200.2MB/s 991.1MB/s
1#samba 21606400-> 5510994(3.921), 330.8MB/s 974MB/s 9.45%
2#samba 21606400-> 5240208(4.123), 257.9MB/s 919.4MB/s 7.93%
3#samba 21606400-> 5003358(4.318), 198.5MB/s 908.9MB/s 9.04%
1#sao 7251944-> 6256401(1.159), 194.6MB/s 602.2MB/s
2#sao 7251944-> 5808761(1.248), 128.2MB/s 532.1MB/s
3#sao 7251944-> 5556318(1.305), 73MB/s 509.4MB/s
1#sao 7251944-> 6256401(1.159), 198.7MB/s 580.7MB/s 3.70%
2#sao 7251944-> 5808761(1.248), 129.1MB/s 502.7MB/s 5.85%
3#sao 7251944-> 5556318(1.305), 74.6MB/s 493.1MB/s 3.31%
1#webster 41458703-> 13692222(3.028), 222.3MB/s 752MB/s
2#webster 41458703-> 12842646(3.228), 157.6MB/s 532.2MB/s
3#webster 41458703-> 12191964(3.400), 124MB/s 468.5MB/s
1#webster 41458703-> 13692222(3.028), 219.7MB/s 697MB/s 7.89%
2#webster 41458703-> 12842646(3.228), 153.9MB/s 495.4MB/s 7.43%
3#webster 41458703-> 12191964(3.400), 124.8MB/s 444.8MB/s 5.33%
1#xml 5345280-> 696652(7.673), 485MB/s , 1333.9MB/s
2#xml 5345280-> 681492(7.843), 405.2MB/s , 1237.5MB/s
3#xml 5345280-> 639057(8.364), 328.5MB/s , 1281.3MB/s
1#xml 5345280-> 696652(7.673), 473.1MB/s , 1232.4MB/s 8.24%
2#xml 5345280-> 681492(7.843), 398.6MB/s , 1145.9MB/s 7.99%
3#xml 5345280-> 639057(8.364), 327.1MB/s , 1175MB/s 9.05%
1#x-ray 8474240-> 6772557(1.251), 521.3MB/s 762.6MB/s
2#x-ray 8474240-> 6684531(1.268), 230.5MB/s 688.5MB/s
3#x-ray 8474240-> 6166679(1.374), 68.7MB/s 478.8MB/s
1#x-ray 8474240-> 6772557(1.251), 502.8MB/s 736.7MB/s 3.52%
2#x-ray 8474240-> 6684531(1.268), 224.4MB/s 662MB/s 4.00%
3#x-ray 8474240-> 6166679(1.374), 67.3MB/s 437.8MB/s 9.37%
7.51%
* makefile changed to only pass -fno-tree-vectorize to gcc
* <Replace this line with a title. Use 1 line only, 67 chars or less>
Don't add "no-tree-vectorize" attribute on clang (which defines __GNUC__)
* fix for warning/error with subtraction of void* pointers
* fix c90 conformance issue - ISO C90 forbids mixed declarations and code
* Fix assert for negative diff, only when there is no overlap
* fix overflow revealed in fuzzing tests
* tweak for small speed increase
2019-07-11 22:31:07 +00:00
ZSTD_wildcopy ( op , match , oend_w - op , ZSTD_overlap_src_before_dst ) ;
2018-10-25 23:28:41 +00:00
match + = oend_w - op ;
op = oend_w ;
}
while ( op < oMatchEnd ) * op + + = * match + + ;
} else {
perf improvements for zstd decode (#1668)
* perf improvements for zstd decode
tldr: 7.5% average decode speedup on silesia corpus at compression levels 1-3 (sandy bridge)
Background: while investigating zstd perf differences between clang and gcc I noticed that even though gcc is vectorizing the loop in in wildcopy, it was not being done as well as could be done by hand. The sites where wildcopy is invoked have an interesting distribution of lengths to be copied. The loop trip count is rarely above 1, yet long copies are common enough to make their performance important.The code in zstd_decompress.c to invoke wildcopy handles the latter well but the gcc autovectorizer introduces a needlessly expensive startup check for vectorization.
See how GCC autovectorizes the loop here:
https://godbolt.org/z/apr0x0
Here is the code after this diff has been applied: (left hand side is the good one, right is with vectorizer on)
After: https://godbolt.org/z/OwO4F8
Note that autovectorization still does not do a good job on the optimized version, so it's turned off\
via attribute and flag. I found that neither attribute nor command-line flag were entirely successful in turning off vectorization, which is why there were both.
silesia benchmark data - second triad of each file is with the original code:
file orig compressedratio encode decode change
1#dickens 10192446-> 4268865(2.388), 198.9MB/s 709.6MB/s
2#dickens 10192446-> 3876126(2.630), 128.7MB/s 552.5MB/s
3#dickens 10192446-> 3682956(2.767), 104.6MB/s 537MB/s
1#dickens 10192446-> 4268865(2.388), 195.4MB/s 659.5MB/s 7.60%
2#dickens 10192446-> 3876126(2.630), 127MB/s 516.3MB/s 7.01%
3#dickens 10192446-> 3682956(2.767), 105MB/s 479.5MB/s 11.99%
1#mozilla 51220480-> 20117517(2.546), 285.4MB/s 734.9MB/s
2#mozilla 51220480-> 19067018(2.686), 220.8MB/s 686.3MB/s
3#mozilla 51220480-> 18508283(2.767), 152.2MB/s 669.4MB/s
1#mozilla 51220480-> 20117517(2.546), 283.4MB/s 697.9MB/s 5.30%
2#mozilla 51220480-> 19067018(2.686), 225.9MB/s 665MB/s 3.20%
3#mozilla 51220480-> 18508283(2.767), 154.5MB/s 640.6MB/s 4.50%
1#mr 9970564-> 3840242(2.596), 262.4MB/s 899.8MB/s
2#mr 9970564-> 3600976(2.769), 181.2MB/s 717.9MB/s
3#mr 9970564-> 3563987(2.798), 116.3MB/s 620MB/s
1#mr 9970564-> 3840242(2.596), 253.2MB/s 827.3MB/s 8.76%
2#mr 9970564-> 3600976(2.769), 177.4MB/s 655.4MB/s 9.54%
3#mr 9970564-> 3563987(2.798), 111.2MB/s 564.2MB/s 9.89%
1#nci 33553445-> 2849306(11.78), 575.2MB/s , 1335.8MB/s
2#nci 33553445-> 2890166(11.61), 509.3MB/s , 1238.1MB/s
3#nci 33553445-> 2857408(11.74), 431MB/s , 1210.7MB/s
1#nci 33553445-> 2849306(11.78), 565.4MB/s , 1220.2MB/s 9.47%
2#nci 33553445-> 2890166(11.61), 508.2MB/s , 1128.4MB/s 9.72%
3#nci 33553445-> 2857408(11.74), 429.1MB/s , 1097.7MB/s 10.29%
1#ooffice 6152192-> 3590954(1.713), 231.4MB/s , 662.6MB/s
2#ooffice 6152192-> 3323931(1.851), 162.8MB/s , 592.6MB/s
3#ooffice 6152192-> 3145625(1.956), 99.9MB/s , 549.6MB/s
1#ooffice 6152192-> 3590954(1.713), 224.7MB/s , 624.2MB/s 6.15%
2#ooffice 6152192-> 3323931 (1.851), 155MB/s , 564.5MB/s 4.98%
3#ooffice 6152192-> 3145625(1.956), 101.1MB/s , 521.2MB/s 5.45%
1#osdb 10085684-> 3739042(2.697), 271.9MB/s 876.4MB/s
2#osdb 10085684-> 3493875(2.887), 208.2MB/s 857MB/s
3#osdb 10085684-> 3515831(2.869), 135.3MB/s 805.4MB/s
1#osdb 10085684-> 3739042(2.697), 257.4MB/s 793.8MB/s 10.41%
2#osdb 10085684-> 3493875(2.887), 209.7MB/s 776.1MB/s 10.42%
3#osdb 10085684-> 3515831(2.869), 130.6MB/s 727.7MB/s 10.68%
1#reymont 6627202-> 2152771(3.078), 198.9MB/s 696.2MB/s
2#reymont 6627202-> 2071140(3.200), 170MB/s 595.2MB/s
3#reymont 6627202-> 1953597(3.392), 128.5MB/s 609.7MB/s
1#reymont 6627202-> 2152771(3.078), 199.6MB/s 655.2MB/s 6.26%
2#reymont 6627202-> 2071140(3.200), 168.2MB/s 554.4MB/s 7.36%
3#reymont 6627202-> 1953597(3.392), 128.7MB/s 557.4MB/s 9.38%
1#samba 21606400-> 5510994(3.921), 338.1MB/s 1066MB/s
2#samba 21606400-> 5240208(4.123), 258.7MB/s 992.3MB/s
3#samba 21606400-> 5003358(4.318), 200.2MB/s 991.1MB/s
1#samba 21606400-> 5510994(3.921), 330.8MB/s 974MB/s 9.45%
2#samba 21606400-> 5240208(4.123), 257.9MB/s 919.4MB/s 7.93%
3#samba 21606400-> 5003358(4.318), 198.5MB/s 908.9MB/s 9.04%
1#sao 7251944-> 6256401(1.159), 194.6MB/s 602.2MB/s
2#sao 7251944-> 5808761(1.248), 128.2MB/s 532.1MB/s
3#sao 7251944-> 5556318(1.305), 73MB/s 509.4MB/s
1#sao 7251944-> 6256401(1.159), 198.7MB/s 580.7MB/s 3.70%
2#sao 7251944-> 5808761(1.248), 129.1MB/s 502.7MB/s 5.85%
3#sao 7251944-> 5556318(1.305), 74.6MB/s 493.1MB/s 3.31%
1#webster 41458703-> 13692222(3.028), 222.3MB/s 752MB/s
2#webster 41458703-> 12842646(3.228), 157.6MB/s 532.2MB/s
3#webster 41458703-> 12191964(3.400), 124MB/s 468.5MB/s
1#webster 41458703-> 13692222(3.028), 219.7MB/s 697MB/s 7.89%
2#webster 41458703-> 12842646(3.228), 153.9MB/s 495.4MB/s 7.43%
3#webster 41458703-> 12191964(3.400), 124.8MB/s 444.8MB/s 5.33%
1#xml 5345280-> 696652(7.673), 485MB/s , 1333.9MB/s
2#xml 5345280-> 681492(7.843), 405.2MB/s , 1237.5MB/s
3#xml 5345280-> 639057(8.364), 328.5MB/s , 1281.3MB/s
1#xml 5345280-> 696652(7.673), 473.1MB/s , 1232.4MB/s 8.24%
2#xml 5345280-> 681492(7.843), 398.6MB/s , 1145.9MB/s 7.99%
3#xml 5345280-> 639057(8.364), 327.1MB/s , 1175MB/s 9.05%
1#x-ray 8474240-> 6772557(1.251), 521.3MB/s 762.6MB/s
2#x-ray 8474240-> 6684531(1.268), 230.5MB/s 688.5MB/s
3#x-ray 8474240-> 6166679(1.374), 68.7MB/s 478.8MB/s
1#x-ray 8474240-> 6772557(1.251), 502.8MB/s 736.7MB/s 3.52%
2#x-ray 8474240-> 6684531(1.268), 224.4MB/s 662MB/s 4.00%
3#x-ray 8474240-> 6166679(1.374), 67.3MB/s 437.8MB/s 9.37%
7.51%
* makefile changed to only pass -fno-tree-vectorize to gcc
* <Replace this line with a title. Use 1 line only, 67 chars or less>
Don't add "no-tree-vectorize" attribute on clang (which defines __GNUC__)
* fix for warning/error with subtraction of void* pointers
* fix c90 conformance issue - ISO C90 forbids mixed declarations and code
* Fix assert for negative diff, only when there is no overlap
* fix overflow revealed in fuzzing tests
* tweak for small speed increase
2019-07-11 22:31:07 +00:00
ZSTD_wildcopy ( op , match , ( ptrdiff_t ) sequence . matchLength - 8 , ZSTD_overlap_src_before_dst ) ; /* works even if matchLength < 8 */
2018-10-25 23:28:41 +00:00
}
return sequenceLength ;
}
static void
ZSTD_initFseState ( ZSTD_fseState * DStatePtr , BIT_DStream_t * bitD , const ZSTD_seqSymbol * dt )
{
const void * ptr = dt ;
const ZSTD_seqSymbol_header * const DTableH = ( const ZSTD_seqSymbol_header * ) ptr ;
DStatePtr - > state = BIT_readBits ( bitD , DTableH - > tableLog ) ;
DEBUGLOG ( 6 , " ZSTD_initFseState : val=%u using %u bits " ,
( U32 ) DStatePtr - > state , DTableH - > tableLog ) ;
BIT_reloadDStream ( bitD ) ;
DStatePtr - > table = dt + 1 ;
}
FORCE_INLINE_TEMPLATE void
ZSTD_updateFseState ( ZSTD_fseState * DStatePtr , BIT_DStream_t * bitD )
{
ZSTD_seqSymbol const DInfo = DStatePtr - > table [ DStatePtr - > state ] ;
U32 const nbBits = DInfo . nbBits ;
size_t const lowBits = BIT_readBits ( bitD , nbBits ) ;
DStatePtr - > state = DInfo . nextState + lowBits ;
}
/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
* offset bits . But we can only read at most ( STREAM_ACCUMULATOR_MIN_32 - 1 )
* bits before reloading . This value is the maximum number of bytes we read
2019-04-12 18:18:11 +00:00
* after reloading when we are decoding long offsets .
2018-10-25 23:28:41 +00:00
*/
# define LONG_OFFSETS_MAX_EXTRA_BITS_32 \
( ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \
? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \
: 0 )
typedef enum { ZSTD_lo_isRegularOffset , ZSTD_lo_isLongOffset = 1 } ZSTD_longOffset_e ;
2018-12-20 20:20:34 +00:00
# ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
2018-10-25 23:28:41 +00:00
FORCE_INLINE_TEMPLATE seq_t
ZSTD_decodeSequence ( seqState_t * seqState , const ZSTD_longOffset_e longOffsets )
{
seq_t seq ;
U32 const llBits = seqState - > stateLL . table [ seqState - > stateLL . state ] . nbAdditionalBits ;
U32 const mlBits = seqState - > stateML . table [ seqState - > stateML . state ] . nbAdditionalBits ;
U32 const ofBits = seqState - > stateOffb . table [ seqState - > stateOffb . state ] . nbAdditionalBits ;
U32 const totalBits = llBits + mlBits + ofBits ;
U32 const llBase = seqState - > stateLL . table [ seqState - > stateLL . state ] . baseValue ;
U32 const mlBase = seqState - > stateML . table [ seqState - > stateML . state ] . baseValue ;
U32 const ofBase = seqState - > stateOffb . table [ seqState - > stateOffb . state ] . baseValue ;
/* sequence */
{ size_t offset ;
if ( ! ofBits )
offset = 0 ;
else {
ZSTD_STATIC_ASSERT ( ZSTD_lo_isLongOffset = = 1 ) ;
ZSTD_STATIC_ASSERT ( LONG_OFFSETS_MAX_EXTRA_BITS_32 = = 5 ) ;
assert ( ofBits < = MaxOff ) ;
if ( MEM_32bits ( ) & & longOffsets & & ( ofBits > = STREAM_ACCUMULATOR_MIN_32 ) ) {
U32 const extraBits = ofBits - MIN ( ofBits , 32 - seqState - > DStream . bitsConsumed ) ;
offset = ofBase + ( BIT_readBitsFast ( & seqState - > DStream , ofBits - extraBits ) < < extraBits ) ;
BIT_reloadDStream ( & seqState - > DStream ) ;
if ( extraBits ) offset + = BIT_readBitsFast ( & seqState - > DStream , extraBits ) ;
assert ( extraBits < = LONG_OFFSETS_MAX_EXTRA_BITS_32 ) ; /* to avoid another reload */
} else {
offset = ofBase + BIT_readBitsFast ( & seqState - > DStream , ofBits /*>0*/ ) ; /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
if ( MEM_32bits ( ) ) BIT_reloadDStream ( & seqState - > DStream ) ;
}
}
if ( ofBits < = 1 ) {
offset + = ( llBase = = 0 ) ;
if ( offset ) {
size_t temp = ( offset = = 3 ) ? seqState - > prevOffset [ 0 ] - 1 : seqState - > prevOffset [ offset ] ;
temp + = ! temp ; /* 0 is not valid; input is corrupted; force offset to 1 */
if ( offset ! = 1 ) seqState - > prevOffset [ 2 ] = seqState - > prevOffset [ 1 ] ;
seqState - > prevOffset [ 1 ] = seqState - > prevOffset [ 0 ] ;
seqState - > prevOffset [ 0 ] = offset = temp ;
} else { /* offset == 0 */
offset = seqState - > prevOffset [ 0 ] ;
}
} else {
seqState - > prevOffset [ 2 ] = seqState - > prevOffset [ 1 ] ;
seqState - > prevOffset [ 1 ] = seqState - > prevOffset [ 0 ] ;
seqState - > prevOffset [ 0 ] = offset ;
}
seq . offset = offset ;
}
seq . matchLength = mlBase
+ ( ( mlBits > 0 ) ? BIT_readBitsFast ( & seqState - > DStream , mlBits /*>0*/ ) : 0 ) ; /* <= 16 bits */
if ( MEM_32bits ( ) & & ( mlBits + llBits > = STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 ) )
BIT_reloadDStream ( & seqState - > DStream ) ;
if ( MEM_64bits ( ) & & ( totalBits > = STREAM_ACCUMULATOR_MIN_64 - ( LLFSELog + MLFSELog + OffFSELog ) ) )
BIT_reloadDStream ( & seqState - > DStream ) ;
/* Ensure there are enough bits to read the rest of data in 64-bit mode. */
ZSTD_STATIC_ASSERT ( 16 + LLFSELog + MLFSELog + OffFSELog < STREAM_ACCUMULATOR_MIN_64 ) ;
seq . litLength = llBase
+ ( ( llBits > 0 ) ? BIT_readBitsFast ( & seqState - > DStream , llBits /*>0*/ ) : 0 ) ; /* <= 16 bits */
if ( MEM_32bits ( ) )
BIT_reloadDStream ( & seqState - > DStream ) ;
DEBUGLOG ( 6 , " seq: litL=%u, matchL=%u, offset=%u " ,
( U32 ) seq . litLength , ( U32 ) seq . matchLength , ( U32 ) seq . offset ) ;
/* ANS state update */
ZSTD_updateFseState ( & seqState - > stateLL , & seqState - > DStream ) ; /* <= 9 bits */
ZSTD_updateFseState ( & seqState - > stateML , & seqState - > DStream ) ; /* <= 9 bits */
if ( MEM_32bits ( ) ) BIT_reloadDStream ( & seqState - > DStream ) ; /* <= 18 bits */
ZSTD_updateFseState ( & seqState - > stateOffb , & seqState - > DStream ) ; /* <= 8 bits */
return seq ;
}
FORCE_INLINE_TEMPLATE size_t
perf improvements for zstd decode (#1668)
* perf improvements for zstd decode
tldr: 7.5% average decode speedup on silesia corpus at compression levels 1-3 (sandy bridge)
Background: while investigating zstd perf differences between clang and gcc I noticed that even though gcc is vectorizing the loop in in wildcopy, it was not being done as well as could be done by hand. The sites where wildcopy is invoked have an interesting distribution of lengths to be copied. The loop trip count is rarely above 1, yet long copies are common enough to make their performance important.The code in zstd_decompress.c to invoke wildcopy handles the latter well but the gcc autovectorizer introduces a needlessly expensive startup check for vectorization.
See how GCC autovectorizes the loop here:
https://godbolt.org/z/apr0x0
Here is the code after this diff has been applied: (left hand side is the good one, right is with vectorizer on)
After: https://godbolt.org/z/OwO4F8
Note that autovectorization still does not do a good job on the optimized version, so it's turned off\
via attribute and flag. I found that neither attribute nor command-line flag were entirely successful in turning off vectorization, which is why there were both.
silesia benchmark data - second triad of each file is with the original code:
file orig compressedratio encode decode change
1#dickens 10192446-> 4268865(2.388), 198.9MB/s 709.6MB/s
2#dickens 10192446-> 3876126(2.630), 128.7MB/s 552.5MB/s
3#dickens 10192446-> 3682956(2.767), 104.6MB/s 537MB/s
1#dickens 10192446-> 4268865(2.388), 195.4MB/s 659.5MB/s 7.60%
2#dickens 10192446-> 3876126(2.630), 127MB/s 516.3MB/s 7.01%
3#dickens 10192446-> 3682956(2.767), 105MB/s 479.5MB/s 11.99%
1#mozilla 51220480-> 20117517(2.546), 285.4MB/s 734.9MB/s
2#mozilla 51220480-> 19067018(2.686), 220.8MB/s 686.3MB/s
3#mozilla 51220480-> 18508283(2.767), 152.2MB/s 669.4MB/s
1#mozilla 51220480-> 20117517(2.546), 283.4MB/s 697.9MB/s 5.30%
2#mozilla 51220480-> 19067018(2.686), 225.9MB/s 665MB/s 3.20%
3#mozilla 51220480-> 18508283(2.767), 154.5MB/s 640.6MB/s 4.50%
1#mr 9970564-> 3840242(2.596), 262.4MB/s 899.8MB/s
2#mr 9970564-> 3600976(2.769), 181.2MB/s 717.9MB/s
3#mr 9970564-> 3563987(2.798), 116.3MB/s 620MB/s
1#mr 9970564-> 3840242(2.596), 253.2MB/s 827.3MB/s 8.76%
2#mr 9970564-> 3600976(2.769), 177.4MB/s 655.4MB/s 9.54%
3#mr 9970564-> 3563987(2.798), 111.2MB/s 564.2MB/s 9.89%
1#nci 33553445-> 2849306(11.78), 575.2MB/s , 1335.8MB/s
2#nci 33553445-> 2890166(11.61), 509.3MB/s , 1238.1MB/s
3#nci 33553445-> 2857408(11.74), 431MB/s , 1210.7MB/s
1#nci 33553445-> 2849306(11.78), 565.4MB/s , 1220.2MB/s 9.47%
2#nci 33553445-> 2890166(11.61), 508.2MB/s , 1128.4MB/s 9.72%
3#nci 33553445-> 2857408(11.74), 429.1MB/s , 1097.7MB/s 10.29%
1#ooffice 6152192-> 3590954(1.713), 231.4MB/s , 662.6MB/s
2#ooffice 6152192-> 3323931(1.851), 162.8MB/s , 592.6MB/s
3#ooffice 6152192-> 3145625(1.956), 99.9MB/s , 549.6MB/s
1#ooffice 6152192-> 3590954(1.713), 224.7MB/s , 624.2MB/s 6.15%
2#ooffice 6152192-> 3323931 (1.851), 155MB/s , 564.5MB/s 4.98%
3#ooffice 6152192-> 3145625(1.956), 101.1MB/s , 521.2MB/s 5.45%
1#osdb 10085684-> 3739042(2.697), 271.9MB/s 876.4MB/s
2#osdb 10085684-> 3493875(2.887), 208.2MB/s 857MB/s
3#osdb 10085684-> 3515831(2.869), 135.3MB/s 805.4MB/s
1#osdb 10085684-> 3739042(2.697), 257.4MB/s 793.8MB/s 10.41%
2#osdb 10085684-> 3493875(2.887), 209.7MB/s 776.1MB/s 10.42%
3#osdb 10085684-> 3515831(2.869), 130.6MB/s 727.7MB/s 10.68%
1#reymont 6627202-> 2152771(3.078), 198.9MB/s 696.2MB/s
2#reymont 6627202-> 2071140(3.200), 170MB/s 595.2MB/s
3#reymont 6627202-> 1953597(3.392), 128.5MB/s 609.7MB/s
1#reymont 6627202-> 2152771(3.078), 199.6MB/s 655.2MB/s 6.26%
2#reymont 6627202-> 2071140(3.200), 168.2MB/s 554.4MB/s 7.36%
3#reymont 6627202-> 1953597(3.392), 128.7MB/s 557.4MB/s 9.38%
1#samba 21606400-> 5510994(3.921), 338.1MB/s 1066MB/s
2#samba 21606400-> 5240208(4.123), 258.7MB/s 992.3MB/s
3#samba 21606400-> 5003358(4.318), 200.2MB/s 991.1MB/s
1#samba 21606400-> 5510994(3.921), 330.8MB/s 974MB/s 9.45%
2#samba 21606400-> 5240208(4.123), 257.9MB/s 919.4MB/s 7.93%
3#samba 21606400-> 5003358(4.318), 198.5MB/s 908.9MB/s 9.04%
1#sao 7251944-> 6256401(1.159), 194.6MB/s 602.2MB/s
2#sao 7251944-> 5808761(1.248), 128.2MB/s 532.1MB/s
3#sao 7251944-> 5556318(1.305), 73MB/s 509.4MB/s
1#sao 7251944-> 6256401(1.159), 198.7MB/s 580.7MB/s 3.70%
2#sao 7251944-> 5808761(1.248), 129.1MB/s 502.7MB/s 5.85%
3#sao 7251944-> 5556318(1.305), 74.6MB/s 493.1MB/s 3.31%
1#webster 41458703-> 13692222(3.028), 222.3MB/s 752MB/s
2#webster 41458703-> 12842646(3.228), 157.6MB/s 532.2MB/s
3#webster 41458703-> 12191964(3.400), 124MB/s 468.5MB/s
1#webster 41458703-> 13692222(3.028), 219.7MB/s 697MB/s 7.89%
2#webster 41458703-> 12842646(3.228), 153.9MB/s 495.4MB/s 7.43%
3#webster 41458703-> 12191964(3.400), 124.8MB/s 444.8MB/s 5.33%
1#xml 5345280-> 696652(7.673), 485MB/s , 1333.9MB/s
2#xml 5345280-> 681492(7.843), 405.2MB/s , 1237.5MB/s
3#xml 5345280-> 639057(8.364), 328.5MB/s , 1281.3MB/s
1#xml 5345280-> 696652(7.673), 473.1MB/s , 1232.4MB/s 8.24%
2#xml 5345280-> 681492(7.843), 398.6MB/s , 1145.9MB/s 7.99%
3#xml 5345280-> 639057(8.364), 327.1MB/s , 1175MB/s 9.05%
1#x-ray 8474240-> 6772557(1.251), 521.3MB/s 762.6MB/s
2#x-ray 8474240-> 6684531(1.268), 230.5MB/s 688.5MB/s
3#x-ray 8474240-> 6166679(1.374), 68.7MB/s 478.8MB/s
1#x-ray 8474240-> 6772557(1.251), 502.8MB/s 736.7MB/s 3.52%
2#x-ray 8474240-> 6684531(1.268), 224.4MB/s 662MB/s 4.00%
3#x-ray 8474240-> 6166679(1.374), 67.3MB/s 437.8MB/s 9.37%
7.51%
* makefile changed to only pass -fno-tree-vectorize to gcc
* <Replace this line with a title. Use 1 line only, 67 chars or less>
Don't add "no-tree-vectorize" attribute on clang (which defines __GNUC__)
* fix for warning/error with subtraction of void* pointers
* fix c90 conformance issue - ISO C90 forbids mixed declarations and code
* Fix assert for negative diff, only when there is no overlap
* fix overflow revealed in fuzzing tests
* tweak for small speed increase
2019-07-11 22:31:07 +00:00
DONT_VECTORIZE
2018-10-25 23:28:41 +00:00
ZSTD_decompressSequences_body ( ZSTD_DCtx * dctx ,
void * dst , size_t maxDstSize ,
const void * seqStart , size_t seqSize , int nbSeq ,
const ZSTD_longOffset_e isLongOffset )
{
const BYTE * ip = ( const BYTE * ) seqStart ;
const BYTE * const iend = ip + seqSize ;
BYTE * const ostart = ( BYTE * const ) dst ;
BYTE * const oend = ostart + maxDstSize ;
BYTE * op = ostart ;
const BYTE * litPtr = dctx - > litPtr ;
const BYTE * const litEnd = litPtr + dctx - > litSize ;
const BYTE * const prefixStart = ( const BYTE * ) ( dctx - > prefixStart ) ;
const BYTE * const vBase = ( const BYTE * ) ( dctx - > virtualStart ) ;
const BYTE * const dictEnd = ( const BYTE * ) ( dctx - > dictEnd ) ;
DEBUGLOG ( 5 , " ZSTD_decompressSequences_body " ) ;
/* Regen sequences */
if ( nbSeq ) {
seqState_t seqState ;
dctx - > fseEntropy = 1 ;
{ U32 i ; for ( i = 0 ; i < ZSTD_REP_NUM ; i + + ) seqState . prevOffset [ i ] = dctx - > entropy . rep [ i ] ; }
2019-01-28 22:27:29 +00:00
RETURN_ERROR_IF (
ERR_isError ( BIT_initDStream ( & seqState . DStream , ip , iend - ip ) ) ,
corruption_detected ) ;
2018-10-25 23:28:41 +00:00
ZSTD_initFseState ( & seqState . stateLL , & seqState . DStream , dctx - > LLTptr ) ;
ZSTD_initFseState ( & seqState . stateOffb , & seqState . DStream , dctx - > OFTptr ) ;
ZSTD_initFseState ( & seqState . stateML , & seqState . DStream , dctx - > MLTptr ) ;
2019-04-23 21:22:16 +00:00
ZSTD_STATIC_ASSERT (
BIT_DStream_unfinished < BIT_DStream_completed & &
BIT_DStream_endOfBuffer < BIT_DStream_completed & &
BIT_DStream_completed < BIT_DStream_overflow ) ;
2018-10-25 23:28:41 +00:00
for ( ; ( BIT_reloadDStream ( & ( seqState . DStream ) ) < = BIT_DStream_completed ) & & nbSeq ; ) {
nbSeq - - ;
{ seq_t const sequence = ZSTD_decodeSequence ( & seqState , isLongOffset ) ;
size_t const oneSeqSize = ZSTD_execSequence ( op , oend , sequence , & litPtr , litEnd , prefixStart , vBase , dictEnd ) ;
DEBUGLOG ( 6 , " regenerated sequence size : %u " , ( U32 ) oneSeqSize ) ;
if ( ZSTD_isError ( oneSeqSize ) ) return oneSeqSize ;
op + = oneSeqSize ;
} }
/* check if reached exact end */
DEBUGLOG ( 5 , " ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i " , nbSeq ) ;
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( nbSeq , corruption_detected ) ;
2019-04-23 21:07:36 +00:00
RETURN_ERROR_IF ( BIT_reloadDStream ( & seqState . DStream ) < BIT_DStream_completed , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
/* save reps for next block */
{ U32 i ; for ( i = 0 ; i < ZSTD_REP_NUM ; i + + ) dctx - > entropy . rep [ i ] = ( U32 ) ( seqState . prevOffset [ i ] ) ; }
}
/* last literal segment */
{ size_t const lastLLSize = litEnd - litPtr ;
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( lastLLSize > ( size_t ) ( oend - op ) , dstSize_tooSmall ) ;
2018-10-25 23:28:41 +00:00
memcpy ( op , litPtr , lastLLSize ) ;
op + = lastLLSize ;
}
return op - ostart ;
}
static size_t
ZSTD_decompressSequences_default ( ZSTD_DCtx * dctx ,
void * dst , size_t maxDstSize ,
const void * seqStart , size_t seqSize , int nbSeq ,
const ZSTD_longOffset_e isLongOffset )
{
return ZSTD_decompressSequences_body ( dctx , dst , maxDstSize , seqStart , seqSize , nbSeq , isLongOffset ) ;
}
2018-12-20 20:20:34 +00:00
# endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
2018-10-25 23:28:41 +00:00
2018-12-20 20:15:07 +00:00
# ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
2018-10-25 23:28:41 +00:00
FORCE_INLINE_TEMPLATE seq_t
ZSTD_decodeSequenceLong ( seqState_t * seqState , ZSTD_longOffset_e const longOffsets )
{
seq_t seq ;
U32 const llBits = seqState - > stateLL . table [ seqState - > stateLL . state ] . nbAdditionalBits ;
U32 const mlBits = seqState - > stateML . table [ seqState - > stateML . state ] . nbAdditionalBits ;
U32 const ofBits = seqState - > stateOffb . table [ seqState - > stateOffb . state ] . nbAdditionalBits ;
U32 const totalBits = llBits + mlBits + ofBits ;
U32 const llBase = seqState - > stateLL . table [ seqState - > stateLL . state ] . baseValue ;
U32 const mlBase = seqState - > stateML . table [ seqState - > stateML . state ] . baseValue ;
U32 const ofBase = seqState - > stateOffb . table [ seqState - > stateOffb . state ] . baseValue ;
/* sequence */
{ size_t offset ;
if ( ! ofBits )
offset = 0 ;
else {
ZSTD_STATIC_ASSERT ( ZSTD_lo_isLongOffset = = 1 ) ;
ZSTD_STATIC_ASSERT ( LONG_OFFSETS_MAX_EXTRA_BITS_32 = = 5 ) ;
assert ( ofBits < = MaxOff ) ;
if ( MEM_32bits ( ) & & longOffsets ) {
U32 const extraBits = ofBits - MIN ( ofBits , STREAM_ACCUMULATOR_MIN_32 - 1 ) ;
offset = ofBase + ( BIT_readBitsFast ( & seqState - > DStream , ofBits - extraBits ) < < extraBits ) ;
if ( MEM_32bits ( ) | | extraBits ) BIT_reloadDStream ( & seqState - > DStream ) ;
if ( extraBits ) offset + = BIT_readBitsFast ( & seqState - > DStream , extraBits ) ;
} else {
offset = ofBase + BIT_readBitsFast ( & seqState - > DStream , ofBits ) ; /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
if ( MEM_32bits ( ) ) BIT_reloadDStream ( & seqState - > DStream ) ;
}
}
if ( ofBits < = 1 ) {
offset + = ( llBase = = 0 ) ;
if ( offset ) {
size_t temp = ( offset = = 3 ) ? seqState - > prevOffset [ 0 ] - 1 : seqState - > prevOffset [ offset ] ;
temp + = ! temp ; /* 0 is not valid; input is corrupted; force offset to 1 */
if ( offset ! = 1 ) seqState - > prevOffset [ 2 ] = seqState - > prevOffset [ 1 ] ;
seqState - > prevOffset [ 1 ] = seqState - > prevOffset [ 0 ] ;
seqState - > prevOffset [ 0 ] = offset = temp ;
} else {
offset = seqState - > prevOffset [ 0 ] ;
}
} else {
seqState - > prevOffset [ 2 ] = seqState - > prevOffset [ 1 ] ;
seqState - > prevOffset [ 1 ] = seqState - > prevOffset [ 0 ] ;
seqState - > prevOffset [ 0 ] = offset ;
}
seq . offset = offset ;
}
seq . matchLength = mlBase + ( ( mlBits > 0 ) ? BIT_readBitsFast ( & seqState - > DStream , mlBits ) : 0 ) ; /* <= 16 bits */
if ( MEM_32bits ( ) & & ( mlBits + llBits > = STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 ) )
BIT_reloadDStream ( & seqState - > DStream ) ;
if ( MEM_64bits ( ) & & ( totalBits > = STREAM_ACCUMULATOR_MIN_64 - ( LLFSELog + MLFSELog + OffFSELog ) ) )
BIT_reloadDStream ( & seqState - > DStream ) ;
/* Verify that there is enough bits to read the rest of the data in 64-bit mode. */
ZSTD_STATIC_ASSERT ( 16 + LLFSELog + MLFSELog + OffFSELog < STREAM_ACCUMULATOR_MIN_64 ) ;
seq . litLength = llBase + ( ( llBits > 0 ) ? BIT_readBitsFast ( & seqState - > DStream , llBits ) : 0 ) ; /* <= 16 bits */
if ( MEM_32bits ( ) )
BIT_reloadDStream ( & seqState - > DStream ) ;
{ size_t const pos = seqState - > pos + seq . litLength ;
const BYTE * const matchBase = ( seq . offset > pos ) ? seqState - > dictEnd : seqState - > prefixStart ;
seq . match = matchBase + pos - seq . offset ; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
* No consequence though : no memory access will occur , overly large offset will be detected in ZSTD_execSequenceLong ( ) */
seqState - > pos = pos + seq . matchLength ;
}
/* ANS state update */
ZSTD_updateFseState ( & seqState - > stateLL , & seqState - > DStream ) ; /* <= 9 bits */
ZSTD_updateFseState ( & seqState - > stateML , & seqState - > DStream ) ; /* <= 9 bits */
if ( MEM_32bits ( ) ) BIT_reloadDStream ( & seqState - > DStream ) ; /* <= 18 bits */
ZSTD_updateFseState ( & seqState - > stateOffb , & seqState - > DStream ) ; /* <= 8 bits */
return seq ;
}
FORCE_INLINE_TEMPLATE size_t
ZSTD_decompressSequencesLong_body (
ZSTD_DCtx * dctx ,
void * dst , size_t maxDstSize ,
const void * seqStart , size_t seqSize , int nbSeq ,
const ZSTD_longOffset_e isLongOffset )
{
const BYTE * ip = ( const BYTE * ) seqStart ;
const BYTE * const iend = ip + seqSize ;
BYTE * const ostart = ( BYTE * const ) dst ;
BYTE * const oend = ostart + maxDstSize ;
BYTE * op = ostart ;
const BYTE * litPtr = dctx - > litPtr ;
const BYTE * const litEnd = litPtr + dctx - > litSize ;
const BYTE * const prefixStart = ( const BYTE * ) ( dctx - > prefixStart ) ;
const BYTE * const dictStart = ( const BYTE * ) ( dctx - > virtualStart ) ;
const BYTE * const dictEnd = ( const BYTE * ) ( dctx - > dictEnd ) ;
/* Regen sequences */
if ( nbSeq ) {
# define STORED_SEQS 4
2018-11-08 20:36:39 +00:00
# define STORED_SEQS_MASK (STORED_SEQS-1)
2018-10-25 23:28:41 +00:00
# define ADVANCED_SEQS 4
seq_t sequences [ STORED_SEQS ] ;
int const seqAdvance = MIN ( nbSeq , ADVANCED_SEQS ) ;
seqState_t seqState ;
int seqNb ;
dctx - > fseEntropy = 1 ;
2018-11-08 20:36:39 +00:00
{ int i ; for ( i = 0 ; i < ZSTD_REP_NUM ; i + + ) seqState . prevOffset [ i ] = dctx - > entropy . rep [ i ] ; }
2018-10-25 23:28:41 +00:00
seqState . prefixStart = prefixStart ;
seqState . pos = ( size_t ) ( op - prefixStart ) ;
seqState . dictEnd = dictEnd ;
2018-11-08 20:57:34 +00:00
assert ( iend > = ip ) ;
2019-01-28 22:27:29 +00:00
RETURN_ERROR_IF (
ERR_isError ( BIT_initDStream ( & seqState . DStream , ip , iend - ip ) ) ,
corruption_detected ) ;
2018-10-25 23:28:41 +00:00
ZSTD_initFseState ( & seqState . stateLL , & seqState . DStream , dctx - > LLTptr ) ;
ZSTD_initFseState ( & seqState . stateOffb , & seqState . DStream , dctx - > OFTptr ) ;
ZSTD_initFseState ( & seqState . stateML , & seqState . DStream , dctx - > MLTptr ) ;
/* prepare in advance */
for ( seqNb = 0 ; ( BIT_reloadDStream ( & seqState . DStream ) < = BIT_DStream_completed ) & & ( seqNb < seqAdvance ) ; seqNb + + ) {
sequences [ seqNb ] = ZSTD_decodeSequenceLong ( & seqState , isLongOffset ) ;
2018-11-09 01:00:23 +00:00
PREFETCH_L1 ( sequences [ seqNb ] . match ) ; PREFETCH_L1 ( sequences [ seqNb ] . match + sequences [ seqNb ] . matchLength - 1 ) ; /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
2018-10-25 23:28:41 +00:00
}
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( seqNb < seqAdvance , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
/* decode and decompress */
for ( ; ( BIT_reloadDStream ( & ( seqState . DStream ) ) < = BIT_DStream_completed ) & & ( seqNb < nbSeq ) ; seqNb + + ) {
seq_t const sequence = ZSTD_decodeSequenceLong ( & seqState , isLongOffset ) ;
2018-11-08 20:36:39 +00:00
size_t const oneSeqSize = ZSTD_execSequenceLong ( op , oend , sequences [ ( seqNb - ADVANCED_SEQS ) & STORED_SEQS_MASK ] , & litPtr , litEnd , prefixStart , dictStart , dictEnd ) ;
2018-10-25 23:28:41 +00:00
if ( ZSTD_isError ( oneSeqSize ) ) return oneSeqSize ;
2018-11-08 20:47:46 +00:00
PREFETCH_L1 ( sequence . match ) ; PREFETCH_L1 ( sequence . match + sequence . matchLength - 1 ) ; /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
2018-11-08 20:36:39 +00:00
sequences [ seqNb & STORED_SEQS_MASK ] = sequence ;
2018-10-25 23:28:41 +00:00
op + = oneSeqSize ;
}
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( seqNb < nbSeq , corruption_detected ) ;
2018-10-25 23:28:41 +00:00
/* finish queue */
seqNb - = seqAdvance ;
for ( ; seqNb < nbSeq ; seqNb + + ) {
2018-11-08 20:36:39 +00:00
size_t const oneSeqSize = ZSTD_execSequenceLong ( op , oend , sequences [ seqNb & STORED_SEQS_MASK ] , & litPtr , litEnd , prefixStart , dictStart , dictEnd ) ;
2018-10-25 23:28:41 +00:00
if ( ZSTD_isError ( oneSeqSize ) ) return oneSeqSize ;
op + = oneSeqSize ;
}
/* save reps for next block */
{ U32 i ; for ( i = 0 ; i < ZSTD_REP_NUM ; i + + ) dctx - > entropy . rep [ i ] = ( U32 ) ( seqState . prevOffset [ i ] ) ; }
}
/* last literal segment */
{ size_t const lastLLSize = litEnd - litPtr ;
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( lastLLSize > ( size_t ) ( oend - op ) , dstSize_tooSmall ) ;
2018-10-25 23:28:41 +00:00
memcpy ( op , litPtr , lastLLSize ) ;
op + = lastLLSize ;
}
return op - ostart ;
}
static size_t
ZSTD_decompressSequencesLong_default ( ZSTD_DCtx * dctx ,
void * dst , size_t maxDstSize ,
const void * seqStart , size_t seqSize , int nbSeq ,
const ZSTD_longOffset_e isLongOffset )
{
return ZSTD_decompressSequencesLong_body ( dctx , dst , maxDstSize , seqStart , seqSize , nbSeq , isLongOffset ) ;
}
2018-12-20 20:15:07 +00:00
# endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
2018-10-25 23:28:41 +00:00
# if DYNAMIC_BMI2
2018-12-20 20:20:34 +00:00
# ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
2018-10-25 23:28:41 +00:00
static TARGET_ATTRIBUTE ( " bmi2 " ) size_t
perf improvements for zstd decode (#1668)
* perf improvements for zstd decode
tldr: 7.5% average decode speedup on silesia corpus at compression levels 1-3 (sandy bridge)
Background: while investigating zstd perf differences between clang and gcc I noticed that even though gcc is vectorizing the loop in in wildcopy, it was not being done as well as could be done by hand. The sites where wildcopy is invoked have an interesting distribution of lengths to be copied. The loop trip count is rarely above 1, yet long copies are common enough to make their performance important.The code in zstd_decompress.c to invoke wildcopy handles the latter well but the gcc autovectorizer introduces a needlessly expensive startup check for vectorization.
See how GCC autovectorizes the loop here:
https://godbolt.org/z/apr0x0
Here is the code after this diff has been applied: (left hand side is the good one, right is with vectorizer on)
After: https://godbolt.org/z/OwO4F8
Note that autovectorization still does not do a good job on the optimized version, so it's turned off\
via attribute and flag. I found that neither attribute nor command-line flag were entirely successful in turning off vectorization, which is why there were both.
silesia benchmark data - second triad of each file is with the original code:
file orig compressedratio encode decode change
1#dickens 10192446-> 4268865(2.388), 198.9MB/s 709.6MB/s
2#dickens 10192446-> 3876126(2.630), 128.7MB/s 552.5MB/s
3#dickens 10192446-> 3682956(2.767), 104.6MB/s 537MB/s
1#dickens 10192446-> 4268865(2.388), 195.4MB/s 659.5MB/s 7.60%
2#dickens 10192446-> 3876126(2.630), 127MB/s 516.3MB/s 7.01%
3#dickens 10192446-> 3682956(2.767), 105MB/s 479.5MB/s 11.99%
1#mozilla 51220480-> 20117517(2.546), 285.4MB/s 734.9MB/s
2#mozilla 51220480-> 19067018(2.686), 220.8MB/s 686.3MB/s
3#mozilla 51220480-> 18508283(2.767), 152.2MB/s 669.4MB/s
1#mozilla 51220480-> 20117517(2.546), 283.4MB/s 697.9MB/s 5.30%
2#mozilla 51220480-> 19067018(2.686), 225.9MB/s 665MB/s 3.20%
3#mozilla 51220480-> 18508283(2.767), 154.5MB/s 640.6MB/s 4.50%
1#mr 9970564-> 3840242(2.596), 262.4MB/s 899.8MB/s
2#mr 9970564-> 3600976(2.769), 181.2MB/s 717.9MB/s
3#mr 9970564-> 3563987(2.798), 116.3MB/s 620MB/s
1#mr 9970564-> 3840242(2.596), 253.2MB/s 827.3MB/s 8.76%
2#mr 9970564-> 3600976(2.769), 177.4MB/s 655.4MB/s 9.54%
3#mr 9970564-> 3563987(2.798), 111.2MB/s 564.2MB/s 9.89%
1#nci 33553445-> 2849306(11.78), 575.2MB/s , 1335.8MB/s
2#nci 33553445-> 2890166(11.61), 509.3MB/s , 1238.1MB/s
3#nci 33553445-> 2857408(11.74), 431MB/s , 1210.7MB/s
1#nci 33553445-> 2849306(11.78), 565.4MB/s , 1220.2MB/s 9.47%
2#nci 33553445-> 2890166(11.61), 508.2MB/s , 1128.4MB/s 9.72%
3#nci 33553445-> 2857408(11.74), 429.1MB/s , 1097.7MB/s 10.29%
1#ooffice 6152192-> 3590954(1.713), 231.4MB/s , 662.6MB/s
2#ooffice 6152192-> 3323931(1.851), 162.8MB/s , 592.6MB/s
3#ooffice 6152192-> 3145625(1.956), 99.9MB/s , 549.6MB/s
1#ooffice 6152192-> 3590954(1.713), 224.7MB/s , 624.2MB/s 6.15%
2#ooffice 6152192-> 3323931 (1.851), 155MB/s , 564.5MB/s 4.98%
3#ooffice 6152192-> 3145625(1.956), 101.1MB/s , 521.2MB/s 5.45%
1#osdb 10085684-> 3739042(2.697), 271.9MB/s 876.4MB/s
2#osdb 10085684-> 3493875(2.887), 208.2MB/s 857MB/s
3#osdb 10085684-> 3515831(2.869), 135.3MB/s 805.4MB/s
1#osdb 10085684-> 3739042(2.697), 257.4MB/s 793.8MB/s 10.41%
2#osdb 10085684-> 3493875(2.887), 209.7MB/s 776.1MB/s 10.42%
3#osdb 10085684-> 3515831(2.869), 130.6MB/s 727.7MB/s 10.68%
1#reymont 6627202-> 2152771(3.078), 198.9MB/s 696.2MB/s
2#reymont 6627202-> 2071140(3.200), 170MB/s 595.2MB/s
3#reymont 6627202-> 1953597(3.392), 128.5MB/s 609.7MB/s
1#reymont 6627202-> 2152771(3.078), 199.6MB/s 655.2MB/s 6.26%
2#reymont 6627202-> 2071140(3.200), 168.2MB/s 554.4MB/s 7.36%
3#reymont 6627202-> 1953597(3.392), 128.7MB/s 557.4MB/s 9.38%
1#samba 21606400-> 5510994(3.921), 338.1MB/s 1066MB/s
2#samba 21606400-> 5240208(4.123), 258.7MB/s 992.3MB/s
3#samba 21606400-> 5003358(4.318), 200.2MB/s 991.1MB/s
1#samba 21606400-> 5510994(3.921), 330.8MB/s 974MB/s 9.45%
2#samba 21606400-> 5240208(4.123), 257.9MB/s 919.4MB/s 7.93%
3#samba 21606400-> 5003358(4.318), 198.5MB/s 908.9MB/s 9.04%
1#sao 7251944-> 6256401(1.159), 194.6MB/s 602.2MB/s
2#sao 7251944-> 5808761(1.248), 128.2MB/s 532.1MB/s
3#sao 7251944-> 5556318(1.305), 73MB/s 509.4MB/s
1#sao 7251944-> 6256401(1.159), 198.7MB/s 580.7MB/s 3.70%
2#sao 7251944-> 5808761(1.248), 129.1MB/s 502.7MB/s 5.85%
3#sao 7251944-> 5556318(1.305), 74.6MB/s 493.1MB/s 3.31%
1#webster 41458703-> 13692222(3.028), 222.3MB/s 752MB/s
2#webster 41458703-> 12842646(3.228), 157.6MB/s 532.2MB/s
3#webster 41458703-> 12191964(3.400), 124MB/s 468.5MB/s
1#webster 41458703-> 13692222(3.028), 219.7MB/s 697MB/s 7.89%
2#webster 41458703-> 12842646(3.228), 153.9MB/s 495.4MB/s 7.43%
3#webster 41458703-> 12191964(3.400), 124.8MB/s 444.8MB/s 5.33%
1#xml 5345280-> 696652(7.673), 485MB/s , 1333.9MB/s
2#xml 5345280-> 681492(7.843), 405.2MB/s , 1237.5MB/s
3#xml 5345280-> 639057(8.364), 328.5MB/s , 1281.3MB/s
1#xml 5345280-> 696652(7.673), 473.1MB/s , 1232.4MB/s 8.24%
2#xml 5345280-> 681492(7.843), 398.6MB/s , 1145.9MB/s 7.99%
3#xml 5345280-> 639057(8.364), 327.1MB/s , 1175MB/s 9.05%
1#x-ray 8474240-> 6772557(1.251), 521.3MB/s 762.6MB/s
2#x-ray 8474240-> 6684531(1.268), 230.5MB/s 688.5MB/s
3#x-ray 8474240-> 6166679(1.374), 68.7MB/s 478.8MB/s
1#x-ray 8474240-> 6772557(1.251), 502.8MB/s 736.7MB/s 3.52%
2#x-ray 8474240-> 6684531(1.268), 224.4MB/s 662MB/s 4.00%
3#x-ray 8474240-> 6166679(1.374), 67.3MB/s 437.8MB/s 9.37%
7.51%
* makefile changed to only pass -fno-tree-vectorize to gcc
* <Replace this line with a title. Use 1 line only, 67 chars or less>
Don't add "no-tree-vectorize" attribute on clang (which defines __GNUC__)
* fix for warning/error with subtraction of void* pointers
* fix c90 conformance issue - ISO C90 forbids mixed declarations and code
* Fix assert for negative diff, only when there is no overlap
* fix overflow revealed in fuzzing tests
* tweak for small speed increase
2019-07-11 22:31:07 +00:00
DONT_VECTORIZE
2018-10-25 23:28:41 +00:00
ZSTD_decompressSequences_bmi2 ( ZSTD_DCtx * dctx ,
void * dst , size_t maxDstSize ,
const void * seqStart , size_t seqSize , int nbSeq ,
const ZSTD_longOffset_e isLongOffset )
{
return ZSTD_decompressSequences_body ( dctx , dst , maxDstSize , seqStart , seqSize , nbSeq , isLongOffset ) ;
}
2018-12-20 20:20:34 +00:00
# endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
2018-10-25 23:28:41 +00:00
2018-12-20 20:15:07 +00:00
# ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
2018-10-25 23:28:41 +00:00
static TARGET_ATTRIBUTE ( " bmi2 " ) size_t
ZSTD_decompressSequencesLong_bmi2 ( ZSTD_DCtx * dctx ,
void * dst , size_t maxDstSize ,
const void * seqStart , size_t seqSize , int nbSeq ,
const ZSTD_longOffset_e isLongOffset )
{
return ZSTD_decompressSequencesLong_body ( dctx , dst , maxDstSize , seqStart , seqSize , nbSeq , isLongOffset ) ;
}
2018-12-20 20:15:07 +00:00
# endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
2018-10-25 23:28:41 +00:00
2018-12-20 20:15:07 +00:00
# endif /* DYNAMIC_BMI2 */
2018-10-25 23:28:41 +00:00
typedef size_t ( * ZSTD_decompressSequences_t ) (
2018-11-08 20:36:39 +00:00
ZSTD_DCtx * dctx ,
void * dst , size_t maxDstSize ,
const void * seqStart , size_t seqSize , int nbSeq ,
const ZSTD_longOffset_e isLongOffset ) ;
2018-10-25 23:28:41 +00:00
2018-12-20 20:20:34 +00:00
# ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
2018-10-25 23:28:41 +00:00
static size_t
ZSTD_decompressSequences ( ZSTD_DCtx * dctx , void * dst , size_t maxDstSize ,
const void * seqStart , size_t seqSize , int nbSeq ,
const ZSTD_longOffset_e isLongOffset )
{
DEBUGLOG ( 5 , " ZSTD_decompressSequences " ) ;
# if DYNAMIC_BMI2
if ( dctx - > bmi2 ) {
return ZSTD_decompressSequences_bmi2 ( dctx , dst , maxDstSize , seqStart , seqSize , nbSeq , isLongOffset ) ;
}
# endif
return ZSTD_decompressSequences_default ( dctx , dst , maxDstSize , seqStart , seqSize , nbSeq , isLongOffset ) ;
}
2018-12-20 20:20:34 +00:00
# endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
2018-10-25 23:28:41 +00:00
2018-11-08 20:36:39 +00:00
2018-12-20 20:15:07 +00:00
# ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
2018-11-08 20:36:39 +00:00
/* ZSTD_decompressSequencesLong() :
* decompression function triggered when a minimum share of offsets is considered " long " ,
* aka out of cache .
2019-04-12 18:18:11 +00:00
* note : " long " definition seems overloaded here , sometimes meaning " wider than bitstream register " , and sometimes meaning " farther than memory cache distance " .
2018-11-08 20:36:39 +00:00
* This function will try to mitigate main memory latency through the use of prefetching */
static size_t
ZSTD_decompressSequencesLong ( ZSTD_DCtx * dctx ,
void * dst , size_t maxDstSize ,
const void * seqStart , size_t seqSize , int nbSeq ,
const ZSTD_longOffset_e isLongOffset )
2018-10-25 23:28:41 +00:00
{
DEBUGLOG ( 5 , " ZSTD_decompressSequencesLong " ) ;
# if DYNAMIC_BMI2
if ( dctx - > bmi2 ) {
return ZSTD_decompressSequencesLong_bmi2 ( dctx , dst , maxDstSize , seqStart , seqSize , nbSeq , isLongOffset ) ;
}
# endif
return ZSTD_decompressSequencesLong_default ( dctx , dst , maxDstSize , seqStart , seqSize , nbSeq , isLongOffset ) ;
}
2018-12-20 20:15:07 +00:00
# endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
2018-10-25 23:28:41 +00:00
2018-11-08 20:36:39 +00:00
2018-12-20 20:15:07 +00:00
# if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
! defined ( ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG )
2018-10-25 23:28:41 +00:00
/* ZSTD_getLongOffsetsShare() :
* condition : offTable must be valid
* @ return : " share " of long offsets ( arbitrarily defined as > ( 1 < < 23 ) )
* compared to maximum possible of ( 1 < < OffFSELog ) */
static unsigned
ZSTD_getLongOffsetsShare ( const ZSTD_seqSymbol * offTable )
{
const void * ptr = offTable ;
U32 const tableLog = ( ( const ZSTD_seqSymbol_header * ) ptr ) [ 0 ] . tableLog ;
const ZSTD_seqSymbol * table = offTable + 1 ;
U32 const max = 1 < < tableLog ;
U32 u , total = 0 ;
DEBUGLOG ( 5 , " ZSTD_getLongOffsetsShare: (tableLog=%u) " , tableLog ) ;
assert ( max < = ( 1 < < OffFSELog ) ) ; /* max not too large */
for ( u = 0 ; u < max ; u + + ) {
if ( table [ u ] . nbAdditionalBits > 22 ) total + = 1 ;
}
assert ( tableLog < = OffFSELog ) ;
total < < = ( OffFSELog - tableLog ) ; /* scale to OffFSELog */
return total ;
}
2018-12-20 20:15:07 +00:00
# endif
2018-10-25 23:28:41 +00:00
size_t
ZSTD_decompressBlock_internal ( ZSTD_DCtx * dctx ,
void * dst , size_t dstCapacity ,
const void * src , size_t srcSize , const int frame )
{ /* blockType == blockCompressed */
const BYTE * ip = ( const BYTE * ) src ;
/* isLongOffset must be true if there are long offsets.
* Offsets are long if they are larger than 2 ^ STREAM_ACCUMULATOR_MIN .
* We don ' t expect that to be the case in 64 - bit mode .
* In block mode , window size is not known , so we have to be conservative .
* ( note : but it could be evaluated from current - lowLimit )
*/
2018-11-08 20:36:39 +00:00
ZSTD_longOffset_e const isLongOffset = ( ZSTD_longOffset_e ) ( MEM_32bits ( ) & & ( ! frame | | ( dctx - > fParams . windowSize > ( 1ULL < < STREAM_ACCUMULATOR_MIN ) ) ) ) ;
2018-10-25 23:28:41 +00:00
DEBUGLOG ( 5 , " ZSTD_decompressBlock_internal (size : %u) " , ( U32 ) srcSize ) ;
2018-12-06 01:17:11 +00:00
RETURN_ERROR_IF ( srcSize > = ZSTD_BLOCKSIZE_MAX , srcSize_wrong ) ;
2018-10-25 23:28:41 +00:00
/* Decode literals section */
{ size_t const litCSize = ZSTD_decodeLiteralsBlock ( dctx , src , srcSize ) ;
DEBUGLOG ( 5 , " ZSTD_decodeLiteralsBlock : %u " , ( U32 ) litCSize ) ;
if ( ZSTD_isError ( litCSize ) ) return litCSize ;
ip + = litCSize ;
srcSize - = litCSize ;
}
/* Build Decoding Tables */
2018-11-16 23:02:11 +00:00
{
2018-12-04 01:36:24 +00:00
/* These macros control at build-time which decompressor implementation
* we use . If neither is defined , we do some inspection and dispatch at
* runtime .
*/
# if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
! defined ( ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG )
2018-11-16 23:02:11 +00:00
int usePrefetchDecoder = dctx - > ddictIsCold ;
# endif
2018-11-09 01:00:23 +00:00
int nbSeq ;
2018-10-25 23:28:41 +00:00
size_t const seqHSize = ZSTD_decodeSeqHeaders ( dctx , & nbSeq , ip , srcSize ) ;
if ( ZSTD_isError ( seqHSize ) ) return seqHSize ;
ip + = seqHSize ;
srcSize - = seqHSize ;
2018-12-04 01:36:24 +00:00
# if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
! defined ( ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG )
2018-11-09 01:00:23 +00:00
if ( ! usePrefetchDecoder
& & ( ! frame | | ( dctx - > fParams . windowSize > ( 1 < < 24 ) ) )
& & ( nbSeq > ADVANCED_SEQS ) ) { /* could probably use a larger nbSeq limit */
2018-10-25 23:28:41 +00:00
U32 const shareLongOffsets = ZSTD_getLongOffsetsShare ( dctx - > OFTptr ) ;
U32 const minShare = MEM_64bits ( ) ? 7 : 20 ; /* heuristic values, correspond to 2.73% and 7.81% */
2018-11-09 01:00:23 +00:00
usePrefetchDecoder = ( shareLongOffsets > = minShare ) ;
2018-10-25 23:28:41 +00:00
}
2018-11-16 23:02:11 +00:00
# endif
2018-10-25 23:28:41 +00:00
2018-11-09 01:00:23 +00:00
dctx - > ddictIsCold = 0 ;
2018-12-04 01:36:24 +00:00
# if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
! defined ( ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG )
2018-11-09 01:00:23 +00:00
if ( usePrefetchDecoder )
2018-12-04 01:36:24 +00:00
# endif
# ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
2018-11-09 01:00:23 +00:00
return ZSTD_decompressSequencesLong ( dctx , dst , dstCapacity , ip , srcSize , nbSeq , isLongOffset ) ;
2018-11-16 23:02:11 +00:00
# endif
2018-11-09 01:00:23 +00:00
2018-12-04 01:36:24 +00:00
# ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
2018-11-09 01:00:23 +00:00
/* else */
2018-10-25 23:28:41 +00:00
return ZSTD_decompressSequences ( dctx , dst , dstCapacity , ip , srcSize , nbSeq , isLongOffset ) ;
2018-12-04 01:36:24 +00:00
# endif
2018-10-25 23:28:41 +00:00
}
}
size_t ZSTD_decompressBlock ( ZSTD_DCtx * dctx ,
void * dst , size_t dstCapacity ,
const void * src , size_t srcSize )
{
size_t dSize ;
ZSTD_checkContinuity ( dctx , dst ) ;
dSize = ZSTD_decompressBlock_internal ( dctx , dst , dstCapacity , src , srcSize , /* frame */ 0 ) ;
dctx - > previousDstEnd = ( char * ) dst + dSize ;
return dSize ;
}