Optimize decompression and fix wildcopy overread

* Bump `WILDCOPY_OVERLENGTH` to 16 to fix the wildcopy overread.
* Optimize `ZSTD_wildcopy()` by removing unnecessary branches and
  unrolling the loop.
* Extract `ZSTD_overlapCopy8()` into its own function.
* Add `ZSTD_safecopy()` for `ZSTD_execSequenceEnd()`. It is
  optimized for single long sequences, since that is the important
  case that can end up in `ZSTD_execSequenceEnd()`. Without this
  optimization, decompressing a block with 1 long match goes
  from 5.7 GB/s to 800 MB/s.
* Refactor `ZSTD_execSequenceEnd()`.
* Increase the literal copy shortcut to 16.
* Add a shortcut for offset >= 16.
* Simplify `ZSTD_execSequence()` by pushing more cases into
  `ZSTD_execSequenceEnd()`.
* Delete `ZSTD_execSequenceLong()` since it is exactly the
  same as `ZSTD_execSequence()`.

clang-8 seeds +17.5% on silesia and +21.8% on enwik8.
gcc-9 sees +12% on silesia and +15.5% on enwik8.

TODO: More detailed measurements, and on more datasets.

Crdit to OSS-Fuzz for finding the wildcopy overread.
This commit is contained in:
Nick Terrell 2019-09-19 13:25:03 -07:00 committed by Nick Terrell
parent 0e76000dee
commit efd37a64ea
3 changed files with 179 additions and 205 deletions

View File

@ -197,8 +197,8 @@ static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); } static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); }
#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; } #define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
#define WILDCOPY_OVERLENGTH 8 #define WILDCOPY_OVERLENGTH 16
#define VECLEN 16 #define WILDCOPY_VECLEN 16
typedef enum { typedef enum {
ZSTD_no_overlap, ZSTD_no_overlap,
@ -207,83 +207,58 @@ typedef enum {
} ZSTD_overlap_e; } ZSTD_overlap_e;
/*! ZSTD_wildcopy() : /*! ZSTD_wildcopy() :
* custom version of memcpy(), can overwrite up to WILDCOPY_OVERLENGTH bytes (if length==0) */ * Custom version of memcpy(), can over read/write up to WILDCOPY_OVERLENGTH bytes (if length==0)
* @param ovtype controls the overlap detection
* - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
* - ZSTD_overlap_src_before_dst: The src and dst may overlap, but they MUST be at least 8 bytes apart.
* The src buffer must be before the dst buffer.
*/
MEM_STATIC FORCE_INLINE_ATTR DONT_VECTORIZE MEM_STATIC FORCE_INLINE_ATTR DONT_VECTORIZE
void ZSTD_wildcopy(void* dst, const void* src, BYTE* oend_g, ptrdiff_t length, ZSTD_overlap_e ovtype) void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e ovtype)
{ {
ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src; ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
const BYTE* ip = (const BYTE*)src; const BYTE* ip = (const BYTE*)src;
BYTE* op = (BYTE*)dst; BYTE* op = (BYTE*)dst;
BYTE* const oend = op + length; BYTE* const oend = op + length;
assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff < -8)); assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff <= -WILDCOPY_VECLEN));
if (length < VECLEN || (ovtype == ZSTD_overlap_src_before_dst && diff < VECLEN)) { if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) {
do /* Handle short offset copies. */
do {
COPY8(op, ip) COPY8(op, ip)
while (op < oend); } while (op < oend);
} } else {
else { assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN);
if (oend < oend_g-16) { /* Separate out the first two COPY16() calls because the copy length is
/* common case */ * almost certain to be short, so the branches have different
* probabilities.
* On gcc-9 unrolling once is +1.6%, twice is +2%, thrice is +1.8%.
* On clang-8 unrolling once is +1.4%, twice is +3.3%, thrice is +3%.
*/
COPY16(op, ip);
if (op >= oend) return;
COPY16(op, ip);
if (op >= oend) return;
do { do {
COPY16(op, ip); COPY16(op, ip);
} }
while (op < oend); while (op < oend);
} }
else {
do {
COPY8(op, ip);
}
while (op < oend);
}
}
} }
/*! ZSTD_wildcopy_16min() : /*! ZSTD_wildcopy8() :
* same semantics as ZSTD_wildcopy() except guaranteed to be able to copy 16 bytes at the start */ * The same as ZSTD_wildcopy(), but it can only overwrite 8 bytes, and works for
MEM_STATIC FORCE_INLINE_ATTR DONT_VECTORIZE * overlapping buffers that are at least 8 bytes apart.
void ZSTD_wildcopy_16min(void* dst, const void* src, BYTE* oend_g, ptrdiff_t length, ZSTD_overlap_e ovtype) */
{ MEM_STATIC void ZSTD_wildcopy8(void* dst, const void* src, ptrdiff_t length)
ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
const BYTE* ip = (const BYTE*)src;
BYTE* op = (BYTE*)dst;
BYTE* const oend = op + length;
assert(length >= 8);
assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff < -8));
if (ovtype == ZSTD_overlap_src_before_dst && diff < VECLEN) {
do {
COPY8(op, ip);
}
while (op < oend);
}
else {
if (oend < oend_g-16) {
/* common case */
do {
COPY16(op, ip);
}
while (op < oend);
}
else {
do {
COPY8(op, ip);
}
while (op < oend);
}
}
}
MEM_STATIC void ZSTD_wildcopy_e(void* dst, const void* src, void* dstEnd) /* should be faster for decoding, but strangely, not verified on all platform */
{ {
const BYTE* ip = (const BYTE*)src; const BYTE* ip = (const BYTE*)src;
BYTE* op = (BYTE*)dst; BYTE* op = (BYTE*)dst;
BYTE* const oend = (BYTE*)dstEnd; BYTE* const oend = (BYTE*)op + length;
do do {
COPY8(op, ip) COPY8(op, ip)
while (op < oend); } while (op < oend);
} }

View File

@ -359,7 +359,10 @@ MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const v
/* copy Literals */ /* copy Literals */
assert(seqStorePtr->maxNbLit <= 128 KB); assert(seqStorePtr->maxNbLit <= 128 KB);
assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit); assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit);
ZSTD_wildcopy(seqStorePtr->lit, literals, seqStorePtr->lit + litLength + 8, (ptrdiff_t)litLength, ZSTD_no_overlap); /* We are guaranteed at least 8 bytes of literals space because of HASH_READ_SIZE and
* MINMATCH.
*/
ZSTD_wildcopy8(seqStorePtr->lit, literals, (ptrdiff_t)litLength);
seqStorePtr->lit += litLength; seqStorePtr->lit += litLength;
/* literal Length */ /* literal Length */

View File

@ -573,38 +573,118 @@ typedef struct {
size_t pos; size_t pos;
} seqState_t; } seqState_t;
/*! ZSTD_overlapCopy8() :
* Copies 8 bytes from ip to op and updates op and ip where ip <= op.
* If the offset is < 8 then the offset is spread to at least 8 bytes.
*
* Precondition: *ip <= *op
* Postcondition: *op - *op >= 8
*/
static void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
assert(*ip <= *op);
if (offset < 8) {
/* close range match, overlap */
static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */
static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */
int const sub2 = dec64table[offset];
(*op)[0] = (*ip)[0];
(*op)[1] = (*ip)[1];
(*op)[2] = (*ip)[2];
(*op)[3] = (*ip)[3];
*ip += dec32table[offset];
ZSTD_copy4(*op+4, *ip);
*ip -= sub2;
} else {
ZSTD_copy8(*op, *ip);
}
*ip += 8;
*op += 8;
assert(*op - *ip >= 8);
}
/* ZSTD_execSequenceLast7(): /*! ZSTD_safecopy() :
* exceptional case : decompress a match starting within last 7 bytes of output buffer. * Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer
* requires more careful checks, to ensure there is no overflow. * and write up to 16 bytes past oend_w (op >= oend_w is allowed).
* performance does not matter though. * This function is only called in the uncommon case where the sequence is near the end of the block. It
* note : this case is supposed to be never generated "naturally" by reference encoder, * should be fast for a single long sequence, but can be slow for several short sequences.
* since in most cases it needs at least 8 bytes to look for a match. *
* but it's allowed by the specification. */ * @param ovtype controls the overlap detection
* - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
* - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
* The src buffer must be before the dst buffer.
*/
static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
ptrdiff_t const diff = op - ip;
BYTE* const oend = op + length;
assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8)) ||
(ovtype == ZSTD_overlap_src_before_dst && diff >= 0));
if (length < 8) {
/* Handle short lengths. */
while (op < oend) *op++ = *ip++;
return;
}
if (ovtype == ZSTD_overlap_src_before_dst) {
/* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
assert(length >= 8);
ZSTD_overlapCopy8(&op, &ip, diff);
assert(op - ip >= 8);
assert(op <= oend);
}
if (oend <= oend_w) {
/* No risk of overwrite. */
ZSTD_wildcopy(op, ip, length, ovtype);
return;
}
if (op <= oend_w) {
/* Wildcopy until we get close to the end. */
assert(oend > oend_w);
ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
ip += oend_w - op;
op = oend_w;
}
/* Handle the leftovers. */
while (op < oend) *op++ = *ip++;
}
/* ZSTD_execSequenceEnd():
* This version handles cases that are near the end of the output buffer. It requires
* more careful checks to make sure there is no overflow. By separating out these hard
* and unlikely cases, we can speed up the common cases.
*
* NOTE: This function needs to be fast for a single long sequence, but doesn't need
* to be optimized for many small sequences, since those fall into ZSTD_execSequence().
*/
FORCE_NOINLINE FORCE_NOINLINE
size_t ZSTD_execSequenceLast7(BYTE* op, size_t ZSTD_execSequenceEnd(BYTE* op,
BYTE* const oend, seq_t sequence, BYTE* const oend, seq_t sequence,
const BYTE** litPtr, const BYTE* const litLimit, const BYTE** litPtr, const BYTE* const litLimit,
const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd) const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
{ {
BYTE* const oLitEnd = op + sequence.litLength; BYTE* const oLitEnd = op + sequence.litLength;
size_t const sequenceLength = sequence.litLength + sequence.matchLength; size_t const sequenceLength = sequence.litLength + sequence.matchLength;
BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
const BYTE* const iLitEnd = *litPtr + sequence.litLength; const BYTE* const iLitEnd = *litPtr + sequence.litLength;
const BYTE* match = oLitEnd - sequence.offset; const BYTE* match = oLitEnd - sequence.offset;
BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
/* check */ /* bounds checks */
assert(oLitEnd < oMatchEnd);
RETURN_ERROR_IF(oMatchEnd > oend, dstSize_tooSmall, "last match must fit within dstBuffer"); RETURN_ERROR_IF(oMatchEnd > oend, dstSize_tooSmall, "last match must fit within dstBuffer");
RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "try to read beyond literal buffer"); RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "try to read beyond literal buffer");
/* copy literals */ /* copy literals */
while (op < oLitEnd) *op++ = *(*litPtr)++; ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap);
op = oLitEnd;
*litPtr = iLitEnd;
/* copy Match */ /* copy Match */
if (sequence.offset > (size_t)(oLitEnd - base)) { if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
/* offset beyond prefix */ /* offset beyond prefix */
RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - vBase),corruption_detected); RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected);
match = dictEnd - (base-match); match = dictEnd - (prefixStart-match);
if (match + sequence.matchLength <= dictEnd) { if (match + sequence.matchLength <= dictEnd) {
memmove(oLitEnd, match, sequence.matchLength); memmove(oLitEnd, match, sequence.matchLength);
return sequenceLength; return sequenceLength;
@ -614,13 +694,12 @@ size_t ZSTD_execSequenceLast7(BYTE* op,
memmove(oLitEnd, match, length1); memmove(oLitEnd, match, length1);
op = oLitEnd + length1; op = oLitEnd + length1;
sequence.matchLength -= length1; sequence.matchLength -= length1;
match = base; match = prefixStart;
} } } }
while (op < oMatchEnd) *op++ = *match++; ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
return sequenceLength; return sequenceLength;
} }
HINT_INLINE HINT_INLINE
size_t ZSTD_execSequence(BYTE* op, size_t ZSTD_execSequence(BYTE* op,
BYTE* const oend, seq_t sequence, BYTE* const oend, seq_t sequence,
@ -634,20 +713,27 @@ size_t ZSTD_execSequence(BYTE* op,
const BYTE* const iLitEnd = *litPtr + sequence.litLength; const BYTE* const iLitEnd = *litPtr + sequence.litLength;
const BYTE* match = oLitEnd - sequence.offset; const BYTE* match = oLitEnd - sequence.offset;
/* check */ /* Errors and uncommon cases handled here. */
RETURN_ERROR_IF(oMatchEnd>oend, dstSize_tooSmall, "last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend"); assert(oLitEnd < oMatchEnd);
RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "over-read beyond lit buffer"); if (iLitEnd > litLimit || oMatchEnd > oend_w)
if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
/* copy Literals */ /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
if (sequence.litLength > 8) assert(iLitEnd <= litLimit /* Literal length is in bounds */);
ZSTD_wildcopy_16min(op, (*litPtr), oend, sequence.litLength, ZSTD_no_overlap); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */ assert(oLitEnd <= oend_w /* Can wildcopy literals */);
assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
/* Copy Literals:
* Split out litLength <= 16 since it is nearly always true. +1% on gcc-9.
*/
if (sequence.litLength <= 16)
ZSTD_copy16(op, *litPtr);
else else
ZSTD_copy8(op, *litPtr); ZSTD_wildcopy(op, (*litPtr), sequence.litLength, ZSTD_no_overlap);
op = oLitEnd; op = oLitEnd;
*litPtr = iLitEnd; /* update for next sequence */ *litPtr = iLitEnd; /* update for next sequence */
/* copy Match */ /* Copy Match */
if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
/* offset beyond prefix -> go into extDict */ /* offset beyond prefix -> go into extDict */
RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected); RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected);
@ -662,123 +748,33 @@ size_t ZSTD_execSequence(BYTE* op,
op = oLitEnd + length1; op = oLitEnd + length1;
sequence.matchLength -= length1; sequence.matchLength -= length1;
match = prefixStart; match = prefixStart;
if (op > oend_w || sequence.matchLength < MINMATCH) {
U32 i;
for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i];
return sequenceLength;
}
} } } }
/* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */ /* Match within prefix of 1 or more bytes */
assert(op <= oMatchEnd);
assert(oMatchEnd <= oend_w);
assert(match >= prefixStart);
assert(sequence.matchLength >= 1);
/* match within prefix */ /* Nearly all offsets are >= 16 bytes, which means we can use wildcopy
if (sequence.offset < 8) { * without overlap checking.
/* close range match, overlap */ */
static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ if (sequence.offset >= 16) {
static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */ /* Split out matchLength <= 16 since it is nearly always true. +1% on gcc-9. */
int const sub2 = dec64table[sequence.offset]; if (sequence.matchLength <= 16)
op[0] = match[0]; ZSTD_copy16(op, match);
op[1] = match[1];
op[2] = match[2];
op[3] = match[3];
match += dec32table[sequence.offset];
ZSTD_copy4(op+4, match);
match -= sub2;
} else {
ZSTD_copy8(op, match);
}
op += 8; match += 8;
if (oMatchEnd > oend-(16-MINMATCH)) {
if (op < oend_w) {
ZSTD_wildcopy(op, match, oend, oend_w - op, ZSTD_overlap_src_before_dst);
match += oend_w - op;
op = oend_w;
}
while (op < oMatchEnd) *op++ = *match++;
} else {
ZSTD_wildcopy(op, match, oend, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); /* works even if matchLength < 8 */
}
return sequenceLength;
}
HINT_INLINE
size_t ZSTD_execSequenceLong(BYTE* op,
BYTE* const oend, seq_t sequence,
const BYTE** litPtr, const BYTE* const litLimit,
const BYTE* const prefixStart, const BYTE* const dictStart, const BYTE* const dictEnd)
{
BYTE* const oLitEnd = op + sequence.litLength;
size_t const sequenceLength = sequence.litLength + sequence.matchLength;
BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
const BYTE* const iLitEnd = *litPtr + sequence.litLength;
const BYTE* match = sequence.match;
/* check */
RETURN_ERROR_IF(oMatchEnd > oend, dstSize_tooSmall, "last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend");
RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "over-read beyond lit buffer");
if (oLitEnd > oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, dictStart, dictEnd);
/* copy Literals */
if (sequence.litLength > 8)
ZSTD_wildcopy_16min(op, *litPtr, oend, sequence.litLength, ZSTD_no_overlap); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
else else
ZSTD_copy8(op, *litPtr); /* note : op <= oLitEnd <= oend_w == oend - 8 */ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
op = oLitEnd;
*litPtr = iLitEnd; /* update for next sequence */
/* copy Match */
if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
/* offset beyond prefix */
RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - dictStart), corruption_detected);
if (match + sequence.matchLength <= dictEnd) {
memmove(oLitEnd, match, sequence.matchLength);
return sequenceLength; return sequenceLength;
} }
/* span extDict & currentPrefixSegment */ assert(sequence.offset < 16);
{ size_t const length1 = dictEnd - match;
memmove(oLitEnd, match, length1);
op = oLitEnd + length1;
sequence.matchLength -= length1;
match = prefixStart;
if (op > oend_w || sequence.matchLength < MINMATCH) {
U32 i;
for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i];
return sequenceLength;
}
} }
assert(op <= oend_w);
assert(sequence.matchLength >= MINMATCH);
/* match within prefix */ /* Copy 8 bytes and spread the offset to be >= 8. */
if (sequence.offset < 8) { ZSTD_overlapCopy8(&op, &match, sequence.offset);
/* close range match, overlap */
static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */
static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */
int const sub2 = dec64table[sequence.offset];
op[0] = match[0];
op[1] = match[1];
op[2] = match[2];
op[3] = match[3];
match += dec32table[sequence.offset];
ZSTD_copy4(op+4, match);
match -= sub2;
} else {
ZSTD_copy8(op, match);
}
op += 8; match += 8;
if (oMatchEnd > oend-(16-MINMATCH)) { /* If the match length is > 8 bytes, then continue with the wildcopy. */
if (op < oend_w) { if (sequence.matchLength > 8) {
ZSTD_wildcopy(op, match, oend, oend_w - op, ZSTD_overlap_src_before_dst); assert(op < oMatchEnd);
match += oend_w - op; ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst);
op = oend_w;
}
while (op < oMatchEnd) *op++ = *match++;
} else {
ZSTD_wildcopy(op, match, oend, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); /* works even if matchLength < 8 */
} }
return sequenceLength; return sequenceLength;
} }
@ -1098,7 +1094,7 @@ ZSTD_decompressSequencesLong_body(
/* decode and decompress */ /* decode and decompress */
for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) { for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset); seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd); size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
if (ZSTD_isError(oneSeqSize)) return oneSeqSize; if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
sequences[seqNb & STORED_SEQS_MASK] = sequence; sequences[seqNb & STORED_SEQS_MASK] = sequence;
@ -1109,7 +1105,7 @@ ZSTD_decompressSequencesLong_body(
/* finish queue */ /* finish queue */
seqNb -= seqAdvance; seqNb -= seqAdvance;
for ( ; seqNb<nbSeq ; seqNb++) { for ( ; seqNb<nbSeq ; seqNb++) {
size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd); size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
if (ZSTD_isError(oneSeqSize)) return oneSeqSize; if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
op += oneSeqSize; op += oneSeqSize;
} }