disable prefetch-decode for 32-bits target

This decoder variant is detrimental to x86 architecture
likely due to register pressure.

Note that the variant is disabled for all 32-bits targets.
It's unclear if it would help for different architectures,
such as ARM, MIPS or PowerPC.
This commit is contained in:
Yann Collet 2017-03-02 17:09:21 -08:00
parent 3a55d8be26
commit fe5d27062e

View File

@ -1410,13 +1410,18 @@ static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
if (srcSize >= ZSTD_BLOCKSIZE_ABSOLUTEMAX) return ERROR(srcSize_wrong); if (srcSize >= ZSTD_BLOCKSIZE_ABSOLUTEMAX) return ERROR(srcSize_wrong);
/* Decode literals sub-block */ /* Decode literals section */
{ size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize); { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
if (ZSTD_isError(litCSize)) return litCSize; if (ZSTD_isError(litCSize)) return litCSize;
ip += litCSize; ip += litCSize;
srcSize -= litCSize; srcSize -= litCSize;
} }
if (dctx->fParams.windowSize > (1<<23)) return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize); if (sizeof(size_t) > 4) /* do not enable prefetching on 32-bits x86, as it's performance detrimental */
/* likely because of register pressure */
/* if that's the correct cause, then 32-bits ARM should be affected differently */
/* it would be good to test this on ARM real hardware, to see if prefetch version improves speed */
if (dctx->fParams.windowSize > (1<<23))
return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize);
return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize); return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize);
} }