Merge pull request #95 from szabadka/master

Decoder optimizations for ARM architecture.
This commit is contained in:
szabadka 2015-05-07 17:13:06 +02:00
commit 762f9ba5a0
2 changed files with 57 additions and 6 deletions

View File

@ -30,6 +30,11 @@ extern "C" {
#if (defined(__x86_64__) || defined(_M_X64))
/* This should be set to 1 only on little-endian machines. */
#define BROTLI_USE_64_BITS 1
#elif (defined(__arm__))
/* TODO: __arm__ is much too broad. The following flags should
only be set on ARM architectures with little-endian byte order */
#define ARMv7
#define BROTLI_USE_64_BITS 1
#else
#define BROTLI_USE_64_BITS 0
#endif
@ -41,10 +46,16 @@ extern "C" {
#define UNALIGNED_COPY64(dst, src) memcpy(dst, src, 8)
#define UNALIGNED_MOVE64(dst, src) memmove(dst, src, 8)
#ifdef ARMv7
/* Arm instructions can shift and negate registers before an AND operation. */
static BROTLI_INLINE uint32_t BitMask(int n) { return ~((0xffffffff) << n); }
#else
static const uint32_t kBitMask[BROTLI_MAX_NUM_BIT_READ] = {
0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215
};
static BROTLI_INLINE uint32_t BitMask(int n) { return kBitMask[n]; }
#endif
typedef struct {
#if (BROTLI_USE_64_BITS)
@ -91,7 +102,7 @@ static BROTLI_INLINE void BrotliSetBitPos(BrotliBitReader* const br,
uint32_t val) {
#ifdef BROTLI_DECODE_DEBUG
uint32_t n_bits = val - br->bit_pos_;
const uint32_t bval = (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
const uint32_t bval = (uint32_t)(br->val_ >> br->bit_pos_) & BitMask(n_bits);
printf("[BrotliReadBits] %010d %2d val: %6x\n",
(br->pos_ << 3) + br->bit_pos_ - 64, n_bits, bval);
#endif
@ -148,7 +159,7 @@ static BROTLI_INLINE int BrotliReadMoreInput(BrotliBitReader* const br) {
}
br->eos_ = 1;
/* Store 32 bytes of zero after the stream end. */
#if (BROTLI_USE_64_BITS)
#if (BROTLI_USE_64_BITS) && !defined(ARMv7)
*(uint64_t*)(dst + bytes_read) = 0;
*(uint64_t*)(dst + bytes_read + 8) = 0;
*(uint64_t*)(dst + bytes_read + 16) = 0;
@ -159,7 +170,7 @@ static BROTLI_INLINE int BrotliReadMoreInput(BrotliBitReader* const br) {
}
if (dst == br->buf_) {
/* Copy the head of the ringbuffer to the slack region. */
#if (BROTLI_USE_64_BITS)
#if (BROTLI_USE_64_BITS) && !defined(ARMv7)
UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 32, br->buf_);
UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 24, br->buf_ + 8);
UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 16, br->buf_ + 16);
@ -203,8 +214,15 @@ static BROTLI_INLINE uint32_t BrotliReadBits(
BrotliBitReader* const br, int n_bits) {
uint32_t val;
#if (BROTLI_USE_64_BITS)
#if defined(ARMv7)
if ((64 - br->bit_pos_) < ((uint32_t) n_bits)) {
BrotliFillBitWindow(br);
}
val = (uint32_t)(br->val_ >> br->bit_pos_) & BitMask(n_bits);
#else
BrotliFillBitWindow(br);
val = (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
val = (uint32_t)(br->val_ >> br->bit_pos_) & BitMask(n_bits);
#endif /* defined (ARMv7) */
#else
/*
* The if statement gives 2-4% speed boost on Canterbury data set with
@ -213,8 +231,8 @@ static BROTLI_INLINE uint32_t BrotliReadBits(
if ((32 - br->bit_pos_) < ((uint32_t) n_bits)) {
BrotliFillBitWindow(br);
}
val = (br->val_ >> br->bit_pos_) & kBitMask[n_bits];
#endif
val = (br->val_ >> br->bit_pos_) & Bitmask(n_bits);
#endif /* BROTLI_USE_64_BITS */
#ifdef BROTLI_DECODE_DEBUG
printf("[BrotliReadBits] %010d %2d val: %6x\n",
(br->pos_ << 3) + br->bit_pos_ - 64, n_bits, val);

View File

@ -149,6 +149,21 @@ static BROTLI_INLINE int ReadSymbol(const HuffmanCode* table,
BrotliBitReader* br) {
int nbits;
BrotliFillBitWindow(br);
#ifdef ARMv7
/* Prefetching helps, since this needs to shift a 64 bit
val by a variable length. The other changes are minor. */
uint32_t val = BrotliPrefetchBits(br);
table += val & HUFFMAN_TABLE_MASK;
nbits = table->bits;
if (PREDICT_FALSE(nbits > HUFFMAN_TABLE_BITS)) {
nbits -= HUFFMAN_TABLE_BITS;
br->bit_pos_ += HUFFMAN_TABLE_BITS;
table += table->value;
table += (int)(val >> HUFFMAN_TABLE_BITS) & ((1 << nbits) - 1);
nbits = table->bits;
}
br->bit_pos_ += nbits;
#else
table += (int)(br->val_ >> br->bit_pos_) & HUFFMAN_TABLE_MASK;
if (PREDICT_FALSE(table->bits > HUFFMAN_TABLE_BITS)) {
br->bit_pos_ += HUFFMAN_TABLE_BITS;
@ -157,6 +172,7 @@ static BROTLI_INLINE int ReadSymbol(const HuffmanCode* table,
table += (int)(br->val_ >> br->bit_pos_) & ((1 << nbits) - 1);
}
br->bit_pos_ += table->bits;
#endif
return table->value;
}
@ -624,6 +640,8 @@ static BROTLI_INLINE void DecodeBlockTypeWithContext(BrotliState* s,
*/
static BROTLI_INLINE void IncrementalCopyFastPath(
uint8_t* dst, const uint8_t* src, int len) {
/* TODO: On an ARM UNALIGNED_MOVE64 is compiled into a memcpy.
But I don't have a better solution. */
if (src < dst) {
while (dst - src < 8) {
UNALIGNED_MOVE64(dst, src);
@ -1488,6 +1506,7 @@ BrotliResult BrotliDecompressStreaming(BrotliInput input, BrotliOutput output,
s->copy_src =
&s->ringbuffer[(pos - s->distance) & s->ringbuffer_mask];
#if (defined(__x86_64__) || defined(_M_X64))
if (s->copy_src + s->copy_length <= s->ringbuffer_end &&
s->copy_dst + s->copy_length < s->ringbuffer_end) {
@ -1501,6 +1520,20 @@ BrotliResult BrotliDecompressStreaming(BrotliInput input, BrotliOutput output,
s->meta_block_remaining_len -= s->copy_length;
s->copy_length = 0;
}
#elif defined(ARMv7)
/* This version is maybe 5% faster than the version above.
UNALIGNED_COPY64 does not inline and generates memcpy calls. */
if (s->copy_src + s->copy_length <= s->ringbuffer_end &&
s->copy_dst + s->copy_length < s->ringbuffer_end) {
if (s->copy_length <= s->distance) {
memcpy(s->copy_dst, s->copy_src, s->copy_length);
} else {
IncrementalCopyFastPath(s->copy_dst, s->copy_src, s->copy_length);
}
pos += s->copy_length;
s->meta_block_remaining_len -= s->copy_length;
s->copy_length = 0;
}
#endif
/* Modifications to this loop shold be reflected in
BROTLI_STATE_BLOCK_POST_WRITE_2 case */