mirror of
https://github.com/google/brotli.git
synced 2024-11-21 19:20:09 +00:00
Merge pull request #95 from szabadka/master
Decoder optimizations for ARM architecture.
This commit is contained in:
commit
762f9ba5a0
@ -30,6 +30,11 @@ extern "C" {
|
||||
#if (defined(__x86_64__) || defined(_M_X64))
|
||||
/* This should be set to 1 only on little-endian machines. */
|
||||
#define BROTLI_USE_64_BITS 1
|
||||
#elif (defined(__arm__))
|
||||
/* TODO: __arm__ is much too broad. The following flags should
|
||||
only be set on ARM architectures with little-endian byte order */
|
||||
#define ARMv7
|
||||
#define BROTLI_USE_64_BITS 1
|
||||
#else
|
||||
#define BROTLI_USE_64_BITS 0
|
||||
#endif
|
||||
@ -41,10 +46,16 @@ extern "C" {
|
||||
#define UNALIGNED_COPY64(dst, src) memcpy(dst, src, 8)
|
||||
#define UNALIGNED_MOVE64(dst, src) memmove(dst, src, 8)
|
||||
|
||||
#ifdef ARMv7
|
||||
/* Arm instructions can shift and negate registers before an AND operation. */
|
||||
static BROTLI_INLINE uint32_t BitMask(int n) { return ~((0xffffffff) << n); }
|
||||
#else
|
||||
static const uint32_t kBitMask[BROTLI_MAX_NUM_BIT_READ] = {
|
||||
0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
|
||||
65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215
|
||||
};
|
||||
static BROTLI_INLINE uint32_t BitMask(int n) { return kBitMask[n]; }
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
#if (BROTLI_USE_64_BITS)
|
||||
@ -91,7 +102,7 @@ static BROTLI_INLINE void BrotliSetBitPos(BrotliBitReader* const br,
|
||||
uint32_t val) {
|
||||
#ifdef BROTLI_DECODE_DEBUG
|
||||
uint32_t n_bits = val - br->bit_pos_;
|
||||
const uint32_t bval = (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
|
||||
const uint32_t bval = (uint32_t)(br->val_ >> br->bit_pos_) & BitMask(n_bits);
|
||||
printf("[BrotliReadBits] %010d %2d val: %6x\n",
|
||||
(br->pos_ << 3) + br->bit_pos_ - 64, n_bits, bval);
|
||||
#endif
|
||||
@ -148,7 +159,7 @@ static BROTLI_INLINE int BrotliReadMoreInput(BrotliBitReader* const br) {
|
||||
}
|
||||
br->eos_ = 1;
|
||||
/* Store 32 bytes of zero after the stream end. */
|
||||
#if (BROTLI_USE_64_BITS)
|
||||
#if (BROTLI_USE_64_BITS) && !defined(ARMv7)
|
||||
*(uint64_t*)(dst + bytes_read) = 0;
|
||||
*(uint64_t*)(dst + bytes_read + 8) = 0;
|
||||
*(uint64_t*)(dst + bytes_read + 16) = 0;
|
||||
@ -159,7 +170,7 @@ static BROTLI_INLINE int BrotliReadMoreInput(BrotliBitReader* const br) {
|
||||
}
|
||||
if (dst == br->buf_) {
|
||||
/* Copy the head of the ringbuffer to the slack region. */
|
||||
#if (BROTLI_USE_64_BITS)
|
||||
#if (BROTLI_USE_64_BITS) && !defined(ARMv7)
|
||||
UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 32, br->buf_);
|
||||
UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 24, br->buf_ + 8);
|
||||
UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 16, br->buf_ + 16);
|
||||
@ -203,8 +214,15 @@ static BROTLI_INLINE uint32_t BrotliReadBits(
|
||||
BrotliBitReader* const br, int n_bits) {
|
||||
uint32_t val;
|
||||
#if (BROTLI_USE_64_BITS)
|
||||
#if defined(ARMv7)
|
||||
if ((64 - br->bit_pos_) < ((uint32_t) n_bits)) {
|
||||
BrotliFillBitWindow(br);
|
||||
}
|
||||
val = (uint32_t)(br->val_ >> br->bit_pos_) & BitMask(n_bits);
|
||||
#else
|
||||
BrotliFillBitWindow(br);
|
||||
val = (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
|
||||
val = (uint32_t)(br->val_ >> br->bit_pos_) & BitMask(n_bits);
|
||||
#endif /* defined (ARMv7) */
|
||||
#else
|
||||
/*
|
||||
* The if statement gives 2-4% speed boost on Canterbury data set with
|
||||
@ -213,8 +231,8 @@ static BROTLI_INLINE uint32_t BrotliReadBits(
|
||||
if ((32 - br->bit_pos_) < ((uint32_t) n_bits)) {
|
||||
BrotliFillBitWindow(br);
|
||||
}
|
||||
val = (br->val_ >> br->bit_pos_) & kBitMask[n_bits];
|
||||
#endif
|
||||
val = (br->val_ >> br->bit_pos_) & Bitmask(n_bits);
|
||||
#endif /* BROTLI_USE_64_BITS */
|
||||
#ifdef BROTLI_DECODE_DEBUG
|
||||
printf("[BrotliReadBits] %010d %2d val: %6x\n",
|
||||
(br->pos_ << 3) + br->bit_pos_ - 64, n_bits, val);
|
||||
|
33
dec/decode.c
33
dec/decode.c
@ -149,6 +149,21 @@ static BROTLI_INLINE int ReadSymbol(const HuffmanCode* table,
|
||||
BrotliBitReader* br) {
|
||||
int nbits;
|
||||
BrotliFillBitWindow(br);
|
||||
#ifdef ARMv7
|
||||
/* Prefetching helps, since this needs to shift a 64 bit
|
||||
val by a variable length. The other changes are minor. */
|
||||
uint32_t val = BrotliPrefetchBits(br);
|
||||
table += val & HUFFMAN_TABLE_MASK;
|
||||
nbits = table->bits;
|
||||
if (PREDICT_FALSE(nbits > HUFFMAN_TABLE_BITS)) {
|
||||
nbits -= HUFFMAN_TABLE_BITS;
|
||||
br->bit_pos_ += HUFFMAN_TABLE_BITS;
|
||||
table += table->value;
|
||||
table += (int)(val >> HUFFMAN_TABLE_BITS) & ((1 << nbits) - 1);
|
||||
nbits = table->bits;
|
||||
}
|
||||
br->bit_pos_ += nbits;
|
||||
#else
|
||||
table += (int)(br->val_ >> br->bit_pos_) & HUFFMAN_TABLE_MASK;
|
||||
if (PREDICT_FALSE(table->bits > HUFFMAN_TABLE_BITS)) {
|
||||
br->bit_pos_ += HUFFMAN_TABLE_BITS;
|
||||
@ -157,6 +172,7 @@ static BROTLI_INLINE int ReadSymbol(const HuffmanCode* table,
|
||||
table += (int)(br->val_ >> br->bit_pos_) & ((1 << nbits) - 1);
|
||||
}
|
||||
br->bit_pos_ += table->bits;
|
||||
#endif
|
||||
return table->value;
|
||||
}
|
||||
|
||||
@ -624,6 +640,8 @@ static BROTLI_INLINE void DecodeBlockTypeWithContext(BrotliState* s,
|
||||
*/
|
||||
static BROTLI_INLINE void IncrementalCopyFastPath(
|
||||
uint8_t* dst, const uint8_t* src, int len) {
|
||||
/* TODO: On an ARM UNALIGNED_MOVE64 is compiled into a memcpy.
|
||||
But I don't have a better solution. */
|
||||
if (src < dst) {
|
||||
while (dst - src < 8) {
|
||||
UNALIGNED_MOVE64(dst, src);
|
||||
@ -1488,6 +1506,7 @@ BrotliResult BrotliDecompressStreaming(BrotliInput input, BrotliOutput output,
|
||||
s->copy_src =
|
||||
&s->ringbuffer[(pos - s->distance) & s->ringbuffer_mask];
|
||||
|
||||
|
||||
#if (defined(__x86_64__) || defined(_M_X64))
|
||||
if (s->copy_src + s->copy_length <= s->ringbuffer_end &&
|
||||
s->copy_dst + s->copy_length < s->ringbuffer_end) {
|
||||
@ -1501,6 +1520,20 @@ BrotliResult BrotliDecompressStreaming(BrotliInput input, BrotliOutput output,
|
||||
s->meta_block_remaining_len -= s->copy_length;
|
||||
s->copy_length = 0;
|
||||
}
|
||||
#elif defined(ARMv7)
|
||||
/* This version is maybe 5% faster than the version above.
|
||||
UNALIGNED_COPY64 does not inline and generates memcpy calls. */
|
||||
if (s->copy_src + s->copy_length <= s->ringbuffer_end &&
|
||||
s->copy_dst + s->copy_length < s->ringbuffer_end) {
|
||||
if (s->copy_length <= s->distance) {
|
||||
memcpy(s->copy_dst, s->copy_src, s->copy_length);
|
||||
} else {
|
||||
IncrementalCopyFastPath(s->copy_dst, s->copy_src, s->copy_length);
|
||||
}
|
||||
pos += s->copy_length;
|
||||
s->meta_block_remaining_len -= s->copy_length;
|
||||
s->copy_length = 0;
|
||||
}
|
||||
#endif
|
||||
/* Modifications to this loop shold be reflected in
|
||||
BROTLI_STATE_BLOCK_POST_WRITE_2 case */
|
||||
|
Loading…
Reference in New Issue
Block a user