mirror of
https://github.com/google/brotli.git
synced 2025-01-06 06:50:06 +00:00
Decoder optimizations for ARM architecture.
This commit is contained in:
parent
500c85acae
commit
5f39d607ef
@ -30,6 +30,11 @@ extern "C" {
|
|||||||
#if (defined(__x86_64__) || defined(_M_X64))
|
#if (defined(__x86_64__) || defined(_M_X64))
|
||||||
/* This should be set to 1 only on little-endian machines. */
|
/* This should be set to 1 only on little-endian machines. */
|
||||||
#define BROTLI_USE_64_BITS 1
|
#define BROTLI_USE_64_BITS 1
|
||||||
|
#elif (defined(__arm__))
|
||||||
|
/* TODO: __arm__ is much too broad. The following flags should
|
||||||
|
only be set on ARM architectures with little-endian byte order */
|
||||||
|
#define ARMv7
|
||||||
|
#define BROTLI_USE_64_BITS 1
|
||||||
#else
|
#else
|
||||||
#define BROTLI_USE_64_BITS 0
|
#define BROTLI_USE_64_BITS 0
|
||||||
#endif
|
#endif
|
||||||
@ -41,10 +46,16 @@ extern "C" {
|
|||||||
#define UNALIGNED_COPY64(dst, src) memcpy(dst, src, 8)
|
#define UNALIGNED_COPY64(dst, src) memcpy(dst, src, 8)
|
||||||
#define UNALIGNED_MOVE64(dst, src) memmove(dst, src, 8)
|
#define UNALIGNED_MOVE64(dst, src) memmove(dst, src, 8)
|
||||||
|
|
||||||
|
#ifdef ARMv7
|
||||||
|
/* Arm instructions can shift and negate registers before an AND operation. */
|
||||||
|
static BROTLI_INLINE uint32_t BitMask(int n) { return ~((0xffffffff) << n); }
|
||||||
|
#else
|
||||||
static const uint32_t kBitMask[BROTLI_MAX_NUM_BIT_READ] = {
|
static const uint32_t kBitMask[BROTLI_MAX_NUM_BIT_READ] = {
|
||||||
0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
|
0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
|
||||||
65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215
|
65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215
|
||||||
};
|
};
|
||||||
|
static BROTLI_INLINE uint32_t BitMask(int n) { return kBitMask[n]; }
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
#if (BROTLI_USE_64_BITS)
|
#if (BROTLI_USE_64_BITS)
|
||||||
@ -91,7 +102,7 @@ static BROTLI_INLINE void BrotliSetBitPos(BrotliBitReader* const br,
|
|||||||
uint32_t val) {
|
uint32_t val) {
|
||||||
#ifdef BROTLI_DECODE_DEBUG
|
#ifdef BROTLI_DECODE_DEBUG
|
||||||
uint32_t n_bits = val - br->bit_pos_;
|
uint32_t n_bits = val - br->bit_pos_;
|
||||||
const uint32_t bval = (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
|
const uint32_t bval = (uint32_t)(br->val_ >> br->bit_pos_) & BitMask(n_bits);
|
||||||
printf("[BrotliReadBits] %010d %2d val: %6x\n",
|
printf("[BrotliReadBits] %010d %2d val: %6x\n",
|
||||||
(br->pos_ << 3) + br->bit_pos_ - 64, n_bits, bval);
|
(br->pos_ << 3) + br->bit_pos_ - 64, n_bits, bval);
|
||||||
#endif
|
#endif
|
||||||
@ -148,7 +159,7 @@ static BROTLI_INLINE int BrotliReadMoreInput(BrotliBitReader* const br) {
|
|||||||
}
|
}
|
||||||
br->eos_ = 1;
|
br->eos_ = 1;
|
||||||
/* Store 32 bytes of zero after the stream end. */
|
/* Store 32 bytes of zero after the stream end. */
|
||||||
#if (BROTLI_USE_64_BITS)
|
#if (BROTLI_USE_64_BITS) && !defined(ARMv7)
|
||||||
*(uint64_t*)(dst + bytes_read) = 0;
|
*(uint64_t*)(dst + bytes_read) = 0;
|
||||||
*(uint64_t*)(dst + bytes_read + 8) = 0;
|
*(uint64_t*)(dst + bytes_read + 8) = 0;
|
||||||
*(uint64_t*)(dst + bytes_read + 16) = 0;
|
*(uint64_t*)(dst + bytes_read + 16) = 0;
|
||||||
@ -159,7 +170,7 @@ static BROTLI_INLINE int BrotliReadMoreInput(BrotliBitReader* const br) {
|
|||||||
}
|
}
|
||||||
if (dst == br->buf_) {
|
if (dst == br->buf_) {
|
||||||
/* Copy the head of the ringbuffer to the slack region. */
|
/* Copy the head of the ringbuffer to the slack region. */
|
||||||
#if (BROTLI_USE_64_BITS)
|
#if (BROTLI_USE_64_BITS) && !defined(ARMv7)
|
||||||
UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 32, br->buf_);
|
UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 32, br->buf_);
|
||||||
UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 24, br->buf_ + 8);
|
UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 24, br->buf_ + 8);
|
||||||
UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 16, br->buf_ + 16);
|
UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 16, br->buf_ + 16);
|
||||||
@ -203,8 +214,15 @@ static BROTLI_INLINE uint32_t BrotliReadBits(
|
|||||||
BrotliBitReader* const br, int n_bits) {
|
BrotliBitReader* const br, int n_bits) {
|
||||||
uint32_t val;
|
uint32_t val;
|
||||||
#if (BROTLI_USE_64_BITS)
|
#if (BROTLI_USE_64_BITS)
|
||||||
|
#if defined(ARMv7)
|
||||||
|
if ((64 - br->bit_pos_) < ((uint32_t) n_bits)) {
|
||||||
|
BrotliFillBitWindow(br);
|
||||||
|
}
|
||||||
|
val = (uint32_t)(br->val_ >> br->bit_pos_) & BitMask(n_bits);
|
||||||
|
#else
|
||||||
BrotliFillBitWindow(br);
|
BrotliFillBitWindow(br);
|
||||||
val = (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
|
val = (uint32_t)(br->val_ >> br->bit_pos_) & BitMask(n_bits);
|
||||||
|
#endif /* defined (ARMv7) */
|
||||||
#else
|
#else
|
||||||
/*
|
/*
|
||||||
* The if statement gives 2-4% speed boost on Canterbury data set with
|
* The if statement gives 2-4% speed boost on Canterbury data set with
|
||||||
@ -213,8 +231,8 @@ static BROTLI_INLINE uint32_t BrotliReadBits(
|
|||||||
if ((32 - br->bit_pos_) < ((uint32_t) n_bits)) {
|
if ((32 - br->bit_pos_) < ((uint32_t) n_bits)) {
|
||||||
BrotliFillBitWindow(br);
|
BrotliFillBitWindow(br);
|
||||||
}
|
}
|
||||||
val = (br->val_ >> br->bit_pos_) & kBitMask[n_bits];
|
val = (br->val_ >> br->bit_pos_) & Bitmask(n_bits);
|
||||||
#endif
|
#endif /* BROTLI_USE_64_BITS */
|
||||||
#ifdef BROTLI_DECODE_DEBUG
|
#ifdef BROTLI_DECODE_DEBUG
|
||||||
printf("[BrotliReadBits] %010d %2d val: %6x\n",
|
printf("[BrotliReadBits] %010d %2d val: %6x\n",
|
||||||
(br->pos_ << 3) + br->bit_pos_ - 64, n_bits, val);
|
(br->pos_ << 3) + br->bit_pos_ - 64, n_bits, val);
|
||||||
|
33
dec/decode.c
33
dec/decode.c
@ -149,6 +149,21 @@ static BROTLI_INLINE int ReadSymbol(const HuffmanCode* table,
|
|||||||
BrotliBitReader* br) {
|
BrotliBitReader* br) {
|
||||||
int nbits;
|
int nbits;
|
||||||
BrotliFillBitWindow(br);
|
BrotliFillBitWindow(br);
|
||||||
|
#ifdef ARMv7
|
||||||
|
/* Prefetching helps, since this needs to shift a 64 bit
|
||||||
|
val by a variable length. The other changes are minor. */
|
||||||
|
uint32_t val = BrotliPrefetchBits(br);
|
||||||
|
table += val & HUFFMAN_TABLE_MASK;
|
||||||
|
nbits = table->bits;
|
||||||
|
if (PREDICT_FALSE(nbits > HUFFMAN_TABLE_BITS)) {
|
||||||
|
nbits -= HUFFMAN_TABLE_BITS;
|
||||||
|
br->bit_pos_ += HUFFMAN_TABLE_BITS;
|
||||||
|
table += table->value;
|
||||||
|
table += (int)(val >> HUFFMAN_TABLE_BITS) & ((1 << nbits) - 1);
|
||||||
|
nbits = table->bits;
|
||||||
|
}
|
||||||
|
br->bit_pos_ += nbits;
|
||||||
|
#else
|
||||||
table += (int)(br->val_ >> br->bit_pos_) & HUFFMAN_TABLE_MASK;
|
table += (int)(br->val_ >> br->bit_pos_) & HUFFMAN_TABLE_MASK;
|
||||||
if (PREDICT_FALSE(table->bits > HUFFMAN_TABLE_BITS)) {
|
if (PREDICT_FALSE(table->bits > HUFFMAN_TABLE_BITS)) {
|
||||||
br->bit_pos_ += HUFFMAN_TABLE_BITS;
|
br->bit_pos_ += HUFFMAN_TABLE_BITS;
|
||||||
@ -157,6 +172,7 @@ static BROTLI_INLINE int ReadSymbol(const HuffmanCode* table,
|
|||||||
table += (int)(br->val_ >> br->bit_pos_) & ((1 << nbits) - 1);
|
table += (int)(br->val_ >> br->bit_pos_) & ((1 << nbits) - 1);
|
||||||
}
|
}
|
||||||
br->bit_pos_ += table->bits;
|
br->bit_pos_ += table->bits;
|
||||||
|
#endif
|
||||||
return table->value;
|
return table->value;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -624,6 +640,8 @@ static BROTLI_INLINE void DecodeBlockTypeWithContext(BrotliState* s,
|
|||||||
*/
|
*/
|
||||||
static BROTLI_INLINE void IncrementalCopyFastPath(
|
static BROTLI_INLINE void IncrementalCopyFastPath(
|
||||||
uint8_t* dst, const uint8_t* src, int len) {
|
uint8_t* dst, const uint8_t* src, int len) {
|
||||||
|
/* TODO: On an ARM UNALIGNED_MOVE64 is compiled into a memcpy.
|
||||||
|
But I don't have a better solution. */
|
||||||
if (src < dst) {
|
if (src < dst) {
|
||||||
while (dst - src < 8) {
|
while (dst - src < 8) {
|
||||||
UNALIGNED_MOVE64(dst, src);
|
UNALIGNED_MOVE64(dst, src);
|
||||||
@ -1488,6 +1506,7 @@ BrotliResult BrotliDecompressStreaming(BrotliInput input, BrotliOutput output,
|
|||||||
s->copy_src =
|
s->copy_src =
|
||||||
&s->ringbuffer[(pos - s->distance) & s->ringbuffer_mask];
|
&s->ringbuffer[(pos - s->distance) & s->ringbuffer_mask];
|
||||||
|
|
||||||
|
|
||||||
#if (defined(__x86_64__) || defined(_M_X64))
|
#if (defined(__x86_64__) || defined(_M_X64))
|
||||||
if (s->copy_src + s->copy_length <= s->ringbuffer_end &&
|
if (s->copy_src + s->copy_length <= s->ringbuffer_end &&
|
||||||
s->copy_dst + s->copy_length < s->ringbuffer_end) {
|
s->copy_dst + s->copy_length < s->ringbuffer_end) {
|
||||||
@ -1501,6 +1520,20 @@ BrotliResult BrotliDecompressStreaming(BrotliInput input, BrotliOutput output,
|
|||||||
s->meta_block_remaining_len -= s->copy_length;
|
s->meta_block_remaining_len -= s->copy_length;
|
||||||
s->copy_length = 0;
|
s->copy_length = 0;
|
||||||
}
|
}
|
||||||
|
#elif defined(ARMv7)
|
||||||
|
/* This version is maybe 5% faster than the version above.
|
||||||
|
UNALIGNED_COPY64 does not inline and generates memcpy calls. */
|
||||||
|
if (s->copy_src + s->copy_length <= s->ringbuffer_end &&
|
||||||
|
s->copy_dst + s->copy_length < s->ringbuffer_end) {
|
||||||
|
if (s->copy_length <= s->distance) {
|
||||||
|
memcpy(s->copy_dst, s->copy_src, s->copy_length);
|
||||||
|
} else {
|
||||||
|
IncrementalCopyFastPath(s->copy_dst, s->copy_src, s->copy_length);
|
||||||
|
}
|
||||||
|
pos += s->copy_length;
|
||||||
|
s->meta_block_remaining_len -= s->copy_length;
|
||||||
|
s->copy_length = 0;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
/* Modifications to this loop shold be reflected in
|
/* Modifications to this loop shold be reflected in
|
||||||
BROTLI_STATE_BLOCK_POST_WRITE_2 case */
|
BROTLI_STATE_BLOCK_POST_WRITE_2 case */
|
||||||
|
Loading…
Reference in New Issue
Block a user