decode: faster huffman code loading on 32-bit Arm (#703)

* platform: add macro for using the 'aligned' attribute

* decode: add accessor macros for HuffmanCode fields

Adds a constructor function for building HuffmanCode values
so they can be accessed quickly on different architectures.

Also adds macros for marking a HuffmanCode table pointer
that can be accessed quickly (BROTLI_HC_MARK_TABLE_FOR_FAST_LOAD),
adjusting the index into that table (BROTLI_HC_ADJUST_TABLE_INDEX),
and getting the .bits or .value fields out of the table at the
current index (BROTLI_HC_GET_BITS/VALUE).

For example, assuming |table| contains a HuffmanCode pointer:

  BROTLI_HC_MARK_TABLE_FOR_FAST_LOAD(table);
  BROTLI_HC_ADJUST_TABLE_INDEX(table, index_into_table);
  *bits = BROTLI_HC_GET_BITS(table);
  *value = BROTLI_HC_GET_VALUE(table);
  BROTLI_HC_ADJUST_TABLE_INDEX(table, offset);
  *bits2 = BROTLI_HC_GET_BITS(table);
  *value2 = BROTLI_HC_GET_VALUE(table);

All uses of the HuffmanCode have been updated appropriately.

* decode: add alternative accessors for HuffmanCode on Arm AArch32
This commit is contained in:
Stephen Kyle 2018-09-27 12:15:46 +01:00 committed by Eugene Kliuchnikov
parent 67f059eaf5
commit 9402ac5c08
4 changed files with 139 additions and 84 deletions

View File

@ -180,6 +180,12 @@ OR:
#define BROTLI_UNUSED_FUNCTION static BROTLI_INLINE
#endif
#if BROTLI_GNUC_HAS_ATTRIBUTE(aligned, 2, 7, 0)
#define BROTLI_ALIGNED(N) __attribute__((aligned(N)))
#else
#define BROTLI_ALIGNED(N)
#endif
#if (defined(__ARM_ARCH) && (__ARM_ARCH == 7)) || \
(defined(M_ARM) && (M_ARM == 7))
#define BROTLI_TARGET_ARMV7
@ -343,7 +349,7 @@ static BROTLI_INLINE void BrotliUnalignedWrite64(void* p, uint64_t v) {
/* If __attribute__(aligned) is available, use that. Otherwise, memcpy. */
#if BROTLI_GNUC_HAS_ATTRIBUTE(aligned, 2, 7, 0)
typedef __attribute__((aligned(1))) uint64_t brotli_unaligned_uint64_t;
typedef BROTLI_ALIGNED(1) uint64_t brotli_unaligned_uint64_t;
static BROTLI_INLINE uint64_t BrotliUnalignedRead64(const void* p) {
return (uint64_t) ((brotli_unaligned_uint64_t*) p)[0];

View File

@ -347,15 +347,17 @@ static BrotliDecoderErrorCode BROTLI_NOINLINE DecodeMetaBlockLength(
static BROTLI_INLINE uint32_t DecodeSymbol(uint32_t bits,
const HuffmanCode* table,
BrotliBitReader* br) {
table += bits & HUFFMAN_TABLE_MASK;
if (table->bits > HUFFMAN_TABLE_BITS) {
uint32_t nbits = table->bits - HUFFMAN_TABLE_BITS;
BROTLI_HC_MARK_TABLE_FOR_FAST_LOAD(table);
BROTLI_HC_ADJUST_TABLE_INDEX(table, bits & HUFFMAN_TABLE_MASK);
if (BROTLI_HC_FAST_LOAD_BITS(table) > HUFFMAN_TABLE_BITS) {
uint32_t nbits = BROTLI_HC_FAST_LOAD_BITS(table) - HUFFMAN_TABLE_BITS;
BrotliDropBits(br, HUFFMAN_TABLE_BITS);
table += table->value;
table += (bits >> HUFFMAN_TABLE_BITS) & BitMask(nbits);
BROTLI_HC_ADJUST_TABLE_INDEX(table,
BROTLI_HC_FAST_LOAD_VALUE(table) +
((bits >> HUFFMAN_TABLE_BITS) & BitMask(nbits)));
}
BrotliDropBits(br, table->bits);
return table->value;
BrotliDropBits(br, BROTLI_HC_FAST_LOAD_BITS(table));
return BROTLI_HC_FAST_LOAD_VALUE(table);
}
/* Reads and decodes the next Huffman code from bit-stream.
@ -371,19 +373,20 @@ static BROTLI_NOINLINE BROTLI_BOOL SafeDecodeSymbol(
const HuffmanCode* table, BrotliBitReader* br, uint32_t* result) {
uint32_t val;
uint32_t available_bits = BrotliGetAvailableBits(br);
BROTLI_HC_MARK_TABLE_FOR_FAST_LOAD(table);
if (available_bits == 0) {
if (table->bits == 0) {
*result = table->value;
if (BROTLI_HC_FAST_LOAD_BITS(table) == 0) {
*result = BROTLI_HC_FAST_LOAD_VALUE(table);
return BROTLI_TRUE;
}
return BROTLI_FALSE; /* No valid bits at all. */
}
val = (uint32_t)BrotliGetBitsUnmasked(br);
table += val & HUFFMAN_TABLE_MASK;
if (table->bits <= HUFFMAN_TABLE_BITS) {
if (table->bits <= available_bits) {
BrotliDropBits(br, table->bits);
*result = table->value;
BROTLI_HC_ADJUST_TABLE_INDEX(table, val & HUFFMAN_TABLE_MASK);
if (BROTLI_HC_FAST_LOAD_BITS(table) <= HUFFMAN_TABLE_BITS) {
if (BROTLI_HC_FAST_LOAD_BITS(table) <= available_bits) {
BrotliDropBits(br, BROTLI_HC_FAST_LOAD_BITS(table));
*result = BROTLI_HC_FAST_LOAD_VALUE(table);
return BROTLI_TRUE;
} else {
return BROTLI_FALSE; /* Not enough bits for the first level. */
@ -394,15 +397,15 @@ static BROTLI_NOINLINE BROTLI_BOOL SafeDecodeSymbol(
}
/* Speculatively drop HUFFMAN_TABLE_BITS. */
val = (val & BitMask(table->bits)) >> HUFFMAN_TABLE_BITS;
val = (val & BitMask(BROTLI_HC_FAST_LOAD_BITS(table))) >> HUFFMAN_TABLE_BITS;
available_bits -= HUFFMAN_TABLE_BITS;
table += table->value + val;
if (available_bits < table->bits) {
BROTLI_HC_ADJUST_TABLE_INDEX(table, BROTLI_HC_FAST_LOAD_VALUE(table) + val);
if (available_bits < BROTLI_HC_FAST_LOAD_BITS(table)) {
return BROTLI_FALSE; /* Not enough bits for the second level. */
}
BrotliDropBits(br, HUFFMAN_TABLE_BITS + table->bits);
*result = table->value;
BrotliDropBits(br, HUFFMAN_TABLE_BITS + BROTLI_HC_FAST_LOAD_BITS(table));
*result = BROTLI_HC_FAST_LOAD_VALUE(table);
return BROTLI_TRUE;
}
@ -425,9 +428,10 @@ static BROTLI_INLINE void PreloadSymbol(int safe,
if (safe) {
return;
}
table += BrotliGetBits(br, HUFFMAN_TABLE_BITS);
*bits = table->bits;
*value = table->value;
BROTLI_HC_MARK_TABLE_FOR_FAST_LOAD(table);
BROTLI_HC_ADJUST_TABLE_INDEX(table, BrotliGetBits(br, HUFFMAN_TABLE_BITS));
*bits = BROTLI_HC_FAST_LOAD_BITS(table);
*value = BROTLI_HC_FAST_LOAD_VALUE(table);
}
/* Decodes the next Huffman code using data prepared by PreloadSymbol.
@ -441,10 +445,11 @@ static BROTLI_INLINE uint32_t ReadPreloadedSymbol(const HuffmanCode* table,
uint32_t val = BrotliGet16BitsUnmasked(br);
const HuffmanCode* ext = table + (val & HUFFMAN_TABLE_MASK) + *value;
uint32_t mask = BitMask((*bits - HUFFMAN_TABLE_BITS));
BROTLI_HC_MARK_TABLE_FOR_FAST_LOAD(ext);
BrotliDropBits(br, HUFFMAN_TABLE_BITS);
ext += (val >> HUFFMAN_TABLE_BITS) & mask;
BrotliDropBits(br, ext->bits);
result = ext->value;
BROTLI_HC_ADJUST_TABLE_INDEX(ext, (val >> HUFFMAN_TABLE_BITS) & mask);
BrotliDropBits(br, BROTLI_HC_FAST_LOAD_BITS(ext));
result = BROTLI_HC_FAST_LOAD_VALUE(ext);
} else {
BrotliDropBits(br, *bits);
}
@ -597,6 +602,7 @@ static BrotliDecoderErrorCode ReadSymbolCodeLengths(
while (symbol < alphabet_size && space > 0) {
const HuffmanCode* p = s->table;
uint32_t code_len;
BROTLI_HC_MARK_TABLE_FOR_FAST_LOAD(p);
if (!BrotliCheckInputAmount(br, BROTLI_SHORT_FILL_BIT_WINDOW_READ)) {
s->symbol = symbol;
s->repeat = repeat;
@ -606,10 +612,10 @@ static BrotliDecoderErrorCode ReadSymbolCodeLengths(
return BROTLI_DECODER_NEEDS_MORE_INPUT;
}
BrotliFillBitWindow16(br);
p += BrotliGetBitsUnmasked(br) &
BitMask(BROTLI_HUFFMAN_MAX_CODE_LENGTH_CODE_LENGTH);
BrotliDropBits(br, p->bits); /* Use 1..5 bits. */
code_len = p->value; /* code_len == 0..17 */
BROTLI_HC_ADJUST_TABLE_INDEX(p, BrotliGetBitsUnmasked(br) &
BitMask(BROTLI_HUFFMAN_MAX_CODE_LENGTH_CODE_LENGTH));
BrotliDropBits(br, BROTLI_HC_FAST_LOAD_BITS(p)); /* Use 1..5 bits. */
code_len = BROTLI_HC_FAST_LOAD_VALUE(p); /* code_len == 0..17 */
if (code_len < BROTLI_REPEAT_PREVIOUS_CODE_LENGTH) {
ProcessSingleCodeLength(code_len, &symbol, &repeat, &space,
&prev_code_len, symbol_lists, code_length_histo, next_symbol);
@ -637,31 +643,34 @@ static BrotliDecoderErrorCode SafeReadSymbolCodeLengths(
uint32_t code_len;
uint32_t available_bits;
uint32_t bits = 0;
BROTLI_HC_MARK_TABLE_FOR_FAST_LOAD(p);
if (get_byte && !BrotliPullByte(br)) return BROTLI_DECODER_NEEDS_MORE_INPUT;
get_byte = BROTLI_FALSE;
available_bits = BrotliGetAvailableBits(br);
if (available_bits != 0) {
bits = (uint32_t)BrotliGetBitsUnmasked(br);
}
p += bits & BitMask(BROTLI_HUFFMAN_MAX_CODE_LENGTH_CODE_LENGTH);
if (p->bits > available_bits) {
BROTLI_HC_ADJUST_TABLE_INDEX(p,
bits & BitMask(BROTLI_HUFFMAN_MAX_CODE_LENGTH_CODE_LENGTH));
if (BROTLI_HC_FAST_LOAD_BITS(p) > available_bits) {
get_byte = BROTLI_TRUE;
continue;
}
code_len = p->value; /* code_len == 0..17 */
code_len = BROTLI_HC_FAST_LOAD_VALUE(p); /* code_len == 0..17 */
if (code_len < BROTLI_REPEAT_PREVIOUS_CODE_LENGTH) {
BrotliDropBits(br, p->bits);
BrotliDropBits(br, BROTLI_HC_FAST_LOAD_BITS(p));
ProcessSingleCodeLength(code_len, &s->symbol, &s->repeat, &s->space,
&s->prev_code_len, s->symbol_lists, s->code_length_histo,
s->next_symbol);
} else { /* code_len == 16..17, extra_bits == 2..3 */
uint32_t extra_bits = code_len - 14U;
uint32_t repeat_delta = (bits >> p->bits) & BitMask(extra_bits);
if (available_bits < p->bits + extra_bits) {
uint32_t repeat_delta = (bits >> BROTLI_HC_FAST_LOAD_BITS(p)) &
BitMask(extra_bits);
if (available_bits < BROTLI_HC_FAST_LOAD_BITS(p) + extra_bits) {
get_byte = BROTLI_TRUE;
continue;
}
BrotliDropBits(br, p->bits + extra_bits);
BrotliDropBits(br, BROTLI_HC_FAST_LOAD_BITS(p) + extra_bits);
ProcessRepeatedCodeLength(code_len, repeat_delta, alphabet_size,
&s->symbol, &s->repeat, &s->space, &s->prev_code_len,
&s->repeat_code_len, s->symbol_lists, s->code_length_histo,

View File

@ -142,8 +142,7 @@ void BrotliBuildCodeLengthsHuffmanTable(HuffmanCode* table,
/* Special case: all symbols but one have 0 code length. */
if (offset[0] == 0) {
code.bits = 0;
code.value = (uint16_t)sorted[0];
code = ConstructHuffmanCode(0, (uint16_t)sorted[0]);
for (key = 0; key < (brotli_reg_t)table_size; ++key) {
table[key] = code;
}
@ -157,9 +156,9 @@ void BrotliBuildCodeLengthsHuffmanTable(HuffmanCode* table,
bits = 1;
step = 2;
do {
code.bits = (uint8_t)bits;
for (bits_count = count[bits]; bits_count != 0; --bits_count) {
code.value = (uint16_t)sorted[symbol++];
code = ConstructHuffmanCode((uint8_t)bits,
(uint16_t)sorted[symbol++]);
ReplicateValue(&table[BrotliReverseBits(key)], step, table_size, code);
key += key_step;
}
@ -211,11 +210,10 @@ uint32_t BrotliBuildHuffmanTable(HuffmanCode* root_table,
bits = 1;
step = 2;
do {
code.bits = (uint8_t)bits;
symbol = bits - (BROTLI_HUFFMAN_MAX_CODE_LENGTH + 1);
for (bits_count = count[bits]; bits_count != 0; --bits_count) {
symbol = symbol_lists[symbol];
code.value = (uint16_t)symbol;
code = ConstructHuffmanCode((uint8_t)bits, (uint16_t)symbol);
ReplicateValue(&table[BrotliReverseBits(key)], step, table_size, code);
key += key_step;
}
@ -244,14 +242,14 @@ uint32_t BrotliBuildHuffmanTable(HuffmanCode* root_table,
total_size += table_size;
sub_key = BrotliReverseBits(key);
key += key_step;
root_table[sub_key].bits = (uint8_t)(table_bits + root_bits);
root_table[sub_key].value =
(uint16_t)(((size_t)(table - root_table)) - sub_key);
root_table[sub_key] = ConstructHuffmanCode(
(uint8_t)(table_bits + root_bits),
(uint16_t)(((size_t)(table - root_table)) - sub_key));
sub_key = 0;
}
code.bits = (uint8_t)(len - root_bits);
symbol = symbol_lists[symbol];
code.value = (uint16_t)symbol;
code = ConstructHuffmanCode((uint8_t)(len - root_bits),
(uint16_t)symbol);
ReplicateValue(
&table[BrotliReverseBits(sub_key)], step, table_size, code);
sub_key += sub_key_step;
@ -270,35 +268,28 @@ uint32_t BrotliBuildSimpleHuffmanTable(HuffmanCode* table,
const uint32_t goal_size = 1U << root_bits;
switch (num_symbols) {
case 0:
table[0].bits = 0;
table[0].value = val[0];
table[0] = ConstructHuffmanCode(0, val[0]);
break;
case 1:
table[0].bits = 1;
table[1].bits = 1;
if (val[1] > val[0]) {
table[0].value = val[0];
table[1].value = val[1];
table[0] = ConstructHuffmanCode(1, val[0]);
table[1] = ConstructHuffmanCode(1, val[1]);
} else {
table[0].value = val[1];
table[1].value = val[0];
table[0] = ConstructHuffmanCode(1, val[1]);
table[1] = ConstructHuffmanCode(1, val[0]);
}
table_size = 2;
break;
case 2:
table[0].bits = 1;
table[0].value = val[0];
table[2].bits = 1;
table[2].value = val[0];
table[0] = ConstructHuffmanCode(1, val[0]);
table[2] = ConstructHuffmanCode(1, val[0]);
if (val[2] > val[1]) {
table[1].value = val[1];
table[3].value = val[2];
table[1] = ConstructHuffmanCode(2, val[1]);
table[3] = ConstructHuffmanCode(2, val[2]);
} else {
table[1].value = val[2];
table[3].value = val[1];
table[1] = ConstructHuffmanCode(2, val[2]);
table[3] = ConstructHuffmanCode(2, val[1]);
}
table[1].bits = 2;
table[3].bits = 2;
table_size = 4;
break;
case 3: {
@ -312,33 +303,27 @@ uint32_t BrotliBuildSimpleHuffmanTable(HuffmanCode* table,
}
}
}
for (i = 0; i < 4; ++i) {
table[i].bits = 2;
}
table[0].value = val[0];
table[2].value = val[1];
table[1].value = val[2];
table[3].value = val[3];
table[0] = ConstructHuffmanCode(2, val[0]);
table[2] = ConstructHuffmanCode(2, val[1]);
table[1] = ConstructHuffmanCode(2, val[2]);
table[3] = ConstructHuffmanCode(2, val[3]);
table_size = 4;
break;
}
case 4: {
int i;
if (val[3] < val[2]) {
uint16_t t = val[3];
val[3] = val[2];
val[2] = t;
}
for (i = 0; i < 7; ++i) {
table[i].value = val[0];
table[i].bits = (uint8_t)(1 + (i & 1));
}
table[1].value = val[1];
table[3].value = val[2];
table[5].value = val[1];
table[7].value = val[3];
table[3].bits = 3;
table[7].bits = 3;
table[0] = ConstructHuffmanCode(1, val[0]);
table[1] = ConstructHuffmanCode(2, val[1]);
table[2] = ConstructHuffmanCode(1, val[0]);
table[3] = ConstructHuffmanCode(3, val[2]);
table[4] = ConstructHuffmanCode(1, val[0]);
table[5] = ConstructHuffmanCode(2, val[1]);
table[6] = ConstructHuffmanCode(1, val[0]);
table[7] = ConstructHuffmanCode(3, val[3]);
table_size = 8;
break;
}

View File

@ -33,11 +33,66 @@ static const uint16_t kMaxHuffmanTableSize[] = {
#define BROTLI_HUFFMAN_MAX_CODE_LENGTH_CODE_LENGTH 5
#if ((defined(BROTLI_TARGET_ARMV7) || defined(BROTLI_TARGET_ARMV8_32)) && \
BROTLI_GNUC_HAS_ATTRIBUTE(aligned, 2, 7, 0))
#define BROTLI_HUFFMAN_CODE_FAST_LOAD
#endif
#if !defined(BROTLI_HUFFMAN_CODE_FAST_LOAD)
/* Do not create this struct directly - use the ConstructHuffmanCode
* constructor below! */
typedef struct {
uint8_t bits; /* number of bits used for this symbol */
uint16_t value; /* symbol value or table offset */
} HuffmanCode;
static BROTLI_INLINE HuffmanCode ConstructHuffmanCode(const uint8_t bits,
const uint16_t value) {
HuffmanCode h;
h.bits = bits;
h.value = value;
return h;
}
/* Please use the following macros to optimize HuffmanCode accesses in hot
* paths.
*
* For example, assuming |table| contains a HuffmanCode pointer:
*
* BROTLI_HC_MARK_TABLE_FOR_FAST_LOAD(table);
* BROTLI_HC_ADJUST_TABLE_INDEX(table, index_into_table);
* *bits = BROTLI_HC_GET_BITS(table);
* *value = BROTLI_HC_GET_VALUE(table);
* BROTLI_HC_ADJUST_TABLE_INDEX(table, offset);
* *bits2 = BROTLI_HC_GET_BITS(table);
* *value2 = BROTLI_HC_GET_VALUE(table);
*
*/
#define BROTLI_HC_MARK_TABLE_FOR_FAST_LOAD(H)
#define BROTLI_HC_ADJUST_TABLE_INDEX(H, V) H += (V)
/* These must be given a HuffmanCode pointer! */
#define BROTLI_HC_FAST_LOAD_BITS(H) (H->bits)
#define BROTLI_HC_FAST_LOAD_VALUE(H) (H->value)
#else /* BROTLI_HUFFMAN_CODE_FAST_LOAD */
typedef BROTLI_ALIGNED(4) uint32_t HuffmanCode;
static BROTLI_INLINE HuffmanCode ConstructHuffmanCode(const uint8_t bits,
const uint16_t value) {
return ((value & 0xFFFF) << 16) | (bits & 0xFF);
}
#define BROTLI_HC_MARK_TABLE_FOR_FAST_LOAD(H) uint32_t __fastload_##H = (*H)
#define BROTLI_HC_ADJUST_TABLE_INDEX(H, V) H += (V); __fastload_##H = (*H)
/* These must be given a HuffmanCode pointer! */
#define BROTLI_HC_FAST_LOAD_BITS(H) ((__fastload_##H) & 0xFF)
#define BROTLI_HC_FAST_LOAD_VALUE(H) ((__fastload_##H) >> 16)
#endif /* BROTLI_HUFFMAN_CODE_FAST_LOAD */
/* Builds Huffman lookup table assuming code lengths are in symbol order. */
BROTLI_INTERNAL void BrotliBuildCodeLengthsHuffmanTable(HuffmanCode* root_table,
const uint8_t* const code_lengths, uint16_t* count);