Updates to Brotli compression format, decoder and encoder

This commit contains a batch of changes that were made to the Brotli
compression algorithm in the last month. Most important changes:

   * Format change: don't push distances representing static dictionary words to the distance cache.
   * Fix decoder invalid memory access bug caused by building a non-complete Huffman tree.
   * Add a mode parameter to the encoder interface.
   * Use different hashers for text and font mode.
   * Add a heuristics to the hasher for skipping non-compressible data.
   * Exhaustive search of static dictionary during backward reference search.
This commit is contained in:
Zoltan Szabadka 2014-03-20 14:32:35 +01:00
parent cddab4adef
commit e7650080a8
14 changed files with 546 additions and 280 deletions

View File

@ -589,7 +589,9 @@ Abstract
in the sequence of distances) is not pushed to the ring-buffer of in the sequence of distances) is not pushed to the ring-buffer of
last distances, in other words, the expression "(second, third, last distances, in other words, the expression "(second, third,
fourth) last distance" means the (second, third, fourth) last fourth) last distance" means the (second, third, fourth) last
distance that was not represented by a 0 distance code. distance that was not represented by a 0 distance code. Similarly,
distances that represent static dictionary words (see Section 8.) are
not pushed to the ringbuffer of last distances.
The next NDIRECT distance codes, from 16 to 15 + NDIRECT, represent The next NDIRECT distance codes, from 16 to 15 + NDIRECT, represent
distances from 1 to NDIRECT. Neither the distance short codes, nor distances from 1 to NDIRECT. Neither the distance short codes, nor

View File

@ -247,12 +247,23 @@ static int ReadHuffmanCode(int alphabet_size,
code_lengths[symbols[0]] = 1; code_lengths[symbols[0]] = 1;
switch (num_symbols) { switch (num_symbols) {
case 1: case 1:
break;
case 3: case 3:
ok = ((symbols[0] != symbols[1]) &&
(symbols[0] != symbols[2]) &&
(symbols[1] != symbols[2]));
break; break;
case 2: case 2:
ok = (symbols[0] != symbols[1]);
code_lengths[symbols[1]] = 1; code_lengths[symbols[1]] = 1;
break; break;
case 4: case 4:
ok = ((symbols[0] != symbols[1]) &&
(symbols[0] != symbols[2]) &&
(symbols[0] != symbols[3]) &&
(symbols[1] != symbols[2]) &&
(symbols[1] != symbols[3]) &&
(symbols[2] != symbols[3]));
if (BrotliReadBits(br, 1)) { if (BrotliReadBits(br, 1)) {
code_lengths[symbols[2]] = 3; code_lengths[symbols[2]] = 3;
code_lengths[symbols[3]] = 3; code_lengths[symbols[3]] = 3;
@ -266,6 +277,7 @@ static int ReadHuffmanCode(int alphabet_size,
int i; int i;
uint8_t code_length_code_lengths[CODE_LENGTH_CODES] = { 0 }; uint8_t code_length_code_lengths[CODE_LENGTH_CODES] = { 0 };
int space = 32; int space = 32;
int num_codes = 0;
/* Static Huffman code for the code length code lengths */ /* Static Huffman code for the code length code lengths */
static const HuffmanCode huff[16] = { static const HuffmanCode huff[16] = {
{2, 0}, {2, 4}, {2, 3}, {3, 2}, {2, 0}, {2, 4}, {2, 3}, {4, 1}, {2, 0}, {2, 4}, {2, 3}, {3, 2}, {2, 0}, {2, 4}, {2, 3}, {4, 1},
@ -283,9 +295,11 @@ static int ReadHuffmanCode(int alphabet_size,
BROTLI_LOG_ARRAY_INDEX(code_length_code_lengths, code_len_idx); BROTLI_LOG_ARRAY_INDEX(code_length_code_lengths, code_len_idx);
if (v != 0) { if (v != 0) {
space -= (32 >> v); space -= (32 >> v);
++num_codes;
} }
} }
ok = ReadHuffmanCodeLengths(code_length_code_lengths, ok = (num_codes == 1 || space == 0) &&
ReadHuffmanCodeLengths(code_length_code_lengths,
alphabet_size, code_lengths, br); alphabet_size, code_lengths, br);
} }
if (ok) { if (ok) {
@ -961,10 +975,6 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
ok = 0; ok = 0;
goto End; goto End;
} }
if (distance_code > 0) {
dist_rb[dist_rb_idx & 3] = distance;
++dist_rb_idx;
}
BROTLI_LOG_UINT(distance); BROTLI_LOG_UINT(distance);
if (pos < max_backward_distance && if (pos < max_backward_distance &&
@ -1016,6 +1026,11 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
goto End; goto End;
} }
} else { } else {
if (distance_code > 0) {
dist_rb[dist_rb_idx & 3] = distance;
++dist_rb_idx;
}
if (copy_length > meta_block_remaining_len) { if (copy_length > meta_block_remaining_len) {
printf("Invalid backward reference. pos: %d distance: %d " printf("Invalid backward reference. pos: %d distance: %d "
"len: %d bytes left: %d\n", pos, distance, copy_length, "len: %d bytes left: %d\n", pos, distance, copy_length,

View File

@ -23,6 +23,7 @@
namespace brotli { namespace brotli {
template<typename Hasher>
void CreateBackwardReferences(size_t num_bytes, void CreateBackwardReferences(size_t num_bytes,
size_t position, size_t position,
const uint8_t* ringbuffer, const uint8_t* ringbuffer,
@ -38,6 +39,9 @@ void CreateBackwardReferences(size_t num_bytes,
const int i_diff = position - i; const int i_diff = position - i;
const size_t i_end = i + num_bytes; const size_t i_end = i + num_bytes;
const int random_heuristics_window_size = 512;
int apply_random_heuristics = i + random_heuristics_window_size;
double average_cost = 0.0; double average_cost = 0.0;
for (int k = position; k < position + num_bytes; ++k) { for (int k = position; k < position + num_bytes; ++k) {
average_cost += literal_cost[k & ringbuffer_mask]; average_cost += literal_cost[k & ringbuffer_mask];
@ -65,7 +69,8 @@ void CreateBackwardReferences(size_t num_bytes,
bool match_found = hasher->FindLongestMatch( bool match_found = hasher->FindLongestMatch(
ringbuffer, literal_cost, ringbuffer_mask, ringbuffer, literal_cost, ringbuffer_mask,
i + i_diff, i_end - i, max_distance, i + i_diff, i_end - i, max_distance,
&best_len, &best_len_code, &best_dist, &best_score, &in_dictionary); &best_len, &best_len_code, &best_dist, &best_score,
&in_dictionary);
bool best_in_dictionary = in_dictionary; bool best_in_dictionary = in_dictionary;
if (match_found) { if (match_found) {
if (match_found_M1 && best_score_M1 > best_score) { if (match_found_M1 && best_score_M1 > best_score) {
@ -138,16 +143,20 @@ void CreateBackwardReferences(size_t num_bytes,
} }
} }
} }
apply_random_heuristics =
i + 2 * best_len + random_heuristics_window_size;
Command cmd; Command cmd;
cmd.insert_length_ = insert_length; cmd.insert_length_ = insert_length;
cmd.copy_length_ = best_len; cmd.copy_length_ = best_len;
cmd.copy_length_code_ = best_len_code; cmd.copy_length_code_ = best_len_code;
cmd.copy_distance_ = best_dist; cmd.copy_distance_ = best_dist;
commands->push_back(cmd); commands->push_back(cmd);
hasher->set_last_distance(best_dist);
insert_length = 0; insert_length = 0;
++i; ++i;
if (best_dist <= std::min(i + i_diff, max_backward_limit)) {
hasher->set_last_distance(best_dist);
}
// Copy all copied literals to the hasher, except the last one. // Copy all copied literals to the hasher, except the last one.
// We cannot store the last one yet, otherwise we couldn't find // We cannot store the last one yet, otherwise we couldn't find
// the possible M1 match. // the possible M1 match.
@ -158,7 +167,8 @@ void CreateBackwardReferences(size_t num_bytes,
++i; ++i;
} }
// Prepare M1 match. // Prepare M1 match.
if (best_len >= 4 && i + 20 < i_end && !best_in_dictionary) { if (hasher->HasStaticDictionary() &&
best_len >= 4 && i + 20 < i_end && !best_in_dictionary) {
max_distance = std::min(i + i_diff, max_backward_limit); max_distance = std::min(i + i_diff, max_backward_limit);
match_found_M1 = hasher->FindLongestMatch( match_found_M1 = hasher->FindLongestMatch(
ringbuffer, literal_cost, ringbuffer_mask, ringbuffer, literal_cost, ringbuffer_mask,
@ -185,6 +195,32 @@ void CreateBackwardReferences(size_t num_bytes,
++insert_length; ++insert_length;
hasher->Store(ringbuffer + i, i + i_diff); hasher->Store(ringbuffer + i, i + i_diff);
++i; ++i;
// If we have not seen matches for a long time, we can skip some
// match lookups. Unsuccessful match lookups are very very expensive
// and this kind of a heuristic speeds up compression quite
// a lot.
if (i > apply_random_heuristics) {
// Going through uncompressible data, jump.
if (i > apply_random_heuristics + 4 * random_heuristics_window_size) {
// It is quite a long time since we saw a copy, so we assume
// that this data is not compressible, and store hashes less
// often. Hashes of non compressible data are less likely to
// turn out to be useful in the future, too, so we store less of
// them to not to flood out the hash table of good compressible
// data.
int i_jump = std::min(i + 16, i_end - 4);
for (; i < i_jump; i += 4) {
hasher->Store(ringbuffer + i, i + i_diff);
insert_length += 4;
}
} else {
int i_jump = std::min(i + 8, i_end - 2);
for (; i < i_jump; i += 2) {
hasher->Store(ringbuffer + i, i + i_diff);
insert_length += 2;
}
}
}
} }
} }
insert_length += (i_end - i); insert_length += (i_end - i);
@ -198,4 +234,34 @@ void CreateBackwardReferences(size_t num_bytes,
} }
} }
void CreateBackwardReferences(size_t num_bytes,
size_t position,
const uint8_t* ringbuffer,
const float* literal_cost,
size_t ringbuffer_mask,
const size_t max_backward_limit,
Hashers* hashers,
Hashers::Type hash_type,
std::vector<Command>* commands) {
switch (hash_type) {
case Hashers::HASH_15_8_4:
CreateBackwardReferences(
num_bytes, position, ringbuffer, literal_cost,
ringbuffer_mask, max_backward_limit,
hashers->hash_15_8_4.get(),
commands);
break;
case Hashers::HASH_15_8_2:
CreateBackwardReferences(
num_bytes, position, ringbuffer, literal_cost,
ringbuffer_mask, max_backward_limit,
hashers->hash_15_8_2.get(),
commands);
break;
default:
break;
}
}
} // namespace brotli } // namespace brotli

View File

@ -31,7 +31,8 @@ void CreateBackwardReferences(size_t num_bytes,
const float* literal_cost, const float* literal_cost,
size_t ringbuffer_mask, size_t ringbuffer_mask,
const size_t max_backward_limit, const size_t max_backward_limit,
Hasher* hasher, Hashers* hashers,
Hashers::Type hash_type,
std::vector<Command>* commands); std::vector<Command>* commands);
} // namespace brotli } // namespace brotli

View File

@ -46,6 +46,7 @@ static inline int HuffmanBitCost(const uint8_t* depth, int length) {
int max_depth = 1; int max_depth = 1;
int histogram[kCodeLengthCodes] = { 0 }; int histogram[kCodeLengthCodes] = { 0 };
int tail_start = 0; int tail_start = 0;
int prev_value = 8;
// compute histogram of compacted huffman tree // compute histogram of compacted huffman tree
for (int i = 0; i < length;) { for (int i = 0; i < length;) {
const int value = depth[i]; const int value = depth[i];
@ -57,29 +58,32 @@ static inline int HuffmanBitCost(const uint8_t* depth, int length) {
++reps; ++reps;
} }
i += reps; i += reps;
if (i == length && value == 0)
break;
if (value == 0) { if (value == 0) {
if (reps < 3) { if (reps < 3) {
histogram[0] += reps; histogram[0] += reps;
} else { } else {
reps -= 3; reps -= 2;
while (reps >= 0) { while (reps > 0) {
++histogram[17]; ++histogram[17];
reps >>= 3; reps >>= 3;
--reps;
} }
} }
} else { } else {
tail_start = i; tail_start = i;
if (value != prev_value) {
++histogram[value]; ++histogram[value];
--reps; --reps;
}
prev_value = value;
if (reps < 3) { if (reps < 3) {
histogram[value] += reps; histogram[value] += reps;
} else { } else {
reps -= 3; reps -= 2;
while (reps >= 0) { while (reps > 0) {
++histogram[16]; ++histogram[16];
reps >>= 2; reps >>= 2;
--reps;
} }
} }
} }
@ -93,7 +97,7 @@ static inline int HuffmanBitCost(const uint8_t* depth, int length) {
cost[17] += 3; cost[17] += 3;
int tree_size = 0; int tree_size = 0;
int bits = 6 + 2 * max_depth; // huffman tree of huffman tree cost int bits = 18 + 2 * max_depth; // huffman tree of huffman tree cost
for (int i = 0; i < kCodeLengthCodes; ++i) { for (int i = 0; i < kCodeLengthCodes; ++i) {
bits += histogram[i] * cost[i]; // huffman tree bit cost bits += histogram[i] * cost[i]; // huffman tree bit cost
tree_size += histogram[i]; tree_size += histogram[i];

View File

@ -348,6 +348,7 @@ void SplitBlock(const std::vector<Command>& cmds,
&insert_and_copy_codes, &insert_and_copy_codes,
&distance_prefixes); &distance_prefixes);
SplitByteVector<HistogramLiteral>( SplitByteVector<HistogramLiteral>(
literals, literals,
kSymbolsPerLiteralHistogram, kMaxLiteralHistograms, kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,

View File

@ -24,9 +24,13 @@ namespace brotli {
// Command holds a sequence of literals and a backward reference copy. // Command holds a sequence of literals and a backward reference copy.
class Command { class Command {
public: public:
// distance_code_ is initialized to 17 because it refers to the distance
// code of a backward distance of 1, this way the last insert-only command
// won't use the last-distance short code, and accordingly distance_prefix_ is
// set to 16
Command() : insert_length_(0), copy_length_(0), copy_length_code_(0), Command() : insert_length_(0), copy_length_(0), copy_length_code_(0),
copy_distance_(0), distance_code_(0), copy_distance_(0), distance_code_(17),
distance_prefix_(0), command_prefix_(0), distance_prefix_(16), command_prefix_(0),
distance_extra_bits_(0), distance_extra_bits_value_(0) {} distance_extra_bits_(0), distance_extra_bits_value_(0) {}
uint32_t insert_length_; uint32_t insert_length_;

View File

@ -168,15 +168,6 @@ void EncodeMetaBlockLength(size_t meta_block_size,
} }
} }
template<int kSize>
void EntropyEncode(int val, const EntropyCode<kSize>& code,
int* storage_ix, uint8_t* storage) {
if (code.count_ <= 1) {
return;
};
WriteBits(code.depth_[val], code.bits_[val], storage_ix, storage);
}
void StoreHuffmanTreeOfHuffmanTreeToBitMask( void StoreHuffmanTreeOfHuffmanTreeToBitMask(
const uint8_t* code_length_bitdepth, const uint8_t* code_length_bitdepth,
int* storage_ix, uint8_t* storage) { int* storage_ix, uint8_t* storage) {
@ -225,7 +216,9 @@ void StoreHuffmanTreeToBitMask(
for (int i = 0; i < huffman_tree_size; ++i) { for (int i = 0; i < huffman_tree_size; ++i) {
const int ix = huffman_tree[i]; const int ix = huffman_tree[i];
const int extra_bits = huffman_tree_extra_bits[i]; const int extra_bits = huffman_tree_extra_bits[i];
EntropyEncode(ix, entropy, storage_ix, storage); if (entropy.count_ > 1) {
WriteBits(entropy.depth_[ix], entropy.bits_[ix], storage_ix, storage);
}
switch (ix) { switch (ix) {
case 16: case 16:
WriteBits(2, extra_bits, storage_ix, storage); WriteBits(2, extra_bits, storage_ix, storage);
@ -240,8 +233,7 @@ void StoreHuffmanTreeToBitMask(
template<int kSize> template<int kSize>
void StoreHuffmanCodeSimple( void StoreHuffmanCodeSimple(
const EntropyCode<kSize>& code, int alphabet_size, const EntropyCode<kSize>& code, int alphabet_size,
int max_bits, int max_bits, int* storage_ix, uint8_t* storage) {
int* storage_ix, uint8_t* storage) {
const uint8_t *depth = &code.depth_[0]; const uint8_t *depth = &code.depth_[0];
int symbols[4]; int symbols[4];
// Quadratic sort. // Quadratic sort.
@ -304,37 +296,66 @@ void StoreHuffmanCodeComplex(
storage_ix, storage); storage_ix, storage);
} }
template<int kSize> template<int kSize>
void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size, void BuildAndStoreEntropyCode(const Histogram<kSize>& histogram,
const int tree_limit,
const int alphabet_size,
EntropyCode<kSize>* code,
int* storage_ix, uint8_t* storage) { int* storage_ix, uint8_t* storage) {
memset(code->depth_, 0, sizeof(code->depth_));
memset(code->bits_, 0, sizeof(code->bits_));
memset(code->symbols_, 0, sizeof(code->symbols_));
code->count_ = 0;
int max_bits_counter = alphabet_size - 1; int max_bits_counter = alphabet_size - 1;
int max_bits = 0; int max_bits = 0;
while (max_bits_counter) { while (max_bits_counter) {
max_bits_counter >>= 1; max_bits_counter >>= 1;
++max_bits; ++max_bits;
} }
if (code.count_ == 0) {
// Emit a minimal tree for empty cases. for (size_t i = 0; i < alphabet_size; i++) {
// bits: small tree marker: 1, count-1: 0, max_bits-sized encoding for 0 if (histogram.data_[i] > 0) {
WriteBits(4 + max_bits, 0x1, storage_ix, storage); if (code->count_ < 4) code->symbols_[code->count_] = i;
} else if (code.count_ <= 4) { ++code->count_;
StoreHuffmanCodeSimple( }
code, alphabet_size, max_bits, }
storage_ix, storage);
if (code->count_ <= 1) {
WriteBits(2, 1, storage_ix, storage);
WriteBits(2, 0, storage_ix, storage);
WriteBits(max_bits, code->symbols_[0], storage_ix, storage);
return;
}
if (alphabet_size >= 50 && code->count_ >= 16) {
std::vector<int> counts(alphabet_size);
memcpy(&counts[0], histogram.data_, sizeof(counts[0]) * alphabet_size);
OptimizeHuffmanCountsForRle(alphabet_size, &counts[0]);
CreateHuffmanTree(&counts[0], alphabet_size, tree_limit, code->depth_);
} else { } else {
StoreHuffmanCodeComplex( CreateHuffmanTree(histogram.data_, alphabet_size, tree_limit, code->depth_);
code, alphabet_size, }
storage_ix, storage); ConvertBitDepthsToSymbols(code->depth_, alphabet_size, code->bits_);
if (code->count_ <= 4) {
StoreHuffmanCodeSimple(*code, alphabet_size, max_bits, storage_ix, storage);
} else {
StoreHuffmanCodeComplex(*code, alphabet_size, storage_ix, storage);
} }
} }
template<int kSize> template<int kSize>
void StoreHuffmanCodes(const std::vector<EntropyCode<kSize> >& codes, void BuildAndStoreEntropyCodes(
const std::vector<Histogram<kSize> >& histograms,
int alphabet_size, int alphabet_size,
std::vector<EntropyCode<kSize> >* entropy_codes,
int* storage_ix, uint8_t* storage) { int* storage_ix, uint8_t* storage) {
for (int i = 0; i < codes.size(); ++i) { entropy_codes->resize(histograms.size());
StoreHuffmanCode(codes[i], alphabet_size, storage_ix, storage); for (int i = 0; i < histograms.size(); ++i) {
BuildAndStoreEntropyCode(histograms[i], 15, alphabet_size,
&(*entropy_codes)[i],
storage_ix, storage);
} }
} }
@ -342,7 +363,7 @@ void EncodeCommand(const Command& cmd,
const EntropyCodeCommand& entropy, const EntropyCodeCommand& entropy,
int* storage_ix, uint8_t* storage) { int* storage_ix, uint8_t* storage) {
int code = cmd.command_prefix_; int code = cmd.command_prefix_;
EntropyEncode(code, entropy, storage_ix, storage); WriteBits(entropy.depth_[code], entropy.bits_[code], storage_ix, storage);
if (code >= 128) { if (code >= 128) {
code -= 128; code -= 128;
} }
@ -364,13 +385,15 @@ void EncodeCopyDistance(const Command& cmd, const EntropyCodeDistance& entropy,
int code = cmd.distance_prefix_; int code = cmd.distance_prefix_;
int extra_bits = cmd.distance_extra_bits_; int extra_bits = cmd.distance_extra_bits_;
uint64_t extra_bits_val = cmd.distance_extra_bits_value_; uint64_t extra_bits_val = cmd.distance_extra_bits_value_;
EntropyEncode(code, entropy, storage_ix, storage); WriteBits(entropy.depth_[code], entropy.bits_[code], storage_ix, storage);
if (extra_bits > 0) { if (extra_bits > 0) {
WriteBits(extra_bits, extra_bits_val, storage_ix, storage); WriteBits(extra_bits, extra_bits_val, storage_ix, storage);
} }
} }
void ComputeDistanceShortCodes(std::vector<Command>* cmds, void ComputeDistanceShortCodes(std::vector<Command>* cmds,
size_t pos,
const size_t max_backward,
int* dist_ringbuffer, int* dist_ringbuffer,
size_t* ringbuffer_idx) { size_t* ringbuffer_idx) {
static const int kIndexOffset[16] = { static const int kIndexOffset[16] = {
@ -380,10 +403,13 @@ void ComputeDistanceShortCodes(std::vector<Command>* cmds,
0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3 0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3
}; };
for (int i = 0; i < cmds->size(); ++i) { for (int i = 0; i < cmds->size(); ++i) {
pos += (*cmds)[i].insert_length_;
size_t max_distance = std::min(pos, max_backward);
int cur_dist = (*cmds)[i].copy_distance_; int cur_dist = (*cmds)[i].copy_distance_;
if (cur_dist == 0) break;
int dist_code = cur_dist + 16; int dist_code = cur_dist + 16;
int limits[16] = { 0, 4, 10, 11, if (cur_dist <= max_distance) {
if (cur_dist == 0) break;
int limits[16] = { 0, 0, 0, 0,
6, 6, 11, 11, 6, 6, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11,
12, 12, 12, 12 }; 12, 12, 12, 12 };
@ -405,6 +431,13 @@ void ComputeDistanceShortCodes(std::vector<Command>* cmds,
dist_ringbuffer[*ringbuffer_idx & 3] = cur_dist; dist_ringbuffer[*ringbuffer_idx & 3] = cur_dist;
++(*ringbuffer_idx); ++(*ringbuffer_idx);
} }
pos += (*cmds)[i].copy_length_;
} else {
int word_idx = cur_dist - max_distance - 1;
const std::string word =
GetTransformedDictionaryWord((*cmds)[i].copy_length_code_, word_idx);
pos += word.size();
}
(*cmds)[i].distance_code_ = dist_code; (*cmds)[i].distance_code_ = dist_code;
} }
} }
@ -558,18 +591,20 @@ void EncodeContextMap(const std::vector<int>& context_map,
for (int i = 0; i < rle_symbols.size(); ++i) { for (int i = 0; i < rle_symbols.size(); ++i) {
symbol_histogram.Add(rle_symbols[i]); symbol_histogram.Add(rle_symbols[i]);
} }
EntropyCodeContextMap symbol_code;
BuildEntropyCode(symbol_histogram, 15, num_clusters + max_run_length_prefix,
&symbol_code);
bool use_rle = max_run_length_prefix > 0; bool use_rle = max_run_length_prefix > 0;
WriteBits(1, use_rle, storage_ix, storage); WriteBits(1, use_rle, storage_ix, storage);
if (use_rle) { if (use_rle) {
WriteBits(4, max_run_length_prefix - 1, storage_ix, storage); WriteBits(4, max_run_length_prefix - 1, storage_ix, storage);
} }
StoreHuffmanCode(symbol_code, num_clusters + max_run_length_prefix, EntropyCodeContextMap symbol_code;
BuildAndStoreEntropyCode(symbol_histogram, 15,
num_clusters + max_run_length_prefix,
&symbol_code,
storage_ix, storage); storage_ix, storage);
for (int i = 0; i < rle_symbols.size(); ++i) { for (int i = 0; i < rle_symbols.size(); ++i) {
EntropyEncode(rle_symbols[i], symbol_code, storage_ix, storage); WriteBits(symbol_code.depth_[rle_symbols[i]],
symbol_code.bits_[rle_symbols[i]],
storage_ix, storage);
if (rle_symbols[i] > 0 && rle_symbols[i] <= max_run_length_prefix) { if (rle_symbols[i] > 0 && rle_symbols[i] <= max_run_length_prefix) {
WriteBits(rle_symbols[i], extra_bits[i], storage_ix, storage); WriteBits(rle_symbols[i], extra_bits[i], storage_ix, storage);
} }
@ -577,16 +612,6 @@ void EncodeContextMap(const std::vector<int>& context_map,
WriteBits(1, 1, storage_ix, storage); // use move-to-front WriteBits(1, 1, storage_ix, storage); // use move-to-front
} }
template<int kSize>
void BuildEntropyCodes(const std::vector<Histogram<kSize> >& histograms,
int alphabet_size,
std::vector<EntropyCode<kSize> >* entropy_codes) {
entropy_codes->resize(histograms.size());
for (int i = 0; i < histograms.size(); ++i) {
BuildEntropyCode(histograms[i], 15, alphabet_size, &(*entropy_codes)[i]);
}
}
struct BlockSplitCode { struct BlockSplitCode {
EntropyCodeBlockType block_type_code; EntropyCodeBlockType block_type_code;
EntropyCodeBlockLength block_len_code; EntropyCodeBlockLength block_len_code;
@ -598,8 +623,8 @@ void EncodeBlockLength(const EntropyCodeBlockLength& entropy,
int len_code = BlockLengthPrefix(length); int len_code = BlockLengthPrefix(length);
int extra_bits = BlockLengthExtraBits(len_code); int extra_bits = BlockLengthExtraBits(len_code);
int extra_bits_value = length - BlockLengthOffset(len_code); int extra_bits_value = length - BlockLengthOffset(len_code);
EntropyEncode(len_code, entropy, storage_ix, storage); WriteBits(entropy.depth_[len_code], entropy.bits_[len_code],
storage_ix, storage);
if (extra_bits > 0) { if (extra_bits > 0) {
WriteBits(extra_bits, extra_bits_value, storage_ix, storage); WriteBits(extra_bits, extra_bits_value, storage_ix, storage);
} }
@ -632,25 +657,24 @@ void BuildAndEncodeBlockSplitCode(const BlockSplit& split,
BlockSplitCode* code, BlockSplitCode* code,
int* storage_ix, uint8_t* storage) { int* storage_ix, uint8_t* storage) {
EncodeVarLenUint8(split.num_types_ - 1, storage_ix, storage); EncodeVarLenUint8(split.num_types_ - 1, storage_ix, storage);
if (split.num_types_ == 1) { if (split.num_types_ == 1) {
return; return;
} }
HistogramBlockType type_histo; HistogramBlockType type_histo;
for (int i = 0; i < split.type_codes_.size(); ++i) { for (int i = 1; i < split.type_codes_.size(); ++i) {
type_histo.Add(split.type_codes_[i]); type_histo.Add(split.type_codes_[i]);
} }
BuildEntropyCode(type_histo, 15, split.num_types_ + 2,
&code->block_type_code);
HistogramBlockLength length_histo; HistogramBlockLength length_histo;
for (int i = 0; i < split.lengths_.size(); ++i) { for (int i = 0; i < split.lengths_.size(); ++i) {
length_histo.Add(BlockLengthPrefix(split.lengths_[i])); length_histo.Add(BlockLengthPrefix(split.lengths_[i]));
} }
BuildEntropyCode(length_histo, 15, kNumBlockLenPrefixes, BuildAndStoreEntropyCode(type_histo, 15, split.num_types_ + 2,
&code->block_len_code); &code->block_type_code,
StoreHuffmanCode(code->block_type_code, split.num_types_ + 2,
storage_ix, storage); storage_ix, storage);
StoreHuffmanCode(code->block_len_code, kNumBlockLenPrefixes, BuildAndStoreEntropyCode(length_histo, 15, kNumBlockLenPrefixes,
&code->block_len_code,
storage_ix, storage); storage_ix, storage);
EncodeBlockLength(code->block_len_code, split.lengths_[0], EncodeBlockLength(code->block_len_code, split.lengths_[0],
storage_ix, storage); storage_ix, storage);
@ -664,7 +688,9 @@ void MoveAndEncode(const BlockSplitCode& code,
it->type_ = it->split_.types_[it->idx_]; it->type_ = it->split_.types_[it->idx_];
it->length_ = it->split_.lengths_[it->idx_]; it->length_ = it->split_.lengths_[it->idx_];
int type_code = it->split_.type_codes_[it->idx_]; int type_code = it->split_.type_codes_[it->idx_];
EntropyEncode(type_code, code.block_type_code, storage_ix, storage); WriteBits(code.block_type_code.depth_[type_code],
code.block_type_code.bits_[type_code],
storage_ix, storage);
EncodeBlockLength(code.block_len_code, it->length_, storage_ix, storage); EncodeBlockLength(code.block_len_code, it->length_, storage_ix, storage);
} }
--it->length_; --it->length_;
@ -773,10 +799,8 @@ void StoreMetaBlock(const MetaBlock& mb,
int* storage_ix, uint8_t* storage) { int* storage_ix, uint8_t* storage) {
size_t length = MetaBlockLength(mb.cmds); size_t length = MetaBlockLength(mb.cmds);
const size_t end_pos = *pos + length; const size_t end_pos = *pos + length;
EncodeMetaBlockLength(length, EncodeMetaBlockLength(length, is_last, false, storage_ix, storage);
is_last,
false,
storage_ix, storage);
if (length == 0) { if (length == 0) {
return; return;
} }
@ -792,26 +816,27 @@ void StoreMetaBlock(const MetaBlock& mb,
WriteBits(2, mb.params.distance_postfix_bits, storage_ix, storage); WriteBits(2, mb.params.distance_postfix_bits, storage_ix, storage);
WriteBits(4, WriteBits(4,
mb.params.num_direct_distance_codes >> mb.params.num_direct_distance_codes >>
mb.params.distance_postfix_bits, storage_ix, storage); mb.params.distance_postfix_bits,
storage_ix, storage);
int num_distance_codes = int num_distance_codes =
kNumDistanceShortCodes + mb.params.num_direct_distance_codes + kNumDistanceShortCodes + mb.params.num_direct_distance_codes +
(48 << mb.params.distance_postfix_bits); (48 << mb.params.distance_postfix_bits);
for (int i = 0; i < mb.literal_split.num_types_; ++i) { for (int i = 0; i < mb.literal_split.num_types_; ++i) {
WriteBits(2, mb.literal_context_modes[i], storage_ix, storage); WriteBits(2, mb.literal_context_modes[i], storage_ix, storage);
} }
EncodeContextMap(mb.literal_context_map, mb.literal_histograms.size(), storage_ix, storage); EncodeContextMap(mb.literal_context_map, mb.literal_histograms.size(),
EncodeContextMap(mb.distance_context_map, mb.distance_histograms.size(), storage_ix, storage); storage_ix, storage);
EncodeContextMap(mb.distance_context_map, mb.distance_histograms.size(),
storage_ix, storage);
std::vector<EntropyCodeLiteral> literal_codes; std::vector<EntropyCodeLiteral> literal_codes;
std::vector<EntropyCodeCommand> command_codes; std::vector<EntropyCodeCommand> command_codes;
std::vector<EntropyCodeDistance> distance_codes; std::vector<EntropyCodeDistance> distance_codes;
BuildEntropyCodes(mb.literal_histograms, 256, &literal_codes); BuildAndStoreEntropyCodes(mb.literal_histograms, 256, &literal_codes,
BuildEntropyCodes(mb.command_histograms, kNumCommandPrefixes, storage_ix, storage);
&command_codes); BuildAndStoreEntropyCodes(mb.command_histograms, kNumCommandPrefixes,
BuildEntropyCodes(mb.distance_histograms, num_distance_codes, &command_codes, storage_ix, storage);
&distance_codes); BuildAndStoreEntropyCodes(mb.distance_histograms, num_distance_codes,
StoreHuffmanCodes(literal_codes, 256, storage_ix, storage); &distance_codes, storage_ix, storage);
StoreHuffmanCodes(command_codes, kNumCommandPrefixes, storage_ix, storage);
StoreHuffmanCodes(distance_codes, num_distance_codes, storage_ix, storage);
BlockSplitIterator literal_it(mb.literal_split); BlockSplitIterator literal_it(mb.literal_split);
BlockSplitIterator command_it(mb.command_split); BlockSplitIterator command_it(mb.command_split);
BlockSplitIterator distance_it(mb.distance_split); BlockSplitIterator distance_it(mb.distance_split);
@ -828,8 +853,10 @@ void StoreMetaBlock(const MetaBlock& mb,
Context(prev_byte, prev_byte2, Context(prev_byte, prev_byte2,
mb.literal_context_modes[literal_it.type_])); mb.literal_context_modes[literal_it.type_]));
histogram_idx = mb.literal_context_map[context]; histogram_idx = mb.literal_context_map[context];
EntropyEncode(ringbuffer[*pos & mask], int literal = ringbuffer[*pos & mask];
literal_codes[histogram_idx], storage_ix, storage); WriteBits(literal_codes[histogram_idx].depth_[literal],
literal_codes[histogram_idx].bits_[literal],
storage_ix, storage);
++(*pos); ++(*pos);
} }
if (*pos < end_pos && cmd.distance_prefix_ != 0xffff) { if (*pos < end_pos && cmd.distance_prefix_ != 0xffff) {
@ -845,9 +872,10 @@ void StoreMetaBlock(const MetaBlock& mb,
} }
} }
BrotliCompressor::BrotliCompressor() BrotliCompressor::BrotliCompressor(BrotliParams params)
: window_bits_(kWindowBits), : params_(params),
hasher_(new Hasher), window_bits_(kWindowBits),
hashers_(new Hashers()),
dist_ringbuffer_idx_(0), dist_ringbuffer_idx_(0),
input_pos_(0), input_pos_(0),
ringbuffer_(kRingBufferBits, kMetaBlockSizeBits), ringbuffer_(kRingBufferBits, kMetaBlockSizeBits),
@ -859,28 +887,41 @@ BrotliCompressor::BrotliCompressor()
dist_ringbuffer_[2] = 11; dist_ringbuffer_[2] = 11;
dist_ringbuffer_[3] = 4; dist_ringbuffer_[3] = 4;
storage_[0] = 0; storage_[0] = 0;
switch (params.mode) {
case BrotliParams::MODE_TEXT: hash_type_ = Hashers::HASH_15_8_4; break;
case BrotliParams::MODE_FONT: hash_type_ = Hashers::HASH_15_8_2; break;
default: break;
}
hashers_->Init(hash_type_);
if (params.mode == BrotliParams::MODE_TEXT) {
StoreDictionaryWordHashes(); StoreDictionaryWordHashes();
}
} }
BrotliCompressor::~BrotliCompressor() { BrotliCompressor::~BrotliCompressor() {
delete hasher_;
delete[] storage_; delete[] storage_;
} }
StaticDictionary *BrotliCompressor::static_dictionary_ = NULL;
void BrotliCompressor::StoreDictionaryWordHashes() { void BrotliCompressor::StoreDictionaryWordHashes() {
for (int t = kNumTransforms - 1; t >= 0; --t) { const int num_transforms = kNumTransforms;
if (static_dictionary_ == NULL) {
static_dictionary_ = new StaticDictionary;
for (int t = num_transforms - 1; t >= 0; --t) {
for (int i = kMaxDictionaryWordLength; i >= 3; --i) { for (int i = kMaxDictionaryWordLength; i >= 3; --i) {
const int num_words = 1 << kBrotliDictionarySizeBitsByLength[i]; const int num_words = 1 << kBrotliDictionarySizeBitsByLength[i];
for (int j = num_words - 1; j >= 0; --j) { for (int j = num_words - 1; j >= 0; --j) {
int word_id = t * num_words + j; int word_id = t * num_words + j;
std::string word = GetTransformedDictionaryWord(i, word_id); std::string word = GetTransformedDictionaryWord(i, word_id);
if (word.size() >= 3) { if (word.size() >= 3) {
hasher_->Store(reinterpret_cast<const uint8_t*>(&word[0]), static_dictionary_->Insert(word, i, word_id);
(-1) * ((i << 20) + word_id + 1));
} }
} }
} }
} }
}
hashers_->SetStaticDictionary(static_dictionary_);
} }
void BrotliCompressor::WriteStreamHeader() { void BrotliCompressor::WriteStreamHeader() {
@ -908,25 +949,30 @@ void BrotliCompressor::WriteMetaBlock(const size_t input_size,
input_size, kMinUTF8Ratio); input_size, kMinUTF8Ratio);
if (utf8_mode) { if (utf8_mode) {
EstimateBitCostsForLiteralsUTF8(input_pos_, input_size, EstimateBitCostsForLiteralsUTF8(input_pos_, input_size,
kRingBufferMask, ringbuffer_.start(), kRingBufferMask, kRingBufferMask,
&literal_cost_[0]); ringbuffer_.start(), &literal_cost_[0]);
} else { } else {
EstimateBitCostsForLiterals(input_pos_, input_size, EstimateBitCostsForLiterals(input_pos_, input_size,
kRingBufferMask, ringbuffer_.start(), kRingBufferMask, kRingBufferMask,
&literal_cost_[0]); ringbuffer_.start(), &literal_cost_[0]);
} }
CreateBackwardReferences(input_size, input_pos_, CreateBackwardReferences(
input_size, input_pos_,
ringbuffer_.start(), ringbuffer_.start(),
&literal_cost_[0], &literal_cost_[0],
kRingBufferMask, kMaxBackwardDistance, kRingBufferMask, kMaxBackwardDistance,
hasher_, hashers_.get(),
hash_type_,
&commands); &commands);
ComputeDistanceShortCodes(&commands, dist_ringbuffer_, ComputeDistanceShortCodes(&commands, input_pos_, kMaxBackwardDistance,
dist_ringbuffer_,
&dist_ringbuffer_idx_); &dist_ringbuffer_idx_);
} }
EncodingParams params; EncodingParams params;
params.num_direct_distance_codes = 12; params.num_direct_distance_codes =
params.distance_postfix_bits = 1; params_.mode == BrotliParams::MODE_FONT ? 12 : 0;
params.distance_postfix_bits =
params_.mode == BrotliParams::MODE_FONT ? 1 : 0;
params.literal_context_mode = CONTEXT_SIGNED; params.literal_context_mode = CONTEXT_SIGNED;
const int storage_ix0 = storage_ix_; const int storage_ix0 = storage_ix_;
MetaBlock mb; MetaBlock mb;
@ -935,6 +981,7 @@ void BrotliCompressor::WriteMetaBlock(const size_t input_size,
StoreMetaBlock(mb, is_last, ringbuffer_.start(), kRingBufferMask, StoreMetaBlock(mb, is_last, ringbuffer_.start(), kRingBufferMask,
&input_pos_, &storage_ix_, storage_); &input_pos_, &storage_ix_, storage_);
size_t output_size = is_last ? ((storage_ix_ + 7) >> 3) : (storage_ix_ >> 3); size_t output_size = is_last ? ((storage_ix_ + 7) >> 3) : (storage_ix_ >> 3);
output_size -= (storage_ix0 >> 3);
if (input_size + 4 < output_size) { if (input_size + 4 < output_size) {
storage_ix_ = storage_ix0; storage_ix_ = storage_ix0;
storage_[storage_ix_ >> 3] &= (1 << (storage_ix_ & 7)) - 1; storage_[storage_ix_ >> 3] &= (1 << (storage_ix_ & 7)) - 1;
@ -968,7 +1015,8 @@ void BrotliCompressor::FinishStream(
} }
int BrotliCompressBuffer(size_t input_size, int BrotliCompressBuffer(BrotliParams params,
size_t input_size,
const uint8_t* input_buffer, const uint8_t* input_buffer,
size_t* encoded_size, size_t* encoded_size,
uint8_t* encoded_buffer) { uint8_t* encoded_buffer) {
@ -978,7 +1026,7 @@ int BrotliCompressBuffer(size_t input_size,
return 1; return 1;
} }
BrotliCompressor compressor; BrotliCompressor compressor(params);
compressor.WriteStreamHeader(); compressor.WriteStreamHeader();
const int max_block_size = 1 << kMetaBlockSizeBits; const int max_block_size = 1 << kMetaBlockSizeBits;

View File

@ -23,12 +23,23 @@
#include <vector> #include <vector>
#include "./hash.h" #include "./hash.h"
#include "./ringbuffer.h" #include "./ringbuffer.h"
#include "./static_dict.h"
namespace brotli { namespace brotli {
struct BrotliParams {
enum Mode {
MODE_TEXT = 0,
MODE_FONT = 1,
};
Mode mode;
BrotliParams() : mode(MODE_TEXT) {}
};
class BrotliCompressor { class BrotliCompressor {
public: public:
BrotliCompressor(); explicit BrotliCompressor(BrotliParams params);
~BrotliCompressor(); ~BrotliCompressor();
// Writes the stream header into the internal output buffer. // Writes the stream header into the internal output buffer.
@ -53,8 +64,10 @@ class BrotliCompressor {
// Initializes the hasher with the hashes of dictionary words. // Initializes the hasher with the hashes of dictionary words.
void StoreDictionaryWordHashes(); void StoreDictionaryWordHashes();
BrotliParams params_;
int window_bits_; int window_bits_;
Hasher* hasher_; std::unique_ptr<Hashers> hashers_;
Hashers::Type hash_type_;
int dist_ringbuffer_[4]; int dist_ringbuffer_[4];
size_t dist_ringbuffer_idx_; size_t dist_ringbuffer_idx_;
size_t input_pos_; size_t input_pos_;
@ -62,12 +75,14 @@ class BrotliCompressor {
std::vector<float> literal_cost_; std::vector<float> literal_cost_;
int storage_ix_; int storage_ix_;
uint8_t* storage_; uint8_t* storage_;
static StaticDictionary *static_dictionary_;
}; };
// Compresses the data in input_buffer into encoded_buffer, and sets // Compresses the data in input_buffer into encoded_buffer, and sets
// *encoded_size to the compressed length. // *encoded_size to the compressed length.
// Returns 0 if there was an error and 1 otherwise. // Returns 0 if there was an error and 1 otherwise.
int BrotliCompressBuffer(size_t input_size, int BrotliCompressBuffer(BrotliParams params,
size_t input_size,
const uint8_t* input_buffer, const uint8_t* input_buffer,
size_t* encoded_size, size_t* encoded_size,
uint8_t* encoded_buffer); uint8_t* encoded_buffer);

View File

@ -24,12 +24,14 @@
#include <sys/types.h> #include <sys/types.h>
#include <algorithm> #include <algorithm>
#include <cstdlib> #include <cstdlib>
#include <memory>
#include <string> #include <string>
#include "./transform.h" #include "./transform.h"
#include "./fast_log.h" #include "./fast_log.h"
#include "./find_match_length.h" #include "./find_match_length.h"
#include "./port.h" #include "./port.h"
#include "./static_dict.h"
namespace brotli { namespace brotli {
@ -41,11 +43,21 @@ namespace brotli {
// * The number has been tuned heuristically against compression benchmarks. // * The number has been tuned heuristically against compression benchmarks.
static const uint32_t kHashMul32 = 0x1e35a7bd; static const uint32_t kHashMul32 = 0x1e35a7bd;
inline uint32_t Hash3Bytes(const uint8_t *data, const int bits) { template<int kShiftBits, int kMinLength>
inline uint32_t Hash(const uint8_t *data) {
if (kMinLength <= 3) {
// If kMinLength is 2 or 3, we hash the first 3 bytes of data.
uint32_t h = (BROTLI_UNALIGNED_LOAD32(data) & 0xffffff) * kHashMul32; uint32_t h = (BROTLI_UNALIGNED_LOAD32(data) & 0xffffff) * kHashMul32;
// The higher bits contain more mixture from the multiplication, // The higher bits contain more mixture from the multiplication,
// so we take our results from there. // so we take our results from there.
return h >> (32 - bits); return h >> (32 - kShiftBits);
} else {
// If kMinLength is at least 4, we hash the first 4 bytes of data.
uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
// The higher bits contain more mixture from the multiplication,
// so we take our results from there.
return h >> (32 - kShiftBits);
}
} }
// Usually, we always choose the longest backward reference. This function // Usually, we always choose the longest backward reference. This function
@ -67,32 +79,35 @@ inline double BackwardReferenceScore(double average_cost,
double start_cost3, double start_cost3,
double start_cost2, double start_cost2,
int copy_length, int copy_length,
int backward_reference_offset, int backward_reference_offset) {
int last_distance1,
int last_distance2,
int last_distance3,
int last_distance4) {
double retval = 0; double retval = 0;
switch (copy_length) { switch (copy_length) {
case 2: retval = start_cost2; break; case 2: retval = start_cost2; break;
case 3: retval = start_cost3; break; case 3: retval = start_cost3; break;
default: retval = start_cost4 + (copy_length - 4) * average_cost; break; default: retval = start_cost4 + (copy_length - 4) * average_cost; break;
} }
int diff_last1 = abs(backward_reference_offset - last_distance1);
int diff_last2 = abs(backward_reference_offset - last_distance2);
if (diff_last1 == 0) {
retval += 0.6;
} else if (diff_last1 < 4) {
retval -= 0.9 + 0.03 * diff_last1;
} else if (diff_last2 < 4) {
retval -= 0.95 + 0.1 * diff_last2;
} else if (backward_reference_offset == last_distance3) {
retval -= 1.17;
} else if (backward_reference_offset == last_distance4) {
retval -= 1.27;
} else {
retval -= 1.20 * Log2Floor(backward_reference_offset); retval -= 1.20 * Log2Floor(backward_reference_offset);
return retval;
}
inline double BackwardReferenceScoreUsingLastDistance(double average_cost,
double start_cost4,
double start_cost3,
double start_cost2,
int copy_length,
int distance_short_code) {
double retval = 0;
switch (copy_length) {
case 2: retval = start_cost2; break;
case 3: retval = start_cost3; break;
default: retval = start_cost4 + (copy_length - 4) * average_cost; break;
} }
static const double kDistanceShortCodeBitCost[16] = {
-0.6, 0.95, 1.17, 1.27,
0.93, 0.93, 0.96, 0.96, 0.99, 0.99,
1.05, 1.05, 1.15, 1.15, 1.25, 1.25
};
retval -= kDistanceShortCodeBitCost[distance_short_code];
return retval; return retval;
} }
@ -102,7 +117,7 @@ inline double BackwardReferenceScore(double average_cost,
// This is a hash map of fixed size (kBucketSize) to a ring buffer of // This is a hash map of fixed size (kBucketSize) to a ring buffer of
// fixed size (kBlockSize). The ring buffer contains the last kBlockSize // fixed size (kBlockSize). The ring buffer contains the last kBlockSize
// index positions of the given hash key in the compressed data. // index positions of the given hash key in the compressed data.
template <int kBucketBits, int kBlockBits> template <int kBucketBits, int kBlockBits, int kMinLength>
class HashLongestMatch { class HashLongestMatch {
public: public:
HashLongestMatch() HashLongestMatch()
@ -111,17 +126,24 @@ class HashLongestMatch {
last_distance3_(15), last_distance3_(15),
last_distance4_(16), last_distance4_(16),
insert_length_(0), insert_length_(0),
average_cost_(5.4) { average_cost_(5.4),
static_dict_(NULL) {
Reset(); Reset();
} }
void Reset() { void Reset() {
std::fill(&num_[0], &num_[sizeof(num_) / sizeof(num_[0])], 0); std::fill(&num_[0], &num_[sizeof(num_) / sizeof(num_[0])], 0);
} }
void SetStaticDictionary(const StaticDictionary *dict) {
static_dict_ = dict;
}
bool HasStaticDictionary() const {
return static_dict_ != NULL;
}
// Look at 3 bytes at data. // Look at 3 bytes at data.
// Compute a hash from these, and store the value of ix at that position. // Compute a hash from these, and store the value of ix at that position.
inline void Store(const uint8_t *data, const int ix) { inline void Store(const uint8_t *data, const int ix) {
const uint32_t key = Hash3Bytes(data, kBucketBits); const uint32_t key = Hash<kBucketBits, kMinLength>(data);
const int minor_ix = num_[key] & kBlockMask; const int minor_ix = num_[key] & kBlockMask;
buckets_[key][minor_ix] = ix; buckets_[key][minor_ix] = ix;
++num_[key]; ++num_[key];
@ -218,19 +240,17 @@ class HashLongestMatch {
const size_t len = const size_t len =
FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked], FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
max_length); max_length);
if (len >= 3 || (len == 2 && i < 2)) { if (len >= std::max(kMinLength, 3) ||
(kMinLength == 2 && len == 2 && i < 2)) {
// Comparing for >= 2 does not change the semantics, but just saves for // Comparing for >= 2 does not change the semantics, but just saves for
// a few unnecessary binary logarithms in backward reference score, // a few unnecessary binary logarithms in backward reference score,
// since we are not interested in such short matches. // since we are not interested in such short matches.
const double score = BackwardReferenceScore(average_cost_, const double score = BackwardReferenceScoreUsingLastDistance(
average_cost_,
start_cost4, start_cost4,
start_cost3, start_cost3,
start_cost2, start_cost2,
len, backward, len, i);
last_distance1_,
last_distance2_,
last_distance3_,
last_distance4_);
if (best_score < score) { if (best_score < score) {
best_score = score; best_score = score;
best_len = len; best_len = len;
@ -244,12 +264,9 @@ class HashLongestMatch {
} }
} }
} }
const uint32_t key = Hash3Bytes(&data[cur_ix_masked], kBucketBits); if (kMinLength == 2) {
const int * __restrict const bucket = &buckets_[key][0];
const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
int stop = int(cur_ix) - 64; int stop = int(cur_ix) - 64;
if (stop < 0) { stop = 0; } if (stop < 0) { stop = 0; }
start_cost2 -= 1.0; start_cost2 -= 1.0;
for (int i = cur_ix - 1; i > stop; --i) { for (int i = cur_ix - 1; i > stop; --i) {
size_t prev_ix = i; size_t prev_ix = i;
@ -275,45 +292,13 @@ class HashLongestMatch {
match_found = true; match_found = true;
} }
} }
}
const uint32_t key = Hash<kBucketBits, kMinLength>(&data[cur_ix_masked]);
const int * __restrict const bucket = &buckets_[key][0];
const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
for (int i = num_[key] - 1; i >= down; --i) { for (int i = num_[key] - 1; i >= down; --i) {
int prev_ix = bucket[i & kBlockMask]; int prev_ix = bucket[i & kBlockMask];
if (prev_ix < 0) { if (prev_ix >= 0) {
prev_ix *= -1;
prev_ix -= 1;
int copy_len_code = prev_ix >> 20;
int word_id = prev_ix & 0xfffff;
std::string word = GetTransformedDictionaryWord(copy_len_code, word_id);
int len = word.size();
const size_t backward = max_backward + word_id + 1;
bool word_matched = (len >= 3 && len <= max_length);
for (int k = 0; k < len && word_matched; ++k) {
if ((uint8_t)(word[k]) != data[cur_ix_masked + k]) {
word_matched = false;
}
}
if (word_matched) {
const double score = BackwardReferenceScore(average_cost_,
start_cost4,
start_cost3,
start_cost2,
len, backward,
last_distance1_,
last_distance2_,
last_distance3_,
last_distance4_);
if (best_score < score) {
best_score = score;
best_len = len;
best_ix = backward;
*best_len_out = best_len;
*best_len_code_out = copy_len_code;
*best_distance_out = best_ix;
*best_score_out = best_score;
match_found = true;
*in_dictionary = true;
}
}
} else {
const size_t backward = cur_ix - prev_ix; const size_t backward = cur_ix - prev_ix;
if (PREDICT_FALSE(backward > max_backward)) { if (PREDICT_FALSE(backward > max_backward)) {
break; break;
@ -327,7 +312,7 @@ class HashLongestMatch {
const size_t len = const size_t len =
FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked], FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
max_length); max_length);
if (len >= 3) { if (len >= std::max(kMinLength, 3)) {
// Comparing for >= 3 does not change the semantics, but just saves // Comparing for >= 3 does not change the semantics, but just saves
// for a few unnecessary binary logarithms in backward reference // for a few unnecessary binary logarithms in backward reference
// score, since we are not interested in such short matches. // score, since we are not interested in such short matches.
@ -335,11 +320,7 @@ class HashLongestMatch {
start_cost4, start_cost4,
start_cost3, start_cost3,
start_cost2, start_cost2,
len, backward, len, backward);
last_distance1_,
last_distance2_,
last_distance3_,
last_distance4_);
if (best_score < score) { if (best_score < score) {
best_score = score; best_score = score;
best_len = len; best_len = len;
@ -354,6 +335,36 @@ class HashLongestMatch {
} }
} }
} }
if (static_dict_ != NULL) {
// We decide based on first 4 bytes how many bytes to test for.
int prefix = BROTLI_UNALIGNED_LOAD32(&data[cur_ix_masked]);
int maxlen = static_dict_->GetLength(prefix);
for (int len = std::min<size_t>(maxlen, max_length);
len > best_len && len >= 4; --len) {
std::string snippet((const char *)&data[cur_ix_masked], len);
int copy_len_code;
int word_id;
if (static_dict_->Get(snippet, &copy_len_code, &word_id)) {
const size_t backward = max_backward + word_id + 1;
const double score = BackwardReferenceScore(average_cost_,
start_cost4,
start_cost3,
start_cost2,
len, backward);
if (best_score < score) {
best_score = score;
best_len = len;
best_ix = backward;
*best_len_out = best_len;
*best_len_code_out = copy_len_code;
*best_distance_out = best_ix;
*best_score_out = best_score;
match_found = true;
*in_dictionary = true;
}
}
}
}
return match_found; return match_found;
} }
@ -399,9 +410,37 @@ class HashLongestMatch {
int insert_length_; int insert_length_;
double average_cost_; double average_cost_;
const StaticDictionary *static_dict_;
}; };
typedef HashLongestMatch<13, 11> Hasher; struct Hashers {
enum Type {
HASH_15_8_4 = 0,
HASH_15_8_2 = 1,
};
void Init(Type type) {
switch (type) {
case HASH_15_8_4:
hash_15_8_4.reset(new HashLongestMatch<15, 8, 4>());
break;
case HASH_15_8_2:
hash_15_8_2.reset(new HashLongestMatch<15, 8, 2>());
break;
default:
break;
}
}
void SetStaticDictionary(const StaticDictionary *dict) {
if (hash_15_8_4.get() != NULL) hash_15_8_4->SetStaticDictionary(dict);
if (hash_15_8_2.get() != NULL) hash_15_8_2->SetStaticDictionary(dict);
}
std::unique_ptr<HashLongestMatch<15, 8, 4> > hash_15_8_4;
std::unique_ptr<HashLongestMatch<15, 8, 2> > hash_15_8_2;
};
} // namespace brotli } // namespace brotli

View File

@ -59,7 +59,8 @@ static int DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask,
} }
void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask, void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
const uint8_t *data, float *cost) { size_t cost_mask, const uint8_t *data,
float *cost) {
// max_utf8 is 0 (normal ascii single byte modeling), // max_utf8 is 0 (normal ascii single byte modeling),
// 1 (for 2-byte utf-8 modeling), or 2 (for 3-byte utf-8 modeling). // 1 (for 2-byte utf-8 modeling), or 2 (for 3-byte utf-8 modeling).
@ -110,18 +111,20 @@ void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
if (histo == 0) { if (histo == 0) {
histo = 1; histo = 1;
} }
cost[masked_pos] = log2(static_cast<double>(in_window_utf8[utf8_pos]) float lit_cost = log2(static_cast<double>(in_window_utf8[utf8_pos])
/ histo); / histo);
cost[masked_pos] += 0.02905; lit_cost += 0.02905;
if (cost[masked_pos] < 1.0) { if (lit_cost < 1.0) {
cost[masked_pos] *= 0.5; lit_cost *= 0.5;
cost[masked_pos] += 0.5; lit_cost += 0.5;
} }
cost[(pos + i) & cost_mask] = lit_cost;
} }
} }
void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask, void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
const uint8_t *data, float *cost) { size_t cost_mask, const uint8_t *data,
float *cost) {
int histogram[256] = { 0 }; int histogram[256] = { 0 };
int window_half = 2000; int window_half = 2000;
int in_window = std::min(static_cast<size_t>(window_half), len); int in_window = std::min(static_cast<size_t>(window_half), len);
@ -143,17 +146,17 @@ void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
++histogram[data[(pos + i + window_half) & mask]]; ++histogram[data[(pos + i + window_half) & mask]];
++in_window; ++in_window;
} }
int masked_pos = (pos + i) & mask; int histo = histogram[data[(pos + i) & mask]];
int histo = histogram[data[masked_pos]];
if (histo == 0) { if (histo == 0) {
histo = 1; histo = 1;
} }
cost[masked_pos] = log2(static_cast<double>(in_window) / histo); float lit_cost = log2(static_cast<double>(in_window) / histo);
cost[masked_pos] += 0.029; lit_cost += 0.029;
if (cost[masked_pos] < 1.0) { if (lit_cost < 1.0) {
cost[masked_pos] *= 0.5; lit_cost *= 0.5;
cost[masked_pos] += 0.5; lit_cost += 0.5;
} }
cost[(pos + i) & cost_mask] = lit_cost;
} }
} }

View File

@ -26,11 +26,11 @@ namespace brotli {
// ringbuffer (data, mask) will take entropy coded and writes these estimates // ringbuffer (data, mask) will take entropy coded and writes these estimates
// to the ringbuffer (cost, mask). // to the ringbuffer (cost, mask).
void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask, void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
const uint8_t *data, size_t cost_mask, const uint8_t *data,
float *cost); float *cost);
void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask, void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
const uint8_t *data, size_t cost_mask, const uint8_t *data,
float *cost); float *cost);
} // namespace brotli } // namespace brotli

View File

@ -90,7 +90,7 @@ int CopyLengthPrefix(int length) {
int CommandPrefix(int insert_length, int copy_length) { int CommandPrefix(int insert_length, int copy_length) {
if (copy_length == 0) { if (copy_length == 0) {
copy_length = 3; copy_length = 4;
} }
int insert_prefix = InsertLengthPrefix(insert_length); int insert_prefix = InsertLengthPrefix(insert_length);
int copy_prefix = CopyLengthPrefix(copy_length); int copy_prefix = CopyLengthPrefix(copy_length);

68
enc/static_dict.h Normal file
View File

@ -0,0 +1,68 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Class to model the static dictionary.
#ifndef BROTLI_ENC_STATIC_DICT_H_
#define BROTLI_ENC_STATIC_DICT_H_
#include <algorithm>
#include <unordered_map>
#include <string>
namespace brotli {
class StaticDictionary {
public:
StaticDictionary() {}
void Insert(const std::string &str, int len, int dist) {
int ix = (dist << 6) + len;
std::unordered_map<std::string, int>::const_iterator it = map_.find(str);
if (it != map_.end() && ix >= it->second) {
return;
}
map_[str] = ix;
int v = 0;
for (int i = 0; i < 4 && i < str.size(); ++i) {
v += str[i] << (8 * i);
}
if (prefix_map_[v] < str.size()) {
prefix_map_[v] = str.size();
}
}
int GetLength(int v) const {
std::unordered_map<int, int>::const_iterator it = prefix_map_.find(v);
if (it == prefix_map_.end()) {
return 0;
}
return it->second;
}
bool Get(const std::string &str, int *len, int *dist) const {
std::unordered_map<std::string, int>::const_iterator it = map_.find(str);
if (it == map_.end()) {
return false;
}
int v = it->second;
*len = v & 63;
*dist = v >> 6;
return true;
}
private:
std::unordered_map<std::string, int> map_;
std::unordered_map<int, int> prefix_map_;
};
} // namespace brotli
#endif // BROTLI_ENC_STATIC_DICT_H_