Use a static context map with two buckets for UTF8 data.

Enabled for quality >= 4, and if there are no obvious
UTF8 violations detected.
For each block, we gather two separate histograms, one
for continuation bytes and one for ASCII or lead bytes.
This commit is contained in:
Zoltan Szabadka 2015-05-07 17:23:07 +02:00
parent 762f9ba5a0
commit 945b0d025f
3 changed files with 331 additions and 5 deletions

View File

@ -351,6 +351,53 @@ bool BrotliCompressor::WriteBrotliData(const bool is_last,
return WriteMetaBlockInternal(is_last, utf8_mode, out_size, output);
}
void DecideOverLiteralContextModeling(const uint8_t* input,
size_t start_pos,
size_t length,
size_t mask,
int quality,
int* literal_context_mode,
int* num_literal_contexts,
const int** literal_context_map) {
if (quality <= 3 || length < 64) {
return;
}
// Simple heuristics to guess if the data is UTF8 or not. The goal is to
// recognize non-UTF8 data quickly by searching for the following obvious
// violations: a continuation byte following an ASCII byte or an ASCII or
// lead byte following a lead byte. If we find such violation we decide that
// the data is not UTF8. To make the analysis of UTF8 data faster we only
// examine 64 byte long strides at every 4kB intervals, if there are no
// violations found, we assume the whole data is UTF8.
const size_t end_pos = start_pos + length;
for (; start_pos + 64 < end_pos; start_pos += 4096) {
const size_t stride_end_pos = start_pos + 64;
uint8_t prev = input[start_pos & mask];
for (size_t pos = start_pos + 1; pos < stride_end_pos; ++pos) {
const uint8_t literal = input[pos & mask];
if ((prev < 128 && (literal & 0xc0) == 0x80) ||
(prev >= 192 && (literal & 0xc0) != 0x80)) {
return;
}
prev = literal;
}
}
*literal_context_mode = CONTEXT_UTF8;
// If the data is UTF8, this static context map distinguishes between ASCII
// or lead bytes and continuation bytes: the UTF8 context value based on the
// last two bytes is 2 or 3 if and only if the next byte is a continuation
// byte (see table in context.h).
static const int kStaticContextMap[64] = {
0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static const int kNumLiteralContexts = 2;
*num_literal_contexts = kNumLiteralContexts;
*literal_context_map = kStaticContextMap;
}
bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
const bool utf8_mode,
size_t* out_size,
@ -406,8 +453,6 @@ bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
num_direct_distance_codes,
distance_postfix_bits);
}
int literal_context_mode = utf8_mode ? CONTEXT_UTF8 : CONTEXT_SIGNED;
MetaBlockSplit mb;
if (params_.quality == 1) {
if (!StoreMetaBlockTrivial(data, last_flush_pos_, bytes, mask, is_last,
commands_.get(), num_commands_,
@ -416,10 +461,29 @@ bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
return false;
}
} else {
MetaBlockSplit mb;
int literal_context_mode = utf8_mode ? CONTEXT_UTF8 : CONTEXT_SIGNED;
if (params_.greedy_block_split) {
int num_literal_contexts = 1;
const int* literal_context_map = NULL;
DecideOverLiteralContextModeling(data, last_flush_pos_, bytes, mask,
params_.quality,
&literal_context_mode,
&num_literal_contexts,
&literal_context_map);
if (literal_context_map == NULL) {
BuildMetaBlockGreedy(data, last_flush_pos_, mask,
commands_.get(), num_commands_,
&mb);
} else {
BuildMetaBlockGreedyWithContexts(data, last_flush_pos_, mask,
prev_byte_, prev_byte2_,
literal_context_mode,
num_literal_contexts,
literal_context_map,
commands_.get(), num_commands_,
&mb);
}
} else {
BuildMetaBlock(data, last_flush_pos_, mask,
prev_byte_, prev_byte2_,

View File

@ -18,6 +18,7 @@
#include "./metablock.h"
#include "./block_splitter.h"
#include "./context.h"
#include "./cluster.h"
#include "./histogram.h"
@ -297,6 +298,249 @@ void BuildMetaBlockGreedy(const uint8_t* ringbuffer,
dist_blocks.FinishBlock(/* is_final = */ true);
}
// Greedy block splitter for one block category (literal, command or distance).
// Gathers histograms for all context buckets.
template<typename HistogramType>
class ContextBlockSplitter {
public:
ContextBlockSplitter(int alphabet_size,
int num_contexts,
int min_block_size,
double split_threshold,
int num_symbols,
BlockSplit* split,
std::vector<HistogramType>* histograms)
: alphabet_size_(alphabet_size),
num_contexts_(num_contexts),
max_block_types_(kMaxBlockTypes / num_contexts),
min_block_size_(min_block_size),
split_threshold_(split_threshold),
num_blocks_(0),
split_(split),
histograms_(histograms),
target_block_size_(min_block_size),
block_size_(0),
curr_histogram_ix_(0),
last_entropy_(2 * num_contexts),
merge_last_count_(0) {
int max_num_blocks = num_symbols / min_block_size + 1;
// We have to allocate one more histogram than the maximum number of block
// types for the current histogram when the meta-block is too big.
int max_num_types = std::min(max_num_blocks, max_block_types_ + 1);
split_->lengths.resize(max_num_blocks);
split_->types.resize(max_num_blocks);
histograms_->resize(max_num_types * num_contexts);
last_histogram_ix_[0] = last_histogram_ix_[1] = 0;
}
// Adds the next symbol to the current block type and context. When the
// current block reaches the target size, decides on merging the block.
void AddSymbol(int symbol, int context) {
(*histograms_)[curr_histogram_ix_ + context].Add(symbol);
++block_size_;
if (block_size_ == target_block_size_) {
FinishBlock(/* is_final = */ false);
}
}
// Does either of three things:
// (1) emits the current block with a new block type;
// (2) emits the current block with the type of the second last block;
// (3) merges the current block with the last block.
void FinishBlock(bool is_final) {
if (block_size_ < min_block_size_) {
block_size_ = min_block_size_;
}
if (num_blocks_ == 0) {
// Create first block.
split_->lengths[0] = block_size_;
split_->types[0] = 0;
for (int i = 0; i < num_contexts_; ++i) {
last_entropy_[i] =
BitsEntropy(&(*histograms_)[i].data_[0], alphabet_size_);
last_entropy_[num_contexts_ + i] = last_entropy_[i];
}
++num_blocks_;
++split_->num_types;
curr_histogram_ix_ += num_contexts_;
block_size_ = 0;
} else if (block_size_ > 0) {
// Try merging the set of histograms for the current block type with the
// respective set of histograms for the last and second last block types.
// Decide over the split based on the total reduction of entropy across
// all contexts.
std::vector<double> entropy(num_contexts_);
std::vector<HistogramType> combined_histo(2 * num_contexts_);
std::vector<double> combined_entropy(2 * num_contexts_);
double diff[2] = { 0.0 };
for (int i = 0; i < num_contexts_; ++i) {
int curr_histo_ix = curr_histogram_ix_ + i;
entropy[i] = BitsEntropy(&(*histograms_)[curr_histo_ix].data_[0],
alphabet_size_);
for (int j = 0; j < 2; ++j) {
int jx = j * num_contexts_ + i;
int last_histogram_ix = last_histogram_ix_[j] + i;
combined_histo[jx] = (*histograms_)[curr_histo_ix];
combined_histo[jx].AddHistogram((*histograms_)[last_histogram_ix]);
combined_entropy[jx] = BitsEntropy(
&combined_histo[jx].data_[0], alphabet_size_);
diff[j] += combined_entropy[jx] - entropy[i] - last_entropy_[jx];
}
}
if (split_->num_types < max_block_types_ &&
diff[0] > split_threshold_ &&
diff[1] > split_threshold_) {
// Create new block.
split_->lengths[num_blocks_] = block_size_;
split_->types[num_blocks_] = split_->num_types;
last_histogram_ix_[1] = last_histogram_ix_[0];
last_histogram_ix_[0] = split_->num_types * num_contexts_;
for (int i = 0; i < num_contexts_; ++i) {
last_entropy_[num_contexts_ + i] = last_entropy_[i];
last_entropy_[i] = entropy[i];
}
++num_blocks_;
++split_->num_types;
curr_histogram_ix_ += num_contexts_;
block_size_ = 0;
merge_last_count_ = 0;
target_block_size_ = min_block_size_;
} else if (diff[1] < diff[0] - 20.0) {
// Combine this block with second last block.
split_->lengths[num_blocks_] = block_size_;
split_->types[num_blocks_] = split_->types[num_blocks_ - 2];
std::swap(last_histogram_ix_[0], last_histogram_ix_[1]);
for (int i = 0; i < num_contexts_; ++i) {
(*histograms_)[last_histogram_ix_[0] + i] =
combined_histo[num_contexts_ + i];
last_entropy_[num_contexts_ + i] = last_entropy_[i];
last_entropy_[i] = combined_entropy[num_contexts_ + i];
(*histograms_)[curr_histogram_ix_ + i].Clear();
}
++num_blocks_;
block_size_ = 0;
merge_last_count_ = 0;
target_block_size_ = min_block_size_;
} else {
// Combine this block with last block.
split_->lengths[num_blocks_ - 1] += block_size_;
for (int i = 0; i < num_contexts_; ++i) {
(*histograms_)[last_histogram_ix_[0] + i] = combined_histo[i];
last_entropy_[i] = combined_entropy[i];
if (split_->num_types == 1) {
last_entropy_[num_contexts_ + i] = last_entropy_[i];
}
(*histograms_)[curr_histogram_ix_ + i].Clear();
}
block_size_ = 0;
if (++merge_last_count_ > 1) {
target_block_size_ += min_block_size_;
}
}
}
if (is_final) {
(*histograms_).resize(split_->num_types * num_contexts_);
split_->types.resize(num_blocks_);
split_->lengths.resize(num_blocks_);
}
}
private:
static const int kMaxBlockTypes = 256;
// Alphabet size of particular block category.
const int alphabet_size_;
const int num_contexts_;
const int max_block_types_;
// We collect at least this many symbols for each block.
const int min_block_size_;
// We merge histograms A and B if
// entropy(A+B) < entropy(A) + entropy(B) + split_threshold_,
// where A is the current histogram and B is the histogram of the last or the
// second last block type.
const double split_threshold_;
int num_blocks_;
BlockSplit* split_; // not owned
std::vector<HistogramType>* histograms_; // not owned
// The number of symbols that we want to collect before deciding on whether
// or not to merge the block with a previous one or emit a new block.
int target_block_size_;
// The number of symbols in the current histogram.
int block_size_;
// Offset of the current histogram.
int curr_histogram_ix_;
// Offset of the histograms of the previous two block types.
int last_histogram_ix_[2];
// Entropy of the previous two block types.
std::vector<double> last_entropy_;
// The number of times we merged the current block with the last one.
int merge_last_count_;
};
void BuildMetaBlockGreedyWithContexts(const uint8_t* ringbuffer,
size_t pos,
size_t mask,
uint8_t prev_byte,
uint8_t prev_byte2,
int literal_context_mode,
int num_contexts,
const int* static_context_map,
const Command *commands,
size_t n_commands,
MetaBlockSplit* mb) {
int num_literals = 0;
for (int i = 0; i < n_commands; ++i) {
num_literals += commands[i].insert_len_;
}
ContextBlockSplitter<HistogramLiteral> lit_blocks(
256, num_contexts, 512, 400.0, num_literals,
&mb->literal_split, &mb->literal_histograms);
BlockSplitter<HistogramCommand> cmd_blocks(
kNumCommandPrefixes, 1024, 500.0, n_commands,
&mb->command_split, &mb->command_histograms);
BlockSplitter<HistogramDistance> dist_blocks(
64, 512, 100.0, n_commands,
&mb->distance_split, &mb->distance_histograms);
for (int i = 0; i < n_commands; ++i) {
const Command cmd = commands[i];
cmd_blocks.AddSymbol(cmd.cmd_prefix_);
for (int j = 0; j < cmd.insert_len_; ++j) {
int context = Context(prev_byte, prev_byte2, literal_context_mode);
uint8_t literal = ringbuffer[pos & mask];
lit_blocks.AddSymbol(literal, static_context_map[context]);
prev_byte2 = prev_byte;
prev_byte = literal;
++pos;
}
pos += cmd.copy_len_;
if (cmd.copy_len_ > 0) {
prev_byte2 = ringbuffer[(pos - 2) & mask];
prev_byte = ringbuffer[(pos - 1) & mask];
if (cmd.cmd_prefix_ >= 128) {
dist_blocks.AddSymbol(cmd.dist_prefix_);
}
}
}
lit_blocks.FinishBlock(/* is_final = */ true);
cmd_blocks.FinishBlock(/* is_final = */ true);
dist_blocks.FinishBlock(/* is_final = */ true);
mb->literal_context_map.resize(
mb->literal_split.num_types << kLiteralContextBits);
for (int i = 0; i < mb->literal_split.num_types; ++i) {
for (int j = 0; j < (1 << kLiteralContextBits); ++j) {
mb->literal_context_map[(i << kLiteralContextBits) + j] =
i * num_contexts + static_context_map[j];
}
}
}
void OptimizeHistograms(int num_direct_distance_codes,
int distance_postfix_bits,
MetaBlockSplit* mb) {

View File

@ -44,6 +44,7 @@ struct MetaBlockSplit {
std::vector<HistogramDistance> distance_histograms;
};
// Uses the slow shortest-path block splitter and does context clustering.
void BuildMetaBlock(const uint8_t* ringbuffer,
const size_t pos,
const size_t mask,
@ -55,6 +56,8 @@ void BuildMetaBlock(const uint8_t* ringbuffer,
bool enable_context_modleing,
MetaBlockSplit* mb);
// Uses a fast greedy block splitter that tries to merge current block with the
// last or the second last block and does not do any context modeling.
void BuildMetaBlockGreedy(const uint8_t* ringbuffer,
size_t pos,
size_t mask,
@ -62,6 +65,21 @@ void BuildMetaBlockGreedy(const uint8_t* ringbuffer,
size_t n_commands,
MetaBlockSplit* mb);
// Uses a fast greedy block splitter that tries to merge current block with the
// last or the second last block and uses a static context clustering which
// is the same for all block types.
void BuildMetaBlockGreedyWithContexts(const uint8_t* ringbuffer,
size_t pos,
size_t mask,
uint8_t prev_byte,
uint8_t prev_byte2,
int literal_context_mode,
int num_contexts,
const int* static_context_map,
const Command *commands,
size_t n_commands,
MetaBlockSplit* mb);
void OptimizeHistograms(int num_direct_distance_codes,
int distance_postfix_bits,
MetaBlockSplit* mb);