Speedups to brotli quality 11.

* Cluster at most 64 histograms at a time in the first round of clustering. * Use a faster histogram cost estimation function. * Don't compute the log2(total) multiple times in the block splitter.
2025-01-15 19:00:06 +00:00 · 2015-06-12 15:29:06 +02:00 · 2015-06-12 15:29:06 +02:00 · 667f70adcb
commit 667f70adcb
parent af09ee7344
4 changed files with 72 additions and 136 deletions
--- a/enc/bit_cost.h
+++ b/enc/bit_cost.h
@ -49,94 +49,13 @@ static inline double BitsEntropy(const int *population, int size) {
  return retval;
 }

-static const int kHuffmanExtraBits[kCodeLengthCodes] = {
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3,
-};
-
-static inline int HuffmanTreeBitCost(const int* counts, const uint8_t* depth) {
-  int nbits = 0;
-  for (int i = 0; i < kCodeLengthCodes; ++i) {
-    nbits += counts[i] * (depth[i] + kHuffmanExtraBits[i]);
-  }
-  return nbits;
-}
-
-static inline int HuffmanTreeBitCost(
-    const Histogram<kCodeLengthCodes>& histogram,
-    const EntropyCode<kCodeLengthCodes>& entropy) {
-  return HuffmanTreeBitCost(&histogram.data_[0], &entropy.depth_[0]);
-}
-
-static inline int HuffmanBitCost(const uint8_t* depth, int length) {
-  int max_depth = 1;
-  int histogram[kCodeLengthCodes] = { 0 };
-  int tail_start = 0;
-  int prev_value = 8;
-  // compute histogram of compacted huffman tree
-  for (int i = 0; i < length;) {
-    const int value = depth[i];
-    if (value > max_depth) {
-      max_depth = value;
-    }
-    int reps = 1;
-    for (int k = i + 1; k < length && depth[k] == value; ++k) {
-      ++reps;
-    }
-    i += reps;
-    if (i == length && value == 0)
-      break;
-    if (value == 0) {
-      if (reps < 3) {
-        histogram[0] += reps;
-      } else {
-        reps -= 2;
-        while (reps > 0) {
-          ++histogram[17];
-          reps >>= 3;
-        }
-      }
-    } else {
-      tail_start = i;
-      if (value != prev_value) {
-        ++histogram[value];
-        --reps;
-      }
-      prev_value = value;
-      if (reps < 3) {
-        histogram[value] += reps;
-      } else {
-        reps -= 2;
-        while (reps > 0) {
-          ++histogram[16];
-          reps >>= 2;
-        }
-      }
-    }
-  }
-
-  // create huffman tree of huffman tree
-  uint8_t cost[kCodeLengthCodes] = { 0 };
-  CreateHuffmanTree(histogram, kCodeLengthCodes, 7, cost);
-  // account for rle extra bits
-  cost[16] += 2;
-  cost[17] += 3;
-
-  int tree_size = 0;
-  int bits = 18 + 2 * max_depth;  // huffman tree of huffman tree cost
-  for (int i = 0; i < kCodeLengthCodes; ++i) {
-    bits += histogram[i] * cost[i];  // huffman tree bit cost
-    tree_size += histogram[i];
-  }
-  return bits;
-}
-
 template<int kSize>
 double PopulationCost(const Histogram<kSize>& histogram) {
  if (histogram.total_count_ == 0) {
    return 12;
  }
  int count = 0;
-  for (int i = 0; i < kSize && count < 5; ++i) {
+  for (int i = 0; i < kSize; ++i) {
    if (histogram.data_[i] > 0) {
      ++count;
    }
@ -147,19 +66,66 @@ double PopulationCost(const Histogram<kSize>& histogram) {
  if (count == 2) {
    return 20 + histogram.total_count_;
  }
+  double bits = 0;
  uint8_t depth[kSize] = { 0 };
-  CreateHuffmanTree(&histogram.data_[0], kSize, 15, depth);
-  int bits = 0;
-  for (int i = 0; i < kSize; ++i) {
-    bits += histogram.data_[i] * depth[i];
+  if (count <= 4) {
+    // For very low symbol count we build the Huffman tree.
+    CreateHuffmanTree(&histogram.data_[0], kSize, 15, depth);
+    for (int i = 0; i < kSize; ++i) {
+      bits += histogram.data_[i] * depth[i];
+    }
+    return count == 3 ? bits + 28 : bits + 37;
  }
-  if (count == 3) {
-    bits += 28;
-  } else if (count == 4) {
-    bits += 37;
-  } else {
-    bits += HuffmanBitCost(depth, kSize);
+
+  // In this loop we compute the entropy of the histogram and simultaneously
+  // build a simplified histogram of the code length codes where we use the
+  // zero repeat code 17, but we don't use the non-zero repeat code 16.
+  int max_depth = 1;
+  int depth_histo[kCodeLengthCodes] = { 0 };
+  const double log2total = FastLog2(histogram.total_count_);
+  for (int i = 0; i < kSize;) {
+    if (histogram.data_[i] > 0) {
+      // Compute -log2(P(symbol)) = -log2(count(symbol)/total_count) =
+      //                          =  log2(total_count) - log2(count(symbol))
+      double log2p = log2total - FastLog2(histogram.data_[i]);
+      // Approximate the bit depth by round(-log2(P(symbol)))
+      int depth = static_cast<int>(log2p + 0.5);
+      bits += histogram.data_[i] * log2p;
+      if (depth > max_depth) {
+        max_depth = depth;
+      }
+      ++depth_histo[depth];
+      ++i;
+    } else {
+      // Compute the run length of zeros and add the appropiate number of 0 and
+      // 17 code length codes to the code length code histogram.
+      int reps = 1;
+      for (int k = i + 1; k < kSize && histogram.data_[k] == 0; ++k) {
+        ++reps;
+      }
+      i += reps;
+      if (i == kSize) {
+        // Don't add any cost for the last zero run, since these are encoded
+        // only implicitly.
+        break;
+      }
+      if (reps < 3) {
+        depth_histo[0] += reps;
+      } else {
+        reps -= 2;
+        while (reps > 0) {
+          ++depth_histo[17];
+          // Add the 3 extra bits for the 17 code length code.
+          bits += 3;
+          reps >>= 3;
+        }
+      }
+    }
  }
+  // Add the estimated encoding cost of the code length code histogram.
+  bits += 18 + 2 * max_depth;
+  // Add the entropy of the code length code histogram.
+  bits += BitsEntropy(depth_histo, kCodeLengthCodes);
  return bits;
 }

--- a/enc/block_splitter.cc
+++ b/enc/block_splitter.cc
@ -150,8 +150,8 @@ void RefineEntropyCodes(const DataType* data, size_t length,
  }
 }

-inline static float BitCost(int total, int count) {
-  return count == 0 ? FastLog2(total) + 2 : FastLog2(total) - FastLog2(count);
+inline static float BitCost(int count) {
+  return count == 0 ? -2 : FastLog2(count);
 }

 template<typename DataType, int kSize>
@ -168,10 +168,12 @@ void FindBlocks(const DataType* data, const size_t length,
  int vecsize = vec.size();
  double* insert_cost = new double[kSize * vecsize];
  memset(insert_cost, 0, sizeof(insert_cost[0]) * kSize * vecsize);
-  for (int i = 0; i < kSize; ++i) {
+  for (int j = 0; j < vecsize; ++j) {
+    insert_cost[j] = FastLog2(vec[j].total_count_);
+  }
+  for (int i = kSize - 1; i >= 0; --i) {
    for (int j = 0; j < vecsize; ++j) {
-      insert_cost[i * vecsize + j] =
-          BitCost(vec[j].total_count_, vec[j].data_[i]);
+      insert_cost[i * vecsize + j] = insert_cost[j] - BitCost(vec[j].data_[i]);
    }
  }
  double *cost = new double[vecsize];
--- a/enc/brotli_bit_stream.cc
+++ b/enc/brotli_bit_stream.cc
@ -393,37 +393,6 @@ void RunLengthCodeZeros(const std::vector<int>& v_in,
  }
 }

-// Returns a maximum zero-run-length-prefix value such that run-length coding
-// zeros in v with this maximum prefix value and then encoding the resulting
-// histogram and entropy-coding v produces the least amount of bits.
-int BestMaxZeroRunLengthPrefix(const std::vector<int>& v) {
-  int min_cost = std::numeric_limits<int>::max();
-  int best_max_prefix = 0;
-  for (int max_prefix = 0; max_prefix <= 16; ++max_prefix) {
-    std::vector<int> rle_symbols;
-    std::vector<int> extra_bits;
-    int max_run_length_prefix = max_prefix;
-    RunLengthCodeZeros(v, &max_run_length_prefix, &rle_symbols, &extra_bits);
-    if (max_run_length_prefix < max_prefix) break;
-    HistogramContextMap histogram;
-    for (int i = 0; i < rle_symbols.size(); ++i) {
-      histogram.Add(rle_symbols[i]);
-    }
-    int bit_cost = PopulationCost(histogram);
-    if (max_prefix > 0) {
-      bit_cost += 4;
-    }
-    for (int i = 1; i <= max_prefix; ++i) {
-      bit_cost += histogram.data_[i] * i;  // extra bits
-    }
-    if (bit_cost < min_cost) {
-      min_cost = bit_cost;
-      best_max_prefix = max_prefix;
-    }
-  }
-  return best_max_prefix;
-}
-
 void EncodeContextMap(const std::vector<int>& context_map,
                      int num_clusters,
                      int* storage_ix, uint8_t* storage) {
@ -436,7 +405,7 @@ void EncodeContextMap(const std::vector<int>& context_map,
  std::vector<int> transformed_symbols = MoveToFrontTransform(context_map);
  std::vector<int> rle_symbols;
  std::vector<int> extra_bits;
-  int max_run_length_prefix = BestMaxZeroRunLengthPrefix(transformed_symbols);
+  int max_run_length_prefix = 6;
  RunLengthCodeZeros(transformed_symbols, &max_run_length_prefix,
                     &rle_symbols, &extra_bits);
  HistogramContextMap symbol_histogram;
--- a/enc/cluster.h
+++ b/enc/cluster.h
@ -279,13 +279,12 @@ void ClusterHistograms(const std::vector<HistogramType>& in,
    (*histogram_symbols)[i] = i;
  }

-  // Collapse similar histograms within a block type.
-  if (num_contexts > 1) {
-    for (int i = 0; i < num_blocks; ++i) {
-      HistogramCombine(&(*out)[0], &cluster_size[0],
-                       &(*histogram_symbols)[i * num_contexts], num_contexts,
-                       max_histograms);
-    }
+  const int max_input_histograms = 64;
+  for (int i = 0; i < in_size; i += max_input_histograms) {
+    int num_to_combine = std::min(in_size - i, max_input_histograms);
+    HistogramCombine(&(*out)[0], &cluster_size[0],
+                     &(*histogram_symbols)[i], num_to_combine,
+                     max_histograms);
  }

  // Collapse similar histograms.