Reduce memory usage of brotli encoder at quality 10 and 11.

2024-11-25 13:00:06 +00:00 · 2016-03-15 10:50:16 +01:00 · 2016-03-15 10:50:16 +01:00 · b820c39bd9
commit b820c39bd9
parent cfba2db7b3
23 changed files with 1658 additions and 967 deletions
--- a/enc/backward_references.cc
+++ b/enc/backward_references.cc
@ -21,8 +21,6 @@ namespace brotli {
 // The maximum length for which the zopflification uses distinct distances.
 static const uint16_t kMaxZopfliLen = 325;

-static const double kInfinity = std::numeric_limits<double>::infinity();
-
 // Histogram based cost model for zopflification.
 class ZopfliCostModel {
 public:
@ -42,7 +40,7 @@ class ZopfliCostModel {
    size_t pos = position - last_insert_len;
    for (size_t i = 0; i < num_commands; i++) {
      size_t inslength = commands[i].insert_len_;
-      size_t copylength = commands[i].copy_len_;
+      size_t copylength = commands[i].copy_len();
      size_t distcode = commands[i].dist_prefix_;
      size_t cmdcode = commands[i].cmd_prefix_;

@ -56,7 +54,7 @@ class ZopfliCostModel {
      pos += inslength + copylength;
    }

-    std::vector<double> cost_literal;
+    std::vector<float> cost_literal;
    Set(histogram_literal, &cost_literal);
    Set(histogram_cmd, &cost_cmd_);
    Set(histogram_dist, &cost_dist_);
@ -77,26 +75,25 @@ class ZopfliCostModel {
                           size_t position,
                           const uint8_t* ringbuffer,
                           size_t ringbuffer_mask) {
-    std::vector<float> literal_cost(num_bytes + 1);
+    literal_costs_.resize(num_bytes + 2);
    EstimateBitCostsForLiterals(position, num_bytes, ringbuffer_mask,
-                                ringbuffer, &literal_cost[0]);
-    literal_costs_.resize(num_bytes + 1);
+                                ringbuffer, &literal_costs_[1]);
    literal_costs_[0] = 0.0;
    for (size_t i = 0; i < num_bytes; ++i) {
-      literal_costs_[i + 1] = literal_costs_[i] + literal_cost[i];
+      literal_costs_[i + 1] += literal_costs_[i];
    }
    cost_cmd_.resize(kNumCommandPrefixes);
    cost_dist_.resize(kNumDistancePrefixes);
    for (uint32_t i = 0; i < kNumCommandPrefixes; ++i) {
-      cost_cmd_[i] = FastLog2(11 + i);
+      cost_cmd_[i] = static_cast<float>(FastLog2(11 + i));
    }
    for (uint32_t i = 0; i < kNumDistancePrefixes; ++i) {
-      cost_dist_[i] = FastLog2(20 + i);
+      cost_dist_[i] = static_cast<float>(FastLog2(20 + i));
    }
-    min_cost_cmd_ = FastLog2(11);
+    min_cost_cmd_ = static_cast<float>(FastLog2(11));
  }

-  double GetCommandCost(
+  float GetCommandCost(
      size_t dist_code, size_t length_code, size_t insert_length) const {
    uint16_t inscode = GetInsertLengthCode(insert_length);
    uint16_t copycode = GetCopyLengthCode(length_code);
@ -106,29 +103,29 @@ class ZopfliCostModel {
    PrefixEncodeCopyDistance(dist_code, 0, 0, &dist_symbol, &distextra);
    uint32_t distnumextra = distextra >> 24;

-    double result =  static_cast<double>(
-        kInsExtra[inscode] + kCopyExtra[copycode] + distnumextra);
+    float result = static_cast<float>(
+        GetInsertExtra(inscode) + GetCopyExtra(copycode) + distnumextra);
    result += cost_cmd_[cmdcode];
    if (cmdcode >= 128) result += cost_dist_[dist_symbol];
    return result;
  }

-  double GetLiteralCosts(size_t from, size_t to) const {
+  float GetLiteralCosts(size_t from, size_t to) const {
    return literal_costs_[to] - literal_costs_[from];
  }

-  double GetMinCostCmd(void) const {
+  float GetMinCostCmd(void) const {
    return min_cost_cmd_;
  }

 private:
-  void Set(const std::vector<uint32_t>& histogram, std::vector<double>* cost) {
+  void Set(const std::vector<uint32_t>& histogram, std::vector<float>* cost) {
    cost->resize(histogram.size());
    size_t sum = 0;
    for (size_t i = 0; i < histogram.size(); i++) {
      sum += histogram[i];
    }
-    double log2sum = FastLog2(sum);
+    float log2sum = static_cast<float>(FastLog2(sum));
    for (size_t i = 0; i < histogram.size(); i++) {
      if (histogram[i] == 0) {
        (*cost)[i] = log2sum + 2;
@ -136,33 +133,20 @@ class ZopfliCostModel {
      }

      // Shannon bits for this symbol.
-      (*cost)[i] = log2sum - FastLog2(histogram[i]);
+      (*cost)[i] = log2sum - static_cast<float>(FastLog2(histogram[i]));

      // Cannot be coded with less than 1 bit
      if ((*cost)[i] < 1) (*cost)[i] = 1;
    }
  }

-  std::vector<double> cost_cmd_;  // The insert and copy length symbols.
-  std::vector<double> cost_dist_;
+  std::vector<float> cost_cmd_;  // The insert and copy length symbols.
+  std::vector<float> cost_dist_;
  // Cumulative costs of literals per position in the stream.
-  std::vector<double> literal_costs_;
-  double min_cost_cmd_;
+  std::vector<float> literal_costs_;
+  float min_cost_cmd_;
 };

-inline void SetDistanceCache(size_t distance,
-                             size_t distance_code,
-                             size_t max_distance,
-                             const int* dist_cache,
-                             int* result_dist_cache) {
-  if (distance <= max_distance && distance_code > 0) {
-    result_dist_cache[0] = static_cast<int>(distance);
-    memcpy(&result_dist_cache[1], dist_cache, 3 * sizeof(dist_cache[0]));
-  } else {
-    memcpy(result_dist_cache, dist_cache, 4 * sizeof(dist_cache[0]));
-  }
-}
-
 inline size_t ComputeDistanceCode(size_t distance,
                                  size_t max_distance,
                                  int quality,
@ -194,47 +178,28 @@ inline size_t ComputeDistanceCode(size_t distance,
  return distance + 15;
 }

-struct ZopfliNode {
-  ZopfliNode() : length(1),
-                 distance(0),
-                 distance_code(0),
-                 length_code(0),
-                 insert_length(0),
-                 cost(kInfinity) {}
-
-  // best length to get up to this byte (not including this byte itself)
-  uint32_t length;
-  // distance associated with the length
-  uint32_t distance;
-  uint32_t distance_code;
-  int distance_cache[4];
-  // length code associated with the length - usually the same as length,
-  // except in case of length-changing dictionary transformation.
-  uint32_t length_code;
-  // number of literal inserts before this copy
-  uint32_t insert_length;
-  // smallest cost to get to this byte from the beginning, as found so far
-  double cost;
-};
-
+// REQUIRES: len >= 2, start_pos <= pos
+// REQUIRES: cost < kInfinity, nodes[start_pos].cost < kInfinity
+// Maintains the "ZopfliNode array invariant".
 inline void UpdateZopfliNode(ZopfliNode* nodes, size_t pos, size_t start_pos,
                             size_t len, size_t len_code, size_t dist,
-                             size_t dist_code, size_t max_dist,
-                             const int* dist_cache, double cost) {
+                             size_t short_code, float cost) {
  ZopfliNode& next = nodes[pos + len];
-  next.length = static_cast<uint32_t>(len);
-  next.length_code = static_cast<uint32_t>(len_code);
-  next.distance = static_cast<uint32_t>(dist);
-  next.distance_code = static_cast<uint32_t>(dist_code);
+  next.length = static_cast<uint32_t>(len | ((len + 9u - len_code) << 24));
+  next.distance = static_cast<uint32_t>(dist | (short_code << 25));
  next.insert_length = static_cast<uint32_t>(pos - start_pos);
  next.cost = cost;
-  SetDistanceCache(dist, dist_code, max_dist, dist_cache,
-                   &next.distance_cache[0]);
 }

 // Maintains the smallest 2^k cost difference together with their positions
 class StartPosQueue {
 public:
+  struct PosData {
+    size_t pos;
+    int distance_cache[4];
+    float costdiff;
+  };
+
  explicit StartPosQueue(int bits)
      : mask_((1u << bits) - 1), q_(1 << bits), idx_(0) {}

@ -242,21 +207,15 @@ class StartPosQueue {
    idx_ = 0;
  }

-  void Push(size_t pos, double costdiff) {
-    if (costdiff == kInfinity) {
-      // We can't start a command from an unreachable start position.
-      // E.g. position 1 in a stream is always unreachable, because all commands
-      // have a copy of at least length 2.
-      return;
-    }
-    size_t offset = -idx_ & mask_;
+  void Push(const StartPosQueue::PosData& posdata) {
+    size_t offset = ~idx_ & mask_;
    ++idx_;
    size_t len = size();
-    q_[offset] = std::make_pair(pos, costdiff);
+    q_[offset] = posdata;
    /* Restore the sorted order. In the list of |len| items at most |len - 1|
       adjacent element comparisons / swaps are required. */
    for (size_t i = 1; i < len; ++i) {
-      if (q_[offset & mask_].second > q_[(offset + 1) & mask_].second) {
+      if (q_[offset & mask_].costdiff > q_[(offset + 1) & mask_].costdiff) {
        std::swap(q_[offset & mask_], q_[(offset + 1) & mask_]);
      }
      ++offset;
@ -265,32 +224,32 @@ class StartPosQueue {

  size_t size(void) const { return std::min(idx_, mask_ + 1); }

-  size_t GetStartPos(size_t k) const {
-    return q_[(k + 1 - idx_) & mask_].first;
+  const StartPosQueue::PosData& GetStartPosData(size_t k) const {
+    return q_[(k - idx_) & mask_];
  }

 private:
  const size_t mask_;
-  std::vector<std::pair<size_t, double> > q_;
+  std::vector<PosData> q_;
  size_t idx_;
 };

 // Returns the minimum possible copy length that can improve the cost of any
 // future position.
 size_t ComputeMinimumCopyLength(const StartPosQueue& queue,
-                                const std::vector<ZopfliNode>& nodes,
+                                const ZopfliNode* nodes,
                                const ZopfliCostModel& model,
-                                size_t pos,
-                                double min_cost_cmd) {
+                                const size_t num_bytes,
+                                const size_t pos) {
  // Compute the minimum possible cost of reaching any future position.
-  const size_t start0 = queue.GetStartPos(0);
-  double min_cost = (nodes[start0].cost +
-                     model.GetLiteralCosts(start0, pos) +
-                     min_cost_cmd);
+  const size_t start0 = queue.GetStartPosData(0).pos;
+  float min_cost = (nodes[start0].cost +
+                    model.GetLiteralCosts(start0, pos) +
+                    model.GetMinCostCmd());
  size_t len = 2;
  size_t next_len_bucket = 4;
  size_t next_len_offset = 10;
-  while (pos + len < nodes.size() && nodes[pos + len].cost <= min_cost) {
+  while (pos + len <= num_bytes && nodes[pos + len].cost <= min_cost) {
    // We already reached (pos + len) with no more cost than the minimum
    // possible cost of reaching anything from this pos, so there is no point in
    // looking for lengths <= len.
@ -298,7 +257,7 @@ size_t ComputeMinimumCopyLength(const StartPosQueue& queue,
    if (len == next_len_offset) {
      // We reached the next copy length code bucket, so we add one more
      // extra bit to the minimum cost.
-      min_cost += 1.0;
+      min_cost += static_cast<float>(1.0);
      next_len_offset += next_len_bucket;
      next_len_bucket *= 2;
    }
@ -306,164 +265,194 @@ size_t ComputeMinimumCopyLength(const StartPosQueue& queue,
  return len;
 }

-void ZopfliIterate(size_t num_bytes,
-                   size_t position,
-                   const uint8_t* ringbuffer,
-                   size_t ringbuffer_mask,
-                   const size_t max_backward_limit,
-                   const ZopfliCostModel& model,
-                   const std::vector<uint32_t>& num_matches,
-                   const std::vector<BackwardMatch>& matches,
-                   int* dist_cache,
-                   size_t* last_insert_len,
-                   Command* commands,
-                   size_t* num_commands,
-                   size_t* num_literals) {
-  const Command * const orig_commands = commands;
-
-  std::vector<ZopfliNode> nodes(num_bytes + 1);
-  nodes[0].length = 0;
-  nodes[0].cost = 0;
-  memcpy(nodes[0].distance_cache, dist_cache, 4 * sizeof(dist_cache[0]));
-
-  StartPosQueue queue(3);
-  const double min_cost_cmd = model.GetMinCostCmd();
-
-  size_t cur_match_pos = 0;
-  for (size_t i = 0; i + 3 < num_bytes; i++) {
-    size_t cur_ix = position + i;
-    size_t cur_ix_masked = cur_ix & ringbuffer_mask;
-    size_t max_distance = std::min(cur_ix, max_backward_limit);
-    size_t max_length = num_bytes - i;
-
-    queue.Push(i, nodes[i].cost - model.GetLiteralCosts(0, i));
-
-    const size_t min_len = ComputeMinimumCopyLength(queue, nodes, model,
-                                                    i, min_cost_cmd);
-
-    // Go over the command starting positions in order of increasing cost
-    // difference.
-    for (size_t k = 0; k < 5 && k < queue.size(); ++k) {
-      const size_t start = queue.GetStartPos(k);
-      const double start_costdiff =
-          nodes[start].cost - model.GetLiteralCosts(0, start);
-      const int* dist_cache2 = &nodes[start].distance_cache[0];
-
-      // Look for last distance matches using the distance cache from this
-      // starting position.
-      size_t best_len = min_len - 1;
-      for (size_t j = 0; j < kNumDistanceShortCodes; ++j) {
-        const size_t idx = kDistanceCacheIndex[j];
-        const size_t backward =
-            static_cast<size_t>(dist_cache2[idx] + kDistanceCacheOffset[j]);
-        size_t prev_ix = cur_ix - backward;
-        if (prev_ix >= cur_ix) {
-          continue;
-        }
-        if (PREDICT_FALSE(backward > max_distance)) {
-          continue;
-        }
-        prev_ix &= ringbuffer_mask;
-
-        if (cur_ix_masked + best_len > ringbuffer_mask ||
-            prev_ix + best_len > ringbuffer_mask ||
-            ringbuffer[cur_ix_masked + best_len] !=
-            ringbuffer[prev_ix + best_len]) {
-          continue;
-        }
-        const size_t len =
-            FindMatchLengthWithLimit(&ringbuffer[prev_ix],
-                                     &ringbuffer[cur_ix_masked],
-                                     max_length);
-        for (size_t l = best_len + 1; l <= len; ++l) {
-          const size_t inslen = i - start;
-          double cmd_cost = model.GetCommandCost(j, l, inslen);
-          double cost = start_costdiff + cmd_cost + model.GetLiteralCosts(0, i);
-          if (cost < nodes[i + l].cost) {
-            UpdateZopfliNode(&nodes[0], i, start, l, l, backward, j,
-                             max_distance, dist_cache2, cost);
-          }
-          best_len = l;
-        }
-      }
-
-      // At higher iterations look only for new last distance matches, since
-      // looking only for new command start positions with the same distances
-      // does not help much.
-      if (k >= 2) continue;
-
-      // Loop through all possible copy lengths at this position.
-      size_t len = min_len;
-      for (size_t j = 0; j < num_matches[i]; ++j) {
-        BackwardMatch match = matches[cur_match_pos + j];
-        size_t dist = match.distance;
-        bool is_dictionary_match = dist > max_distance;
-        // We already tried all possible last distance matches, so we can use
-        // normal distance code here.
-        size_t dist_code = dist + 15;
-        // Try all copy lengths up until the maximum copy length corresponding
-        // to this distance. If the distance refers to the static dictionary, or
-        // the maximum length is long enough, try only one maximum length.
-        size_t max_len = match.length();
-        if (len < max_len && (is_dictionary_match || max_len > kMaxZopfliLen)) {
-          len = max_len;
-        }
-        for (; len <= max_len; ++len) {
-          size_t len_code = is_dictionary_match ? match.length_code() : len;
-          const size_t inslen = i - start;
-          double cmd_cost = model.GetCommandCost(dist_code, len_code, inslen);
-          double cost = start_costdiff + cmd_cost + model.GetLiteralCosts(0, i);
-          if (cost < nodes[i + len].cost) {
-            UpdateZopfliNode(&nodes[0], i, start, len, len_code, dist,
-                             dist_code, max_distance, dist_cache2, cost);
-          }
-        }
-      }
+// Fills in dist_cache[0..3] with the last four distances (as defined by
+// Section 4. of the Spec) that would be used at (block_start + pos) if we
+// used the shortest path of commands from block_start, computed from
+// nodes[0..pos]. The last four distances at block_start are in
+// starting_dist_cach[0..3].
+// REQUIRES: nodes[pos].cost < kInfinity
+// REQUIRES: nodes[0..pos] satisfies that "ZopfliNode array invariant".
+void ComputeDistanceCache(const size_t block_start,
+                          const size_t pos,
+                          const size_t max_backward,
+                          const int* starting_dist_cache,
+                          const ZopfliNode* nodes,
+                          int* dist_cache) {
+  int idx = 0;
+  size_t p = pos;
+  // Because of prerequisite, does at most (pos + 1) / 2 iterations.
+  while (idx < 4 && p > 0) {
+    const size_t clen = nodes[p].copy_length();
+    const size_t ilen = nodes[p].insert_length;
+    const size_t dist = nodes[p].copy_distance();
+    // Since block_start + p is the end position of the command, the copy part
+    // starts from block_start + p - clen. Distances that are greater than this
+    // or greater than max_backward are static dictionary references, and do
+    // not update the last distances. Also distance code 0 (last distance)
+    // does not update the last distances.
+    if (dist + clen <= block_start + p && dist <= max_backward &&
+        nodes[p].distance_code() > 0) {
+      dist_cache[idx++] = static_cast<int>(dist);
    }
+    // Because of prerequisite, p >= clen + ilen >= 2.
+    p -= clen + ilen;
+  }
+  for (; idx < 4; ++idx) {
+    dist_cache[idx] = *starting_dist_cache++;
+  }
+}

-    cur_match_pos += num_matches[i];
+void UpdateNodes(const size_t num_bytes,
+                 const size_t block_start,
+                 const size_t pos,
+                 const uint8_t* ringbuffer,
+                 const size_t ringbuffer_mask,
+                 const size_t max_backward_limit,
+                 const int* starting_dist_cache,
+                 const size_t num_matches,
+                 const BackwardMatch* matches,
+                 const ZopfliCostModel* model,
+                 StartPosQueue* queue,
+                 ZopfliNode* nodes) {
+  size_t cur_ix = block_start + pos;
+  size_t cur_ix_masked = cur_ix & ringbuffer_mask;
+  size_t max_distance = std::min(cur_ix, max_backward_limit);

-    // The zopflification can be too slow in case of very long lengths, so in
-    // such case skip it all, it does not cost a lot of compression ratio.
-    if (num_matches[i] == 1 &&
-        matches[cur_match_pos - 1].length() > kMaxZopfliLen) {
-      i += matches[cur_match_pos - 1].length() - 1;
-      queue.Clear();
-    }
+  if (nodes[pos].cost <= model->GetLiteralCosts(0, pos)) {
+    StartPosQueue::PosData posdata;
+    posdata.pos = pos;
+    posdata.costdiff = nodes[pos].cost - model->GetLiteralCosts(0, pos);
+    ComputeDistanceCache(block_start, pos, max_backward_limit,
+                         starting_dist_cache, nodes, posdata.distance_cache);
+    queue->Push(posdata);
  }

-  std::vector<uint32_t> backwards;
+  const size_t min_len = ComputeMinimumCopyLength(
+      *queue, nodes, *model, num_bytes, pos);
+
+  // Go over the command starting positions in order of increasing cost
+  // difference.
+  for (size_t k = 0; k < 5 && k < queue->size(); ++k) {
+    const StartPosQueue::PosData& posdata = queue->GetStartPosData(k);
+    const size_t start = posdata.pos;
+    const float start_costdiff = posdata.costdiff;
+
+    // Look for last distance matches using the distance cache from this
+    // starting position.
+    size_t best_len = min_len - 1;
+    for (size_t j = 0; j < kNumDistanceShortCodes; ++j) {
+      const size_t idx = kDistanceCacheIndex[j];
+      const size_t backward = static_cast<size_t>(posdata.distance_cache[idx] +
+                                                  kDistanceCacheOffset[j]);
+      size_t prev_ix = cur_ix - backward;
+      if (prev_ix >= cur_ix) {
+        continue;
+      }
+      if (PREDICT_FALSE(backward > max_distance)) {
+        continue;
+      }
+      prev_ix &= ringbuffer_mask;
+
+      if (cur_ix_masked + best_len > ringbuffer_mask ||
+          prev_ix + best_len > ringbuffer_mask ||
+          ringbuffer[cur_ix_masked + best_len] !=
+          ringbuffer[prev_ix + best_len]) {
+        continue;
+      }
+      const size_t len =
+          FindMatchLengthWithLimit(&ringbuffer[prev_ix],
+                                   &ringbuffer[cur_ix_masked],
+                                   num_bytes - pos);
+      for (size_t l = best_len + 1; l <= len; ++l) {
+        const size_t inslen = pos - start;
+        float cmd_cost = model->GetCommandCost(j, l, inslen);
+        float cost = start_costdiff + cmd_cost + model->GetLiteralCosts(0, pos);
+        if (cost < nodes[pos + l].cost) {
+          UpdateZopfliNode(&nodes[0], pos, start, l, l, backward, j + 1, cost);
+        }
+        best_len = l;
+      }
+    }
+
+    // At higher iterations look only for new last distance matches, since
+    // looking only for new command start positions with the same distances
+    // does not help much.
+    if (k >= 2) continue;
+
+    // Loop through all possible copy lengths at this position.
+    size_t len = min_len;
+    for (size_t j = 0; j < num_matches; ++j) {
+      BackwardMatch match = matches[j];
+      size_t dist = match.distance;
+      bool is_dictionary_match = dist > max_distance;
+      // We already tried all possible last distance matches, so we can use
+      // normal distance code here.
+      size_t dist_code = dist + 15;
+      // Try all copy lengths up until the maximum copy length corresponding
+      // to this distance. If the distance refers to the static dictionary, or
+      // the maximum length is long enough, try only one maximum length.
+      size_t max_len = match.length();
+      if (len < max_len && (is_dictionary_match || max_len > kMaxZopfliLen)) {
+        len = max_len;
+      }
+      for (; len <= max_len; ++len) {
+        size_t len_code = is_dictionary_match ? match.length_code() : len;
+        const size_t inslen = pos - start;
+        float cmd_cost = model->GetCommandCost(dist_code, len_code, inslen);
+        float cost = start_costdiff + cmd_cost + model->GetLiteralCosts(0, pos);
+        if (cost < nodes[pos + len].cost) {
+          UpdateZopfliNode(&nodes[0], pos, start, len, len_code, dist, 0, cost);
+        }
+      }
+    }
+  }
+}
+
+void ComputeShortestPathFromNodes(size_t num_bytes,
+                                  const ZopfliNode* nodes,
+                                  std::vector<uint32_t>* path) {
+  std::vector<uint32_t> backwards(num_bytes / 2 + 1);
  size_t index = num_bytes;
  while (nodes[index].cost == kInfinity) --index;
+  size_t num_commands = 0;
  while (index != 0) {
-    size_t len = nodes[index].length + nodes[index].insert_length;
-    backwards.push_back(static_cast<uint32_t>(len));
+    size_t len = nodes[index].command_length();
+    backwards[num_commands++] = static_cast<uint32_t>(len);
    index -= len;
  }
-
-  std::vector<uint32_t> path;
-  for (size_t i = backwards.size(); i > 0; i--) {
-    path.push_back(backwards[i - 1]);
+  path->resize(num_commands);
+  for (size_t i = num_commands, j = 0; i > 0; --i, ++j) {
+    (*path)[j] = backwards[i - 1];
  }
+}

+void ZopfliCreateCommands(const size_t num_bytes,
+                          const size_t block_start,
+                          const size_t max_backward_limit,
+                          const std::vector<uint32_t>& path,
+                          const ZopfliNode* nodes,
+                          int* dist_cache,
+                          size_t* last_insert_len,
+                          Command* commands,
+                          size_t* num_literals) {
  size_t pos = 0;
  for (size_t i = 0; i < path.size(); i++) {
    const ZopfliNode& next = nodes[pos + path[i]];
-    size_t copy_length = next.length;
+    size_t copy_length = next.copy_length();
    size_t insert_length = next.insert_length;
    pos += insert_length;
    if (i == 0) {
      insert_length += *last_insert_len;
      *last_insert_len = 0;
    }
-    size_t distance = next.distance;
-    size_t len_code = next.length_code;
-    size_t max_distance = std::min(position + pos, max_backward_limit);
+    size_t distance = next.copy_distance();
+    size_t len_code = next.length_code();
+    size_t max_distance = std::min(block_start + pos, max_backward_limit);
    bool is_dictionary = (distance > max_distance);
-    size_t dist_code = next.distance_code;
+    size_t dist_code = next.distance_code();

    Command cmd(insert_length, copy_length, len_code, dist_code);
-    *commands++ = cmd;
+    commands[i] = cmd;

    if (!is_dictionary && dist_code > 0) {
      dist_cache[3] = dist_cache[2];
@ -473,11 +462,85 @@ void ZopfliIterate(size_t num_bytes,
    }

    *num_literals += insert_length;
-    insert_length = 0;
    pos += copy_length;
  }
  *last_insert_len += num_bytes - pos;
-  *num_commands += static_cast<size_t>(commands - orig_commands);
+}
+
+void ZopfliIterate(size_t num_bytes,
+                   size_t position,
+                   const uint8_t* ringbuffer,
+                   size_t ringbuffer_mask,
+                   const size_t max_backward_limit,
+                   const int* dist_cache,
+                   const ZopfliCostModel& model,
+                   const std::vector<uint32_t>& num_matches,
+                   const std::vector<BackwardMatch>& matches,
+                   ZopfliNode* nodes,
+                   std::vector<uint32_t>* path) {
+  nodes[0].length = 0;
+  nodes[0].cost = 0;
+  StartPosQueue queue(3);
+  size_t cur_match_pos = 0;
+  for (size_t i = 0; i + 3 < num_bytes; i++) {
+    UpdateNodes(num_bytes, position, i, ringbuffer, ringbuffer_mask,
+                max_backward_limit, dist_cache, num_matches[i],
+                &matches[cur_match_pos], &model, &queue, &nodes[0]);
+    cur_match_pos += num_matches[i];
+    // The zopflification can be too slow in case of very long lengths, so in
+    // such case skip it all, it does not cost a lot of compression ratio.
+    if (num_matches[i] == 1 &&
+        matches[cur_match_pos - 1].length() > kMaxZopfliLen) {
+      i += matches[cur_match_pos - 1].length() - 1;
+      queue.Clear();
+    }
+  }
+  ComputeShortestPathFromNodes(num_bytes, &nodes[0], path);
+}
+
+
+void ZopfliComputeShortestPath(size_t num_bytes,
+                               size_t position,
+                               const uint8_t* ringbuffer,
+                               size_t ringbuffer_mask,
+                               const size_t max_backward_limit,
+                               const int* dist_cache,
+                               Hashers::H10* hasher,
+                               ZopfliNode* nodes,
+                               std::vector<uint32_t>* path) {
+  nodes[0].length = 0;
+  nodes[0].cost = 0;
+  ZopfliCostModel* model = new ZopfliCostModel;
+  model->SetFromLiteralCosts(num_bytes, position,
+                             ringbuffer, ringbuffer_mask);
+  StartPosQueue queue(3);
+  BackwardMatch matches[Hashers::H10::kMaxNumMatches];
+  for (size_t i = 0; i + 3 < num_bytes; i++) {
+    const size_t max_distance = std::min(position + i, max_backward_limit);
+    size_t num_matches = hasher->FindAllMatches(
+        ringbuffer, ringbuffer_mask, position + i, num_bytes - i, max_distance,
+        matches);
+    if (num_matches > 0 &&
+        matches[num_matches - 1].length() > kMaxZopfliLen) {
+      matches[0] = matches[num_matches - 1];
+      num_matches = 1;
+    }
+    UpdateNodes(num_bytes, position, i, ringbuffer, ringbuffer_mask,
+                max_backward_limit, dist_cache, num_matches, matches,
+                model, &queue, nodes);
+    if (num_matches == 1 && matches[0].length() > kMaxZopfliLen) {
+      for (size_t j = 1; j < matches[0].length() && i + 4 < num_bytes; ++j) {
+        ++i;
+        if (matches[0].length() - j < 64 &&
+            num_bytes - i >= kMaxTreeCompLength) {
+          hasher->Store(ringbuffer, ringbuffer_mask, position + i);
+        }
+      }
+      queue.Clear();
+    }
+  }
+  delete model;
+  ComputeShortestPathFromNodes(num_bytes, nodes, path);
 }

 template<typename Hasher>
@ -527,7 +590,7 @@ void CreateBackwardReferences(size_t num_bytes,
  size_t apply_random_heuristics = i + random_heuristics_window_size;

  // Minimum score to accept a backward reference.
-  const int kMinScore = 4.0;
+  const double kMinScore = 4.0;

  while (i + Hasher::kHashTypeLength - 1 < i_end) {
    size_t max_length = i_end - i;
@ -649,16 +712,23 @@ void CreateBackwardReferences(size_t num_bytes,
  if (zopflify) {
    Hashers::H10* hasher = hashers->hash_h10;
    hasher->Init(lgwin, position, num_bytes, is_last);
-    if (num_bytes >= 3 && position >= kMaxTreeCompLength) {
-      // Store the last `kMaxTreeCompLength - 1` positions in the hasher.
-      // These could not be calculated before, since they require knowledge
-      // of both the previous and the current block.
-      for (size_t i = position - kMaxTreeCompLength + 1; i < position; ++i) {
-        hasher->Store(ringbuffer, ringbuffer_mask, i, num_bytes + position - i);
-      }
-    }
+    hasher->StitchToPreviousBlock(num_bytes, position,
+                                  ringbuffer, ringbuffer_mask);
    // Set maximum distance, see section 9.1. of the spec.
    const size_t max_backward_limit = (1 << lgwin) - 16;
+    if (quality == 10) {
+      std::vector<ZopfliNode> nodes(num_bytes + 1);
+      std::vector<uint32_t> path;
+      ZopfliComputeShortestPath(num_bytes, position,
+                                ringbuffer, ringbuffer_mask,
+                                max_backward_limit, dist_cache, hasher,
+                                &nodes[0], &path);
+      ZopfliCreateCommands(num_bytes, position, max_backward_limit, path,
+                           &nodes[0], dist_cache, last_insert_len, commands,
+                           num_literals);
+      *num_commands += path.size();
+      return;
+    }
    std::vector<uint32_t> num_matches(num_bytes);
    std::vector<BackwardMatch> matches(4 * num_bytes);
    size_t cur_match_pos = 0;
@ -686,9 +756,8 @@ void CreateBackwardReferences(size_t num_bytes,
          num_matches[i] = 1;
          for (size_t j = 1; j < match_len; ++j) {
            ++i;
-            if (match_len - j < 64) {
-              hasher->Store(ringbuffer, ringbuffer_mask, position + i,
-                            num_bytes - i);
+            if (match_len - j < 64 && num_bytes - i >= kMaxTreeCompLength) {
+              hasher->Store(ringbuffer, ringbuffer_mask, position + i);
            }
            num_matches[i] = 0;
          }
@ -719,9 +788,15 @@ void CreateBackwardReferences(size_t num_bytes,
      *num_literals = orig_num_literals;
      *last_insert_len = orig_last_insert_len;
      memcpy(dist_cache, orig_dist_cache, 4 * sizeof(dist_cache[0]));
+      std::vector<ZopfliNode> nodes(num_bytes + 1);
+      std::vector<uint32_t> path;
      ZopfliIterate(num_bytes, position, ringbuffer, ringbuffer_mask,
-                    max_backward_limit, model, num_matches, matches, dist_cache,
-                    last_insert_len, commands, num_commands, num_literals);
+                    max_backward_limit, dist_cache, model, num_matches, matches,
+                    &nodes[0], &path);
+      ZopfliCreateCommands(num_bytes, position, max_backward_limit, path,
+                           &nodes[0], dist_cache, last_insert_len, commands,
+                           num_literals);
+      *num_commands += path.size();
    }
    return;
  }
--- a/enc/backward_references.h
+++ b/enc/backward_references.h
@ -9,6 +9,8 @@
 #ifndef BROTLI_ENC_BACKWARD_REFERENCES_H_
 #define BROTLI_ENC_BACKWARD_REFERENCES_H_

+#include <vector>
+
 #include "./hash.h"
 #include "./command.h"
 #include "./types.h"
@ -34,6 +36,81 @@ void CreateBackwardReferences(size_t num_bytes,
                              size_t* num_commands,
                              size_t* num_literals);

+static const float kInfinity = std::numeric_limits<float>::infinity();
+
+struct ZopfliNode {
+  ZopfliNode(void) : length(1),
+                     distance(0),
+                     insert_length(0),
+                     cost(kInfinity) {}
+
+  inline uint32_t copy_length() const {
+    return length & 0xffffff;
+  }
+
+  inline uint32_t length_code() const {
+    const uint32_t modifier = length >> 24;
+    return copy_length() + 9u - modifier;
+  }
+
+  inline uint32_t copy_distance() const {
+    return distance & 0x1ffffff;
+  }
+
+  inline uint32_t distance_code() const {
+    const uint32_t short_code = distance >> 25;
+    return short_code == 0 ? copy_distance() + 15 : short_code - 1;
+  }
+
+  inline uint32_t command_length() const {
+    return copy_length() + insert_length;
+  }
+
+  // best length to get up to this byte (not including this byte itself)
+  // highest 8 bit is used to reconstruct the length code
+  uint32_t length;
+  // distance associated with the length
+  // highest 7 bit contains distance short code + 1 (or zero if no short code)
+  uint32_t distance;
+  // number of literal inserts before this copy
+  uint32_t insert_length;
+  // smallest cost to get to this byte from the beginning, as found so far
+  float cost;
+};
+
+// Computes the shortest path of commands from position to at most
+// position + num_bytes.
+//
+// On return, path->size() is the number of commands found and path[i] is the
+// length of the ith command (copy length plus insert length).
+// Note that the sum of the lengths of all commands can be less than num_bytes.
+//
+// On return, the nodes[0..num_bytes] array will have the following
+// "ZopfliNode array invariant":
+// For each i in [1..num_bytes], if nodes[i].cost < kInfinity, then
+//   (1) nodes[i].copy_length() >= 2
+//   (2) nodes[i].command_length() <= i and
+//   (3) nodes[i - nodes[i].command_length()].cost < kInfinity
+void ZopfliComputeShortestPath(size_t num_bytes,
+                               size_t position,
+                               const uint8_t* ringbuffer,
+                               size_t ringbuffer_mask,
+                               const size_t max_backward_limit,
+                               const int* dist_cache,
+                               Hashers::H10* hasher,
+                               ZopfliNode* nodes,
+                               std::vector<uint32_t>* path);
+
+void ZopfliCreateCommands(const size_t num_bytes,
+                          const size_t block_start,
+                          const size_t max_backward_limit,
+                          const std::vector<uint32_t>& path,
+                          const ZopfliNode* nodes,
+                          int* dist_cache,
+                          size_t* last_insert_len,
+                          Command* commands,
+                          size_t* num_literals);
+
 }  // namespace brotli

 #endif  // BROTLI_ENC_BACKWARD_REFERENCES_H_
--- a/enc/bit_cost.h
+++ b/enc/bit_cost.h
@ -48,38 +48,62 @@ static inline double BitsEntropy(const uint32_t *population, size_t size) {
  return retval;
 }

-
 template<int kSize>
 double PopulationCost(const Histogram<kSize>& histogram) {
+  static const double kOneSymbolHistogramCost = 12;
+  static const double kTwoSymbolHistogramCost = 20;
+  static const double kThreeSymbolHistogramCost = 28;
+  static const double kFourSymbolHistogramCost = 37;
  if (histogram.total_count_ == 0) {
-    return 12;
+    return kOneSymbolHistogramCost;
  }
  int count = 0;
+  int s[5];
  for (int i = 0; i < kSize; ++i) {
    if (histogram.data_[i] > 0) {
+      s[count] = i;
      ++count;
+      if (count > 4) break;
    }
  }
  if (count == 1) {
-    return 12;
+    return kOneSymbolHistogramCost;
  }
  if (count == 2) {
-    return static_cast<double>(20 + histogram.total_count_);
+    return (kTwoSymbolHistogramCost +
+            static_cast<double>(histogram.total_count_));
  }
-  double bits = 0;
-  uint8_t depth_array[kSize] = { 0 };
-  if (count <= 4) {
-    // For very low symbol count we build the Huffman tree.
-    CreateHuffmanTree(&histogram.data_[0], kSize, 15, depth_array);
-    for (int i = 0; i < kSize; ++i) {
-      bits += histogram.data_[i] * depth_array[i];
+  if (count == 3) {
+    const uint32_t histo0 = histogram.data_[s[0]];
+    const uint32_t histo1 = histogram.data_[s[1]];
+    const uint32_t histo2 = histogram.data_[s[2]];
+    const uint32_t histomax = std::max(histo0, std::max(histo1, histo2));
+    return (kThreeSymbolHistogramCost +
+            2 * (histo0 + histo1 + histo2) - histomax);
+  }
+  if (count == 4) {
+    uint32_t histo[4];
+    for (int i = 0; i < 4; ++i) {
+      histo[i] = histogram.data_[s[i]];
    }
-    return count == 3 ? bits + 28 : bits + 37;
+    // Sort
+    for (int i = 0; i < 4; ++i) {
+      for (int j = i + 1; j < 4; ++j) {
+        if (histo[j] > histo[i]) {
+          std::swap(histo[j], histo[i]);
+        }
+      }
+    }
+    const uint32_t h23 = histo[2] + histo[3];
+    const uint32_t histomax = std::max(h23, histo[0]);
+    return (kFourSymbolHistogramCost +
+            3 * h23 + 2 * (histo[0] + histo[1]) - histomax);
  }

  // In this loop we compute the entropy of the histogram and simultaneously
  // build a simplified histogram of the code length codes where we use the
  // zero repeat code 17, but we don't use the non-zero repeat code 16.
+  double bits = 0;
  size_t max_depth = 1;
  uint32_t depth_histo[kCodeLengthCodes] = { 0 };
  const double log2total = FastLog2(histogram.total_count_);
--- a/enc/block_splitter.cc
+++ b/enc/block_splitter.cc
@ -13,7 +13,7 @@

 #include <algorithm>
 #include <cstring>
-#include <map>
+#include <vector>

 #include "./cluster.h"
 #include "./command.h"
@ -70,20 +70,7 @@ void CopyLiteralsToByteArray(const Command* cmds,
      memcpy(&(*literals)[pos], data + from_pos, insert_len);
      pos += insert_len;
    }
-    from_pos = (from_pos + insert_len + cmds[i].copy_len_) & mask;
-  }
-}
-
-void CopyCommandsToByteArray(const Command* cmds,
-                             const size_t num_commands,
-                             std::vector<uint16_t>* insert_and_copy_codes,
-                             std::vector<uint16_t>* distance_prefixes) {
-  for (size_t i = 0; i < num_commands; ++i) {
-    const Command& cmd = cmds[i];
-    insert_and_copy_codes->push_back(cmd.cmd_prefix_);
-    if (cmd.copy_len_ > 0 && cmd.cmd_prefix_ >= 128) {
-      distance_prefixes->push_back(cmd.dist_prefix_);
-    }
+    from_pos = (from_pos + insert_len + cmds[i].copy_len()) & mask;
  }
 }

@ -97,27 +84,23 @@ inline static unsigned int MyRand(unsigned int* seed) {

 template<typename HistogramType, typename DataType>
 void InitialEntropyCodes(const DataType* data, size_t length,
-                         size_t literals_per_histogram,
-                         size_t max_histograms,
                         size_t stride,
-                         std::vector<HistogramType>* vec) {
-  size_t total_histograms = length / literals_per_histogram + 1;
-  if (total_histograms > max_histograms) {
-    total_histograms = max_histograms;
+                         size_t num_histograms,
+                         HistogramType* histograms) {
+  for (size_t i = 0; i < num_histograms; ++i) {
+    histograms[i].Clear();
  }
  unsigned int seed = 7;
-  size_t block_length = length / total_histograms;
-  for (size_t i = 0; i < total_histograms; ++i) {
-    size_t pos = length * i / total_histograms;
+  size_t block_length = length / num_histograms;
+  for (size_t i = 0; i < num_histograms; ++i) {
+    size_t pos = length * i / num_histograms;
    if (i != 0) {
      pos += MyRand(&seed) % block_length;
    }
    if (pos + stride >= length) {
      pos = length - stride - 1;
    }
-    HistogramType histo;
-    histo.Add(data + pos, stride);
-    vec->push_back(histo);
+    histograms[i].Add(data + pos, stride);
  }
 }

@ -140,16 +123,17 @@ void RandomSample(unsigned int* seed,
 template<typename HistogramType, typename DataType>
 void RefineEntropyCodes(const DataType* data, size_t length,
                        size_t stride,
-                        std::vector<HistogramType>* vec) {
+                        size_t num_histograms,
+                        HistogramType* histograms) {
  size_t iters =
      kIterMulForRefining * length / stride + kMinItersForRefining;
  unsigned int seed = 7;
-  iters = ((iters + vec->size() - 1) / vec->size()) * vec->size();
+  iters = ((iters + num_histograms - 1) / num_histograms) * num_histograms;
  for (size_t iter = 0; iter < iters; ++iter) {
    HistogramType sample;
    RandomSample(&seed, data, length, stride, &sample);
-    size_t ix = iter % vec->size();
-    (*vec)[ix].AddHistogram(sample);
+    size_t ix = iter % num_histograms;
+    histograms[ix].AddHistogram(sample);
  }
 }

@ -157,34 +141,40 @@ inline static double BitCost(size_t count) {
  return count == 0 ? -2.0 : FastLog2(count);
 }

+// Assigns a block id from the range [0, vec.size()) to each data element
+// in data[0..length) and fills in block_id[0..length) with the assigned values.
+// Returns the number of blocks, i.e. one plus the number of block switches.
 template<typename DataType, int kSize>
-void FindBlocks(const DataType* data, const size_t length,
-                const double block_switch_bitcost,
-                const std::vector<Histogram<kSize> > &vec,
-                uint8_t *block_id) {
-  if (vec.size() <= 1) {
+size_t FindBlocks(const DataType* data, const size_t length,
+                  const double block_switch_bitcost,
+                  const size_t num_histograms,
+                  const Histogram<kSize>* histograms,
+                  double* insert_cost,
+                  double* cost,
+                  uint8_t* switch_signal,
+                  uint8_t *block_id) {
+  if (num_histograms <= 1) {
    for (size_t i = 0; i < length; ++i) {
      block_id[i] = 0;
    }
-    return;
+    return 1;
  }
-  size_t vecsize = vec.size();
-  assert(vecsize <= 256);
-  double* insert_cost = new double[kSize * vecsize];
-  memset(insert_cost, 0, sizeof(insert_cost[0]) * kSize * vecsize);
-  for (size_t j = 0; j < vecsize; ++j) {
-    insert_cost[j] = FastLog2(static_cast<uint32_t>(vec[j].total_count_));
+  const size_t bitmaplen = (num_histograms + 7) >> 3;
+  assert(num_histograms <= 256);
+  memset(insert_cost, 0, sizeof(insert_cost[0]) * kSize * num_histograms);
+  for (size_t j = 0; j < num_histograms; ++j) {
+    insert_cost[j] = FastLog2(static_cast<uint32_t>(
+        histograms[j].total_count_));
  }
  for (size_t i = kSize; i != 0;) {
    --i;
-    for (size_t j = 0; j < vecsize; ++j) {
-      insert_cost[i * vecsize + j] = insert_cost[j] - BitCost(vec[j].data_[i]);
+    for (size_t j = 0; j < num_histograms; ++j) {
+      insert_cost[i * num_histograms + j] =
+          insert_cost[j] - BitCost(histograms[j].data_[i]);
    }
  }
-  double *cost = new double[vecsize];
-  memset(cost, 0, sizeof(cost[0]) * vecsize);
-  bool* switch_signal = new bool[length * vecsize];
-  memset(switch_signal, 0, sizeof(switch_signal[0]) * length * vecsize);
+  memset(cost, 0, sizeof(cost[0]) * num_histograms);
+  memset(switch_signal, 0, sizeof(switch_signal[0]) * length * bitmaplen);
  // After each iteration of this loop, cost[k] will contain the difference
  // between the minimum cost of arriving at the current byte position using
  // entropy code k, and the minimum cost of arriving at the current byte
@ -192,10 +182,10 @@ void FindBlocks(const DataType* data, const size_t length,
  // reaches block switch cost, it means that when we trace back from the last
  // position, we need to switch here.
  for (size_t byte_ix = 0; byte_ix < length; ++byte_ix) {
-    size_t ix = byte_ix * vecsize;
-    size_t insert_cost_ix = data[byte_ix] * vecsize;
+    size_t ix = byte_ix * bitmaplen;
+    size_t insert_cost_ix = data[byte_ix] * num_histograms;
    double min_cost = 1e99;
-    for (size_t k = 0; k < vecsize; ++k) {
+    for (size_t k = 0; k < num_histograms; ++k) {
      // We are coding the symbol in data[byte_ix] with entropy code k.
      cost[k] += insert_cost[insert_cost_ix + k];
      if (cost[k] < min_cost) {
@ -208,110 +198,200 @@ void FindBlocks(const DataType* data, const size_t length,
    if (byte_ix < 2000) {
      block_switch_cost *= 0.77 + 0.07 * static_cast<double>(byte_ix) / 2000;
    }
-    for (size_t k = 0; k < vecsize; ++k) {
+    for (size_t k = 0; k < num_histograms; ++k) {
      cost[k] -= min_cost;
      if (cost[k] >= block_switch_cost) {
        cost[k] = block_switch_cost;
-        switch_signal[ix + k] = true;
+        const uint8_t mask = static_cast<uint8_t>(1u << (k & 7));
+        assert((k >> 3) < bitmaplen);
+        switch_signal[ix + (k >> 3)] |= mask;
      }
    }
  }
  // Now trace back from the last position and switch at the marked places.
  size_t byte_ix = length - 1;
-  size_t ix = byte_ix * vecsize;
+  size_t ix = byte_ix * bitmaplen;
  uint8_t cur_id = block_id[byte_ix];
+  size_t num_blocks = 1;
  while (byte_ix > 0) {
    --byte_ix;
-    ix -= vecsize;
-    if (switch_signal[ix + cur_id]) {
-      cur_id = block_id[byte_ix];
+    ix -= bitmaplen;
+    const uint8_t mask = static_cast<uint8_t>(1u << (cur_id & 7));
+    assert((static_cast<size_t>(cur_id) >> 3) < bitmaplen);
+    if (switch_signal[ix + (cur_id >> 3)] & mask) {
+      if (cur_id != block_id[byte_ix]) {
+        cur_id = block_id[byte_ix];
+        ++num_blocks;
+      }
    }
    block_id[byte_ix] = cur_id;
  }
-  delete[] insert_cost;
-  delete[] cost;
-  delete[] switch_signal;
+  return num_blocks;
 }

-size_t RemapBlockIds(uint8_t* block_ids, const size_t length) {
-  std::map<uint8_t, uint8_t> new_id;
-  size_t next_id = 0;
+size_t RemapBlockIds(uint8_t* block_ids, const size_t length,
+                     uint16_t* new_id, const size_t num_histograms) {
+  static const uint16_t kInvalidId = 256;
+  for (size_t i = 0; i < num_histograms; ++i) {
+    new_id[i] = kInvalidId;
+  }
+  uint16_t next_id = 0;
  for (size_t i = 0; i < length; ++i) {
-    if (new_id.find(block_ids[i]) == new_id.end()) {
-      new_id[block_ids[i]] = static_cast<uint8_t>(next_id);
-      ++next_id;
+    assert(block_ids[i] < num_histograms);
+    if (new_id[block_ids[i]] == kInvalidId) {
+      new_id[block_ids[i]] = next_id++;
    }
  }
  for (size_t i = 0; i < length; ++i) {
-    block_ids[i] = new_id[block_ids[i]];
+    block_ids[i] = static_cast<uint8_t>(new_id[block_ids[i]]);
+    assert(block_ids[i] < num_histograms);
  }
+  assert(next_id <= num_histograms);
  return next_id;
 }

 template<typename HistogramType, typename DataType>
 void BuildBlockHistograms(const DataType* data, const size_t length,
-                          uint8_t* block_ids,
-                          std::vector<HistogramType>* histograms) {
-  size_t num_types = RemapBlockIds(block_ids, length);
-  assert(num_types <= 256);
-  histograms->clear();
-  histograms->resize(num_types);
+                          const uint8_t* block_ids,
+                          const size_t num_histograms,
+                          HistogramType* histograms) {
+  for (size_t i = 0; i < num_histograms; ++i) {
+    histograms[i].Clear();
+  }
  for (size_t i = 0; i < length; ++i) {
-    (*histograms)[block_ids[i]].Add(data[i]);
+    histograms[block_ids[i]].Add(data[i]);
  }
 }

 template<typename HistogramType, typename DataType>
 void ClusterBlocks(const DataType* data, const size_t length,
-                   uint8_t* block_ids) {
-  std::vector<HistogramType> histograms;
-  std::vector<uint32_t> block_index(length);
-  uint32_t cur_idx = 0;
-  HistogramType cur_histogram;
-  for (size_t i = 0; i < length; ++i) {
-    bool block_boundary = (i + 1 == length || block_ids[i] != block_ids[i + 1]);
-    block_index[i] = cur_idx;
-    cur_histogram.Add(data[i]);
-    if (block_boundary) {
-      histograms.push_back(cur_histogram);
-      cur_histogram.Clear();
-      ++cur_idx;
-    }
-  }
-  std::vector<HistogramType> clustered_histograms;
-  std::vector<uint32_t> histogram_symbols;
-  // Block ids need to fit in one byte.
+                   const size_t num_blocks,
+                   uint8_t* block_ids,
+                   BlockSplit* split) {
  static const size_t kMaxNumberOfBlockTypes = 256;
-  ClusterHistograms(histograms, 1, histograms.size(),
-                    kMaxNumberOfBlockTypes,
-                    &clustered_histograms,
-                    &histogram_symbols);
-  for (size_t i = 0; i < length; ++i) {
-    block_ids[i] = static_cast<uint8_t>(histogram_symbols[block_index[i]]);
-  }
-}
+  static const size_t kHistogramsPerBatch = 64;
+  static const size_t kClustersPerBatch = 16;
+  std::vector<uint32_t> histogram_symbols(num_blocks);
+  std::vector<uint32_t> block_lengths(num_blocks);

-void BuildBlockSplit(const std::vector<uint8_t>& block_ids, BlockSplit* split) {
-  uint8_t cur_id = block_ids[0];
-  uint8_t max_type = cur_id;
-  uint32_t cur_length = 1;
-  for (size_t i = 1; i < block_ids.size(); ++i) {
-    uint8_t next_id = block_ids[i];
-    if (next_id != cur_id) {
-      split->types.push_back(cur_id);
-      split->lengths.push_back(cur_length);
-      max_type = std::max(max_type, next_id);
-      cur_id = next_id;
-      cur_length = 0;
+  size_t block_idx = 0;
+  for (size_t i = 0; i < length; ++i) {
+    assert(block_idx < num_blocks);
+    ++block_lengths[block_idx];
+    if (i + 1 == length || block_ids[i] != block_ids[i + 1]) {
+      ++block_idx;
    }
-    ++cur_length;
  }
-  split->types.push_back(cur_id);
-  split->lengths.push_back(cur_length);
+  assert(block_idx == num_blocks);
+
+  const size_t expected_num_clusters =
+      kClustersPerBatch *
+      (num_blocks + kHistogramsPerBatch - 1) / kHistogramsPerBatch;
+  std::vector<HistogramType> all_histograms;
+  std::vector<uint32_t> cluster_size;
+  all_histograms.reserve(expected_num_clusters);
+  cluster_size.reserve(expected_num_clusters);
+  size_t num_clusters = 0;
+  std::vector<HistogramType> histograms(
+      std::min(num_blocks, kHistogramsPerBatch));
+  size_t max_num_pairs = kHistogramsPerBatch * kHistogramsPerBatch / 2;
+  std::vector<HistogramPair> pairs(max_num_pairs + 1);
+  size_t pos = 0;
+  for (size_t i = 0; i < num_blocks; i += kHistogramsPerBatch) {
+    const size_t num_to_combine = std::min(num_blocks - i, kHistogramsPerBatch);
+    uint32_t sizes[kHistogramsPerBatch];
+    uint32_t clusters[kHistogramsPerBatch];
+    uint32_t symbols[kHistogramsPerBatch];
+    uint32_t remap[kHistogramsPerBatch];
+    for (size_t j = 0; j < num_to_combine; ++j) {
+      histograms[j].Clear();
+      for (size_t k = 0; k < block_lengths[i + j]; ++k) {
+        histograms[j].Add(data[pos++]);
+      }
+      histograms[j].bit_cost_ = PopulationCost(histograms[j]);
+      symbols[j] = clusters[j] = static_cast<uint32_t>(j);
+      sizes[j] = 1;
+    }
+    size_t num_new_clusters = HistogramCombine(
+        &histograms[0], sizes, symbols, clusters, &pairs[0], num_to_combine,
+        num_to_combine, kHistogramsPerBatch, max_num_pairs);
+    for (size_t j = 0; j < num_new_clusters; ++j) {
+      all_histograms.push_back(histograms[clusters[j]]);
+      cluster_size.push_back(sizes[clusters[j]]);
+      remap[clusters[j]] = static_cast<uint32_t>(j);
+    }
+    for (size_t j = 0; j < num_to_combine; ++j) {
+      histogram_symbols[i + j] =
+          static_cast<uint32_t>(num_clusters) + remap[symbols[j]];
+    }
+    num_clusters += num_new_clusters;
+    assert(num_clusters == cluster_size.size());
+    assert(num_clusters == all_histograms.size());
+  }
+
+  max_num_pairs =
+      std::min(64 * num_clusters, (num_clusters / 2) * num_clusters);
+  pairs.resize(max_num_pairs + 1);
+
+  std::vector<uint32_t> clusters(num_clusters);
+  for (size_t i = 0; i < num_clusters; ++i) {
+    clusters[i] = static_cast<uint32_t>(i);
+  }
+  size_t num_final_clusters =
+      HistogramCombine(&all_histograms[0], &cluster_size[0],
+                       &histogram_symbols[0],
+                       &clusters[0], &pairs[0], num_clusters,
+                       num_blocks, kMaxNumberOfBlockTypes, max_num_pairs);
+
+  static const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
+  std::vector<uint32_t> new_index(num_clusters, kInvalidIndex);
+  uint32_t next_index = 0;
+  pos = 0;
+  for (size_t i = 0; i < num_blocks; ++i) {
+    HistogramType histo;
+    for (size_t j = 0; j < block_lengths[i]; ++j) {
+      histo.Add(data[pos++]);
+    }
+    uint32_t best_out =
+        i == 0 ? histogram_symbols[0] : histogram_symbols[i - 1];
+    double best_bits = HistogramBitCostDistance(
+        histo, all_histograms[best_out]);
+    for (size_t j = 0; j < num_final_clusters; ++j) {
+      const double cur_bits = HistogramBitCostDistance(
+          histo, all_histograms[clusters[j]]);
+      if (cur_bits < best_bits) {
+        best_bits = cur_bits;
+        best_out = clusters[j];
+      }
+    }
+    histogram_symbols[i] = best_out;
+    if (new_index[best_out] == kInvalidIndex) {
+      new_index[best_out] = next_index++;
+    }
+  }
+  uint8_t max_type = 0;
+  uint32_t cur_length = 0;
+  block_idx = 0;
+  split->types.resize(num_blocks);
+  split->lengths.resize(num_blocks);
+  for (size_t i = 0; i < num_blocks; ++i) {
+    cur_length += block_lengths[i];
+    if (i + 1 == num_blocks ||
+        histogram_symbols[i] != histogram_symbols[i + 1]) {
+      const uint8_t id = static_cast<uint8_t>(new_index[histogram_symbols[i]]);
+      split->types[block_idx] = id;
+      split->lengths[block_idx] = cur_length;
+      max_type = std::max(max_type, id);
+      cur_length = 0;
+      ++block_idx;
+    }
+  }
+  split->types.resize(block_idx);
+  split->lengths.resize(block_idx);
  split->num_types = static_cast<size_t>(max_type) + 1;
 }

-template<typename HistogramType, typename DataType>
+template<int kSize, typename DataType>
 void SplitByteVector(const std::vector<DataType>& data,
                     const size_t literals_per_histogram,
                     const size_t max_histograms,
@ -327,27 +407,44 @@ void SplitByteVector(const std::vector<DataType>& data,
    split->lengths.push_back(static_cast<uint32_t>(data.size()));
    return;
  }
-  std::vector<HistogramType> histograms;
+  size_t num_histograms = data.size() / literals_per_histogram + 1;
+  if (num_histograms > max_histograms) {
+    num_histograms = max_histograms;
+  }
+  Histogram<kSize>* histograms = new Histogram<kSize>[num_histograms];
  // Find good entropy codes.
  InitialEntropyCodes(&data[0], data.size(),
-                      literals_per_histogram,
-                      max_histograms,
                      sampling_stride_length,
-                      &histograms);
+                      num_histograms, histograms);
  RefineEntropyCodes(&data[0], data.size(),
                     sampling_stride_length,
-                     &histograms);
+                     num_histograms, histograms);
  // Find a good path through literals with the good entropy codes.
  std::vector<uint8_t> block_ids(data.size());
+  size_t num_blocks;
+  const size_t bitmaplen = (num_histograms + 7) >> 3;
+  double* insert_cost = new double[kSize * num_histograms];
+  double *cost = new double[num_histograms];
+  uint8_t* switch_signal = new uint8_t[data.size() * bitmaplen];
+  uint16_t* new_id = new uint16_t[num_histograms];
  for (size_t i = 0; i < 10; ++i) {
-    FindBlocks(&data[0], data.size(),
-               block_switch_cost,
-               histograms,
-               &block_ids[0]);
-    BuildBlockHistograms(&data[0], data.size(), &block_ids[0], &histograms);
+    num_blocks = FindBlocks(&data[0], data.size(),
+                            block_switch_cost,
+                            num_histograms, histograms,
+                            insert_cost, cost, switch_signal,
+                            &block_ids[0]);
+    num_histograms = RemapBlockIds(&block_ids[0], data.size(),
+                                   new_id, num_histograms);
+    BuildBlockHistograms(&data[0], data.size(), &block_ids[0],
+                         num_histograms, histograms);
  }
-  ClusterBlocks<HistogramType>(&data[0], data.size(), &block_ids[0]);
-  BuildBlockSplit(block_ids, split);
+  delete[] insert_cost;
+  delete[] cost;
+  delete[] switch_signal;
+  delete[] new_id;
+  delete[] histograms;
+  ClusterBlocks<Histogram<kSize> >(&data[0], data.size(), num_blocks,
+                                   &block_ids[0], split);
 }

 void SplitBlock(const Command* cmds,
@ -358,32 +455,51 @@ void SplitBlock(const Command* cmds,
                BlockSplit* literal_split,
                BlockSplit* insert_and_copy_split,
                BlockSplit* dist_split) {
-  // Create a continuous array of literals.
-  std::vector<uint8_t> literals;
-  CopyLiteralsToByteArray(cmds, num_commands, data, pos, mask, &literals);
+  {
+    // Create a continuous array of literals.
+    std::vector<uint8_t> literals;
+    CopyLiteralsToByteArray(cmds, num_commands, data, pos, mask, &literals);
+    // Create the block split on the array of literals.
+    // Literal histograms have alphabet size 256.
+    SplitByteVector<256>(
+        literals,
+        kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
+        kLiteralStrideLength, kLiteralBlockSwitchCost,
+        literal_split);
+  }

-  // Compute prefix codes for commands.
-  std::vector<uint16_t> insert_and_copy_codes;
-  std::vector<uint16_t> distance_prefixes;
-  CopyCommandsToByteArray(cmds, num_commands,
-                          &insert_and_copy_codes,
-                          &distance_prefixes);
+  {
+    // Compute prefix codes for commands.
+    std::vector<uint16_t> insert_and_copy_codes(num_commands);
+    for (size_t i = 0; i < num_commands; ++i) {
+      insert_and_copy_codes[i] = cmds[i].cmd_prefix_;
+    }
+    // Create the block split on the array of command prefixes.
+    SplitByteVector<kNumCommandPrefixes>(
+        insert_and_copy_codes,
+        kSymbolsPerCommandHistogram, kMaxCommandHistograms,
+        kCommandStrideLength, kCommandBlockSwitchCost,
+        insert_and_copy_split);
+  }

-  SplitByteVector<HistogramLiteral>(
-      literals,
-      kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
-      kLiteralStrideLength, kLiteralBlockSwitchCost,
-      literal_split);
-  SplitByteVector<HistogramCommand>(
-      insert_and_copy_codes,
-      kSymbolsPerCommandHistogram, kMaxCommandHistograms,
-      kCommandStrideLength, kCommandBlockSwitchCost,
-      insert_and_copy_split);
-  SplitByteVector<HistogramDistance>(
-      distance_prefixes,
-      kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
-      kCommandStrideLength, kDistanceBlockSwitchCost,
-      dist_split);
+  {
+    // Create a continuous array of distance prefixes.
+    std::vector<uint16_t> distance_prefixes(num_commands);
+    size_t pos = 0;
+    for (size_t i = 0; i < num_commands; ++i) {
+      const Command& cmd = cmds[i];
+      if (cmd.copy_len() && cmd.cmd_prefix_ >= 128) {
+        distance_prefixes[pos++] = cmd.dist_prefix_;
+      }
+    }
+    distance_prefixes.resize(pos);
+    // Create the block split on the array of distance prefixes.
+    SplitByteVector<kNumDistancePrefixes>(
+        distance_prefixes,
+        kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
+        kCommandStrideLength, kDistanceBlockSwitchCost,
+        dist_split);
+  }
 }

 }  // namespace brotli
--- a/enc/brotli_bit_stream.cc
+++ b/enc/brotli_bit_stream.cc
@ -28,6 +28,12 @@ namespace brotli {

 namespace {

+static const size_t kMaxHuffmanTreeSize = 2 * kNumCommandPrefixes + 1;
+// Context map alphabet has 256 context id symbols plus max 16 rle symbols.
+static const size_t kContextMapAlphabetSize = 256 + 16;
+// Block type alphabet has 256 block id symbols plus 2 special symbols.
+static const size_t kBlockTypeAlphabetSize = 256 + 2;
+
 // nibblesbits represents the 2 bits to encode MNIBBLES (0-3)
 // REQUIRES: length > 0
 // REQUIRES: length <= (1 << 24)
@ -45,6 +51,18 @@ void EncodeMlen(size_t length, uint64_t* bits,
  *bits = length;
 }

+static inline void StoreCommandExtra(
+    const Command& cmd, size_t* storage_ix, uint8_t* storage) {
+  uint32_t copylen_code = cmd.copy_len_code();
+  uint16_t inscode = GetInsertLengthCode(cmd.insert_len_);
+  uint16_t copycode = GetCopyLengthCode(copylen_code);
+  uint32_t insnumextra = GetInsertExtra(inscode);
+  uint64_t insextraval = cmd.insert_len_ - GetInsertBase(inscode);
+  uint64_t copyextraval = copylen_code - GetCopyBase(copycode);
+  uint64_t bits = (copyextraval << insnumextra) | insextraval;
+  WriteBits(insnumextra + GetCopyExtra(copycode), bits, storage_ix, storage);
+}
+
 }  // namespace

 void StoreVarLenUint8(size_t n, size_t* storage_ix, uint8_t* storage) {
@ -148,13 +166,14 @@ void StoreHuffmanTreeOfHuffmanTreeToBitMask(
 }

 void StoreHuffmanTreeToBitMask(
-    const std::vector<uint8_t> &huffman_tree,
-    const std::vector<uint8_t> &huffman_tree_extra_bits,
-    const uint8_t *code_length_bitdepth,
-    const std::vector<uint16_t> &code_length_bitdepth_symbols,
+    const size_t huffman_tree_size,
+    const uint8_t* huffman_tree,
+    const uint8_t* huffman_tree_extra_bits,
+    const uint8_t* code_length_bitdepth,
+    const uint16_t* code_length_bitdepth_symbols,
    size_t * __restrict storage_ix,
    uint8_t * __restrict storage) {
-  for (size_t i = 0; i < huffman_tree.size(); ++i) {
+  for (size_t i = 0; i < huffman_tree_size; ++i) {
    size_t ix = huffman_tree[i];
    WriteBits(code_length_bitdepth[ix], code_length_bitdepth_symbols[ix],
              storage_ix, storage);
@ -208,18 +227,21 @@ void StoreSimpleHuffmanTree(const uint8_t* depths,
 // num = alphabet size
 // depths = symbol depths
 void StoreHuffmanTree(const uint8_t* depths, size_t num,
+                      HuffmanTree* tree,
                      size_t *storage_ix, uint8_t *storage) {
  // Write the Huffman tree into the brotli-representation.
-  std::vector<uint8_t> huffman_tree;
-  std::vector<uint8_t> huffman_tree_extra_bits;
-  // TODO: Consider allocating these from stack.
-  huffman_tree.reserve(256);
-  huffman_tree_extra_bits.reserve(256);
-  WriteHuffmanTree(depths, num, &huffman_tree, &huffman_tree_extra_bits);
+  // The command alphabet is the largest, so this allocation will fit all
+  // alphabets.
+  assert(num <= kNumCommandPrefixes);
+  uint8_t huffman_tree[kNumCommandPrefixes];
+  uint8_t huffman_tree_extra_bits[kNumCommandPrefixes];
+  size_t huffman_tree_size = 0;
+  WriteHuffmanTree(depths, num, &huffman_tree_size, huffman_tree,
+                   huffman_tree_extra_bits);

  // Calculate the statistics of the Huffman tree in brotli-representation.
  uint32_t huffman_tree_histogram[kCodeLengthCodes] = { 0 };
-  for (size_t i = 0; i < huffman_tree.size(); ++i) {
+  for (size_t i = 0; i < huffman_tree_size; ++i) {
    ++huffman_tree_histogram[huffman_tree[i]];
  }

@ -239,11 +261,10 @@ void StoreHuffmanTree(const uint8_t* depths, size_t num,

  // Calculate another Huffman tree to use for compressing both the
  // earlier Huffman tree with.
-  // TODO: Consider allocating these from stack.
  uint8_t code_length_bitdepth[kCodeLengthCodes] = { 0 };
-  std::vector<uint16_t> code_length_bitdepth_symbols(kCodeLengthCodes);
+  uint16_t code_length_bitdepth_symbols[kCodeLengthCodes] = { 0 };
  CreateHuffmanTree(&huffman_tree_histogram[0], kCodeLengthCodes,
-                    5, &code_length_bitdepth[0]);
+                    5, tree, &code_length_bitdepth[0]);
  ConvertBitDepthsToSymbols(code_length_bitdepth, kCodeLengthCodes,
                            &code_length_bitdepth_symbols[0]);

@ -256,16 +277,17 @@ void StoreHuffmanTree(const uint8_t* depths, size_t num,
  }

  // Store the real huffman tree now.
-  StoreHuffmanTreeToBitMask(huffman_tree,
+  StoreHuffmanTreeToBitMask(huffman_tree_size,
+                            huffman_tree,
                            huffman_tree_extra_bits,
                            &code_length_bitdepth[0],
                            code_length_bitdepth_symbols,
                            storage_ix, storage);
 }

-
 void BuildAndStoreHuffmanTree(const uint32_t *histogram,
                              const size_t length,
+                              HuffmanTree* tree,
                              uint8_t* depth,
                              uint16_t* bits,
                              size_t* storage_ix,
@ -296,16 +318,21 @@ void BuildAndStoreHuffmanTree(const uint32_t *histogram,
    return;
  }

-  CreateHuffmanTree(histogram, length, 15, depth);
+  CreateHuffmanTree(histogram, length, 15, tree, depth);
  ConvertBitDepthsToSymbols(depth, length, bits);

  if (count <= 4) {
    StoreSimpleHuffmanTree(depth, s4, count, max_bits, storage_ix, storage);
  } else {
-    StoreHuffmanTree(depth, length, storage_ix, storage);
+    StoreHuffmanTree(depth, length, tree, storage_ix, storage);
  }
 }

+static inline bool SortHuffmanTree(const HuffmanTree& v0,
+                                   const HuffmanTree& v1) {
+  return v0.total_count_ < v1.total_count_;
+}
+
 void BuildAndStoreHuffmanTreeFast(const uint32_t *histogram,
                                  const size_t histogram_total,
                                  const size_t max_bits,
@ -467,52 +494,58 @@ void BuildAndStoreHuffmanTreeFast(const uint32_t *histogram,
  }
 }

-size_t IndexOf(const std::vector<uint32_t>& v, uint32_t value) {
+size_t IndexOf(const uint8_t* v, size_t v_size, uint8_t value) {
  size_t i = 0;
-  for (; i < v.size(); ++i) {
+  for (; i < v_size; ++i) {
    if (v[i] == value) return i;
  }
  return i;
 }

-void MoveToFront(std::vector<uint32_t>* v, size_t index) {
-  uint32_t value = (*v)[index];
+void MoveToFront(uint8_t* v, size_t index) {
+  uint8_t value = v[index];
  for (size_t i = index; i != 0; --i) {
-    (*v)[i] = (*v)[i - 1];
+    v[i] = v[i - 1];
  }
-  (*v)[0] = value;
+  v[0] = value;
 }

-std::vector<uint32_t> MoveToFrontTransform(const std::vector<uint32_t>& v) {
-  if (v.empty()) return v;
-  uint32_t max_value = *std::max_element(v.begin(), v.end());
-  std::vector<uint32_t> mtf(max_value + 1);
-  for (uint32_t i = 0; i <= max_value; ++i) mtf[i] = i;
-  std::vector<uint32_t> result(v.size());
-  for (size_t i = 0; i < v.size(); ++i) {
-    size_t index = IndexOf(mtf, v[i]);
-    assert(index < mtf.size());
-    result[i] = static_cast<uint32_t>(index);
-    MoveToFront(&mtf, index);
+void MoveToFrontTransform(const uint32_t* __restrict v_in,
+                          const size_t v_size,
+                          uint32_t* v_out) {
+  if (v_size == 0) {
+    return;
+  }
+  uint32_t max_value = *std::max_element(v_in, v_in + v_size);
+  assert(max_value < 256u);
+  uint8_t mtf[256];
+  size_t mtf_size = max_value + 1;
+  for (uint32_t i = 0; i <= max_value; ++i) {
+    mtf[i] = static_cast<uint8_t>(i);
+  }
+  for (size_t i = 0; i < v_size; ++i) {
+    size_t index = IndexOf(mtf, mtf_size, static_cast<uint8_t>(v_in[i]));
+    assert(index < mtf_size);
+    v_out[i] = static_cast<uint32_t>(index);
+    MoveToFront(mtf, index);
  }
-  return result;
 }

-// Finds runs of zeros in v_in and replaces them with a prefix code of the run
-// length plus extra bits in *v_out and *extra_bits. Non-zero values in v_in are
-// shifted by *max_length_prefix. Will not create prefix codes bigger than the
-// initial value of *max_run_length_prefix. The prefix code of run length L is
-// simply Log2Floor(L) and the number of extra bits is the same as the prefix
-// code.
-void RunLengthCodeZeros(const std::vector<uint32_t>& v_in,
-                        uint32_t* max_run_length_prefix,
-                        std::vector<uint32_t>* v_out,
-                        std::vector<uint32_t>* extra_bits) {
+// Finds runs of zeros in v[0..in_size) and replaces them with a prefix code of
+// the run length plus extra bits (lower 9 bits is the prefix code and the rest
+// are the extra bits). Non-zero values in v[] are shifted by
+// *max_length_prefix. Will not create prefix codes bigger than the initial
+// value of *max_run_length_prefix. The prefix code of run length L is simply
+// Log2Floor(L) and the number of extra bits is the same as the prefix code.
+void RunLengthCodeZeros(const size_t in_size,
+                        uint32_t* __restrict v,
+                        size_t* __restrict out_size,
+                        uint32_t* __restrict max_run_length_prefix) {
  uint32_t max_reps = 0;
-  for (size_t i = 0; i < v_in.size();) {
-    for (; i < v_in.size() && v_in[i] != 0; ++i) ;
+  for (size_t i = 0; i < in_size;) {
+    for (; i < in_size && v[i] != 0; ++i) ;
    uint32_t reps = 0;
-    for (; i < v_in.size() && v_in[i] == 0; ++i) {
+    for (; i < in_size && v[i] == 0; ++i) {
      ++reps;
    }
    max_reps = std::max(reps, max_reps);
@ -520,27 +553,31 @@ void RunLengthCodeZeros(const std::vector<uint32_t>& v_in,
  uint32_t max_prefix = max_reps > 0 ? Log2FloorNonZero(max_reps) : 0;
  max_prefix = std::min(max_prefix, *max_run_length_prefix);
  *max_run_length_prefix = max_prefix;
-  for (size_t i = 0; i < v_in.size();) {
-    if (v_in[i] != 0) {
-      v_out->push_back(v_in[i] + *max_run_length_prefix);
-      extra_bits->push_back(0);
+  *out_size = 0;
+  for (size_t i = 0; i < in_size;) {
+    assert(*out_size <= i);
+    if (v[i] != 0) {
+      v[*out_size] = v[i] + *max_run_length_prefix;
      ++i;
+      ++(*out_size);
    } else {
      uint32_t reps = 1;
-      for (size_t k = i + 1; k < v_in.size() && v_in[k] == 0; ++k) {
+      for (size_t k = i + 1; k < in_size && v[k] == 0; ++k) {
        ++reps;
      }
      i += reps;
      while (reps != 0) {
        if (reps < (2u << max_prefix)) {
          uint32_t run_length_prefix = Log2FloorNonZero(reps);
-          v_out->push_back(run_length_prefix);
-          extra_bits->push_back(reps - (1u << run_length_prefix));
+          const uint32_t extra_bits = reps - (1u << run_length_prefix);
+          v[*out_size] = run_length_prefix + (extra_bits << 9);
+          ++(*out_size);
          break;
        } else {
-          v_out->push_back(max_prefix);
-          extra_bits->push_back((1u << max_prefix) - 1u);
+          const uint32_t extra_bits = (1u << max_prefix) - 1u;
+          v[*out_size] = max_prefix + (extra_bits << 9);
          reps -= (2u << max_prefix) - 1u;
+          ++(*out_size);
        }
      }
    }
@ -549,6 +586,7 @@ void RunLengthCodeZeros(const std::vector<uint32_t>& v_in,

 void EncodeContextMap(const std::vector<uint32_t>& context_map,
                      size_t num_clusters,
+                      HuffmanTree* tree,
                      size_t* storage_ix, uint8_t* storage) {
  StoreVarLenUint8(num_clusters - 1, storage_ix, storage);

@ -556,37 +594,40 @@ void EncodeContextMap(const std::vector<uint32_t>& context_map,
    return;
  }

-  std::vector<uint32_t> transformed_symbols = MoveToFrontTransform(context_map);
-  std::vector<uint32_t> rle_symbols;
-  std::vector<uint32_t> extra_bits;
+  uint32_t* rle_symbols = new uint32_t[context_map.size()];
+  MoveToFrontTransform(&context_map[0], context_map.size(), rle_symbols);
  uint32_t max_run_length_prefix = 6;
-  RunLengthCodeZeros(transformed_symbols, &max_run_length_prefix,
-                     &rle_symbols, &extra_bits);
-  HistogramContextMap symbol_histogram;
-  for (size_t i = 0; i < rle_symbols.size(); ++i) {
-    symbol_histogram.Add(rle_symbols[i]);
+  size_t num_rle_symbols = 0;
+  RunLengthCodeZeros(context_map.size(), rle_symbols,
+                     &num_rle_symbols, &max_run_length_prefix);
+  uint32_t histogram[kContextMapAlphabetSize];
+  memset(histogram, 0, sizeof(histogram));
+  static const int kSymbolBits = 9;
+  static const uint32_t kSymbolMask = (1u << kSymbolBits) - 1u;
+  for (size_t i = 0; i < num_rle_symbols; ++i) {
+    ++histogram[rle_symbols[i] & kSymbolMask];
  }
  bool use_rle = max_run_length_prefix > 0;
  WriteBits(1, use_rle, storage_ix, storage);
  if (use_rle) {
    WriteBits(4, max_run_length_prefix - 1, storage_ix, storage);
  }
-  EntropyCodeContextMap symbol_code;
-  memset(symbol_code.depth_, 0, sizeof(symbol_code.depth_));
-  memset(symbol_code.bits_, 0, sizeof(symbol_code.bits_));
-  BuildAndStoreHuffmanTree(symbol_histogram.data_,
-                           num_clusters + max_run_length_prefix,
-                           symbol_code.depth_, symbol_code.bits_,
-                           storage_ix, storage);
-  for (size_t i = 0; i < rle_symbols.size(); ++i) {
-    WriteBits(symbol_code.depth_[rle_symbols[i]],
-              symbol_code.bits_[rle_symbols[i]],
-              storage_ix, storage);
-    if (rle_symbols[i] > 0 && rle_symbols[i] <= max_run_length_prefix) {
-      WriteBits(rle_symbols[i], extra_bits[i], storage_ix, storage);
+  uint8_t depths[kContextMapAlphabetSize];
+  uint16_t bits[kContextMapAlphabetSize];
+  memset(depths, 0, sizeof(depths));
+  memset(bits, 0, sizeof(bits));
+  BuildAndStoreHuffmanTree(histogram, num_clusters + max_run_length_prefix,
+                           tree, depths, bits, storage_ix, storage);
+  for (size_t i = 0; i < num_rle_symbols; ++i) {
+    const uint32_t rle_symbol = rle_symbols[i] & kSymbolMask;
+    const uint32_t extra_bits_val = rle_symbols[i] >> kSymbolBits;
+    WriteBits(depths[rle_symbol], bits[rle_symbol], storage_ix, storage);
+    if (rle_symbol > 0 && rle_symbol <= max_run_length_prefix) {
+      WriteBits(rle_symbol, extra_bits_val, storage_ix, storage);
    }
  }
  WriteBits(1, 1, storage_ix, storage);  // use move-to-front
+  delete[] rle_symbols;
 }

 void StoreBlockSwitch(const BlockSplitCode& code,
@ -608,12 +649,15 @@ void StoreBlockSwitch(const BlockSplitCode& code,
 void BuildAndStoreBlockSplitCode(const std::vector<uint8_t>& types,
                                 const std::vector<uint32_t>& lengths,
                                 const size_t num_types,
+                                 HuffmanTree* tree,
                                 BlockSplitCode* code,
                                 size_t* storage_ix,
                                 uint8_t* storage) {
  const size_t num_blocks = types.size();
-  std::vector<uint32_t> type_histo(num_types + 2);
-  std::vector<uint32_t> length_histo(26);
+  uint32_t type_histo[kBlockTypeAlphabetSize];
+  uint32_t length_histo[kNumBlockLenPrefixes];
+  memset(type_histo, 0, (num_types + 2) * sizeof(type_histo[0]));
+  memset(length_histo, 0, sizeof(length_histo));
  size_t last_type = 1;
  size_t second_last_type = 0;
  code->type_code.resize(num_blocks);
@ -622,8 +666,8 @@ void BuildAndStoreBlockSplitCode(const std::vector<uint8_t>& types,
  code->length_extra.resize(num_blocks);
  code->type_depths.resize(num_types + 2);
  code->type_bits.resize(num_types + 2);
-  code->length_depths.resize(26);
-  code->length_bits.resize(26);
+  memset(code->length_depths, 0, sizeof(code->length_depths));
+  memset(code->length_bits, 0, sizeof(code->length_bits));
  for (size_t i = 0; i < num_blocks; ++i) {
    size_t type = types[i];
    size_t type_code = (type == last_type + 1 ? 1 :
@ -641,10 +685,10 @@ void BuildAndStoreBlockSplitCode(const std::vector<uint8_t>& types,
  }
  StoreVarLenUint8(num_types - 1, storage_ix, storage);
  if (num_types > 1) {
-    BuildAndStoreHuffmanTree(&type_histo[0], num_types + 2,
+    BuildAndStoreHuffmanTree(&type_histo[0], num_types + 2, tree,
                             &code->type_depths[0], &code->type_bits[0],
                             storage_ix, storage);
-    BuildAndStoreHuffmanTree(&length_histo[0], 26,
+    BuildAndStoreHuffmanTree(&length_histo[0], kNumBlockLenPrefixes, tree,
                             &code->length_depths[0], &code->length_bits[0],
                             storage_ix, storage);
    StoreBlockSwitch(*code, 0, storage_ix, storage);
@ -653,6 +697,7 @@ void BuildAndStoreBlockSplitCode(const std::vector<uint8_t>& types,

 void StoreTrivialContextMap(size_t num_types,
                            size_t context_bits,
+                            HuffmanTree* tree,
                            size_t* storage_ix,
                            uint8_t* storage) {
  StoreVarLenUint8(num_types - 1, storage_ix, storage);
@ -660,9 +705,12 @@ void StoreTrivialContextMap(size_t num_types,
    size_t repeat_code = context_bits - 1u;
    size_t repeat_bits = (1u << repeat_code) - 1u;
    size_t alphabet_size = num_types + repeat_code;
-    std::vector<uint32_t> histogram(alphabet_size);
-    std::vector<uint8_t> depths(alphabet_size);
-    std::vector<uint16_t> bits(alphabet_size);
+    uint32_t histogram[kContextMapAlphabetSize];
+    uint8_t depths[kContextMapAlphabetSize];
+    uint16_t bits[kContextMapAlphabetSize];
+    memset(histogram, 0, alphabet_size * sizeof(histogram[0]));
+    memset(depths, 0, alphabet_size * sizeof(depths[0]));
+    memset(bits, 0, alphabet_size * sizeof(bits[0]));
    // Write RLEMAX.
    WriteBits(1, 1, storage_ix, storage);
    WriteBits(4, repeat_code - 1, storage_ix, storage);
@ -671,7 +719,7 @@ void StoreTrivialContextMap(size_t num_types,
    for (size_t i = context_bits; i < alphabet_size; ++i) {
      histogram[i] = 1;
    }
-    BuildAndStoreHuffmanTree(&histogram[0], alphabet_size,
+    BuildAndStoreHuffmanTree(&histogram[0], alphabet_size, tree,
                             &depths[0], &bits[0],
                             storage_ix, storage);
    for (size_t i = 0; i < num_types; ++i) {
@ -702,11 +750,12 @@ class BlockEncoder {

  // Creates entropy codes of block lengths and block types and stores them
  // to the bit stream.
-  void BuildAndStoreBlockSwitchEntropyCodes(size_t* storage_ix,
+  void BuildAndStoreBlockSwitchEntropyCodes(HuffmanTree* tree,
+                                            size_t* storage_ix,
                                            uint8_t* storage) {
    BuildAndStoreBlockSplitCode(
        block_types_, block_lengths_, num_block_types_,
-        &block_split_code_, storage_ix, storage);
+        tree, &block_split_code_, storage_ix, storage);
  }

  // Creates entropy codes for all block types and stores them to the bit
@ -714,12 +763,14 @@ class BlockEncoder {
  template<int kSize>
  void BuildAndStoreEntropyCodes(
      const std::vector<Histogram<kSize> >& histograms,
+      HuffmanTree* tree,
      size_t* storage_ix, uint8_t* storage) {
    depths_.resize(histograms.size() * alphabet_size_);
    bits_.resize(histograms.size() * alphabet_size_);
    for (size_t i = 0; i < histograms.size(); ++i) {
      size_t ix = i * alphabet_size_;
      BuildAndStoreHuffmanTree(&histograms[i].data_[0], alphabet_size_,
+                               tree,
                               &depths_[ix], &bits_[ix],
                               storage_ix, storage);
    }
@ -798,6 +849,8 @@ void StoreMetaBlock(const uint8_t* input,
      kNumDistanceShortCodes + num_direct_distance_codes +
      (48u << distance_postfix_bits);

+  HuffmanTree* tree = static_cast<HuffmanTree*>(
+      malloc(kMaxHuffmanTreeSize * sizeof(HuffmanTree)));
  BlockEncoder literal_enc(256,
                           mb.literal_split.num_types,
                           mb.literal_split.types,
@ -811,9 +864,9 @@ void StoreMetaBlock(const uint8_t* input,
                            mb.distance_split.types,
                            mb.distance_split.lengths);

-  literal_enc.BuildAndStoreBlockSwitchEntropyCodes(storage_ix, storage);
-  command_enc.BuildAndStoreBlockSwitchEntropyCodes(storage_ix, storage);
-  distance_enc.BuildAndStoreBlockSwitchEntropyCodes(storage_ix, storage);
+  literal_enc.BuildAndStoreBlockSwitchEntropyCodes(tree, storage_ix, storage);
+  command_enc.BuildAndStoreBlockSwitchEntropyCodes(tree, storage_ix, storage);
+  distance_enc.BuildAndStoreBlockSwitchEntropyCodes(tree, storage_ix, storage);

  WriteBits(2, distance_postfix_bits, storage_ix, storage);
  WriteBits(4, num_direct_distance_codes >> distance_postfix_bits,
@ -824,37 +877,36 @@ void StoreMetaBlock(const uint8_t* input,

  size_t num_literal_histograms = mb.literal_histograms.size();
  if (mb.literal_context_map.empty()) {
-    StoreTrivialContextMap(num_literal_histograms, kLiteralContextBits,
+    StoreTrivialContextMap(num_literal_histograms, kLiteralContextBits, tree,
                           storage_ix, storage);
  } else {
-    EncodeContextMap(mb.literal_context_map, num_literal_histograms,
+    EncodeContextMap(mb.literal_context_map, num_literal_histograms, tree,
                     storage_ix, storage);
  }

  size_t num_dist_histograms = mb.distance_histograms.size();
  if (mb.distance_context_map.empty()) {
-    StoreTrivialContextMap(num_dist_histograms, kDistanceContextBits,
+    StoreTrivialContextMap(num_dist_histograms, kDistanceContextBits, tree,
                           storage_ix, storage);
  } else {
-    EncodeContextMap(mb.distance_context_map, num_dist_histograms,
+    EncodeContextMap(mb.distance_context_map, num_dist_histograms, tree,
                     storage_ix, storage);
  }

-  literal_enc.BuildAndStoreEntropyCodes(mb.literal_histograms,
+  literal_enc.BuildAndStoreEntropyCodes(mb.literal_histograms, tree,
                                        storage_ix, storage);
-  command_enc.BuildAndStoreEntropyCodes(mb.command_histograms,
+  command_enc.BuildAndStoreEntropyCodes(mb.command_histograms, tree,
                                        storage_ix, storage);
-  distance_enc.BuildAndStoreEntropyCodes(mb.distance_histograms,
+  distance_enc.BuildAndStoreEntropyCodes(mb.distance_histograms, tree,
                                         storage_ix, storage);
+  free(tree);

  size_t pos = start_pos;
  for (size_t i = 0; i < n_commands; ++i) {
    const Command cmd = commands[i];
    size_t cmd_code = cmd.cmd_prefix_;
-    uint32_t lennumextra = static_cast<uint32_t>(cmd.cmd_extra_ >> 48);
-    uint64_t lenextra = cmd.cmd_extra_ & 0xffffffffffffUL;
    command_enc.StoreSymbol(cmd_code, storage_ix, storage);
-    WriteBits(lennumextra, lenextra, storage_ix, storage);
+    StoreCommandExtra(cmd, storage_ix, storage);
    if (mb.literal_context_map.empty()) {
      for (size_t j = cmd.insert_len_; j != 0; --j) {
        literal_enc.StoreSymbol(input[pos & mask], storage_ix, storage);
@ -871,8 +923,8 @@ void StoreMetaBlock(const uint8_t* input,
        ++pos;
      }
    }
-    pos += cmd.copy_len_;
-    if (cmd.copy_len_ > 0) {
+    pos += cmd.copy_len();
+    if (cmd.copy_len()) {
      prev_byte2 = input[(pos - 2) & mask];
      prev_byte = input[(pos - 1) & mask];
      if (cmd.cmd_prefix_ >= 128) {
@ -911,8 +963,8 @@ void BuildHistograms(const uint8_t* input,
      lit_histo->Add(input[pos & mask]);
      ++pos;
    }
-    pos += cmd.copy_len_;
-    if (cmd.copy_len_ > 0 && cmd.cmd_prefix_ >= 128) {
+    pos += cmd.copy_len();
+    if (cmd.copy_len() && cmd.cmd_prefix_ >= 128) {
      dist_histo->Add(cmd.dist_prefix_);
    }
  }
@ -935,17 +987,15 @@ void StoreDataWithHuffmanCodes(const uint8_t* input,
  for (size_t i = 0; i < n_commands; ++i) {
    const Command cmd = commands[i];
    const size_t cmd_code = cmd.cmd_prefix_;
-    const uint32_t lennumextra = static_cast<uint32_t>(cmd.cmd_extra_ >> 48);
-    const uint64_t lenextra = cmd.cmd_extra_ & 0xffffffffffffUL;
    WriteBits(cmd_depth[cmd_code], cmd_bits[cmd_code], storage_ix, storage);
-    WriteBits(lennumextra, lenextra, storage_ix, storage);
+    StoreCommandExtra(cmd, storage_ix, storage);
    for (size_t j = cmd.insert_len_; j != 0; --j) {
      const uint8_t literal = input[pos & mask];
      WriteBits(lit_depth[literal], lit_bits[literal], storage_ix, storage);
      ++pos;
    }
-    pos += cmd.copy_len_;
-    if (cmd.copy_len_ > 0 && cmd.cmd_prefix_ >= 128) {
+    pos += cmd.copy_len();
+    if (cmd.copy_len() && cmd.cmd_prefix_ >= 128) {
      const size_t dist_code = cmd.dist_prefix_;
      const uint32_t distnumextra = cmd.dist_extra_ >> 24;
      const uint32_t distextra = cmd.dist_extra_ & 0xffffff;
@ -983,15 +1033,18 @@ void StoreMetaBlockTrivial(const uint8_t* input,
  std::vector<uint8_t> dist_depth(64);
  std::vector<uint16_t> dist_bits(64);

-  BuildAndStoreHuffmanTree(&lit_histo.data_[0], 256,
+  HuffmanTree* tree = static_cast<HuffmanTree*>(
+      malloc(kMaxHuffmanTreeSize * sizeof(HuffmanTree)));
+  BuildAndStoreHuffmanTree(&lit_histo.data_[0], 256, tree,
                           &lit_depth[0], &lit_bits[0],
                           storage_ix, storage);
-  BuildAndStoreHuffmanTree(&cmd_histo.data_[0], kNumCommandPrefixes,
+  BuildAndStoreHuffmanTree(&cmd_histo.data_[0], kNumCommandPrefixes, tree,
                           &cmd_depth[0], &cmd_bits[0],
                           storage_ix, storage);
-  BuildAndStoreHuffmanTree(&dist_histo.data_[0], 64,
+  BuildAndStoreHuffmanTree(&dist_histo.data_[0], 64, tree,
                           &dist_depth[0], &dist_bits[0],
                           storage_ix, storage);
+  free(tree);
  StoreDataWithHuffmanCodes(input, start_pos, mask, commands,
                            n_commands, &lit_depth[0], &lit_bits[0],
                            &cmd_depth[0], &cmd_bits[0],
@ -1026,7 +1079,7 @@ void StoreMetaBlockFast(const uint8_t* input,
        ++pos;
      }
      num_literals += cmd.insert_len_;
-      pos += cmd.copy_len_;
+      pos += cmd.copy_len();
    }
    uint8_t lit_depth[256] = { 0 };
    uint16_t lit_bits[256] = { 0 };
--- a/enc/brotli_bit_stream.h
+++ b/enc/brotli_bit_stream.h
@ -48,6 +48,7 @@ void StoreUncompressedMetaBlockHeader(size_t length,
 // Stores a context map where the histogram type is always the block type.
 void StoreTrivialContextMap(size_t num_types,
                            size_t context_bits,
+                            HuffmanTree* tree,
                            size_t* storage_ix,
                            uint8_t* storage);

@ -57,13 +58,14 @@ void StoreHuffmanTreeOfHuffmanTreeToBitMask(
    size_t *storage_ix,
    uint8_t *storage);

-void StoreHuffmanTree(const uint8_t* depths, size_t num,
+void StoreHuffmanTree(const uint8_t* depths, size_t num, HuffmanTree* tree,
                      size_t *storage_ix, uint8_t *storage);

 // Builds a Huffman tree from histogram[0:length] into depth[0:length] and
 // bits[0:length] and stores the encoded tree to the bit stream.
 void BuildAndStoreHuffmanTree(const uint32_t *histogram,
                              const size_t length,
+                              HuffmanTree* tree,
                              uint8_t* depth,
                              uint16_t* bits,
                              size_t* storage_ix,
@ -81,6 +83,7 @@ void BuildAndStoreHuffmanTreeFast(const uint32_t *histogram,
 // histogram ids is given by num_clusters.
 void EncodeContextMap(const std::vector<uint32_t>& context_map,
                      size_t num_clusters,
+                      HuffmanTree* tree,
                      size_t* storage_ix, uint8_t* storage);

 // Data structure that stores everything that is needed to encode each block
@ -92,8 +95,8 @@ struct BlockSplitCode {
  std::vector<uint32_t> length_extra;
  std::vector<uint8_t> type_depths;
  std::vector<uint16_t> type_bits;
-  std::vector<uint8_t> length_depths;
-  std::vector<uint16_t> length_bits;
+  uint8_t length_depths[kNumBlockLenPrefixes];
+  uint16_t length_bits[kNumBlockLenPrefixes];
 };

 // Builds a BlockSplitCode data structure from the block split given by the
--- a/enc/cluster.h
+++ b/enc/cluster.h
@ -11,7 +11,6 @@

 #include <math.h>
 #include <algorithm>
-#include <map>
 #include <utility>
 #include <vector>

@ -52,7 +51,9 @@ template<typename HistogramType>
 void CompareAndPushToQueue(const HistogramType* out,
                           const uint32_t* cluster_size,
                           uint32_t idx1, uint32_t idx2,
-                           std::vector<HistogramPair>* pairs) {
+                           size_t max_num_pairs,
+                           HistogramPair* pairs,
+                           size_t* num_pairs) {
  if (idx1 == idx2) {
    return;
  }
@ -76,8 +77,8 @@ void CompareAndPushToQueue(const HistogramType* out,
    p.cost_combo = out[idx1].bit_cost_;
    store_pair = true;
  } else {
-    double threshold = pairs->empty() ? 1e99 :
-        std::max(0.0, (*pairs)[0].cost_diff);
+    double threshold = *num_pairs == 0 ? 1e99 :
+        std::max(0.0, pairs[0].cost_diff);
    HistogramType combo = out[idx1];
    combo.AddHistogram(out[idx2]);
    double cost_combo = PopulationCost(combo);
@ -88,42 +89,44 @@ void CompareAndPushToQueue(const HistogramType* out,
  }
  if (store_pair) {
    p.cost_diff += p.cost_combo;
-    if (!pairs->empty() && (pairs->front() < p)) {
+    if (*num_pairs > 0 && pairs[0] < p) {
      // Replace the top of the queue if needed.
-      pairs->push_back(pairs->front());
-      pairs->front() = p;
-    } else {
-      pairs->push_back(p);
+      if (*num_pairs < max_num_pairs) {
+        pairs[*num_pairs] = pairs[0];
+        ++(*num_pairs);
+      }
+      pairs[0] = p;
+    } else if (*num_pairs < max_num_pairs) {
+      pairs[*num_pairs] = p;
+      ++(*num_pairs);
    }
  }
 }

 template<typename HistogramType>
-void HistogramCombine(HistogramType* out,
-                      uint32_t* cluster_size,
-                      uint32_t* symbols,
-                      size_t symbols_size,
-                      size_t max_clusters) {
+size_t HistogramCombine(HistogramType* out,
+                        uint32_t* cluster_size,
+                        uint32_t* symbols,
+                        uint32_t* clusters,
+                        HistogramPair* pairs,
+                        size_t num_clusters,
+                        size_t symbols_size,
+                        size_t max_clusters,
+                        size_t max_num_pairs) {
  double cost_diff_threshold = 0.0;
  size_t min_cluster_size = 1;

-  // Uniquify the list of symbols.
-  std::vector<uint32_t> clusters(symbols, symbols + symbols_size);
-  std::sort(clusters.begin(), clusters.end());
-  std::vector<uint32_t>::iterator last =
-      std::unique(clusters.begin(), clusters.end());
-  clusters.resize(static_cast<size_t>(last - clusters.begin()));
-
-  // We maintain a heap of histogram pairs, ordered by the bit cost reduction.
-  std::vector<HistogramPair> pairs;
-  for (size_t idx1 = 0; idx1 < clusters.size(); ++idx1) {
-    for (size_t idx2 = idx1 + 1; idx2 < clusters.size(); ++idx2) {
+  // We maintain a vector of histogram pairs, with the property that the pair
+  // with the maximum bit cost reduction is the first.
+  size_t num_pairs = 0;
+  for (size_t idx1 = 0; idx1 < num_clusters; ++idx1) {
+    for (size_t idx2 = idx1 + 1; idx2 < num_clusters; ++idx2) {
      CompareAndPushToQueue(out, cluster_size, clusters[idx1], clusters[idx2],
-                            &pairs);
+                            max_num_pairs, &pairs[0], &num_pairs);
    }
  }

-  while (clusters.size() > min_cluster_size) {
+  while (num_clusters > min_cluster_size) {
    if (pairs[0].cost_diff >= cost_diff_threshold) {
      cost_diff_threshold = 1e99;
      min_cluster_size = max_clusters;
@ -140,40 +143,42 @@ void HistogramCombine(HistogramType* out,
        symbols[i] = best_idx1;
      }
    }
-    for (std::vector<uint32_t>::iterator cluster = clusters.begin();
-         cluster != clusters.end(); ++cluster) {
-      if (*cluster >= best_idx2) {
-        clusters.erase(cluster);
+    for (size_t i = 0; i < num_clusters; ++i) {
+      if (clusters[i] == best_idx2) {
+        memmove(&clusters[i], &clusters[i + 1],
+                (num_clusters - i - 1) * sizeof(clusters[0]));
        break;
      }
    }
-
+    --num_clusters;
    // Remove pairs intersecting the just combined best pair.
    size_t copy_to_idx = 0;
-    for (size_t i = 0; i < pairs.size(); ++i) {
+    for (size_t i = 0; i < num_pairs; ++i) {
      HistogramPair& p = pairs[i];
      if (p.idx1 == best_idx1 || p.idx2 == best_idx1 ||
          p.idx1 == best_idx2 || p.idx2 == best_idx2) {
        // Remove invalid pair from the queue.
        continue;
      }
-      if (pairs.front() < p) {
+      if (pairs[0] < p) {
        // Replace the top of the queue if needed.
-        HistogramPair front = pairs.front();
-        pairs.front() = p;
+        HistogramPair front = pairs[0];
+        pairs[0] = p;
        pairs[copy_to_idx] = front;
      } else {
        pairs[copy_to_idx] = p;
      }
      ++copy_to_idx;
    }
-    pairs.resize(copy_to_idx);
+    num_pairs = copy_to_idx;

    // Push new pairs formed with the combined histogram to the heap.
-    for (size_t i = 0; i < clusters.size(); ++i) {
-      CompareAndPushToQueue(out, cluster_size, best_idx1, clusters[i], &pairs);
+    for (size_t i = 0; i < num_clusters; ++i) {
+      CompareAndPushToQueue(out, cluster_size, best_idx1, clusters[i],
+                            max_num_pairs, &pairs[0], &num_pairs);
    }
  }
+  return num_clusters;
 }

 // -----------------------------------------------------------------------------
@ -192,61 +197,69 @@ double HistogramBitCostDistance(const HistogramType& histogram,
 }

 // Find the best 'out' histogram for each of the 'in' histograms.
+// When called, clusters[0..num_clusters) contains the unique values from
+// symbols[0..in_size), but this property is not preserved in this function.
 // Note: we assume that out[]->bit_cost_ is already up-to-date.
 template<typename HistogramType>
 void HistogramRemap(const HistogramType* in, size_t in_size,
+                    const uint32_t* clusters, size_t num_clusters,
                    HistogramType* out, uint32_t* symbols) {
-  // Uniquify the list of symbols.
-  std::vector<uint32_t> all_symbols(symbols, symbols + in_size);
-  std::sort(all_symbols.begin(), all_symbols.end());
-  std::vector<uint32_t>::iterator last =
-      std::unique(all_symbols.begin(), all_symbols.end());
-  all_symbols.resize(static_cast<size_t>(last - all_symbols.begin()));
-
  for (size_t i = 0; i < in_size; ++i) {
    uint32_t best_out = i == 0 ? symbols[0] : symbols[i - 1];
    double best_bits = HistogramBitCostDistance(in[i], out[best_out]);
-    for (std::vector<uint32_t>::const_iterator k = all_symbols.begin();
-         k != all_symbols.end(); ++k) {
-      const double cur_bits = HistogramBitCostDistance(in[i], out[*k]);
+    for (size_t j = 0; j < num_clusters; ++j) {
+      const double cur_bits = HistogramBitCostDistance(in[i], out[clusters[j]]);
      if (cur_bits < best_bits) {
        best_bits = cur_bits;
-        best_out = *k;
+        best_out = clusters[j];
      }
    }
    symbols[i] = best_out;
  }

-
  // Recompute each out based on raw and symbols.
-  for (std::vector<uint32_t>::const_iterator k = all_symbols.begin();
-       k != all_symbols.end(); ++k) {
-    out[*k].Clear();
+  for (size_t j = 0; j < num_clusters; ++j) {
+    out[clusters[j]].Clear();
  }
  for (size_t i = 0; i < in_size; ++i) {
    out[symbols[i]].AddHistogram(in[i]);
  }
 }

-// Reorder histograms in *out so that the new symbols in *symbols come in
-// increasing order.
+// Reorders elements of the out[0..length) array and changes values in
+// symbols[0..length) array in the following way:
+//   * when called, symbols[] contains indexes into out[], and has N unique
+//     values (possibly N < length)
+//   * on return, symbols'[i] = f(symbols[i]) and
+//                out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
+//     where f is a bijection between the range of symbols[] and [0..N), and
+//     the first occurrences of values in symbols'[i] come in consecutive
+//     increasing order.
+// Returns N, the number of unique values in symbols[].
 template<typename HistogramType>
-void HistogramReindex(std::vector<HistogramType>* out,
-                      std::vector<uint32_t>* symbols) {
-  std::vector<HistogramType> tmp(*out);
-  std::map<uint32_t, uint32_t> new_index;
+size_t HistogramReindex(HistogramType* out, uint32_t* symbols, size_t length) {
+  static const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
+  std::vector<uint32_t> new_index(length, kInvalidIndex);
  uint32_t next_index = 0;
-  for (size_t i = 0; i < symbols->size(); ++i) {
-    if (new_index.find((*symbols)[i]) == new_index.end()) {
-      new_index[(*symbols)[i]] = next_index;
-      (*out)[next_index] = tmp[(*symbols)[i]];
+  for (size_t i = 0; i < length; ++i) {
+    if (new_index[symbols[i]] == kInvalidIndex) {
+      new_index[symbols[i]] = next_index;
      ++next_index;
    }
  }
-  out->resize(next_index);
-  for (size_t i = 0; i < symbols->size(); ++i) {
-    (*symbols)[i] = new_index[(*symbols)[i]];
+  std::vector<HistogramType> tmp(next_index);
+  next_index = 0;
+  for (size_t i = 0; i < length; ++i) {
+    if (new_index[symbols[i]] == next_index) {
+      tmp[next_index] = out[symbols[i]];
+      ++next_index;
+    }
+    symbols[i] = new_index[symbols[i]];
  }
+  for (size_t i = 0; i < next_index; ++i) {
+    out[i] = tmp[i];
+  }
+  return next_index;
 }

 // Clusters similar histograms in 'in' together, the selected histograms are
@ -261,6 +274,8 @@ void ClusterHistograms(const std::vector<HistogramType>& in,
  const size_t in_size = num_contexts * num_blocks;
  assert(in_size == in.size());
  std::vector<uint32_t> cluster_size(in_size, 1);
+  std::vector<uint32_t> clusters(in_size);
+  size_t num_clusters = 0;
  out->resize(in_size);
  histogram_symbols->resize(in_size);
  for (size_t i = 0; i < in_size; ++i) {
@ -269,29 +284,47 @@ void ClusterHistograms(const std::vector<HistogramType>& in,
    (*histogram_symbols)[i] = static_cast<uint32_t>(i);
  }

-
  const size_t max_input_histograms = 64;
+  // For the first pass of clustering, we allow all pairs.
+  size_t max_num_pairs = max_input_histograms * max_input_histograms / 2;
+  std::vector<HistogramPair> pairs(max_num_pairs + 1);
+
  for (size_t i = 0; i < in_size; i += max_input_histograms) {
    size_t num_to_combine = std::min(in_size - i, max_input_histograms);
-    HistogramCombine(&(*out)[0], &cluster_size[0],
-                     &(*histogram_symbols)[i], num_to_combine,
-                     max_histograms);
+    for (size_t j = 0; j < num_to_combine; ++j) {
+      clusters[num_clusters + j] = static_cast<uint32_t>(i + j);
+    }
+    size_t num_new_clusters =
+        HistogramCombine(&(*out)[0], &cluster_size[0],
+                         &(*histogram_symbols)[i],
+                         &clusters[num_clusters], &pairs[0],
+                         num_to_combine, num_to_combine,
+                         max_histograms, max_num_pairs);
+    num_clusters += num_new_clusters;
  }

+  // For the second pass, we limit the total number of histogram pairs.
+  // After this limit is reached, we only keep searching for the best pair.
+  max_num_pairs =
+      std::min(64 * num_clusters, (num_clusters / 2) * num_clusters);
+  pairs.resize(max_num_pairs + 1);
+
  // Collapse similar histograms.
-  HistogramCombine(&(*out)[0], &cluster_size[0],
-                   &(*histogram_symbols)[0], in_size,
-                   max_histograms);
+  num_clusters = HistogramCombine(&(*out)[0], &cluster_size[0],
+                                  &(*histogram_symbols)[0], &clusters[0],
+                                  &pairs[0], num_clusters, in_size,
+                                  max_histograms, max_num_pairs);

  // Find the optimal map from original histograms to the final ones.
-  HistogramRemap(&in[0], in_size, &(*out)[0], &(*histogram_symbols)[0]);
+  HistogramRemap(&in[0], in_size, &clusters[0], num_clusters,
+                 &(*out)[0], &(*histogram_symbols)[0]);

  // Convert the context map to a canonical form.
-  HistogramReindex(out, histogram_symbols);
-
+  size_t num_histograms =
+      HistogramReindex(&(*out)[0], &(*histogram_symbols)[0], in_size);
+  out->resize(num_histograms);
 }

-
 }  // namespace brotli

 #endif  // BROTLI_ENC_CLUSTER_H_
--- a/enc/command.h
+++ b/enc/command.h
@ -73,35 +73,47 @@ static inline uint16_t CombineLengthCodes(

 static inline void GetLengthCode(size_t insertlen, size_t copylen,
                                 bool use_last_distance,
-                                 uint16_t* code, uint64_t* extra) {
+                                 uint16_t* code) {
  uint16_t inscode = GetInsertLengthCode(insertlen);
  uint16_t copycode = GetCopyLengthCode(copylen);
-  uint64_t insnumextra = kInsExtra[inscode];
-  uint64_t numextra = insnumextra + kCopyExtra[copycode];
-  uint64_t insextraval = insertlen - kInsBase[inscode];
-  uint64_t copyextraval = copylen - kCopyBase[copycode];
  *code = CombineLengthCodes(inscode, copycode, use_last_distance);
-  *extra = (numextra << 48) | (copyextraval << insnumextra) | insextraval;
+}
+
+static inline uint32_t GetInsertBase(uint16_t inscode) {
+  return kInsBase[inscode];
+}
+
+static inline uint32_t GetInsertExtra(uint16_t inscode) {
+  return kInsExtra[inscode];
+}
+
+static inline uint32_t GetCopyBase(uint16_t copycode) {
+  return kCopyBase[copycode];
+}
+
+static inline uint32_t GetCopyExtra(uint16_t copycode) {
+  return kCopyExtra[copycode];
 }

 struct Command {
  // distance_code is e.g. 0 for same-as-last short code, or 16 for offset 1.
  Command(size_t insertlen, size_t copylen, size_t copylen_code,
          size_t distance_code)
-      : insert_len_(static_cast<uint32_t>(insertlen))
-      , copy_len_(static_cast<uint32_t>(copylen)) {
+      : insert_len_(static_cast<uint32_t>(insertlen)) {
+    copy_len_ = static_cast<uint32_t>(
+        copylen | ((copylen_code ^ copylen) << 24));
    // The distance prefix and extra bits are stored in this Command as if
    // npostfix and ndirect were 0, they are only recomputed later after the
    // clustering if needed.
    PrefixEncodeCopyDistance(distance_code, 0, 0, &dist_prefix_, &dist_extra_);
    GetLengthCode(insertlen, copylen_code, dist_prefix_ == 0,
-                  &cmd_prefix_, &cmd_extra_);
+                  &cmd_prefix_);
  }

  explicit Command(size_t insertlen)
      : insert_len_(static_cast<uint32_t>(insertlen))
-      , copy_len_(0), dist_extra_(0), dist_prefix_(16) {
-    GetLengthCode(insertlen, 4, dist_prefix_ == 0, &cmd_prefix_, &cmd_extra_);
+      , copy_len_(4 << 24), dist_extra_(0), dist_prefix_(16) {
+    GetLengthCode(insertlen, 4, dist_prefix_ == 0, &cmd_prefix_);
  }

  uint32_t DistanceCode(void) const {
@ -123,9 +135,17 @@ struct Command {
    return 3;
  }

+  inline uint32_t copy_len(void) const {
+    return copy_len_ & 0xFFFFFF;
+  }
+
+  inline uint32_t copy_len_code(void) const {
+    return (copy_len_ & 0xFFFFFF) ^ (copy_len_ >> 24);
+  }
+
  uint32_t insert_len_;
+  /* Stores copy_len in low 24 bits and copy_len XOR copy_code in high 8 bit. */
  uint32_t copy_len_;
-  uint64_t cmd_extra_;
  uint32_t dist_extra_;
  uint16_t cmd_prefix_;
  uint16_t dist_prefix_;
--- a/enc/compress_fragment.cc
+++ b/enc/compress_fragment.cc
@ -105,8 +105,11 @@ void BuildAndStoreLiteralPrefixCode(const uint8_t* input,
 void BuildAndStoreCommandPrefixCode(const uint32_t histogram[128],
                                    uint8_t depth[128], uint16_t bits[128],
                                    size_t* storage_ix, uint8_t* storage) {
-  CreateHuffmanTree(histogram, 64, 15, depth);
-  CreateHuffmanTree(&histogram[64], 64, 14, &depth[64]);
+  // Tree size for building a tree over 64 symbols is 2 * 64 + 1.
+  static const size_t kTreeSize = 129;
+  HuffmanTree tree[kTreeSize];
+  CreateHuffmanTree(histogram, 64, 15, tree, depth);
+  CreateHuffmanTree(&histogram[64], 64, 14, tree, &depth[64]);
  // We have to jump through a few hoopes here in order to compute
  // the command bits because the symbols are in a different order than in
  // the full alphabet. This looks complicated, but having the symbols
@ -141,9 +144,9 @@ void BuildAndStoreCommandPrefixCode(const uint32_t histogram[128],
      cmd_depth[256 + 8 * i] = depth[48 + i];
      cmd_depth[448 + 8 * i] = depth[56 + i];
    }
-    StoreHuffmanTree(cmd_depth, 704, storage_ix, storage);
+    StoreHuffmanTree(cmd_depth, 704, tree, storage_ix, storage);
  }
-  StoreHuffmanTree(&depth[64], 64, storage_ix, storage);
+  StoreHuffmanTree(&depth[64], 64, tree, storage_ix, storage);
 }

 // REQUIRES: insertlen < 6210
@ -452,7 +455,8 @@ void BrotliCompressFragmentFast(const uint8_t* input, size_t input_size,
  assert(table_size <= (1u << 31));
  assert((table_size & (table_size - 1)) == 0);  // table must be power of two
  const size_t shift = 64u - Log2FloorNonZero(table_size);
-  assert(static_cast<size_t>(0xffffffffffffffffU >> shift) == table_size - 1);
+  assert(table_size - 1 == static_cast<size_t>(
+      MAKE_UINT64_T(0xFFFFFFFF, 0xFFFFFF) >> shift));
  const uint8_t* ip_end = input + block_size;

  int last_distance = -1;
--- a/enc/compress_fragment_two_pass.cc
+++ b/enc/compress_fragment_two_pass.cc
@ -57,8 +57,11 @@ static void BuildAndStoreCommandPrefixCode(
    const uint32_t histogram[128],
    uint8_t depth[128], uint16_t bits[128],
    size_t* storage_ix, uint8_t* storage) {
-  CreateHuffmanTree(histogram, 64, 15, depth);
-  CreateHuffmanTree(&histogram[64], 64, 14, &depth[64]);
+  // Tree size for building a tree over 64 symbols is 2 * 64 + 1.
+  static const size_t kTreeSize = 129;
+  HuffmanTree tree[kTreeSize];
+  CreateHuffmanTree(histogram, 64, 15, tree, depth);
+  CreateHuffmanTree(&histogram[64], 64, 14, tree, &depth[64]);
  // We have to jump through a few hoopes here in order to compute
  // the command bits because the symbols are in a different order than in
  // the full alphabet. This looks complicated, but having the symbols
@ -93,9 +96,9 @@ static void BuildAndStoreCommandPrefixCode(
      cmd_depth[256 + 8 * i] = depth[8 + i];
      cmd_depth[448 + 8 * i] = depth[16 + i];
    }
-    StoreHuffmanTree(cmd_depth, 704, storage_ix, storage);
+    StoreHuffmanTree(cmd_depth, 704, tree, storage_ix, storage);
  }
-  StoreHuffmanTree(&depth[64], 64, storage_ix, storage);
+  StoreHuffmanTree(&depth[64], 64, tree, storage_ix, storage);
 }

 inline void EmitInsertLen(uint32_t insertlen, uint32_t** commands) {
@ -227,7 +230,8 @@ void CreateCommands(const uint8_t* input, size_t block_size, size_t input_size,
  assert(table_size <= (1u << 31));
  assert((table_size & (table_size - 1)) == 0);  // table must be power of two
  const size_t shift = 64u - Log2FloorNonZero(table_size);
-  assert(static_cast<size_t>(0xffffffffffffffffU >> shift) == table_size - 1);
+  assert(table_size - 1 == static_cast<size_t>(
+      MAKE_UINT64_T(0xFFFFFFFF, 0xFFFFFF) >> shift));
  const uint8_t* ip_end = input + block_size;
  // "next_emit" is a pointer to the first byte that is not covered by a
  // previous copy. Bytes between "next_emit" and the start of the next copy or
--- a/enc/encode.cc
+++ b/enc/encode.cc
@ -38,7 +38,7 @@ static const int kMinQualityForContextModeling = 5;
 static const int kMinQualityForOptimizeHistograms = 4;
 // For quality 2 there is no block splitting, so we buffer at most this much
 // literals and commands.
-static const int kMaxNumDelayedSymbols = 0x2fff;
+static const size_t kMaxNumDelayedSymbols = 0x2fff;

 #define COPY_ARRAY(dst, src) memcpy(dst, src, sizeof(src));

@ -51,7 +51,7 @@ void RecomputeDistancePrefixes(Command* cmds,
  }
  for (size_t i = 0; i < num_commands; ++i) {
    Command* cmd = &cmds[i];
-    if (cmd->copy_len_ > 0 && cmd->cmd_prefix_ >= 128) {
+    if (cmd->copy_len() && cmd->cmd_prefix_ >= 128) {
      PrefixEncodeCopyDistance(cmd->DistanceCode(),
                               num_direct_distance_codes,
                               distance_postfix_bits,
@ -180,6 +180,250 @@ void InitCommandPrefixCodes(uint8_t cmd_depths[128],
  *cmd_code_numbits = kDefaultCommandCodeNumBits;
 }

+// Decide about the context map based on the ability of the prediction
+// ability of the previous byte UTF8-prefix on the next byte. The
+// prediction ability is calculated as shannon entropy. Here we need
+// shannon entropy instead of 'BitsEntropy' since the prefix will be
+// encoded with the remaining 6 bits of the following byte, and
+// BitsEntropy will assume that symbol to be stored alone using Huffman
+// coding.
+void ChooseContextMap(int quality,
+                      uint32_t* bigram_histo,
+                      size_t* num_literal_contexts,
+                      const uint32_t** literal_context_map) {
+  uint32_t monogram_histo[3] = { 0 };
+  uint32_t two_prefix_histo[6] = { 0 };
+  size_t total = 0;
+  for (size_t i = 0; i < 9; ++i) {
+    total += bigram_histo[i];
+    monogram_histo[i % 3] += bigram_histo[i];
+    size_t j = i;
+    if (j >= 6) {
+      j -= 6;
+    }
+    two_prefix_histo[j] += bigram_histo[i];
+  }
+  size_t dummy;
+  double entropy1 = ShannonEntropy(monogram_histo, 3, &dummy);
+  double entropy2 = (ShannonEntropy(two_prefix_histo, 3, &dummy) +
+                     ShannonEntropy(two_prefix_histo + 3, 3, &dummy));
+  double entropy3 = 0;
+  for (size_t k = 0; k < 3; ++k) {
+    entropy3 += ShannonEntropy(bigram_histo + 3 * k, 3, &dummy);
+  }
+
+  assert(total != 0);
+  double scale = 1.0 / static_cast<double>(total);
+  entropy1 *= scale;
+  entropy2 *= scale;
+  entropy3 *= scale;
+
+  static const uint32_t kStaticContextMapContinuation[64] = {
+    1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  };
+  static const uint32_t kStaticContextMapSimpleUTF8[64] = {
+    0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  };
+  if (quality < 7) {
+    // 3 context models is a bit slower, don't use it at lower qualities.
+    entropy3 = entropy1 * 10;
+  }
+  // If expected savings by symbol are less than 0.2 bits, skip the
+  // context modeling -- in exchange for faster decoding speed.
+  if (entropy1 - entropy2 < 0.2 &&
+      entropy1 - entropy3 < 0.2) {
+    *num_literal_contexts = 1;
+  } else if (entropy2 - entropy3 < 0.02) {
+    *num_literal_contexts = 2;
+    *literal_context_map = kStaticContextMapSimpleUTF8;
+  } else {
+    *num_literal_contexts = 3;
+    *literal_context_map = kStaticContextMapContinuation;
+  }
+}
+
+void DecideOverLiteralContextModeling(const uint8_t* input,
+                                      size_t start_pos,
+                                      size_t length,
+                                      size_t mask,
+                                      int quality,
+                                      ContextType* literal_context_mode,
+                                      size_t* num_literal_contexts,
+                                      const uint32_t** literal_context_map) {
+  if (quality < kMinQualityForContextModeling || length < 64) {
+    return;
+  }
+  // Gather bigram data of the UTF8 byte prefixes. To make the analysis of
+  // UTF8 data faster we only examine 64 byte long strides at every 4kB
+  // intervals.
+  const size_t end_pos = start_pos + length;
+  uint32_t bigram_prefix_histo[9] = { 0 };
+  for (; start_pos + 64 <= end_pos; start_pos += 4096) {
+      static const int lut[4] = { 0, 0, 1, 2 };
+    const size_t stride_end_pos = start_pos + 64;
+    int prev = lut[input[start_pos & mask] >> 6] * 3;
+    for (size_t pos = start_pos + 1; pos < stride_end_pos; ++pos) {
+      const uint8_t literal = input[pos & mask];
+      ++bigram_prefix_histo[prev + lut[literal >> 6]];
+      prev = lut[literal >> 6] * 3;
+    }
+  }
+  *literal_context_mode = CONTEXT_UTF8;
+  ChooseContextMap(quality, &bigram_prefix_histo[0], num_literal_contexts,
+                   literal_context_map);
+}
+
+bool ShouldCompress(const uint8_t* data,
+                    const size_t mask,
+                    const uint64_t last_flush_pos,
+                    const size_t bytes,
+                    const size_t num_literals,
+                    const size_t num_commands) {
+  if (num_commands < (bytes >> 8) + 2) {
+    if (num_literals > 0.99 * static_cast<double>(bytes)) {
+      uint32_t literal_histo[256] = { 0 };
+      static const uint32_t kSampleRate = 13;
+      static const double kMinEntropy = 7.92;
+      const double bit_cost_threshold =
+          static_cast<double>(bytes) * kMinEntropy / kSampleRate;
+      size_t t = (bytes + kSampleRate - 1) / kSampleRate;
+      uint32_t pos = static_cast<uint32_t>(last_flush_pos);
+      for (size_t i = 0; i < t; i++) {
+        ++literal_histo[data[pos & mask]];
+        pos += kSampleRate;
+      }
+      if (BitsEntropy(literal_histo, 256) > bit_cost_threshold) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void WriteMetaBlockInternal(const uint8_t* data,
+                            const size_t mask,
+                            const uint64_t last_flush_pos,
+                            const size_t bytes,
+                            const bool is_last,
+                            const int quality,
+                            const bool font_mode,
+                            const uint8_t prev_byte,
+                            const uint8_t prev_byte2,
+                            const size_t num_literals,
+                            const size_t num_commands,
+                            Command* commands,
+                            const int* saved_dist_cache,
+                            int* dist_cache,
+                            size_t* storage_ix,
+                            uint8_t* storage) {
+  if (bytes == 0) {
+    // Write the ISLAST and ISEMPTY bits.
+    WriteBits(2, 3, storage_ix, storage);
+    *storage_ix = (*storage_ix + 7u) & ~7u;
+    return;
+  }
+
+  if (!ShouldCompress(data, mask, last_flush_pos, bytes,
+                      num_literals, num_commands)) {
+    // Restore the distance cache, as its last update by
+    // CreateBackwardReferences is now unused.
+    memcpy(dist_cache, saved_dist_cache, 4 * sizeof(dist_cache[0]));
+    StoreUncompressedMetaBlock(is_last, data,
+                               WrapPosition(last_flush_pos), mask, bytes,
+                               storage_ix, storage);
+    return;
+  }
+
+  const uint8_t last_byte = storage[0];
+  const uint8_t last_byte_bits = static_cast<uint8_t>(*storage_ix & 0xff);
+  uint32_t num_direct_distance_codes = 0;
+  uint32_t distance_postfix_bits = 0;
+  if (quality > 9 && font_mode) {
+    num_direct_distance_codes = 12;
+    distance_postfix_bits = 1;
+    RecomputeDistancePrefixes(commands,
+                              num_commands,
+                              num_direct_distance_codes,
+                              distance_postfix_bits);
+  }
+  if (quality == 2) {
+    StoreMetaBlockFast(data, WrapPosition(last_flush_pos),
+                       bytes, mask, is_last,
+                       commands, num_commands,
+                       storage_ix, storage);
+  } else if (quality < kMinQualityForBlockSplit) {
+    StoreMetaBlockTrivial(data, WrapPosition(last_flush_pos),
+                          bytes, mask, is_last,
+                          commands, num_commands,
+                          storage_ix, storage);
+  } else {
+    MetaBlockSplit mb;
+    ContextType literal_context_mode = CONTEXT_UTF8;
+    if (quality <= 9) {
+      size_t num_literal_contexts = 1;
+      const uint32_t* literal_context_map = NULL;
+      DecideOverLiteralContextModeling(data, WrapPosition(last_flush_pos),
+                                       bytes, mask,
+                                       quality,
+                                       &literal_context_mode,
+                                       &num_literal_contexts,
+                                       &literal_context_map);
+      if (literal_context_map == NULL) {
+        BuildMetaBlockGreedy(data, WrapPosition(last_flush_pos), mask,
+                             commands, num_commands, &mb);
+      } else {
+        BuildMetaBlockGreedyWithContexts(data, WrapPosition(last_flush_pos),
+                                         mask,
+                                         prev_byte, prev_byte2,
+                                         literal_context_mode,
+                                         num_literal_contexts,
+                                         literal_context_map,
+                                         commands, num_commands,
+                                         &mb);
+      }
+    } else {
+      if (!IsMostlyUTF8(data, WrapPosition(last_flush_pos), mask, bytes,
+                        kMinUTF8Ratio)) {
+        literal_context_mode = CONTEXT_SIGNED;
+      }
+      BuildMetaBlock(data, WrapPosition(last_flush_pos), mask,
+                     prev_byte, prev_byte2,
+                     commands, num_commands,
+                     literal_context_mode,
+                     &mb);
+    }
+    if (quality >= kMinQualityForOptimizeHistograms) {
+      OptimizeHistograms(num_direct_distance_codes,
+                         distance_postfix_bits,
+                         &mb);
+    }
+    StoreMetaBlock(data, WrapPosition(last_flush_pos), bytes, mask,
+                   prev_byte, prev_byte2,
+                   is_last,
+                   num_direct_distance_codes,
+                   distance_postfix_bits,
+                   literal_context_mode,
+                   commands, num_commands,
+                   mb,
+                   storage_ix, storage);
+  }
+  if (bytes + 4 < (*storage_ix >> 3)) {
+    // Restore the distance cache and last byte.
+    memcpy(dist_cache, saved_dist_cache, 4 * sizeof(dist_cache[0]));
+    storage[0] = last_byte;
+    *storage_ix = last_byte_bits;
+    StoreUncompressedMetaBlock(is_last, data,
+                               WrapPosition(last_flush_pos), mask,
+                               bytes, storage_ix, storage);
+  }
+}
+
 BrotliCompressor::BrotliCompressor(BrotliParams params)
    : params_(params),
      hashers_(new Hashers()),
@ -211,7 +455,7 @@ BrotliCompressor::BrotliCompressor(BrotliParams params)
  } else if (params_.lgblock == 0) {
    params_.lgblock = 16;
    if (params_.quality >= 9 && params_.lgwin > params_.lgblock) {
-      params_.lgblock = std::min(21, params_.lgwin);
+      params_.lgblock = std::min(18, params_.lgwin);
    }
  } else {
    params_.lgblock = std::min(kMaxInputBlockBits,
@ -403,9 +647,13 @@ bool BrotliCompressor::WriteBrotliData(const bool is_last,
                           &num_literals_);

  size_t max_length = std::min<size_t>(mask + 1, 1u << kMaxInputBlockBits);
+  const size_t max_literals = max_length / 8;
+  const size_t max_commands = max_length / 8;
  if (!is_last && !force_flush &&
      (params_.quality >= kMinQualityForBlockSplit ||
       (num_literals_ + num_commands_ < kMaxNumDelayedSymbols)) &&
+      num_literals_ < max_literals &&
+      num_commands_ < max_commands &&
      input_pos_ + input_block_size() <= last_flush_pos_ + max_length) {
    // Merge with next input block. Everything will happen later.
    last_processed_pos_ = input_pos_;
@ -421,253 +669,36 @@ bool BrotliCompressor::WriteBrotliData(const bool is_last,
    last_insert_len_ = 0;
  }

-  WriteMetaBlockInternal(is_last, out_size, output);
-  return true;
-}
-
-// Decide about the context map based on the ability of the prediction
-// ability of the previous byte UTF8-prefix on the next byte. The
-// prediction ability is calculated as shannon entropy. Here we need
-// shannon entropy instead of 'BitsEntropy' since the prefix will be
-// encoded with the remaining 6 bits of the following byte, and
-// BitsEntropy will assume that symbol to be stored alone using Huffman
-// coding.
-void ChooseContextMap(int quality,
-                      uint32_t* bigram_histo,
-                      size_t* num_literal_contexts,
-                      const uint32_t** literal_context_map) {
-  uint32_t monogram_histo[3] = { 0 };
-  uint32_t two_prefix_histo[6] = { 0 };
-  size_t total = 0;
-  for (size_t i = 0; i < 9; ++i) {
-    total += bigram_histo[i];
-    monogram_histo[i % 3] += bigram_histo[i];
-    size_t j = i;
-    if (j >= 6) {
-      j -= 6;
-    }
-    two_prefix_histo[j] += bigram_histo[i];
-  }
-  size_t dummy;
-  double entropy1 = ShannonEntropy(monogram_histo, 3, &dummy);
-  double entropy2 = (ShannonEntropy(two_prefix_histo, 3, &dummy) +
-                     ShannonEntropy(two_prefix_histo + 3, 3, &dummy));
-  double entropy3 = 0;
-  for (size_t k = 0; k < 3; ++k) {
-    entropy3 += ShannonEntropy(bigram_histo + 3 * k, 3, &dummy);
-  }
-
-  assert(total != 0);
-  double scale = 1.0 / static_cast<double>(total);
-  entropy1 *= scale;
-  entropy2 *= scale;
-  entropy3 *= scale;
-
-  static const uint32_t kStaticContextMapContinuation[64] = {
-    1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  };
-  static const uint32_t kStaticContextMapSimpleUTF8[64] = {
-    0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  };
-  if (quality < 7) {
-    // 3 context models is a bit slower, don't use it at lower qualities.
-    entropy3 = entropy1 * 10;
-  }
-  // If expected savings by symbol are less than 0.2 bits, skip the
-  // context modeling -- in exchange for faster decoding speed.
-  if (entropy1 - entropy2 < 0.2 &&
-      entropy1 - entropy3 < 0.2) {
-    *num_literal_contexts = 1;
-  } else if (entropy2 - entropy3 < 0.02) {
-    *num_literal_contexts = 2;
-    *literal_context_map = kStaticContextMapSimpleUTF8;
-  } else {
-    *num_literal_contexts = 3;
-    *literal_context_map = kStaticContextMapContinuation;
-  }
-}
-
-void DecideOverLiteralContextModeling(const uint8_t* input,
-                                      size_t start_pos,
-                                      size_t length,
-                                      size_t mask,
-                                      int quality,
-                                      ContextType* literal_context_mode,
-                                      size_t* num_literal_contexts,
-                                      const uint32_t** literal_context_map) {
-  if (quality < kMinQualityForContextModeling || length < 64) {
-    return;
-  }
-  // Gather bigram data of the UTF8 byte prefixes. To make the analysis of
-  // UTF8 data faster we only examine 64 byte long strides at every 4kB
-  // intervals.
-  const size_t end_pos = start_pos + length;
-  uint32_t bigram_prefix_histo[9] = { 0 };
-  for (; start_pos + 64 <= end_pos; start_pos += 4096) {
-      static const int lut[4] = { 0, 0, 1, 2 };
-    const size_t stride_end_pos = start_pos + 64;
-    int prev = lut[input[start_pos & mask] >> 6] * 3;
-    for (size_t pos = start_pos + 1; pos < stride_end_pos; ++pos) {
-      const uint8_t literal = input[pos & mask];
-      ++bigram_prefix_histo[prev + lut[literal >> 6]];
-      prev = lut[literal >> 6] * 3;
-    }
-  }
-  *literal_context_mode = CONTEXT_UTF8;
-  ChooseContextMap(quality, &bigram_prefix_histo[0], num_literal_contexts,
-                   literal_context_map);
-}
-
-void BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
-                                              size_t* out_size,
-                                              uint8_t** output) {
  if (!is_last && input_pos_ == last_flush_pos_) {
    // We have no new input data and we don't have to finish the stream, so
    // nothing to do.
    *out_size = 0;
-    return;
+    return true;
  }
  assert(input_pos_ >= last_flush_pos_);
  assert(input_pos_ > last_flush_pos_ || is_last);
  assert(input_pos_ - last_flush_pos_ <= 1u << 24);
-  const uint32_t bytes = static_cast<uint32_t>(input_pos_ - last_flush_pos_);
-  const uint8_t* data = ringbuffer_->start();
-  const uint32_t mask = ringbuffer_->mask();
-  const size_t max_out_size = 2 * bytes + 500;
+  const uint32_t metablock_size =
+      static_cast<uint32_t>(input_pos_ - last_flush_pos_);
+  const size_t max_out_size = 2 * metablock_size + 500;
  uint8_t* storage = GetBrotliStorage(max_out_size);
  storage[0] = last_byte_;
  size_t storage_ix = last_byte_bits_;
-
-  bool uncompressed = false;
-  if (num_commands_ < (bytes >> 8) + 2) {
-    if (num_literals_ > 0.99 * static_cast<double>(bytes)) {
-      uint32_t literal_histo[256] = { 0 };
-      static const uint32_t kSampleRate = 13;
-      static const double kMinEntropy = 7.92;
-      const double bit_cost_threshold =
-          static_cast<double>(bytes) * kMinEntropy / kSampleRate;
-      size_t t = (bytes + kSampleRate - 1) / kSampleRate;
-      uint32_t pos = static_cast<uint32_t>(last_flush_pos_);
-      for (size_t i = 0; i < t; i++) {
-        ++literal_histo[data[pos & mask]];
-        pos += kSampleRate;
-      }
-      if (BitsEntropy(literal_histo, 256) > bit_cost_threshold) {
-        uncompressed = true;
-      }
-    }
-  }
-
-  if (bytes == 0) {
-    // Write the ISLAST and ISEMPTY bits.
-    WriteBits(2, 3, &storage_ix, &storage[0]);
-    storage_ix = (storage_ix + 7u) & ~7u;
-  } else if (uncompressed) {
-    // Restore the distance cache, as its last update by
-    // CreateBackwardReferences is now unused.
-    memcpy(dist_cache_, saved_dist_cache_, sizeof(dist_cache_));
-    StoreUncompressedMetaBlock(is_last, data,
-                               WrapPosition(last_flush_pos_), mask, bytes,
-                               &storage_ix,
-                               &storage[0]);
-  } else {
-    uint32_t num_direct_distance_codes = 0;
-    uint32_t distance_postfix_bits = 0;
-    if (params_.quality > 9 && params_.mode == BrotliParams::MODE_FONT) {
-      num_direct_distance_codes = 12;
-      distance_postfix_bits = 1;
-      RecomputeDistancePrefixes(commands_,
-                                num_commands_,
-                                num_direct_distance_codes,
-                                distance_postfix_bits);
-    }
-    if (params_.quality == 2) {
-      StoreMetaBlockFast(data, WrapPosition(last_flush_pos_),
-                         bytes, mask, is_last,
-                         commands_, num_commands_,
-                         &storage_ix,
-                         &storage[0]);
-    } else if (params_.quality < kMinQualityForBlockSplit) {
-      StoreMetaBlockTrivial(data, WrapPosition(last_flush_pos_),
-                            bytes, mask, is_last,
-                            commands_, num_commands_,
-                            &storage_ix,
-                            &storage[0]);
-    } else {
-      MetaBlockSplit mb;
-      ContextType literal_context_mode = CONTEXT_UTF8;
-      if (params_.quality <= 9) {
-        size_t num_literal_contexts = 1;
-        const uint32_t* literal_context_map = NULL;
-        DecideOverLiteralContextModeling(data, WrapPosition(last_flush_pos_),
-                                         bytes, mask,
-                                         params_.quality,
-                                         &literal_context_mode,
-                                         &num_literal_contexts,
-                                         &literal_context_map);
-        if (literal_context_map == NULL) {
-          BuildMetaBlockGreedy(data, WrapPosition(last_flush_pos_), mask,
-                               commands_, num_commands_,
-                               &mb);
-        } else {
-          BuildMetaBlockGreedyWithContexts(data, WrapPosition(last_flush_pos_),
-                                           mask,
-                                           prev_byte_, prev_byte2_,
-                                           literal_context_mode,
-                                           num_literal_contexts,
-                                           literal_context_map,
-                                           commands_, num_commands_,
-                                           &mb);
-        }
-      } else {
-        if (!IsMostlyUTF8(
-            data, WrapPosition(last_flush_pos_), mask, bytes, kMinUTF8Ratio)) {
-          literal_context_mode = CONTEXT_SIGNED;
-        }
-        BuildMetaBlock(data, WrapPosition(last_flush_pos_), mask,
-                       prev_byte_, prev_byte2_,
-                       commands_, num_commands_,
-                       literal_context_mode,
-                       &mb);
-      }
-      if (params_.quality >= kMinQualityForOptimizeHistograms) {
-        OptimizeHistograms(num_direct_distance_codes,
-                           distance_postfix_bits,
-                           &mb);
-      }
-      StoreMetaBlock(data, WrapPosition(last_flush_pos_), bytes, mask,
-                     prev_byte_, prev_byte2_,
-                     is_last,
-                     num_direct_distance_codes,
-                     distance_postfix_bits,
-                     literal_context_mode,
-                     commands_, num_commands_,
-                     mb,
-                     &storage_ix,
-                     &storage[0]);
-    }
-    if (bytes + 4 < (storage_ix >> 3)) {
-      // Restore the distance cache and last byte.
-      memcpy(dist_cache_, saved_dist_cache_, sizeof(dist_cache_));
-      storage[0] = last_byte_;
-      storage_ix = last_byte_bits_;
-      StoreUncompressedMetaBlock(is_last, data,
-                                 WrapPosition(last_flush_pos_), mask,
-                                 bytes, &storage_ix, &storage[0]);
-    }
-  }
+  bool font_mode = params_.mode == BrotliParams::MODE_FONT;
+  WriteMetaBlockInternal(
+      data, mask, last_flush_pos_, metablock_size, is_last, params_.quality,
+      font_mode, prev_byte_, prev_byte2_, num_literals_, num_commands_,
+      commands_, saved_dist_cache_, dist_cache_, &storage_ix, storage);
  last_byte_ = storage[storage_ix >> 3];
  last_byte_bits_ = storage_ix & 7u;
  last_flush_pos_ = input_pos_;
  last_processed_pos_ = input_pos_;
-  prev_byte_ = data[(static_cast<uint32_t>(last_flush_pos_) - 1) & mask];
-  prev_byte2_ = data[(static_cast<uint32_t>(last_flush_pos_) - 2) & mask];
+  if (last_flush_pos_ > 0) {
+    prev_byte_ = data[(static_cast<uint32_t>(last_flush_pos_) - 1) & mask];
+  }
+  if (last_flush_pos_ > 1) {
+    prev_byte2_ = data[(static_cast<uint32_t>(last_flush_pos_) - 2) & mask];
+  }
  num_commands_ = 0;
  num_literals_ = 0;
  // Save the state of the distance cache in case we need to restore it for
@ -675,6 +706,7 @@ void BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
  memcpy(saved_dist_cache_, dist_cache_, sizeof(dist_cache_));
  *output = &storage[0];
  *out_size = storage_ix >> 3;
+  return true;
 }

 bool BrotliCompressor::WriteMetaBlock(const size_t input_size,
@ -739,6 +771,177 @@ bool BrotliCompressor::FinishStream(
  return WriteMetaBlock(0, NULL, true, encoded_size, encoded_buffer);
 }

+int BrotliCompressBufferQuality10(int lgwin,
+                                  size_t input_size,
+                                  const uint8_t* input_buffer,
+                                  size_t* encoded_size,
+                                  uint8_t* encoded_buffer) {
+  const size_t mask = std::numeric_limits<size_t>::max() >> 1;
+  assert(input_size <= mask + 1);
+  const size_t max_backward_limit = (1 << lgwin) - 16;
+  int dist_cache[4] = { 4, 11, 15, 16 };
+  int saved_dist_cache[4] = { 4, 11, 15, 16 };
+  int ok = 1;
+  const size_t max_out_size = *encoded_size;
+  size_t total_out_size = 0;
+  uint8_t last_byte;
+  uint8_t last_byte_bits;
+  EncodeWindowBits(lgwin, &last_byte, &last_byte_bits);
+
+  Hashers::H10* hasher = new Hashers::H10;
+  const size_t hasher_eff_size = std::min(input_size, max_backward_limit + 16);
+  hasher->Init(lgwin, 0, hasher_eff_size, true);
+
+  const int lgblock = std::min(18, lgwin);
+  const int lgmetablock = std::min(24, lgwin + 1);
+  const size_t max_block_size = static_cast<size_t>(1) << lgblock;
+  const size_t max_metablock_size = static_cast<size_t>(1) << lgmetablock;
+  const size_t max_literals_per_metablock = max_metablock_size / 8;
+  const size_t max_commands_per_metablock = max_metablock_size / 8;
+  size_t metablock_start = 0;
+  uint8_t prev_byte = 0;
+  uint8_t prev_byte2 = 0;
+  while (ok && metablock_start < input_size) {
+    const size_t metablock_end =
+        std::min(input_size, metablock_start + max_metablock_size);
+    const size_t expected_num_commands =
+        (metablock_end - metablock_start) / 12 + 16;
+    Command* commands = 0;
+    size_t num_commands = 0;
+    size_t last_insert_len = 0;
+    size_t num_literals = 0;
+    size_t metablock_size = 0;
+    size_t cmd_alloc_size = 0;
+
+    for (size_t block_start = metablock_start; block_start < metablock_end; ) {
+      size_t block_size = std::min(metablock_end - block_start, max_block_size);
+      ZopfliNode* nodes = new ZopfliNode[block_size + 1];
+      std::vector<uint32_t> path;
+      hasher->StitchToPreviousBlock(block_size, block_start,
+                                    input_buffer, mask);
+      ZopfliComputeShortestPath(block_size, block_start, input_buffer, mask,
+                                max_backward_limit, dist_cache,
+                                hasher, nodes, &path);
+      // We allocate a command buffer in the first iteration of this loop that
+      // will be likely big enough for the whole metablock, so that for most
+      // inputs we will not have to reallocate in later iterations. We do the
+      // allocation here and not before the loop, because if the input is small,
+      // this will be allocated after the zopfli cost model is freed, so this
+      // will not increase peak memory usage.
+      // TODO: If the first allocation is too small, increase command
+      // buffer size exponentially.
+      size_t new_cmd_alloc_size = std::max(expected_num_commands,
+                                           num_commands + path.size() + 1);
+      if (cmd_alloc_size != new_cmd_alloc_size) {
+        cmd_alloc_size = new_cmd_alloc_size;
+        commands = static_cast<Command*>(
+            realloc(commands, cmd_alloc_size * sizeof(Command)));
+      }
+      ZopfliCreateCommands(block_size, block_start, max_backward_limit, path,
+                           &nodes[0], dist_cache, &last_insert_len,
+                           &commands[num_commands], &num_literals);
+      num_commands += path.size();
+      block_start += block_size;
+      metablock_size += block_size;
+      delete[] nodes;
+      if (num_literals > max_literals_per_metablock ||
+          num_commands > max_commands_per_metablock) {
+        break;
+      }
+    }
+
+    if (last_insert_len > 0) {
+      Command cmd(last_insert_len);
+      commands[num_commands++] = cmd;
+      num_literals += last_insert_len;
+    }
+
+    const bool is_last = (metablock_start + metablock_size == input_size);
+    uint8_t* storage = NULL;
+    size_t storage_ix = last_byte_bits;
+
+    if (metablock_size == 0) {
+      // Write the ISLAST and ISEMPTY bits.
+      storage = new uint8_t[16];
+      storage[0] = last_byte;
+      WriteBits(2, 3, &storage_ix, storage);
+      storage_ix = (storage_ix + 7u) & ~7u;
+    } else if (!ShouldCompress(input_buffer, mask, metablock_start,
+                               metablock_size, num_literals, num_commands)) {
+      // Restore the distance cache, as its last update by
+      // CreateBackwardReferences is now unused.
+      memcpy(dist_cache, saved_dist_cache, 4 * sizeof(dist_cache[0]));
+      storage = new uint8_t[metablock_size + 16];
+      storage[0] = last_byte;
+      StoreUncompressedMetaBlock(is_last, input_buffer,
+                                 metablock_start, mask, metablock_size,
+                                 &storage_ix, storage);
+    } else {
+      uint32_t num_direct_distance_codes = 0;
+      uint32_t distance_postfix_bits = 0;
+      MetaBlockSplit mb;
+      ContextType literal_context_mode = CONTEXT_UTF8;
+      if (!IsMostlyUTF8(
+              input_buffer, metablock_start, mask, metablock_size,
+              kMinUTF8Ratio)) {
+        literal_context_mode = CONTEXT_SIGNED;
+      }
+      BuildMetaBlock(input_buffer, metablock_start, mask,
+                     prev_byte, prev_byte2,
+                     commands, num_commands,
+                     literal_context_mode,
+                     &mb);
+      OptimizeHistograms(num_direct_distance_codes,
+                         distance_postfix_bits,
+                         &mb);
+      const size_t max_out_metablock_size = 2 * metablock_size + 500;
+      storage = new uint8_t[max_out_metablock_size];
+      storage[0] = last_byte;
+      StoreMetaBlock(input_buffer, metablock_start, metablock_size, mask,
+                     prev_byte, prev_byte2,
+                     is_last,
+                     num_direct_distance_codes,
+                     distance_postfix_bits,
+                     literal_context_mode,
+                     commands, num_commands,
+                     mb,
+                     &storage_ix, storage);
+      if (metablock_size + 4 < (storage_ix >> 3)) {
+        // Restore the distance cache and last byte.
+        memcpy(dist_cache, saved_dist_cache, 4 * sizeof(dist_cache[0]));
+        storage[0] = last_byte;
+        storage_ix = last_byte_bits;
+        StoreUncompressedMetaBlock(is_last, input_buffer,
+                                   metablock_start, mask,
+                                   metablock_size, &storage_ix, storage);
+      }
+    }
+    last_byte = storage[storage_ix >> 3];
+    last_byte_bits = storage_ix & 7u;
+    metablock_start += metablock_size;
+    prev_byte = input_buffer[metablock_start - 1];
+    prev_byte2 = input_buffer[metablock_start - 2];
+    // Save the state of the distance cache in case we need to restore it for
+    // emitting an uncompressed block.
+    memcpy(saved_dist_cache, dist_cache, 4 * sizeof(dist_cache[0]));
+
+    const size_t out_size = storage_ix >> 3;
+    total_out_size += out_size;
+    if (total_out_size <= max_out_size) {
+      memcpy(encoded_buffer, storage, out_size);
+      encoded_buffer += out_size;
+    } else {
+      ok = 0;
+    }
+    delete[] storage;
+    free(commands);
+  }
+
+  *encoded_size = total_out_size;
+  delete hasher;
+  return ok;
+}
+
 int BrotliCompressBuffer(BrotliParams params,
                         size_t input_size,
                         const uint8_t* input_buffer,
@ -748,6 +951,18 @@ int BrotliCompressBuffer(BrotliParams params,
    // Output buffer needs at least one byte.
    return 0;
  }
+  if (input_size == 0) {
+    // Handle the special case of empty input.
+    *encoded_size = 1;
+    *encoded_buffer = 6;
+    return 1;
+  }
+  if (params.quality == 10) {
+    // TODO(user) Implement this direct path for all quality levels.
+    const int lgwin = std::min(24, std::max(16, params.lgwin));
+    return BrotliCompressBufferQuality10(lgwin, input_size, input_buffer,
+                                         encoded_size, encoded_buffer);
+  }
  BrotliMemIn in(input_buffer, input_size);
  BrotliMemOut out(encoded_buffer, *encoded_size);
  if (!BrotliCompress(params, &in, &out)) {
--- a/enc/encode.h
+++ b/enc/encode.h
@ -140,10 +140,6 @@ class BrotliCompressor {
  int* GetHashTable(int quality,
                    size_t input_size, size_t* table_size);

-  void WriteMetaBlockInternal(const bool is_last,
-                              size_t* out_size,
-                              uint8_t** output);
-
  BrotliParams params_;
  Hashers* hashers_;
  int hash_type_;
--- a/enc/encode_parallel.cc
+++ b/enc/encode_parallel.cc
@ -40,7 +40,7 @@ void RecomputeDistancePrefixes(Command* cmds, size_t num_commands,
  }
  for (size_t i = 0; i < num_commands; ++i) {
    Command* cmd = &cmds[i];
-    if (cmd->copy_len_ > 0 && cmd->cmd_prefix_ >= 128) {
+    if (cmd->copy_len() && cmd->cmd_prefix_ >= 128) {
      PrefixEncodeCopyDistance(cmd->DistanceCode(),
                               num_direct_distance_codes,
                               distance_postfix_bits,
--- a/enc/entropy_encode.cc
+++ b/enc/entropy_encode.cc
@ -10,7 +10,6 @@

 #include <algorithm>
 #include <limits>
-#include <vector>
 #include <cstdlib>

 #include "./histogram.h"
@ -32,6 +31,15 @@ void SetDepth(const HuffmanTree &p,
  }
 }

+// Sort the root nodes, least popular first.
+static inline bool SortHuffmanTree(const HuffmanTree& v0,
+                                   const HuffmanTree& v1) {
+  if (v0.total_count_ != v1.total_count_) {
+    return v0.total_count_ < v1.total_count_;
+  }
+  return v0.index_right_or_value_ > v1.index_right_or_value_;
+}
+
 // This function will create a Huffman tree.
 //
 // The catch here is that the tree cannot be arbitrarily deep.
@ -50,30 +58,28 @@ void SetDepth(const HuffmanTree &p,
 void CreateHuffmanTree(const uint32_t *data,
                       const size_t length,
                       const int tree_limit,
+                       HuffmanTree* tree,
                       uint8_t *depth) {
  // For block sizes below 64 kB, we never need to do a second iteration
  // of this loop. Probably all of our block sizes will be smaller than
  // that, so this loop is mostly of academic interest. If we actually
  // would need this, we would be better off with the Katajainen algorithm.
  for (uint32_t count_limit = 1; ; count_limit *= 2) {
-    std::vector<HuffmanTree> tree;
-    tree.reserve(2 * length + 1);
-
+    size_t n = 0;
    for (size_t i = length; i != 0;) {
      --i;
      if (data[i]) {
        const uint32_t count = std::max(data[i], count_limit);
-        tree.push_back(HuffmanTree(count, -1, static_cast<int16_t>(i)));
+        tree[n++] = HuffmanTree(count, -1, static_cast<int16_t>(i));
      }
    }

-    const size_t n = tree.size();
    if (n == 1) {
      depth[tree[0].index_right_or_value_] = 1;      // Only one element.
      break;
    }

-    std::stable_sort(tree.begin(), tree.end(), SortHuffmanTree);
+    std::sort(tree, tree + n, SortHuffmanTree);

    // The nodes are:
    // [0, n): the sorted leaf nodes that we start with.
@ -83,8 +89,8 @@ void CreateHuffmanTree(const uint32_t *data,
    // [2n]: we add a sentinel at the end as well.
    // There will be (2n+1) elements at the end.
    const HuffmanTree sentinel(std::numeric_limits<uint32_t>::max(), -1, -1);
-    tree.push_back(sentinel);
-    tree.push_back(sentinel);
+    tree[n] = sentinel;
+    tree[n + 1] = sentinel;

    size_t i = 0;      // Points to the next leaf node.
    size_t j = n + 1;  // Points to the next non-leaf node.
@ -106,16 +112,15 @@ void CreateHuffmanTree(const uint32_t *data,
      }

      // The sentinel node becomes the parent node.
-      size_t j_end = tree.size() - 1;
+      size_t j_end = 2 * n - k;
      tree[j_end].total_count_ =
          tree[left].total_count_ + tree[right].total_count_;
      tree[j_end].index_left_ = static_cast<int16_t>(left);
      tree[j_end].index_right_or_value_ = static_cast<int16_t>(right);

      // Add back the last sentinel node.
-      tree.push_back(sentinel);
+      tree[j_end + 1] = sentinel;
    }
-    assert(tree.size() == 2 * n + 1);
    SetDepth(tree[2 * n - 1], &tree[0], depth, 0);

    // We need to pack the Huffman tree in tree_limit bits.
@ -127,12 +132,12 @@ void CreateHuffmanTree(const uint32_t *data,
  }
 }

-void Reverse(std::vector<uint8_t>* v, size_t start, size_t end) {
+void Reverse(uint8_t* v, size_t start, size_t end) {
  --end;
  while (start < end) {
-    uint8_t tmp = (*v)[start];
-    (*v)[start] = (*v)[end];
-    (*v)[end] = tmp;
+    uint8_t tmp = v[start];
+    v[start] = v[end];
+    v[end] = tmp;
    ++start;
    --end;
  }
@ -142,79 +147,88 @@ void WriteHuffmanTreeRepetitions(
    const uint8_t previous_value,
    const uint8_t value,
    size_t repetitions,
-    std::vector<uint8_t> *tree,
-    std::vector<uint8_t> *extra_bits_data) {
+    size_t* tree_size,
+    uint8_t* tree,
+    uint8_t* extra_bits_data) {
  assert(repetitions > 0);
  if (previous_value != value) {
-    tree->push_back(value);
-    extra_bits_data->push_back(0);
+    tree[*tree_size] = value;
+    extra_bits_data[*tree_size] = 0;
+    ++(*tree_size);
    --repetitions;
  }
  if (repetitions == 7) {
-    tree->push_back(value);
-    extra_bits_data->push_back(0);
+    tree[*tree_size] = value;
+    extra_bits_data[*tree_size] = 0;
+    ++(*tree_size);
    --repetitions;
  }
  if (repetitions < 3) {
    for (size_t i = 0; i < repetitions; ++i) {
-      tree->push_back(value);
-      extra_bits_data->push_back(0);
+      tree[*tree_size] = value;
+      extra_bits_data[*tree_size] = 0;
+      ++(*tree_size);
    }
  } else {
    repetitions -= 3;
-    size_t start = tree->size();
+    size_t start = *tree_size;
    while (true) {
-      tree->push_back(16);
-      extra_bits_data->push_back(repetitions & 0x3);
+      tree[*tree_size] = 16;
+      extra_bits_data[*tree_size] = repetitions & 0x3;
+      ++(*tree_size);
      repetitions >>= 2;
      if (repetitions == 0) {
        break;
      }
      --repetitions;
    }
-    Reverse(tree, start, tree->size());
-    Reverse(extra_bits_data, start, tree->size());
+    Reverse(tree, start, *tree_size);
+    Reverse(extra_bits_data, start, *tree_size);
  }
 }

 void WriteHuffmanTreeRepetitionsZeros(
    size_t repetitions,
-    std::vector<uint8_t> *tree,
-    std::vector<uint8_t> *extra_bits_data) {
+    size_t* tree_size,
+    uint8_t* tree,
+    uint8_t* extra_bits_data) {
  if (repetitions == 11) {
-    tree->push_back(0);
-    extra_bits_data->push_back(0);
+    tree[*tree_size] = 0;
+    extra_bits_data[*tree_size] = 0;
+    ++(*tree_size);
    --repetitions;
  }
  if (repetitions < 3) {
    for (size_t i = 0; i < repetitions; ++i) {
-      tree->push_back(0);
-      extra_bits_data->push_back(0);
+      tree[*tree_size] = 0;
+      extra_bits_data[*tree_size] = 0;
+      ++(*tree_size);
    }
  } else {
    repetitions -= 3;
-    size_t start = tree->size();
+    size_t start = *tree_size;
    while (true) {
-      tree->push_back(17);
-      extra_bits_data->push_back(repetitions & 0x7);
+      tree[*tree_size] = 17;
+      extra_bits_data[*tree_size] = repetitions & 0x7;
+      ++(*tree_size);
      repetitions >>= 3;
      if (repetitions == 0) {
        break;
      }
      --repetitions;
    }
-    Reverse(tree, start, tree->size());
-    Reverse(extra_bits_data, start, tree->size());
+    Reverse(tree, start, *tree_size);
+    Reverse(extra_bits_data, start, *tree_size);
  }
 }

-bool OptimizeHuffmanCountsForRle(size_t length, uint32_t* counts) {
+void OptimizeHuffmanCountsForRle(size_t length, uint32_t* counts,
+                                 uint8_t* good_for_rle) {
  size_t nonzero_count = 0;
  size_t stride;
  size_t limit;
  size_t sum;
  const size_t streak_limit = 1240;
-  uint8_t* good_for_rle;
  // Let's make the Huffman code more compatible with rle encoding.
  size_t i;
  for (i = 0; i < length; i++) {
@ -223,13 +237,13 @@ bool OptimizeHuffmanCountsForRle(size_t length, uint32_t* counts) {
    }
  }
  if (nonzero_count < 16) {
-    return 1;
+    return;
  }
  while (length != 0 && counts[length - 1] == 0) {
    --length;
  }
  if (length == 0) {
-    return 1;  // All zeros.
+    return;  // All zeros.
  }
  // Now counts[0..length - 1] does not have trailing zeros.
  {
@ -245,7 +259,7 @@ bool OptimizeHuffmanCountsForRle(size_t length, uint32_t* counts) {
    }
    if (nonzeros < 5) {
      // Small histogram will model it well.
-      return 1;
+      return;
    }
    size_t zeros = length - nonzeros;
    if (smallest_nonzero < 4) {
@ -258,15 +272,12 @@ bool OptimizeHuffmanCountsForRle(size_t length, uint32_t* counts) {
      }
    }
    if (nonzeros < 28) {
-      return 1;
+      return;
    }
  }
  // 2) Let's mark all population counts that already can be encoded
  // with an rle code.
-  good_for_rle = (uint8_t*)calloc(length, 1);
-  if (good_for_rle == NULL) {
-    return 0;
-  }
+  memset(good_for_rle, 0, length);
  {
    // Let's not spoil any of the existing good rle codes.
    // Mark any seq of 0's that is longer as 5 as a good_for_rle.
@ -340,8 +351,6 @@ bool OptimizeHuffmanCountsForRle(size_t length, uint32_t* counts) {
      }
    }
  }
-  free(good_for_rle);
-  return 1;
 }

 static void DecideOverRleUse(const uint8_t* depth, const size_t length,
@ -373,8 +382,9 @@ static void DecideOverRleUse(const uint8_t* depth, const size_t length,

 void WriteHuffmanTree(const uint8_t* depth,
                      size_t length,
-                      std::vector<uint8_t> *tree,
-                      std::vector<uint8_t> *extra_bits_data) {
+                      size_t* tree_size,
+                      uint8_t* tree,
+                      uint8_t* extra_bits_data) {
  uint8_t previous_value = 8;

  // Throw away trailing zeros.
@ -408,10 +418,11 @@ void WriteHuffmanTree(const uint8_t* depth,
      }
    }
    if (value == 0) {
-      WriteHuffmanTreeRepetitionsZeros(reps, tree, extra_bits_data);
+      WriteHuffmanTreeRepetitionsZeros(reps, tree_size, tree, extra_bits_data);
    } else {
      WriteHuffmanTreeRepetitions(previous_value,
-                                  value, reps, tree, extra_bits_data);
+                                  value, reps, tree_size,
+                                  tree, extra_bits_data);
      previous_value = value;
    }
    i += reps;
--- a/enc/entropy_encode.h
+++ b/enc/entropy_encode.h
@ -10,7 +10,6 @@
 #define BROTLI_ENC_ENTROPY_ENCODE_H_

 #include <string.h>
-#include <vector>
 #include "./histogram.h"
 #include "./prefix.h"
 #include "./types.h"
@ -19,6 +18,7 @@ namespace brotli {

 // A node of a Huffman tree.
 struct HuffmanTree {
+  HuffmanTree() {}
  HuffmanTree(uint32_t count, int16_t left, int16_t right)
      : total_count_(count),
        index_left_(left),
@ -29,11 +29,6 @@ struct HuffmanTree {
  int16_t index_right_or_value_;
 };

-// Sort the root nodes, least popular first.
-inline bool SortHuffmanTree(const HuffmanTree &v0, const HuffmanTree &v1) {
-  return v0.total_count_ < v1.total_count_;
-}
-
 void SetDepth(const HuffmanTree &p, HuffmanTree *pool,
              uint8_t *depth, uint8_t level);

@ -45,10 +40,14 @@ void SetDepth(const HuffmanTree &p, HuffmanTree *pool,
 // The depth contains the tree, i.e., how many bits are used for
 // the symbol.
 //
+// The actual Huffman tree is constructed in the tree[] array, which has to
+// be at least 2 * length + 1 long.
+//
 // See http://en.wikipedia.org/wiki/Huffman_coding
 void CreateHuffmanTree(const uint32_t *data,
                       const size_t length,
                       const int tree_limit,
+                       HuffmanTree* tree,
                       uint8_t *depth);

 // Change the population counts in a way that the consequent
@ -57,15 +56,18 @@ void CreateHuffmanTree(const uint32_t *data,
 //
 // length contains the size of the histogram.
 // counts contains the population counts.
-bool OptimizeHuffmanCountsForRle(size_t length, uint32_t* counts);
+// good_for_rle is a buffer of at least length size
+void OptimizeHuffmanCountsForRle(size_t length, uint32_t* counts,
+                                 uint8_t* good_for_rle);

 // Write a Huffman tree from bit depths into the bitstream representation
 // of a Huffman tree. The generated Huffman tree is to be compressed once
 // more using a Huffman tree
 void WriteHuffmanTree(const uint8_t* depth,
                      size_t num,
-                      std::vector<uint8_t> *tree,
-                      std::vector<uint8_t> *extra_bits_data);
+                      size_t* tree_size,
+                      uint8_t* tree,
+                      uint8_t* extra_bits_data);

 // Get the actual bit values for a tree of bit depths.
 void ConvertBitDepthsToSymbols(const uint8_t *depth,
--- a/enc/entropy_encode_static.h
+++ b/enc/entropy_encode_static.h
@ -78,7 +78,7 @@ static const uint32_t kCodeLengthBits[18] = {
 };

 inline void StoreStaticCodeLengthCode(size_t* storage_ix, uint8_t* storage) {
-  WriteBits(40, 0x000000ff55555554U, storage_ix, storage);
+  WriteBits(40, MAKE_UINT64_T(0xff, 0x55555554), storage_ix, storage);
 }

 static const uint64_t kZeroRepsBits[704] = {
--- a/enc/hash.h
+++ b/enc/hash.h
@ -14,7 +14,6 @@
 #include <algorithm>
 #include <cstring>
 #include <limits>
-#include <vector>

 #include "./dictionary_hash.h"
 #include "./fast_log.h"
@ -278,7 +277,7 @@ class HashLongestMatchQuickly {
          if (matchlen + kCutoffTransformsCount > len && matchlen > 0) {
            const size_t transform_id = kCutoffTransforms[len - matchlen];
            const size_t word_id =
-                transform_id * (1 << kBrotliDictionarySizeBitsByLength[len]) +
+                transform_id * (1u << kBrotliDictionarySizeBitsByLength[len]) +
                dist;
            const size_t backward = max_backward + word_id + 1;
            const double score = BackwardReferenceScore(matchlen, backward);
@ -574,8 +573,10 @@ class HashLongestMatch {
    }
    buckets_[key][num_[key] & kBlockMask] = static_cast<uint32_t>(cur_ix);
    ++num_[key];
-    std::vector<uint32_t> dict_matches(kMaxDictionaryMatchLen + 1,
-                                       kInvalidMatch);
+    uint32_t dict_matches[kMaxDictionaryMatchLen + 1];
+    for (size_t i = 0; i <= kMaxDictionaryMatchLen; ++i) {
+      dict_matches[i] = kInvalidMatch;
+    }
    size_t minlen = std::max<size_t>(4, best_len + 1);
    if (FindAllStaticDictionaryMatches(&data[cur_ix_masked], minlen, max_length,
                                       &dict_matches[0])) {
@ -706,8 +707,10 @@ class HashToBinaryTree {
      matches = StoreAndFindMatches(data, cur_ix, ring_buffer_mask,
                                    max_length, &best_len, matches);
    }
-    std::vector<uint32_t> dict_matches(kMaxDictionaryMatchLen + 1,
-                                       kInvalidMatch);
+    uint32_t dict_matches[kMaxDictionaryMatchLen + 1];
+    for (size_t i = 0; i <= kMaxDictionaryMatchLen; ++i) {
+      dict_matches[i] = kInvalidMatch;
+    }
    size_t minlen = std::max<size_t>(4, best_len + 1);
    if (FindAllStaticDictionaryMatches(&data[cur_ix_masked], minlen, max_length,
                                       &dict_matches[0])) {
@ -725,15 +728,34 @@ class HashToBinaryTree {

  // Stores the hash of the next 4 bytes and re-roots the binary tree at the
  // current sequence, without returning any matches.
+  // REQUIRES: cur_ix + kMaxTreeCompLength <= end-of-current-block
  void Store(const uint8_t* data,
             const size_t ring_buffer_mask,
-             const size_t cur_ix,
-             const size_t max_length) {
+             const size_t cur_ix) {
    size_t best_len = 0;
-    StoreAndFindMatches(data, cur_ix, ring_buffer_mask, max_length,
+    StoreAndFindMatches(data, cur_ix, ring_buffer_mask, kMaxTreeCompLength,
                        &best_len, NULL);
  }

+  void StitchToPreviousBlock(size_t num_bytes,
+                             size_t position,
+                             const uint8_t* ringbuffer,
+                             size_t ringbuffer_mask) {
+    if (num_bytes >= 3 && position >= kMaxTreeCompLength) {
+      // Store the last `kMaxTreeCompLength - 1` positions in the hasher.
+      // These could not be calculated before, since they require knowledge
+      // of both the previous and the current block.
+      const size_t i_start = position - kMaxTreeCompLength + 1;
+      const size_t i_end = std::min(position, i_start + num_bytes);
+      for (size_t i = i_start; i < i_end; ++i) {
+        // We know that i + kMaxTreeCompLength <= position + num_bytes, i.e. the
+        // end of the current block and that we have at least
+        // kMaxTreeCompLength tail in the ringbuffer.
+        Store(ringbuffer, ringbuffer_mask, i);
+      }
+    }
+  }
+
  static const size_t kMaxNumMatches = 64 + kMaxTreeSearchDepth;

 private:
@ -928,8 +950,7 @@ struct Hashers {
      case 10:
        hash_h10->Init(lgwin, 0, size, false);
        for (size_t i = 0; i + kMaxTreeCompLength - 1 < size; ++i) {
-          hash_h10->Store(dict, std::numeric_limits<size_t>::max(),
-                          i, size - i);
+          hash_h10->Store(dict, std::numeric_limits<size_t>::max(), i);
        }
        break;
      default: break;
--- a/enc/histogram.cc
+++ b/enc/histogram.cc
@ -50,8 +50,8 @@ void BuildHistograms(
      prev_byte = ringbuffer[pos & mask];
      ++pos;
    }
-    pos += cmd.copy_len_;
-    if (cmd.copy_len_ > 0) {
+    pos += cmd.copy_len();
+    if (cmd.copy_len()) {
      prev_byte2 = ringbuffer[(pos - 2) & mask];
      prev_byte = ringbuffer[(pos - 1) & mask];
      if (cmd.cmd_prefix_ >= 128) {
--- a/enc/metablock.cc
+++ b/enc/metablock.cc
@ -258,8 +258,8 @@ void BuildMetaBlockGreedy(const uint8_t* ringbuffer,
      lit_blocks.AddSymbol(ringbuffer[pos & mask]);
      ++pos;
    }
-    pos += cmd.copy_len_;
-    if (cmd.copy_len_ > 0 && cmd.cmd_prefix_ >= 128) {
+    pos += cmd.copy_len();
+    if (cmd.copy_len() && cmd.cmd_prefix_ >= 128) {
      dist_blocks.AddSymbol(cmd.dist_prefix_);
    }
  }
@ -488,8 +488,8 @@ void BuildMetaBlockGreedyWithContexts(const uint8_t* ringbuffer,
      prev_byte = literal;
      ++pos;
    }
-    pos += cmd.copy_len_;
-    if (cmd.copy_len_ > 0) {
+    pos += cmd.copy_len();
+    if (cmd.copy_len()) {
      prev_byte2 = ringbuffer[(pos - 2) & mask];
      prev_byte = ringbuffer[(pos - 1) & mask];
      if (cmd.cmd_prefix_ >= 128) {
@ -515,20 +515,25 @@ void BuildMetaBlockGreedyWithContexts(const uint8_t* ringbuffer,
 void OptimizeHistograms(size_t num_direct_distance_codes,
                        size_t distance_postfix_bits,
                        MetaBlockSplit* mb) {
+  uint8_t* good_for_rle = new uint8_t[kNumCommandPrefixes];
  for (size_t i = 0; i < mb->literal_histograms.size(); ++i) {
-    OptimizeHuffmanCountsForRle(256, &mb->literal_histograms[i].data_[0]);
+    OptimizeHuffmanCountsForRle(256, &mb->literal_histograms[i].data_[0],
+                                good_for_rle);
  }
  for (size_t i = 0; i < mb->command_histograms.size(); ++i) {
    OptimizeHuffmanCountsForRle(kNumCommandPrefixes,
-                                &mb->command_histograms[i].data_[0]);
+                                &mb->command_histograms[i].data_[0],
+                                good_for_rle);
  }
  size_t num_distance_codes =
      kNumDistanceShortCodes + num_direct_distance_codes +
      (48u << distance_postfix_bits);
  for (size_t i = 0; i < mb->distance_histograms.size(); ++i) {
    OptimizeHuffmanCountsForRle(num_distance_codes,
-                                &mb->distance_histograms[i].data_[0]);
+                                &mb->distance_histograms[i].data_[0],
+                                good_for_rle);
  }
+  delete[] good_for_rle;
 }

 }  // namespace brotli
--- a/enc/prefix.h
+++ b/enc/prefix.h
@ -60,7 +60,7 @@ inline void PrefixEncodeCopyDistance(size_t distance_code,
    return;
  }
  distance_code -= kNumDistanceShortCodes + num_direct_codes;  /* >= 0 */
-  distance_code += (1 << (postfix_bits + 2));  /* > 0 */
+  distance_code += (1u << (postfix_bits + 2u));  /* > 0 */
  size_t bucket = Log2FloorNonZero(distance_code) - 1;
  size_t postfix_mask = (1 << postfix_bits) - 1;
  size_t postfix = distance_code & postfix_mask;
--- a/enc/ringbuffer.h
+++ b/enc/ringbuffer.h
@ -31,24 +31,52 @@ class RingBuffer {
      : size_(1u << window_bits),
        mask_((1u << window_bits) - 1),
        tail_size_(1u << tail_bits),
-        pos_(0) {
-    static const size_t kSlackForEightByteHashingEverywhere = 7;
-    const size_t buflen = size_ + tail_size_;
-    data_ = new uint8_t[2 + buflen + kSlackForEightByteHashingEverywhere];
-    buffer_ = data_ + 2;
-    for (size_t i = 0; i < kSlackForEightByteHashingEverywhere; ++i) {
-      buffer_[buflen + i] = 0;
-    }
-    // Initialize the last two bytes and their copy to zero.
-    buffer_[-2] = buffer_[size_ - 2] = 0;
-    buffer_[-1] = buffer_[size_ - 1] = 0;
-  }
+        total_size_(size_ + tail_size_),
+        cur_size_(0),
+        pos_(0),
+        data_(0),
+        buffer_(0) {}
+
  ~RingBuffer(void) {
-    delete [] data_;
+    free(data_);
+  }
+
+  // Allocates or re-allocates data_ to the given length + plus some slack
+  // region before and after. Fills the slack regions with zeros.
+  inline void InitBuffer(const uint32_t buflen) {
+    static const size_t kSlackForEightByteHashingEverywhere = 7;
+    cur_size_ = buflen;
+    data_ = static_cast<uint8_t*>(realloc(
+        data_, 2 + buflen + kSlackForEightByteHashingEverywhere));
+    buffer_ = data_ + 2;
+    buffer_[-2] = buffer_[-1] = 0;
+    for (size_t i = 0; i < kSlackForEightByteHashingEverywhere; ++i) {
+      buffer_[cur_size_ + i] = 0;
+    }
  }

  // Push bytes into the ring buffer.
  void Write(const uint8_t *bytes, size_t n) {
+    if (pos_ == 0 && n < tail_size_) {
+      // Special case for the first write: to process the first block, we don't
+      // need to allocate the whole ringbuffer and we don't need the tail
+      // either. However, we do this memory usage optimization only if the
+      // first write is less than the tail size, which is also the input block
+      // size, otherwise it is likely that other blocks will follow and we
+      // will need to reallocate to the full size anyway.
+      pos_ = static_cast<uint32_t>(n);
+      InitBuffer(pos_);
+      memcpy(buffer_, bytes, n);
+      return;
+    }
+    if (cur_size_ < total_size_) {
+      // Lazily allocate the full buffer.
+      InitBuffer(total_size_);
+      // Initialize the last two bytes to zero, so that we don't have to worry
+      // later when we copy the last two bytes to the first two positions.
+      buffer_[size_ - 2] = 0;
+      buffer_[size_ - 1] = 0;
+    }
    const size_t masked_pos = pos_ & mask_;
    // The length of the writes is limited so that we do not need to worry
    // about a write
@ -60,7 +88,7 @@ class RingBuffer {
      // Split into two writes.
      // Copy into the end of the buffer, including the tail buffer.
      memcpy(&buffer_[masked_pos], bytes,
-             std::min(n, (size_ + tail_size_) - masked_pos));
+             std::min(n, total_size_ - masked_pos));
      // Copy into the beginning of the buffer
      memcpy(&buffer_[0], bytes + (size_ - masked_pos),
             n - (size_ - masked_pos));
@ -100,7 +128,9 @@ class RingBuffer {
  const uint32_t size_;
  const uint32_t mask_;
  const uint32_t tail_size_;
+  const uint32_t total_size_;

+  uint32_t cur_size_;
  // Position to write in the ring buffer.
  uint32_t pos_;
  // The actual ring buffer containing the copy of the last two bytes, the data,
--- a/enc/transform.h
+++ b/enc/transform.h
@ -197,7 +197,7 @@ static size_t ToUpperCase(uint8_t *p, size_t len) {
 inline std::string TransformWord(
    WordTransformType transform_type, const uint8_t* word, size_t len) {
  if (transform_type <= kOmitLast9) {
-    if (len <= transform_type) {
+    if (len <= static_cast<size_t>(transform_type)) {
      return std::string();
    }
    return std::string(word, word + len - transform_type);
--- a/enc/types.h
+++ b/enc/types.h
@ -24,4 +24,6 @@ typedef __int64 int64_t;
 #include <stdint.h>
 #endif  /* defined(_MSC_VER) && (_MSC_VER < 1600) */

+#define MAKE_UINT64_T(high, low) ((((uint64_t)(high)) << 32) | low)
+
 #endif  /* BROTLI_ENC_TYPES_H_ */