Push updates to the Brotli encoder and decoder upstream.

This commit contains a batch of changes that were made to the Brotli library since March 2014. Most important changes: * Fix BrotliDecompressedSize() to work for an uncompressed plus an empty meta-block. * Move serialization functions into their own file. * Fix storing of the meta-block header for last empty meta-block. * Add a fast version of the hasher.
2024-11-22 03:30:07 +00:00 · 2014-10-10 15:30:08 +02:00 · 2014-10-10 15:30:08 +02:00 · a597255213
commit a597255213
parent e3980e3bd7
24 changed files with 1640 additions and 1025 deletions
--- a/dec/decode.c
+++ b/dec/decode.c
@ -50,8 +50,9 @@ static const int kDistanceContextBits = 2;

 #define HUFFMAN_TABLE_BITS      8
 #define HUFFMAN_TABLE_MASK      0xff
-/* This is a rough estimate, not an exact bound. */
-#define HUFFMAN_MAX_TABLE_SIZE  2048
+/* Maximum possible Huffman table size for an alphabet size of 704, max code
+ * length 15 and root table bits 8. */
+#define HUFFMAN_MAX_TABLE_SIZE  1080

 #define CODE_LENGTH_CODES 18
 static const uint8_t kCodeLengthCodeOrder[CODE_LENGTH_CODES] = {
@ -633,22 +634,62 @@ int CopyUncompressedBlockToOutput(BrotliOutput output, int len, int pos,
 int BrotliDecompressedSize(size_t encoded_size,
                           const uint8_t* encoded_buffer,
                           size_t* decoded_size) {
-  BrotliMemInput memin;
-  BrotliInput input = BrotliInitMemInput(encoded_buffer, encoded_size, &memin);
-  BrotliBitReader br;
-  int meta_block_len;
-  int input_end;
-  int is_uncompressed;
-  if (!BrotliInitBitReader(&br, input)) {
+  int i;
+  uint64_t val = 0;
+  int bit_pos = 0;
+  int is_last;
+  int is_uncompressed = 0;
+  int size_nibbles;
+  int meta_block_len = 0;
+  if (encoded_size == 0) {
    return 0;
  }
-  DecodeWindowBits(&br);
-  DecodeMetaBlockLength(&br, &meta_block_len, &input_end, &is_uncompressed);
-  if (!input_end) {
-    return 0;
+  /* Look at the first 8 bytes, it is enough to decode the length of the first
+     meta-block. */
+  for (i = 0; i < encoded_size && i < 8; ++i) {
+    val |= (uint64_t)encoded_buffer[i] << (8 * i);
  }
-  *decoded_size = (size_t)meta_block_len;
-  return 1;
+  /* Skip the window bits. */
+  bit_pos += (val & 1) ? 4 : 1;
+  /* Decode the ISLAST bit. */
+  is_last = (val >> bit_pos) & 1;
+  ++bit_pos;
+  if (is_last) {
+    /* Decode the ISEMPTY bit, if it is set to 1, we are done. */
+    if ((val >> bit_pos) & 1) {
+      *decoded_size = 0;
+      return 1;
+    }
+    ++bit_pos;
+  }
+  /* Decode the length of the first meta-block. */
+  size_nibbles = (int)((val >> bit_pos) & 3) + 4;
+  bit_pos += 2;
+  for (i = 0; i < size_nibbles; ++i) {
+    meta_block_len |= (int)((val >> bit_pos) & 0xf) << (4 * i);
+    bit_pos += 4;
+  }
+  ++meta_block_len;
+  if (is_last) {
+    /* If this meta-block is the only one, we are done. */
+    *decoded_size = (size_t)meta_block_len;
+    return 1;
+  }
+  is_uncompressed = (val >> bit_pos) & 1;
+  ++bit_pos;
+  if (is_uncompressed) {
+    /* If the first meta-block is uncompressed, we skip it and look at the
+       first two bits (ISLAST and ISEMPTY) of the next meta-block, and if
+       both are set to 1, we have a stream with an uncompressed meta-block
+       followed by an empty one, so the decompressed size is the size of the
+       first meta-block. */
+    int offset = ((bit_pos + 7) >> 3) + meta_block_len;
+    if (offset < encoded_size && ((encoded_buffer[offset] & 3) == 3)) {
+      *decoded_size = (size_t)meta_block_len;
+      return 1;
+    }
+  }
+  return 0;
 }

 int BrotliDecompressBuffer(size_t encoded_size,
--- a/dec/decode.h
+++ b/dec/decode.h
@ -27,7 +27,8 @@ extern "C" {

 /* Sets *decoded_size to the decompressed size of the given encoded stream. */
 /* This function only works if the encoded buffer has a single meta block, */
-/* and this meta block must have the "is last" bit set. */
+/* or if it has two meta-blocks, where the first is uncompressed and the */
+/* second is empty. */
 /* Returns 1 on success, 0 on failure. */
 int BrotliDecompressedSize(size_t encoded_size,
                           const uint8_t* encoded_buffer,
--- a/dec/safe_malloc.h
+++ b/dec/safe_malloc.h
@ -15,8 +15,8 @@
   Size-checked memory allocation.
 */

-#ifndef BROTLI_UTILS_UTILS_H_
-#define BROTLI_UTILS_UTILS_H_
+#ifndef BROTLI_DEC_SAFE_MALLOC_H_
+#define BROTLI_DEC_SAFE_MALLOC_H_

 #include <assert.h>

@ -42,4 +42,4 @@ void* BrotliSafeMalloc(uint64_t nmemb, size_t size);
 }    /* extern "C" */
 #endif

-#endif  /* BROTLI_UTILS_UTILS_H_ */
+#endif  /* BROTLI_DEC_SAFE_MALLOC_H_ */
--- a/enc/backward_references.cc
+++ b/enc/backward_references.cc
@ -23,173 +23,230 @@

 namespace brotli {

-template<typename Hasher>
+template<typename Hasher, bool kUseCostModel, bool kUseDictionary>
 void CreateBackwardReferences(size_t num_bytes,
                              size_t position,
                              const uint8_t* ringbuffer,
-                              const float* literal_cost,
                              size_t ringbuffer_mask,
+                              const float* literal_cost,
+                              size_t literal_cost_mask,
                              const size_t max_backward_limit,
+                              const double base_min_score,
+                              const int quality,
                              Hasher* hasher,
-                              std::vector<Command>* commands) {
-  // Length heuristic that seems to help probably by better selection
-  // of lazy matches of similar lengths.
-  int insert_length = 0;
+                              int* dist_cache,
+                              int* last_insert_len,
+                              Command* commands,
+                              int* num_commands) {
+  if (num_bytes >= 3 && position >= 3) {
+    // Prepare the hashes for three last bytes of the last write.
+    // These could not be calculated before, since they require knowledge
+    // of both the previous and the current block.
+    hasher->Store(&ringbuffer[(position - 3) & ringbuffer_mask],
+                  position - 3);
+    hasher->Store(&ringbuffer[(position - 2) & ringbuffer_mask],
+                  position - 2);
+    hasher->Store(&ringbuffer[(position - 1) & ringbuffer_mask],
+                  position - 1);
+  }
+  const Command * const orig_commands = commands;
+  int insert_length = *last_insert_len;
  size_t i = position & ringbuffer_mask;
  const int i_diff = position - i;
  const size_t i_end = i + num_bytes;

-  const int random_heuristics_window_size = 512;
+  // For speed up heuristics for random data.
+  const int random_heuristics_window_size = quality < 9 ? 64 : 512;
  int apply_random_heuristics = i + random_heuristics_window_size;

-  double average_cost = 0.0;
-  for (int k = position; k < position + num_bytes; ++k) {
-    average_cost += literal_cost[k & ringbuffer_mask];
+  double average_cost = 5.4;
+  if (kUseCostModel) {
+    average_cost = 0.0;
+    for (int k = position; k < position + num_bytes; ++k) {
+      average_cost += literal_cost[k & literal_cost_mask];
+    }
+    average_cost /= num_bytes;
  }
-  average_cost /= num_bytes;
-  hasher->set_average_cost(average_cost);

  // M1 match is for considering for two repeated copies, if moving
  // one literal form the previous copy to the current one allows the
  // current copy to be more efficient (because the way static dictionary
  // codes words). M1 matching improves text compression density by ~0.15 %.
  bool match_found_M1 = false;
-  size_t best_len_M1 = 0;
-  size_t best_len_code_M1 = 0;
-  size_t best_dist_M1 = 0;
+  int best_len_M1 = 0;
+  int best_len_code_M1 = 0;
+  int best_dist_M1 = 0;
  double best_score_M1 = 0;
-  while (i + 2 < i_end) {
-    size_t best_len = 0;
-    size_t best_len_code = 0;
-    size_t best_dist = 0;
-    double best_score = 0;
+  while (i + 3 < i_end) {
+    int max_length = i_end - i;
    size_t max_distance = std::min(i + i_diff, max_backward_limit);
-    bool in_dictionary;
-    hasher->set_insert_length(insert_length);
+    double min_score = base_min_score;
+    if (kUseCostModel && insert_length < 8) {
+      double cost_diff[8] =
+          { 0.1, 0.038, 0.019, 0.013, 0.001, 0.001, 0.001, 0.001 };
+      min_score += cost_diff[insert_length];
+    }
+    int best_len = 0;
+    int best_len_code = 0;
+    int best_dist = 0;
+    double best_score = min_score;
    bool match_found = hasher->FindLongestMatch(
-        ringbuffer, literal_cost, ringbuffer_mask,
-        i + i_diff, i_end - i, max_distance,
-        &best_len, &best_len_code, &best_dist, &best_score,
-        &in_dictionary);
-    bool best_in_dictionary = in_dictionary;
+        ringbuffer, ringbuffer_mask,
+        literal_cost, literal_cost_mask, average_cost,
+        dist_cache, i + i_diff, max_length, max_distance,
+        &best_len, &best_len_code, &best_dist, &best_score);
    if (match_found) {
-      if (match_found_M1 && best_score_M1 > best_score) {
+      if (kUseDictionary && match_found_M1 && best_score_M1 > best_score) {
        // Two copies after each other. Take the last literal from the
        // last copy, and use it as the first of this one.
-        (commands->rbegin())->copy_length_ -= 1;
-        (commands->rbegin())->copy_length_code_ -= 1;
+        Command prev_cmd = commands[-1];
+        commands[-1] = Command(prev_cmd.insert_len_,
+                               prev_cmd.copy_len_ - 1,
+                               prev_cmd.copy_len_ - 1,
+                               prev_cmd.DistanceCode());
        hasher->Store(ringbuffer + i, i + i_diff);
        --i;
        best_len = best_len_M1;
        best_len_code = best_len_code_M1;
        best_dist = best_dist_M1;
        best_score = best_score_M1;
-        // in_dictionary doesn't need to be correct, but it is the only
-        // reason why M1 matching should be beneficial here. Setting it here
-        // will only disable further M1 matching against this copy.
-        best_in_dictionary = true;
-        in_dictionary = true;
      } else {
        // Found a match. Let's look for something even better ahead.
        int delayed_backward_references_in_row = 0;
-        while (i + 4 < i_end &&
-               delayed_backward_references_in_row < 4) {
-          size_t best_len_2 = 0;
-          size_t best_len_code_2 = 0;
-          size_t best_dist_2 = 0;
-          double best_score_2 = 0;
+        for (;;) {
+          --max_length;
+          int best_len_2 = quality < 4 ? std::min(best_len - 1, max_length) : 0;
+          int best_len_code_2 = 0;
+          int best_dist_2 = 0;
+          double best_score_2 = min_score;
          max_distance = std::min(i + i_diff + 1, max_backward_limit);
          hasher->Store(ringbuffer + i, i + i_diff);
          match_found = hasher->FindLongestMatch(
-              ringbuffer, literal_cost, ringbuffer_mask,
-              i + i_diff + 1, i_end - i - 1, max_distance,
-              &best_len_2, &best_len_code_2, &best_dist_2, &best_score_2,
-              &in_dictionary);
-          double cost_diff_lazy = 0;
-          if (best_len >= 4) {
-            cost_diff_lazy +=
-                literal_cost[(i + 4) & ringbuffer_mask] - average_cost;
-          }
-          {
-            const int tail_length = best_len_2 - best_len + 1;
-            for (int k = 0; k < tail_length; ++k) {
-              cost_diff_lazy -=
-                  literal_cost[(i + best_len + k) & ringbuffer_mask] -
-                  average_cost;
+              ringbuffer, ringbuffer_mask,
+              literal_cost, literal_cost_mask, average_cost,
+              dist_cache, i + i_diff + 1, max_length, max_distance,
+              &best_len_2, &best_len_code_2, &best_dist_2, &best_score_2);
+          double cost_diff_lazy = 7.0;
+          if (kUseCostModel) {
+            cost_diff_lazy = 0.0;
+            if (best_len >= 4) {
+              cost_diff_lazy +=
+                  literal_cost[(i + 4) & literal_cost_mask] - average_cost;
            }
+            {
+              const int tail_length = best_len_2 - best_len + 1;
+              for (int k = 0; k < tail_length; ++k) {
+                cost_diff_lazy -=
+                    literal_cost[(i + best_len + k) & literal_cost_mask] -
+                    average_cost;
+              }
+            }
+            // If we are not inserting any symbols, inserting one is more
+            // expensive than if we were inserting symbols anyways.
+            if (insert_length < 1) {
+              cost_diff_lazy += 0.97;
+            }
+            // Add bias to slightly avoid lazy matching.
+            cost_diff_lazy += 2.0 + delayed_backward_references_in_row * 0.2;
+            cost_diff_lazy += 0.04 * literal_cost[i & literal_cost_mask];
          }
-          // If we are not inserting any symbols, inserting one is more
-          // expensive than if we were inserting symbols anyways.
-          if (insert_length < 1) {
-            cost_diff_lazy += 0.97;
-          }
-          // Add bias to slightly avoid lazy matching.
-          cost_diff_lazy += 2.0 + delayed_backward_references_in_row * 0.2;
-          cost_diff_lazy += 0.04 * literal_cost[i & ringbuffer_mask];
-
          if (match_found && best_score_2 >= best_score + cost_diff_lazy) {
            // Ok, let's just write one byte for now and start a match from the
            // next byte.
+            ++i;
            ++insert_length;
-            ++delayed_backward_references_in_row;
            best_len = best_len_2;
            best_len_code = best_len_code_2;
            best_dist = best_dist_2;
            best_score = best_score_2;
-            best_in_dictionary = in_dictionary;
-            i++;
-          } else {
-            break;
+            if (++delayed_backward_references_in_row < 4) {
+              continue;
+            }
          }
+          break;
        }
      }
      apply_random_heuristics =
          i + 2 * best_len + random_heuristics_window_size;
-      Command cmd;
-      cmd.insert_length_ = insert_length;
-      cmd.copy_length_ = best_len;
-      cmd.copy_length_code_ = best_len_code;
-      cmd.copy_distance_ = best_dist;
-      commands->push_back(cmd);
-      insert_length = 0;
-      ++i;
-      if (best_dist <= std::min(i + i_diff, max_backward_limit)) {
-        hasher->set_last_distance(best_dist);
+      max_distance = std::min(i + i_diff, max_backward_limit);
+      int distance_code = best_dist + 16;
+      if (best_dist <= max_distance) {
+        if (best_dist == dist_cache[0]) {
+          distance_code = 1;
+        } else if (best_dist == dist_cache[1]) {
+          distance_code = 2;
+        } else if (best_dist == dist_cache[2]) {
+          distance_code = 3;
+        } else if (best_dist == dist_cache[3]) {
+          distance_code = 4;
+        } else if (quality > 1 && best_dist >= 6) {
+          for (int k = 4; k < kNumDistanceShortCodes; ++k) {
+            int idx = kDistanceCacheIndex[k];
+            int candidate = dist_cache[idx] + kDistanceCacheOffset[k];
+            static const int kLimits[16] = { 0, 0, 0, 0,
+                                             6, 6, 11, 11,
+                                             11, 11, 11, 11,
+                                             12, 12, 12, 12 };
+            if (best_dist == candidate && best_dist >= kLimits[k]) {
+              distance_code = k + 1;
+              break;
+            }
+          }
+        }
+        if (distance_code > 1) {
+          dist_cache[3] = dist_cache[2];
+          dist_cache[2] = dist_cache[1];
+          dist_cache[1] = dist_cache[0];
+          dist_cache[0] = best_dist;
+        }
      }
-
-      // Copy all copied literals to the hasher, except the last one.
-      // We cannot store the last one yet, otherwise we couldn't find
-      // the possible M1 match.
-      for (int j = 1; j < best_len - 1; ++j) {
-        if (i + 2 < i_end) {
+      Command cmd(insert_length, best_len, best_len_code, distance_code);
+      *commands++ = cmd;
+      insert_length = 0;
+      if (kUseDictionary) {
+        ++i;
+        // Copy all copied literals to the hasher, except the last one.
+        // We cannot store the last one yet, otherwise we couldn't find
+        // the possible M1 match.
+        for (int j = 1; j < best_len - 1; ++j) {
+          if (i + 3 < i_end) {
+            hasher->Store(ringbuffer + i, i + i_diff);
+          }
+          ++i;
+        }
+        // Prepare M1 match.
+        if (hasher->HasStaticDictionary() &&
+            best_len >= 4 && i + 20 < i_end && best_dist <= max_distance) {
+          max_distance = std::min(i + i_diff, max_backward_limit);
+          best_score_M1 = min_score;
+          match_found_M1 = hasher->FindLongestMatch(
+              ringbuffer, ringbuffer_mask,
+              literal_cost, literal_cost_mask, average_cost,
+              dist_cache, i + i_diff, i_end - i, max_distance,
+              &best_len_M1, &best_len_code_M1, &best_dist_M1, &best_score_M1);
+        } else {
+          match_found_M1 = false;
+        }
+        if (kUseCostModel) {
+          // This byte is just moved from the previous copy to the current,
+          // that is no gain.
+          best_score_M1 -= literal_cost[i & literal_cost_mask];
+          // Adjust for losing the opportunity for lazy matching.
+          best_score_M1 -= 3.75;
+        }
+        // Store the last one of the match.
+        if (i + 3 < i_end) {
          hasher->Store(ringbuffer + i, i + i_diff);
        }
        ++i;
-      }
-      // Prepare M1 match.
-      if (hasher->HasStaticDictionary() &&
-          best_len >= 4 && i + 20 < i_end && !best_in_dictionary) {
-        max_distance = std::min(i + i_diff, max_backward_limit);
-        match_found_M1 = hasher->FindLongestMatch(
-            ringbuffer, literal_cost, ringbuffer_mask,
-            i + i_diff, i_end - i, max_distance,
-            &best_len_M1, &best_len_code_M1, &best_dist_M1, &best_score_M1,
-            &in_dictionary);
      } else {
-        match_found_M1 = false;
-        in_dictionary = false;
+        // Put the hash keys into the table, if there are enough
+        // bytes left.
+        for (int j = 1; j < best_len; ++j) {
+          hasher->Store(&ringbuffer[i + j], i + i_diff + j);
+        }
+        i += best_len;
      }
-      // This byte is just moved from the previous copy to the current,
-      // that is no gain.
-      best_score_M1 -= literal_cost[i & ringbuffer_mask];
-      // Adjust for losing the opportunity for lazy matching.
-      best_score_M1 -= 3.75;
-
-      // Store the last one of the match.
-      if (i + 2 < i_end) {
-        hasher->Store(ringbuffer + i, i + i_diff);
-      }
-      ++i;
    } else {
      match_found_M1 = false;
      ++insert_length;
@ -214,7 +271,7 @@ void CreateBackwardReferences(size_t num_bytes,
            insert_length += 4;
          }
        } else {
-          int i_jump = std::min(i + 8, i_end - 2);
+          int i_jump = std::min(i + 8, i_end - 3);
          for (; i < i_jump; i += 2) {
            hasher->Store(ringbuffer + i, i + i_diff);
            insert_length += 2;
@ -224,44 +281,92 @@ void CreateBackwardReferences(size_t num_bytes,
    }
  }
  insert_length += (i_end - i);
-
-  if (insert_length > 0) {
-    Command cmd;
-    cmd.insert_length_ = insert_length;
-    cmd.copy_length_ = 0;
-    cmd.copy_distance_ = 0;
-    commands->push_back(cmd);
-  }
+  *last_insert_len = insert_length;
+  *num_commands += (commands - orig_commands);
 }

 void CreateBackwardReferences(size_t num_bytes,
                              size_t position,
                              const uint8_t* ringbuffer,
-                              const float* literal_cost,
                              size_t ringbuffer_mask,
+                              const float* literal_cost,
+                              size_t literal_cost_mask,
                              const size_t max_backward_limit,
+                              const double base_min_score,
+                              const int quality,
                              Hashers* hashers,
-                              Hashers::Type hash_type,
-                              std::vector<Command>* commands) {
+                              int hash_type,
+                              int* dist_cache,
+                              int* last_insert_len,
+                              Command* commands,
+                              int* num_commands) {
  switch (hash_type) {
-    case Hashers::HASH_15_8_4:
-      CreateBackwardReferences(
-          num_bytes, position, ringbuffer, literal_cost,
-          ringbuffer_mask, max_backward_limit,
-          hashers->hash_15_8_4.get(),
-          commands);
+    case 1:
+      CreateBackwardReferences<Hashers::H1, false, false>(
+          num_bytes, position, ringbuffer, ringbuffer_mask,
+          literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
+          quality, hashers->hash_h1.get(), dist_cache, last_insert_len,
+          commands, num_commands);
      break;
-    case Hashers::HASH_15_8_2:
-      CreateBackwardReferences(
-          num_bytes, position, ringbuffer, literal_cost,
-          ringbuffer_mask, max_backward_limit,
-          hashers->hash_15_8_2.get(),
-          commands);
+    case 2:
+      CreateBackwardReferences<Hashers::H2, false, false>(
+          num_bytes, position, ringbuffer, ringbuffer_mask,
+          literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
+          quality, hashers->hash_h2.get(), dist_cache, last_insert_len,
+          commands, num_commands);
+      break;
+    case 3:
+      CreateBackwardReferences<Hashers::H3, false, false>(
+          num_bytes, position, ringbuffer, ringbuffer_mask,
+          literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
+          quality, hashers->hash_h3.get(), dist_cache, last_insert_len,
+          commands, num_commands);
+      break;
+    case 4:
+      CreateBackwardReferences<Hashers::H4, false, false>(
+          num_bytes, position, ringbuffer, ringbuffer_mask,
+          literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
+          quality, hashers->hash_h4.get(), dist_cache, last_insert_len,
+          commands, num_commands);
+      break;
+    case 5:
+      CreateBackwardReferences<Hashers::H5, false, false>(
+          num_bytes, position, ringbuffer, ringbuffer_mask,
+          literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
+          quality, hashers->hash_h5.get(), dist_cache, last_insert_len,
+          commands, num_commands);
+      break;
+    case 6:
+      CreateBackwardReferences<Hashers::H6, false, false>(
+          num_bytes, position, ringbuffer, ringbuffer_mask,
+          literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
+          quality, hashers->hash_h6.get(), dist_cache, last_insert_len,
+          commands, num_commands);
+      break;
+    case 7:
+      CreateBackwardReferences<Hashers::H7, true, false>(
+          num_bytes, position, ringbuffer, ringbuffer_mask,
+          literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
+          quality, hashers->hash_h7.get(), dist_cache, last_insert_len,
+          commands, num_commands);
+      break;
+    case 8:
+      CreateBackwardReferences<Hashers::H8, true, true>(
+          num_bytes, position, ringbuffer, ringbuffer_mask,
+          literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
+          quality, hashers->hash_h8.get(), dist_cache, last_insert_len,
+          commands, num_commands);
+      break;
+    case 9:
+      CreateBackwardReferences<Hashers::H9, true, false>(
+          num_bytes, position, ringbuffer, ringbuffer_mask,
+          literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
+          quality, hashers->hash_h9.get(), dist_cache, last_insert_len,
+          commands, num_commands);
      break;
    default:
      break;
  }
 }

-
 }  // namespace brotli
--- a/enc/backward_references.h
+++ b/enc/backward_references.h
@ -28,12 +28,18 @@ namespace brotli {
 void CreateBackwardReferences(size_t num_bytes,
                              size_t position,
                              const uint8_t* ringbuffer,
-                              const float* literal_cost,
                              size_t ringbuffer_mask,
+                              const float* literal_cost,
+                              size_t literal_cost_mask,
                              const size_t max_backward_limit,
+                              const double base_min_score,
+                              const int quality,
                              Hashers* hashers,
-                              Hashers::Type hash_type,
-                              std::vector<Command>* commands);
+                              int hash_type,
+                              int* dist_cache,
+                              int* last_insert_len,
+                              Command* commands,
+                              int* num_commands);

 }  // namespace brotli

--- a/enc/bit_cost.h
+++ b/enc/bit_cost.h
@ -91,7 +91,7 @@ static inline int HuffmanBitCost(const uint8_t* depth, int length) {

  // create huffman tree of huffman tree
  uint8_t cost[kCodeLengthCodes] = { 0 };
-  CreateHuffmanTree(histogram, kCodeLengthCodes, 7, cost);
+  CreateHuffmanTree(histogram, kCodeLengthCodes, 7, 9, cost);
  // account for rle extra bits
  cost[16] += 2;
  cost[17] += 3;
@ -123,7 +123,7 @@ double PopulationCost(const Histogram<kSize>& histogram) {
    return 20 + histogram.total_count_;
  }
  uint8_t depth[kSize] = { 0 };
-  CreateHuffmanTree(&histogram.data_[0], kSize, 15, depth);
+  CreateHuffmanTree(&histogram.data_[0], kSize, 15, 9, depth);
  int bits = 0;
  for (int i = 0; i < kSize; ++i) {
    bits += histogram.data_[i] * depth[i];
--- a/enc/block_splitter.cc
+++ b/enc/block_splitter.cc
@ -33,7 +33,7 @@ namespace brotli {

 static const int kMaxLiteralHistograms = 100;
 static const int kMaxCommandHistograms = 50;
-static const double kLiteralBlockSwitchCost = 26;
+static const double kLiteralBlockSwitchCost = 28.1;
 static const double kCommandBlockSwitchCost = 13.5;
 static const double kDistanceBlockSwitchCost = 14.6;
 static const int kLiteralStrideLength = 70;
@ -51,7 +51,7 @@ void CopyLiteralsToByteArray(const std::vector<Command>& cmds,
  // Count how many we have.
  size_t total_length = 0;
  for (int i = 0; i < cmds.size(); ++i) {
-    total_length += cmds[i].insert_length_;
+    total_length += cmds[i].insert_len_;
  }
  if (total_length == 0) {
    return;
@ -64,9 +64,9 @@ void CopyLiteralsToByteArray(const std::vector<Command>& cmds,
  size_t pos = 0;
  size_t from_pos = 0;
  for (int i = 0; i < cmds.size() && pos < total_length; ++i) {
-    memcpy(&(*literals)[pos], data + from_pos, cmds[i].insert_length_);
-    pos += cmds[i].insert_length_;
-    from_pos += cmds[i].insert_length_ + cmds[i].copy_length_;
+    memcpy(&(*literals)[pos], data + from_pos, cmds[i].insert_len_);
+    pos += cmds[i].insert_len_;
+    from_pos += cmds[i].insert_len_ + cmds[i].copy_len_;
  }
 }

@ -75,9 +75,9 @@ void CopyCommandsToByteArray(const std::vector<Command>& cmds,
                             std::vector<uint8_t>* distance_prefixes) {
  for (int i = 0; i < cmds.size(); ++i) {
    const Command& cmd = cmds[i];
-    insert_and_copy_codes->push_back(cmd.command_prefix_);
-    if (cmd.copy_length_ > 0 && cmd.distance_prefix_ != 0xffff) {
-      distance_prefixes->push_back(cmd.distance_prefix_);
+    insert_and_copy_codes->push_back(cmd.cmd_prefix_);
+    if (cmd.copy_len_ > 0 && cmd.cmd_prefix_ >= 128) {
+      distance_prefixes->push_back(cmd.dist_prefix_);
    }
  }
 }
@ -301,7 +301,7 @@ void SplitByteVector(const std::vector<DataType>& data,
                     const double block_switch_cost,
                     BlockSplit* split) {
  if (data.empty()) {
-    split->num_types_ = 0;
+    split->num_types_ = 1;
    return;
  } else if (data.size() < kMinLengthForBlockSplitting) {
    split->num_types_ = 1;
@ -376,7 +376,7 @@ void SplitBlockByTotalLength(const std::vector<Command>& all_commands,
  std::vector<Command> cur_block;
  for (int i = 0; i < all_commands.size(); ++i) {
    const Command& cmd = all_commands[i];
-    int cmd_length = cmd.insert_length_ + cmd.copy_length_;
+    int cmd_length = cmd.insert_len_ + cmd.copy_len_;
    if (total_length > length_limit) {
      blocks->push_back(cur_block);
      cur_block.clear();
--- a/enc/block_splitter.h
+++ b/enc/block_splitter.h
@ -29,8 +29,7 @@ namespace brotli {

 struct BlockSplit {
  int num_types_;
-  std::vector<uint8_t> types_;
-  std::vector<int> type_codes_;
+  std::vector<int> types_;
  std::vector<int> lengths_;
 };

--- a/enc/brotli_bit_stream.cc
+++ b/enc/brotli_bit_stream.cc
@ -0,0 +1,575 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Brotli bit stream functions to support the low level format. There are no
+// compression algorithms here, just the right ordering of bits to match the
+// specs.
+
+#include "./brotli_bit_stream.h"
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+#include "./bit_cost.h"
+#include "./entropy_encode.h"
+#include "./fast_log.h"
+#include "./prefix.h"
+#include "./write_bits.h"
+
+namespace brotli {
+
+// returns false if fail
+// nibblesbits represents the 2 bits to encode MNIBBLES (0-3)
+bool EncodeMlen(size_t length, int* bits, int* numbits, int* nibblesbits) {
+  length--;  // MLEN - 1 is encoded
+  int lg = length == 0 ? 1 : Log2Floor(length) + 1;
+  if (lg > 28) return false;
+  int mnibbles = (lg < 16 ? 16 : (lg + 3)) / 4;
+  *nibblesbits = mnibbles - 4;
+  *numbits = mnibbles * 4;
+  *bits = length;
+  return true;
+}
+
+void StoreVarLenUint8(int n, int* storage_ix, uint8_t* storage) {
+  if (n == 0) {
+    WriteBits(1, 0, storage_ix, storage);
+  } else {
+    WriteBits(1, 1, storage_ix, storage);
+    int nbits = Log2Floor(n);
+    WriteBits(3, nbits, storage_ix, storage);
+    WriteBits(nbits, n - (1 << nbits), storage_ix, storage);
+  }
+}
+
+bool StoreCompressedMetaBlockHeader(bool final_block,
+                                    int length,
+                                    int* storage_ix,
+                                    uint8_t* storage) {
+  // Write ISLAST bit.
+  WriteBits(1, final_block, storage_ix, storage);
+  // Write ISEMPTY bit.
+  if (final_block) {
+    WriteBits(1, length == 0, storage_ix, storage);
+    if (length == 0) {
+      return true;
+    }
+  }
+
+  if (length == 0) {
+    // Only the last meta-block can be empty.
+    return false;
+  }
+
+  int lenbits;
+  int nlenbits;
+  int nibblesbits;
+  if (!EncodeMlen(length, &lenbits, &nlenbits, &nibblesbits)) {
+    return false;
+  }
+
+  WriteBits(2, nibblesbits, storage_ix, storage);
+  WriteBits(nlenbits, lenbits, storage_ix, storage);
+
+  if (!final_block) {
+    // Write ISUNCOMPRESSED bit.
+    WriteBits(1, 0, storage_ix, storage);
+  }
+  return true;
+}
+
+bool StoreUncompressedMetaBlockHeader(int length,
+                                      int* storage_ix,
+                                      uint8_t* storage) {
+  // Write ISLAST bit. Uncompressed block cannot be the last one, so set to 0.
+  WriteBits(1, 0, storage_ix, storage);
+  int lenbits;
+  int nlenbits;
+  int nibblesbits;
+  if (!EncodeMlen(length, &lenbits, &nlenbits, &nibblesbits)) {
+    return false;
+  }
+  WriteBits(2, nibblesbits, storage_ix, storage);
+  WriteBits(nlenbits, lenbits, storage_ix, storage);
+  // Write ISUNCOMPRESSED bit.
+  WriteBits(1, 1, storage_ix, storage);
+  return true;
+}
+
+void StoreHuffmanTreeOfHuffmanTreeToBitMask(
+    const int num_codes,
+    const uint8_t *code_length_bitdepth,
+    int *storage_ix,
+    uint8_t *storage) {
+  static const uint8_t kStorageOrder[kCodeLengthCodes] = {
+    1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  };
+  // The bit lengths of the Huffman code over the code length alphabet
+  // are compressed with the following static Huffman code:
+  //   Symbol   Code
+  //   ------   ----
+  //   0          00
+  //   1        1110
+  //   2         110
+  //   3          01
+  //   4          10
+  //   5        1111
+  static const uint8_t kHuffmanBitLengthHuffmanCodeSymbols[6] = {
+     0, 7, 3, 2, 1, 15
+  };
+  static const uint8_t kHuffmanBitLengthHuffmanCodeBitLengths[6] = {
+    2, 4, 3, 2, 2, 4
+  };
+
+  // Throw away trailing zeros:
+  int codes_to_store = kCodeLengthCodes;
+  if (num_codes > 1) {
+    for (; codes_to_store > 0; --codes_to_store) {
+      if (code_length_bitdepth[kStorageOrder[codes_to_store - 1]] != 0) {
+        break;
+      }
+    }
+  }
+  int skip_some = 0;  // skips none.
+  if (code_length_bitdepth[kStorageOrder[0]] == 0 &&
+      code_length_bitdepth[kStorageOrder[1]] == 0) {
+    skip_some = 2;  // skips two.
+    if (code_length_bitdepth[kStorageOrder[2]] == 0) {
+      skip_some = 3;  // skips three.
+    }
+  }
+  WriteBits(2, skip_some, storage_ix, storage);
+  for (int i = skip_some; i < codes_to_store; ++i) {
+    uint8_t l = code_length_bitdepth[kStorageOrder[i]];
+    WriteBits(kHuffmanBitLengthHuffmanCodeBitLengths[l],
+              kHuffmanBitLengthHuffmanCodeSymbols[l], storage_ix, storage);
+  }
+}
+
+void StoreHuffmanTreeToBitMask(
+    const std::vector<uint8_t> &huffman_tree,
+    const std::vector<uint8_t> &huffman_tree_extra_bits,
+    const uint8_t *code_length_bitdepth,
+    const std::vector<uint16_t> &code_length_bitdepth_symbols,
+    int * __restrict storage_ix,
+    uint8_t * __restrict storage) {
+  for (int i = 0; i < huffman_tree.size(); ++i) {
+    int ix = huffman_tree[i];
+    WriteBits(code_length_bitdepth[ix], code_length_bitdepth_symbols[ix],
+              storage_ix, storage);
+    // Extra bits
+    switch (ix) {
+      case 16:
+        WriteBits(2, huffman_tree_extra_bits[i], storage_ix, storage);
+        break;
+      case 17:
+        WriteBits(3, huffman_tree_extra_bits[i], storage_ix, storage);
+        break;
+    }
+  }
+}
+
+void StoreSimpleHuffmanTree(const uint8_t* depths,
+                            int symbols[4],
+                            int num_symbols,
+                            int max_bits,
+                            int *storage_ix, uint8_t *storage) {
+  // value of 1 indicates a simple Huffman code
+  WriteBits(2, 1, storage_ix, storage);
+  WriteBits(2, num_symbols - 1, storage_ix, storage);  // NSYM - 1
+
+  // Sort
+  for (int i = 0; i < num_symbols; i++) {
+    for (int j = i + 1; j < num_symbols; j++) {
+      if (depths[symbols[j]] < depths[symbols[i]]) {
+        std::swap(symbols[j], symbols[i]);
+      }
+    }
+  }
+
+  if (num_symbols == 2) {
+    WriteBits(max_bits, symbols[0], storage_ix, storage);
+    WriteBits(max_bits, symbols[1], storage_ix, storage);
+  } else if (num_symbols == 3) {
+    WriteBits(max_bits, symbols[0], storage_ix, storage);
+    WriteBits(max_bits, symbols[1], storage_ix, storage);
+    WriteBits(max_bits, symbols[2], storage_ix, storage);
+  } else {
+    WriteBits(max_bits, symbols[0], storage_ix, storage);
+    WriteBits(max_bits, symbols[1], storage_ix, storage);
+    WriteBits(max_bits, symbols[2], storage_ix, storage);
+    WriteBits(max_bits, symbols[3], storage_ix, storage);
+    // tree-select
+    WriteBits(1, depths[symbols[0]] == 1 ? 1 : 0, storage_ix, storage);
+  }
+}
+
+// num = alphabet size
+// depths = symbol depths
+void StoreHuffmanTree(const uint8_t* depths, size_t num,
+                      int quality,
+                      int *storage_ix, uint8_t *storage) {
+  // Write the Huffman tree into the brotli-representation.
+  std::vector<uint8_t> huffman_tree;
+  std::vector<uint8_t> huffman_tree_extra_bits;
+  // TODO(user): Consider allocating these from stack.
+  huffman_tree.reserve(256);
+  huffman_tree_extra_bits.reserve(256);
+  WriteHuffmanTree(depths, num, &huffman_tree, &huffman_tree_extra_bits);
+
+  // Calculate the statistics of the Huffman tree in brotli-representation.
+  int huffman_tree_histogram[kCodeLengthCodes] = { 0 };
+  for (int i = 0; i < huffman_tree.size(); ++i) {
+    ++huffman_tree_histogram[huffman_tree[i]];
+  }
+
+  int num_codes = 0;
+  int code = 0;
+  for (int i = 0; i < kCodeLengthCodes; ++i) {
+    if (huffman_tree_histogram[i]) {
+      if (num_codes == 0) {
+        code = i;
+        num_codes = 1;
+      } else if (num_codes == 1) {
+        num_codes = 2;
+        break;
+      }
+    }
+  }
+
+  // Calculate another Huffman tree to use for compressing both the
+  // earlier Huffman tree with.
+  // TODO(user): Consider allocating these from stack.
+  uint8_t code_length_bitdepth[kCodeLengthCodes] = { 0 };
+  std::vector<uint16_t> code_length_bitdepth_symbols(kCodeLengthCodes);
+  CreateHuffmanTree(&huffman_tree_histogram[0], kCodeLengthCodes,
+                    5, quality, &code_length_bitdepth[0]);
+  ConvertBitDepthsToSymbols(code_length_bitdepth, kCodeLengthCodes,
+                            code_length_bitdepth_symbols.data());
+
+  // Now, we have all the data, let's start storing it
+  StoreHuffmanTreeOfHuffmanTreeToBitMask(num_codes, code_length_bitdepth,
+                                         storage_ix, storage);
+
+  if (num_codes == 1) {
+    code_length_bitdepth[code] = 0;
+  }
+
+  // Store the real huffman tree now.
+  StoreHuffmanTreeToBitMask(huffman_tree,
+                            huffman_tree_extra_bits,
+                            &code_length_bitdepth[0],
+                            code_length_bitdepth_symbols,
+                            storage_ix, storage);
+}
+
+void BuildAndStoreHuffmanTree(const int *histogram,
+                              const int length,
+                              const int quality,
+                              uint8_t* depth,
+                              uint16_t* bits,
+                              int* storage_ix,
+                              uint8_t* storage) {
+  int count = 0;
+  int s4[4] = { 0 };
+  for (size_t i = 0; i < length; i++) {
+    if (histogram[i]) {
+      if (count < 4) {
+        s4[count] = i;
+      } else if (quality < 3 && count > 4) {
+        break;
+      }
+      count++;
+    }
+  }
+
+  int max_bits_counter = length - 1;
+  int max_bits = 0;
+  while (max_bits_counter) {
+    max_bits_counter >>= 1;
+    ++max_bits;
+  }
+
+  if (count <= 1) {
+    WriteBits(4, 1, storage_ix, storage);
+    WriteBits(max_bits, s4[0], storage_ix, storage);
+    return;
+  }
+
+  if (length >= 50 && count >= 16 && quality >= 3) {
+    std::vector<int> counts(length);
+    memcpy(&counts[0], histogram, sizeof(counts[0]) * length);
+    OptimizeHuffmanCountsForRle(length, &counts[0]);
+    CreateHuffmanTree(&counts[0], length, 15, quality, depth);
+  } else {
+    CreateHuffmanTree(histogram, length, 15, quality, depth);
+  }
+  ConvertBitDepthsToSymbols(depth, length, bits);
+
+  if (count <= 4) {
+    StoreSimpleHuffmanTree(depth, s4, count, max_bits, storage_ix, storage);
+  } else {
+    StoreHuffmanTree(depth, length, quality, storage_ix, storage);
+  }
+}
+
+int IndexOf(const std::vector<int>& v, int value) {
+  for (int i = 0; i < v.size(); ++i) {
+    if (v[i] == value) return i;
+  }
+  return -1;
+}
+
+void MoveToFront(std::vector<int>* v, int index) {
+  int value = (*v)[index];
+  for (int i = index; i > 0; --i) {
+    (*v)[i] = (*v)[i - 1];
+  }
+  (*v)[0] = value;
+}
+
+std::vector<int> MoveToFrontTransform(const std::vector<int>& v) {
+  if (v.empty()) return v;
+  std::vector<int> mtf(*max_element(v.begin(), v.end()) + 1);
+  for (int i = 0; i < mtf.size(); ++i) mtf[i] = i;
+  std::vector<int> result(v.size());
+  for (int i = 0; i < v.size(); ++i) {
+    int index = IndexOf(mtf, v[i]);
+    result[i] = index;
+    MoveToFront(&mtf, index);
+  }
+  return result;
+}
+
+// Finds runs of zeros in v_in and replaces them with a prefix code of the run
+// length plus extra bits in *v_out and *extra_bits. Non-zero values in v_in are
+// shifted by *max_length_prefix. Will not create prefix codes bigger than the
+// initial value of *max_run_length_prefix. The prefix code of run length L is
+// simply Log2Floor(L) and the number of extra bits is the same as the prefix
+// code.
+void RunLengthCodeZeros(const std::vector<int>& v_in,
+                        int* max_run_length_prefix,
+                        std::vector<int>* v_out,
+                        std::vector<int>* extra_bits) {
+  int max_reps = 0;
+  for (int i = 0; i < v_in.size();) {
+    for (; i < v_in.size() && v_in[i] != 0; ++i) ;
+    int reps = 0;
+    for (; i < v_in.size() && v_in[i] == 0; ++i) {
+      ++reps;
+    }
+    max_reps = std::max(reps, max_reps);
+  }
+  int max_prefix = max_reps > 0 ? Log2Floor(max_reps) : 0;
+  *max_run_length_prefix = std::min(max_prefix, *max_run_length_prefix);
+  for (int i = 0; i < v_in.size();) {
+    if (v_in[i] != 0) {
+      v_out->push_back(v_in[i] + *max_run_length_prefix);
+      extra_bits->push_back(0);
+      ++i;
+    } else {
+      int reps = 1;
+      for (uint32_t k = i + 1; k < v_in.size() && v_in[k] == 0; ++k) {
+        ++reps;
+      }
+      i += reps;
+      while (reps) {
+        if (reps < (2 << *max_run_length_prefix)) {
+          int run_length_prefix = Log2Floor(reps);
+          v_out->push_back(run_length_prefix);
+          extra_bits->push_back(reps - (1 << run_length_prefix));
+          break;
+        } else {
+          v_out->push_back(*max_run_length_prefix);
+          extra_bits->push_back((1 << *max_run_length_prefix) - 1);
+          reps -= (2 << *max_run_length_prefix) - 1;
+        }
+      }
+    }
+  }
+}
+
+// Returns a maximum zero-run-length-prefix value such that run-length coding
+// zeros in v with this maximum prefix value and then encoding the resulting
+// histogram and entropy-coding v produces the least amount of bits.
+int BestMaxZeroRunLengthPrefix(const std::vector<int>& v) {
+  int min_cost = std::numeric_limits<int>::max();
+  int best_max_prefix = 0;
+  for (int max_prefix = 0; max_prefix <= 16; ++max_prefix) {
+    std::vector<int> rle_symbols;
+    std::vector<int> extra_bits;
+    int max_run_length_prefix = max_prefix;
+    RunLengthCodeZeros(v, &max_run_length_prefix, &rle_symbols, &extra_bits);
+    if (max_run_length_prefix < max_prefix) break;
+    HistogramContextMap histogram;
+    for (int i = 0; i < rle_symbols.size(); ++i) {
+      histogram.Add(rle_symbols[i]);
+    }
+    int bit_cost = PopulationCost(histogram);
+    if (max_prefix > 0) {
+      bit_cost += 4;
+    }
+    for (int i = 1; i <= max_prefix; ++i) {
+      bit_cost += histogram.data_[i] * i;  // extra bits
+    }
+    if (bit_cost < min_cost) {
+      min_cost = bit_cost;
+      best_max_prefix = max_prefix;
+    }
+  }
+  return best_max_prefix;
+}
+
+void EncodeContextMap(const std::vector<int>& context_map,
+                      int num_clusters,
+                      int* storage_ix, uint8_t* storage) {
+  StoreVarLenUint8(num_clusters - 1, storage_ix, storage);
+
+  if (num_clusters == 1) {
+    return;
+  }
+
+  std::vector<int> transformed_symbols = MoveToFrontTransform(context_map);
+  std::vector<int> rle_symbols;
+  std::vector<int> extra_bits;
+  int max_run_length_prefix = BestMaxZeroRunLengthPrefix(transformed_symbols);
+  RunLengthCodeZeros(transformed_symbols, &max_run_length_prefix,
+                     &rle_symbols, &extra_bits);
+  HistogramContextMap symbol_histogram;
+  for (int i = 0; i < rle_symbols.size(); ++i) {
+    symbol_histogram.Add(rle_symbols[i]);
+  }
+  bool use_rle = max_run_length_prefix > 0;
+  WriteBits(1, use_rle, storage_ix, storage);
+  if (use_rle) {
+    WriteBits(4, max_run_length_prefix - 1, storage_ix, storage);
+  }
+  EntropyCodeContextMap symbol_code;
+  memset(symbol_code.depth_, 0, sizeof(symbol_code.depth_));
+  memset(symbol_code.bits_, 0, sizeof(symbol_code.bits_));
+  BuildAndStoreHuffmanTree(symbol_histogram.data_,
+                           num_clusters + max_run_length_prefix,
+                           9,  // quality
+                           symbol_code.depth_, symbol_code.bits_,
+                           storage_ix, storage);
+  for (int i = 0; i < rle_symbols.size(); ++i) {
+    WriteBits(symbol_code.depth_[rle_symbols[i]],
+              symbol_code.bits_[rle_symbols[i]],
+              storage_ix, storage);
+    if (rle_symbols[i] > 0 && rle_symbols[i] <= max_run_length_prefix) {
+      WriteBits(rle_symbols[i], extra_bits[i], storage_ix, storage);
+    }
+  }
+  WriteBits(1, 1, storage_ix, storage);  // use move-to-front
+}
+
+void StoreBlockSwitch(const BlockSplitCode& code,
+                      const int block_ix,
+                      int* storage_ix,
+                      uint8_t* storage) {
+  if (block_ix > 0) {
+    int typecode = code.type_code[block_ix];
+    WriteBits(code.type_depths[typecode], code.type_bits[typecode],
+              storage_ix, storage);
+  }
+  int lencode = code.length_prefix[block_ix];
+  WriteBits(code.length_depths[lencode], code.length_bits[lencode],
+            storage_ix, storage);
+  WriteBits(code.length_nextra[block_ix], code.length_extra[block_ix],
+            storage_ix, storage);
+}
+
+void BuildAndStoreBlockSplitCode(const std::vector<int>& types,
+                                 const std::vector<int>& lengths,
+                                 const int num_types,
+                                 const int quality,
+                                 BlockSplitCode* code,
+                                 int* storage_ix,
+                                 uint8_t* storage) {
+  const int num_blocks = types.size();
+  std::vector<int> type_histo(num_types + 2);
+  std::vector<int> length_histo(26);
+  int last_type = 1;
+  int second_last_type = 0;
+  code->type_code.resize(num_blocks);
+  code->length_prefix.resize(num_blocks);
+  code->length_nextra.resize(num_blocks);
+  code->length_extra.resize(num_blocks);
+  code->type_depths.resize(num_types + 2);
+  code->type_bits.resize(num_types + 2);
+  code->length_depths.resize(26);
+  code->length_bits.resize(26);
+  for (int i = 0; i < num_blocks; ++i) {
+    int type = types[i];
+    int type_code = (type == last_type + 1 ? 1 :
+                     type == second_last_type ? 0 :
+                     type + 2);
+    second_last_type = last_type;
+    last_type = type;
+    code->type_code[i] = type_code;
+    if (i > 0) ++type_histo[type_code];
+    GetBlockLengthPrefixCode(lengths[i],
+                             &code->length_prefix[i],
+                             &code->length_nextra[i],
+                             &code->length_extra[i]);
+    ++length_histo[code->length_prefix[i]];
+  }
+  StoreVarLenUint8(num_types - 1, storage_ix, storage);
+  if (num_types > 1) {
+    BuildAndStoreHuffmanTree(&type_histo[0], num_types + 2, quality,
+                             &code->type_depths[0], &code->type_bits[0],
+                             storage_ix, storage);
+    BuildAndStoreHuffmanTree(&length_histo[0], 26, quality,
+                             &code->length_depths[0], &code->length_bits[0],
+                             storage_ix, storage);
+    StoreBlockSwitch(*code, 0, storage_ix, storage);
+  }
+}
+
+void StoreTrivialContextMap(int num_types,
+                            int context_bits,
+                            int* storage_ix,
+                            uint8_t* storage) {
+  StoreVarLenUint8(num_types - 1, storage_ix, storage);
+  if (num_types > 1) {
+    int repeat_code = context_bits - 1;
+    int repeat_bits = (1 << repeat_code) - 1;
+    int alphabet_size = num_types + repeat_code;
+    std::vector<int> histogram(alphabet_size);
+    std::vector<uint8_t> depths(alphabet_size);
+    std::vector<uint16_t> bits(alphabet_size);
+    // Write RLEMAX.
+    WriteBits(1, 1, storage_ix, storage);
+    WriteBits(4, repeat_code - 1, storage_ix, storage);
+    histogram[repeat_code] = num_types;
+    histogram[0] = 1;
+    for (int i = context_bits; i < alphabet_size; ++i) {
+      histogram[i] = 1;
+    }
+    BuildAndStoreHuffmanTree(&histogram[0], alphabet_size, 1,
+                             &depths[0], &bits[0],
+                             storage_ix, storage);
+    for (int i = 0; i < num_types; ++i) {
+      int code = (i == 0 ? 0 : i + context_bits - 1);
+      WriteBits(depths[code], bits[code], storage_ix, storage);
+      WriteBits(depths[repeat_code], bits[repeat_code], storage_ix, storage);
+      WriteBits(repeat_code, repeat_bits, storage_ix, storage);
+    }
+    // Write IMTF (inverse-move-to-front) bit.
+    WriteBits(1, 1, storage_ix, storage);
+  }
+}
+
+}  // namespace brotli
--- a/enc/brotli_bit_stream.h
+++ b/enc/brotli_bit_stream.h
@ -0,0 +1,109 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Functions to convert brotli-related data structures into the
+// brotli bit stream. The functions here operate under
+// assumption that there is enough space in the storage, i.e., there are
+// no out-of-range checks anywhere.
+//
+// These functions do bit addressing into a byte array. The byte array
+// is called "storage" and the index to the bit is called storage_ix
+// in function arguments.
+
+#ifndef BROTLI_ENC_BROTLI_BIT_STREAM_H_
+#define BROTLI_ENC_BROTLI_BIT_STREAM_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <vector>
+
+namespace brotli {
+
+// All Store functions here will use a storage_ix, which is always the bit
+// position for the current storage.
+
+// Stores a number between 0 and 255.
+void StoreVarLenUint8(int n, int* storage_ix, uint8_t* storage);
+
+// Stores the compressed meta-block header.
+bool StoreCompressedMetaBlockHeader(bool final_block,
+                                    int length,
+                                    int* storage_ix,
+                                    uint8_t* storage);
+
+// Stores the uncompressed meta-block header.
+bool StoreUncompressedMetaBlockHeader(int length,
+                                      int* storage_ix,
+                                      uint8_t* storage);
+
+// Stores a context map where the histogram type is always the block type.
+void StoreTrivialContextMap(int num_types,
+                            int context_bits,
+                            int* storage_ix,
+                            uint8_t* storage);
+
+void StoreHuffmanTreeOfHuffmanTreeToBitMask(
+    const int num_codes,
+    const uint8_t *code_length_bitdepth,
+    int *storage_ix,
+    uint8_t *storage);
+
+// Builds a Huffman tree from histogram[0:length] into depth[0:length] and
+// bits[0:length] and stores the encoded tree to the bit stream.
+void BuildAndStoreHuffmanTree(const int *histogram,
+                              const int length,
+                              const int quality,
+                              uint8_t* depth,
+                              uint16_t* bits,
+                              int* storage_ix,
+                              uint8_t* storage);
+
+// Encodes the given context map to the bit stream. The number of different
+// histogram ids is given by num_clusters.
+void EncodeContextMap(const std::vector<int>& context_map,
+                      int num_clusters,
+                      int* storage_ix, uint8_t* storage);
+
+// Data structure that stores everything that is needed to encode each block
+// block switch command.
+struct BlockSplitCode {
+  std::vector<int> type_code;
+  std::vector<int> length_prefix;
+  std::vector<int> length_nextra;
+  std::vector<int> length_extra;
+  std::vector<uint8_t> type_depths;
+  std::vector<uint16_t> type_bits;
+  std::vector<uint8_t> length_depths;
+  std::vector<uint16_t> length_bits;
+};
+
+// Builds a BlockSplitCode data structure from the block split given by the
+// vector of block types and block lengths and stores it to the bit stream.
+void BuildAndStoreBlockSplitCode(const std::vector<int>& types,
+                                 const std::vector<int>& lengths,
+                                 const int num_types,
+                                 const int quality,
+                                 BlockSplitCode* code,
+                                 int* storage_ix,
+                                 uint8_t* storage);
+
+// Stores the block switch command with index block_ix to the bit stream.
+void StoreBlockSwitch(const BlockSplitCode& code,
+                      const int block_ix,
+                      int* storage_ix,
+                      uint8_t* storage);
+
+}  // namespace brotli
+
+#endif  // BROTLI_ENC_BROTLI_BIT_STREAM_H_
--- a/enc/cluster.h
+++ b/enc/cluster.h
@ -20,6 +20,7 @@
 #include <math.h>
 #include <stdint.h>
 #include <stdio.h>
+#include <algorithm>
 #include <complex>
 #include <map>
 #include <set>
@ -42,7 +43,7 @@ struct HistogramPair {
 };

 struct HistogramPairComparator {
-  bool operator()(const HistogramPair& p1, const HistogramPair& p2) {
+  bool operator()(const HistogramPair& p1, const HistogramPair& p2) const {
    if (p1.cost_diff != p2.cost_diff) {
      return p1.cost_diff > p2.cost_diff;
    }
@ -59,8 +60,8 @@ inline double ClusterCostDiff(int size_a, int size_b) {

 // Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
 // it is below a threshold, stores the pair (idx1, idx2) in the *pairs heap.
-template<int kSize>
-void CompareAndPushToHeap(const Histogram<kSize>* out,
+template<typename HistogramType>
+void CompareAndPushToHeap(const HistogramType* out,
                          const int* cluster_size,
                          int idx1, int idx2,
                          std::vector<HistogramPair>* pairs) {
@ -90,7 +91,7 @@ void CompareAndPushToHeap(const Histogram<kSize>* out,
  } else {
    double threshold = pairs->empty() ? 1e99 :
        std::max(0.0, (*pairs)[0].cost_diff);
-    Histogram<kSize> combo = out[idx1];
+    HistogramType combo = out[idx1];
    combo.AddHistogram(out[idx2]);
    double cost_combo = PopulationCost(combo);
    if (cost_combo < threshold - p.cost_diff) {
@ -105,8 +106,8 @@ void CompareAndPushToHeap(const Histogram<kSize>* out,
  }
 }

-template<int kSize>
-void HistogramCombine(Histogram<kSize>* out,
+template<typename HistogramType>
+void HistogramCombine(HistogramType* out,
                      int* cluster_size,
                      int* symbols,
                      int symbols_size,
@ -178,22 +179,22 @@ void HistogramCombine(Histogram<kSize>* out,
 // Histogram refinement

 // What is the bit cost of moving histogram from cur_symbol to candidate.
-template<int kSize>
-double HistogramBitCostDistance(const Histogram<kSize>& histogram,
-                                const Histogram<kSize>& candidate) {
+template<typename HistogramType>
+double HistogramBitCostDistance(const HistogramType& histogram,
+                                const HistogramType& candidate) {
  if (histogram.total_count_ == 0) {
    return 0.0;
  }
-  Histogram<kSize> tmp = histogram;
+  HistogramType tmp = histogram;
  tmp.AddHistogram(candidate);
  return PopulationCost(tmp) - candidate.bit_cost_;
 }

 // Find the best 'out' histogram for each of the 'in' histograms.
 // Note: we assume that out[]->bit_cost_ is already up-to-date.
-template<int kSize>
-void HistogramRemap(const Histogram<kSize>* in, int in_size,
-                    Histogram<kSize>* out, int* symbols) {
+template<typename HistogramType>
+void HistogramRemap(const HistogramType* in, int in_size,
+                    HistogramType* out, int* symbols) {
  std::set<int> all_symbols;
  for (int i = 0; i < in_size; ++i) {
    all_symbols.insert(symbols[i]);
@ -224,10 +225,10 @@ void HistogramRemap(const Histogram<kSize>* in, int in_size,

 // Reorder histograms in *out so that the new symbols in *symbols come in
 // increasing order.
-template<int kSize>
-void HistogramReindex(std::vector<Histogram<kSize> >* out,
+template<typename HistogramType>
+void HistogramReindex(std::vector<HistogramType>* out,
                      std::vector<int>* symbols) {
-  std::vector<Histogram<kSize> > tmp(*out);
+  std::vector<HistogramType> tmp(*out);
  std::map<int, int> new_index;
  int next_index = 0;
  for (int i = 0; i < symbols->size(); ++i) {
@ -246,11 +247,11 @@ void HistogramReindex(std::vector<Histogram<kSize> >* out,
 // Clusters similar histograms in 'in' together, the selected histograms are
 // placed in 'out', and for each index in 'in', *histogram_symbols will
 // indicate which of the 'out' histograms is the best approximation.
-template<int kSize>
-void ClusterHistograms(const std::vector<Histogram<kSize> >& in,
+template<typename HistogramType>
+void ClusterHistograms(const std::vector<HistogramType>& in,
                       int num_contexts, int num_blocks,
                       int max_histograms,
-                       std::vector<Histogram<kSize> >* out,
+                       std::vector<HistogramType>* out,
                       std::vector<int>* histogram_symbols) {
  const int in_size = num_contexts * num_blocks;
  std::vector<int> cluster_size(in_size, 1);
--- a/enc/command.h
+++ b/enc/command.h
@ -18,31 +18,131 @@
 #define BROTLI_ENC_COMMAND_H_

 #include <stdint.h>
+#include "./fast_log.h"

 namespace brotli {

-// Command holds a sequence of literals and a backward reference copy.
-class Command {
- public:
-  // distance_code_ is initialized to 17 because it refers to the distance
-  // code of a backward distance of 1, this way the last insert-only command
-  // won't use the last-distance short code, and accordingly distance_prefix_ is
-  // set to 16
-  Command() : insert_length_(0), copy_length_(0), copy_length_code_(0),
-              copy_distance_(0), distance_code_(17),
-              distance_prefix_(16), command_prefix_(0),
-              distance_extra_bits_(0), distance_extra_bits_value_(0) {}
+static inline void GetDistCode(int distance_code,
+                               uint16_t* code, uint32_t* extra) {
+  distance_code -= 1;
+  if (distance_code < 16) {
+    *code = distance_code;
+    *extra = 0;
+  } else {
+    distance_code -= 12;
+    int numextra = Log2FloorNonZero(distance_code) - 1;
+    int prefix = distance_code >> numextra;
+    *code = 12 + 2 * numextra + prefix;
+    *extra = (numextra << 24) | (distance_code - (prefix << numextra));
+  }
+}

-  uint32_t insert_length_;
-  uint32_t copy_length_;
-  uint32_t copy_length_code_;
-  uint32_t copy_distance_;
-  // Values <= 16 are short codes, values > 16 are distances shifted by 16.
-  uint32_t distance_code_;
-  uint16_t distance_prefix_;
-  uint16_t command_prefix_;
-  int distance_extra_bits_;
-  uint32_t distance_extra_bits_value_;
+static int insbase[] =   { 0, 1, 2, 3, 4, 5, 6, 8, 10, 14, 18, 26, 34, 50, 66,
+    98, 130, 194, 322, 578, 1090, 2114, 6210, 22594 };
+static int insextra[] =  { 0, 0, 0, 0, 0, 0, 1, 1,  2,  2,  3,  3,  4,  4,  5,
+    5,   6,   7,   8,   9,   10,   12,   14,    24 };
+static int copybase[] =  { 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 18, 22, 30, 38,
+    54,  70, 102, 134, 198, 326,   582, 1094,  2118 };
+static int copyextra[] = { 0, 0, 0, 0, 0, 0, 0, 0,  1,  1,  2,  2,  3,  3,  4,
+    4,   5,   5,   6,   7,   8,     9,   10,    24 };
+
+static inline int GetInsertLengthCode(int insertlen) {
+  if (insertlen < 6) {
+    return insertlen;
+  } else if (insertlen < 130) {
+    insertlen -= 2;
+    int nbits = Log2FloorNonZero(insertlen) - 1;
+    return (nbits << 1) + (insertlen >> nbits) + 2;
+  } else if (insertlen < 2114) {
+    return Log2FloorNonZero(insertlen - 66) + 10;
+  } else if (insertlen < 6210) {
+    return 21;
+  } else if (insertlen < 22594) {
+    return 22;
+  } else {
+    return 23;
+  }
+}
+
+static inline int GetCopyLengthCode(int copylen) {
+  if (copylen < 10) {
+    return copylen - 2;
+  } else if (copylen < 134) {
+    copylen -= 6;
+    int nbits = Log2FloorNonZero(copylen) - 1;
+    return (nbits << 1) + (copylen >> nbits) + 4;
+  } else if (copylen < 2118) {
+    return Log2FloorNonZero(copylen - 70) + 12;
+  } else {
+    return 23;
+  }
+}
+
+static inline int CombineLengthCodes(
+    int inscode, int copycode, int distancecode) {
+  int bits64 = (copycode & 0x7u) | ((inscode & 0x7u) << 3);
+  if (distancecode == 0 && inscode < 8 && copycode < 16) {
+    return (copycode < 8) ? bits64 : (bits64 | 64);
+  } else {
+    // "To convert an insert-and-copy length code to an insert length code and
+    // a copy length code, the following table can be used"
+    static const int cells[9] = { 2, 3, 6, 4, 5, 8, 7, 9, 10 };
+    return (cells[(copycode >> 3) + 3 * (inscode >> 3)] << 6) | bits64;
+  }
+}
+
+static inline void GetLengthCode(int insertlen, int copylen, int distancecode,
+                                 uint16_t* code, uint64_t* extra) {
+  int inscode = GetInsertLengthCode(insertlen);
+  int copycode = GetCopyLengthCode(copylen);
+  uint64_t insnumextra = insextra[inscode];
+  uint64_t numextra = insnumextra + copyextra[copycode];
+  uint64_t insextraval = insertlen - insbase[inscode];
+  uint64_t copyextraval = copylen - copybase[copycode];
+  *code = CombineLengthCodes(inscode, copycode, distancecode);
+  *extra = (numextra << 48) | (copyextraval << insnumextra) | insextraval;
+}
+
+struct Command {
+  Command() {}
+
+  Command(int insertlen, int copylen, int copylen_code, int distance_code)
+      : insert_len_(insertlen), copy_len_(copylen) {
+    GetDistCode(distance_code, &dist_prefix_, &dist_extra_);
+    GetLengthCode(insertlen, copylen_code, dist_prefix_,
+                  &cmd_prefix_, &cmd_extra_);
+  }
+
+  Command(int insertlen)
+      : insert_len_(insertlen), copy_len_(0), dist_prefix_(16), dist_extra_(0) {
+    GetLengthCode(insertlen, 4, dist_prefix_, &cmd_prefix_, &cmd_extra_);
+  }
+
+  int DistanceCode() const {
+    if (dist_prefix_ < 16) {
+      return dist_prefix_ + 1;
+    }
+    int nbits = dist_extra_ >> 24;
+    int extra = dist_extra_ & 0xffffff;
+    int prefix = dist_prefix_ - 12 - 2 * nbits;
+    return (prefix << nbits) + extra + 13;
+  }
+
+  int DistanceContext() const {
+    int r = cmd_prefix_ >> 6;
+    int c = cmd_prefix_ & 7;
+    if ((r == 0 || r == 2 || r == 4 || r == 7) && (c <= 2)) {
+      return c;
+    }
+    return 3;
+  }
+
+  int insert_len_;
+  int copy_len_;
+  uint16_t cmd_prefix_;
+  uint16_t dist_prefix_;
+  uint64_t cmd_extra_;
+  uint32_t dist_extra_;
 };

 }  // namespace brotli
--- a/enc/dictionary.h
+++ b/enc/dictionary.h
@ -17,6 +17,8 @@
 #ifndef BROTLI_ENC_DICTIONARY_H_
 #define BROTLI_ENC_DICTIONARY_H_

+#include <stdint.h>
+
 static const uint8_t kBrotliDictionary[] = {
 0x74, 0x69, 0x6d, 0x65, 0x64, 0x6f, 0x77, 0x6e, 0x6c, 0x69, 0x66, 0x65, 0x6c,
 0x65, 0x66, 0x74, 0x62, 0x61, 0x63, 0x6b, 0x63, 0x6f, 0x64, 0x65, 0x64, 0x61,
--- a/enc/encode.cc
+++ b/enc/encode.cc
@ -22,6 +22,7 @@
 #include "./backward_references.h"
 #include "./bit_cost.h"
 #include "./block_splitter.h"
+#include "./brotli_bit_stream.h"
 #include "./cluster.h"
 #include "./context.h"
 #include "./transform.h"
@ -65,19 +66,6 @@ double TotalBitCost(const std::vector<Histogram<kSize> >& histograms) {
  return retval;
 }

-void EncodeVarLenUint8(int n, int* storage_ix, uint8_t* storage) {
-  if (n == 0) {
-    WriteBits(1, 0, storage_ix, storage);
-  } else {
-    WriteBits(1, 1, storage_ix, storage);
-    int nbits = Log2Floor(n);
-    WriteBits(3, nbits, storage_ix, storage);
-    if (nbits > 0) {
-      WriteBits(nbits, n - (1 << nbits), storage_ix, storage);
-    }
-  }
-}
-
 int ParseAsUTF8(int* symbol, const uint8_t* input, int size) {
  // ASCII
  if ((input[0] & 0x80) == 0) {
@ -168,134 +156,6 @@ void EncodeMetaBlockLength(size_t meta_block_size,
  }
 }

-void StoreHuffmanTreeOfHuffmanTreeToBitMask(
-    const uint8_t* code_length_bitdepth,
-    int* storage_ix, uint8_t* storage) {
-  static const uint8_t kStorageOrder[kCodeLengthCodes] = {
-    1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-  };
-  // Throw away trailing zeros:
-  int codes_to_store = kCodeLengthCodes;
-  for (; codes_to_store > 0; --codes_to_store) {
-    if (code_length_bitdepth[kStorageOrder[codes_to_store - 1]] != 0) {
-      break;
-    }
-  }
-  int num_codes = 0;
-  for (int i = 0; i < codes_to_store; ++i) {
-    if (code_length_bitdepth[kStorageOrder[i]] != 0) {
-      ++num_codes;
-    }
-  }
-  if (num_codes == 1) {
-    codes_to_store = kCodeLengthCodes;
-  }
-  int skip_some = 0;  // skips none.
-  if (code_length_bitdepth[kStorageOrder[0]] == 0 &&
-      code_length_bitdepth[kStorageOrder[1]] == 0) {
-    skip_some = 2;  // skips two.
-    if (code_length_bitdepth[kStorageOrder[2]] == 0) {
-      skip_some = 3;  // skips three.
-    }
-  }
-  WriteBits(2, skip_some, storage_ix, storage);
-  for (int i = skip_some; i < codes_to_store; ++i) {
-    uint8_t len[] = { 2, 4, 3, 2, 2, 4 };
-    uint8_t bits[] = { 0, 7, 3, 2, 1, 15 };
-    int v = code_length_bitdepth[kStorageOrder[i]];
-    WriteBits(len[v], bits[v], storage_ix, storage);
-  }
-}
-
-void StoreHuffmanTreeToBitMask(
-    const uint8_t* huffman_tree,
-    const uint8_t* huffman_tree_extra_bits,
-    const int huffman_tree_size,
-    const EntropyCode<kCodeLengthCodes>& entropy,
-    int* storage_ix, uint8_t* storage) {
-  for (int i = 0; i < huffman_tree_size; ++i) {
-    const int ix = huffman_tree[i];
-    const int extra_bits = huffman_tree_extra_bits[i];
-    if (entropy.count_ > 1) {
-      WriteBits(entropy.depth_[ix], entropy.bits_[ix], storage_ix, storage);
-    }
-    switch (ix) {
-      case 16:
-        WriteBits(2, extra_bits, storage_ix, storage);
-        break;
-      case 17:
-        WriteBits(3, extra_bits, storage_ix, storage);
-        break;
-    }
-  }
-}
-
-template<int kSize>
-void StoreHuffmanCodeSimple(
-    const EntropyCode<kSize>& code, int alphabet_size,
-    int max_bits, int* storage_ix, uint8_t* storage) {
-  const uint8_t *depth = &code.depth_[0];
-  int symbols[4];
-  // Quadratic sort.
-  int k, j;
-  for (k = 0; k < code.count_; ++k) {
-    symbols[k] = code.symbols_[k];
-  }
-  for (k = 0; k < code.count_; ++k) {
-    for (j = k + 1; j < code.count_; ++j) {
-      if (depth[symbols[j]] < depth[symbols[k]]) {
-        int t = symbols[k];
-        symbols[k] = symbols[j];
-        symbols[j] = t;
-      }
-    }
-  }
-  // Small tree marker to encode 1-4 symbols.
-  WriteBits(2, 1, storage_ix, storage);
-  WriteBits(2, code.count_ - 1, storage_ix, storage);
-  for (int i = 0; i < code.count_; ++i) {
-    WriteBits(max_bits, symbols[i], storage_ix, storage);
-  }
-  if (code.count_ == 4) {
-    if (depth[symbols[0]] == 2 &&
-        depth[symbols[1]] == 2 &&
-        depth[symbols[2]] == 2 &&
-        depth[symbols[3]] == 2) {
-      WriteBits(1, 0, storage_ix, storage);
-    } else {
-      WriteBits(1, 1, storage_ix, storage);
-    }
-  }
-}
-
-template<int kSize>
-void StoreHuffmanCodeComplex(
-    const EntropyCode<kSize>& code, int alphabet_size,
-    int* storage_ix, uint8_t* storage) {
-  const uint8_t *depth = &code.depth_[0];
-  uint8_t huffman_tree[kSize];
-  uint8_t huffman_tree_extra_bits[kSize];
-  int huffman_tree_size = 0;
-  WriteHuffmanTree(depth,
-                   alphabet_size,
-                   &huffman_tree[0],
-                   &huffman_tree_extra_bits[0],
-                   &huffman_tree_size);
-  Histogram<kCodeLengthCodes> huffman_tree_histogram;
-  memset(huffman_tree_histogram.data_, 0, sizeof(huffman_tree_histogram.data_));
-  for (int i = 0; i < huffman_tree_size; ++i) {
-    huffman_tree_histogram.Add(huffman_tree[i]);
-  }
-  EntropyCode<kCodeLengthCodes> huffman_tree_entropy;
-  BuildEntropyCode(huffman_tree_histogram, 5, kCodeLengthCodes,
-                   &huffman_tree_entropy);
-  StoreHuffmanTreeOfHuffmanTreeToBitMask(
-      &huffman_tree_entropy.depth_[0], storage_ix, storage);
-  StoreHuffmanTreeToBitMask(&huffman_tree[0], &huffman_tree_extra_bits[0],
-                            huffman_tree_size, huffman_tree_entropy,
-                            storage_ix, storage);
-}
-
 template<int kSize>
 void BuildAndStoreEntropyCode(const Histogram<kSize>& histogram,
                              const int tree_limit,
@ -304,45 +164,8 @@ void BuildAndStoreEntropyCode(const Histogram<kSize>& histogram,
                              int* storage_ix, uint8_t* storage) {
  memset(code->depth_, 0, sizeof(code->depth_));
  memset(code->bits_, 0, sizeof(code->bits_));
-  memset(code->symbols_, 0, sizeof(code->symbols_));
-  code->count_ = 0;
-
-  int max_bits_counter = alphabet_size - 1;
-  int max_bits = 0;
-  while (max_bits_counter) {
-    max_bits_counter >>= 1;
-    ++max_bits;
-  }
-
-  for (size_t i = 0; i < alphabet_size; i++) {
-    if (histogram.data_[i] > 0) {
-      if (code->count_ < 4) code->symbols_[code->count_] = i;
-      ++code->count_;
-    }
-  }
-
-  if (code->count_ <= 1) {
-    WriteBits(2, 1, storage_ix, storage);
-    WriteBits(2, 0, storage_ix, storage);
-    WriteBits(max_bits, code->symbols_[0], storage_ix, storage);
-    return;
-  }
-
-  if (alphabet_size >= 50 && code->count_ >= 16) {
-    std::vector<int> counts(alphabet_size);
-    memcpy(&counts[0], histogram.data_, sizeof(counts[0]) * alphabet_size);
-    OptimizeHuffmanCountsForRle(alphabet_size, &counts[0]);
-    CreateHuffmanTree(&counts[0], alphabet_size, tree_limit, code->depth_);
-  } else {
-    CreateHuffmanTree(histogram.data_, alphabet_size, tree_limit, code->depth_);
-  }
-  ConvertBitDepthsToSymbols(code->depth_, alphabet_size, code->bits_);
-
-  if (code->count_ <= 4) {
-    StoreHuffmanCodeSimple(*code, alphabet_size, max_bits, storage_ix, storage);
-  } else {
-    StoreHuffmanCodeComplex(*code, alphabet_size, storage_ix, storage);
-  }
+  BuildAndStoreHuffmanTree(histogram.data_, alphabet_size, 9,
+                           code->depth_, code->bits_, storage_ix, storage);
 }

 template<int kSize>
@ -362,324 +185,45 @@ void BuildAndStoreEntropyCodes(
 void EncodeCommand(const Command& cmd,
                   const EntropyCodeCommand& entropy,
                   int* storage_ix, uint8_t* storage) {
-  int code = cmd.command_prefix_;
+  int code = cmd.cmd_prefix_;
  WriteBits(entropy.depth_[code], entropy.bits_[code], storage_ix, storage);
-  if (code >= 128) {
-    code -= 128;
-  }
-  int insert_extra_bits = InsertLengthExtraBits(code);
-  uint64_t insert_extra_bits_val =
-      cmd.insert_length_ - InsertLengthOffset(code);
-  int copy_extra_bits = CopyLengthExtraBits(code);
-  uint64_t copy_extra_bits_val = cmd.copy_length_code_ - CopyLengthOffset(code);
-  if (insert_extra_bits > 0) {
-    WriteBits(insert_extra_bits, insert_extra_bits_val, storage_ix, storage);
-  }
-  if (copy_extra_bits > 0) {
-    WriteBits(copy_extra_bits, copy_extra_bits_val, storage_ix, storage);
+  int nextra = cmd.cmd_extra_ >> 48;
+  uint64_t extra = cmd.cmd_extra_ & 0xffffffffffffULL;
+  if (nextra > 0) {
+    WriteBits(nextra, extra, storage_ix, storage);
  }
 }

 void EncodeCopyDistance(const Command& cmd, const EntropyCodeDistance& entropy,
                        int* storage_ix, uint8_t* storage) {
-  int code = cmd.distance_prefix_;
-  int extra_bits = cmd.distance_extra_bits_;
-  uint64_t extra_bits_val = cmd.distance_extra_bits_value_;
+  int code = cmd.dist_prefix_;
+  int extra_bits = cmd.dist_extra_ >> 24;
+  uint64_t extra_bits_val = cmd.dist_extra_ & 0xffffff;
  WriteBits(entropy.depth_[code], entropy.bits_[code], storage_ix, storage);
  if (extra_bits > 0) {
    WriteBits(extra_bits, extra_bits_val, storage_ix, storage);
  }
 }

-void ComputeDistanceShortCodes(std::vector<Command>* cmds,
-                               size_t pos,
-                               const size_t max_backward,
-                               int* dist_ringbuffer,
-                               size_t* ringbuffer_idx) {
-  static const int kIndexOffset[16] = {
-    3, 2, 1, 0, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2
-  };
-  static const int kValueOffset[16] = {
-    0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3
-  };
-  for (int i = 0; i < cmds->size(); ++i) {
-    pos += (*cmds)[i].insert_length_;
-    size_t max_distance = std::min(pos, max_backward);
-    int cur_dist = (*cmds)[i].copy_distance_;
-    int dist_code = cur_dist + 16;
-    if (cur_dist <= max_distance) {
-      if (cur_dist == 0) break;
-      int limits[16] = { 0, 0, 0, 0,
-                         6, 6, 11, 11,
-                         11, 11, 11, 11,
-                         12, 12, 12, 12 };
-      for (int k = 0; k < 16; ++k) {
-        // Only accept more popular choices.
-        if (cur_dist < limits[k]) {
-          // Typically unpopular ranges, don't replace a short distance
-          // with them.
-          continue;
-        }
-        int comp = (dist_ringbuffer[(*ringbuffer_idx + kIndexOffset[k]) & 3] +
-                    kValueOffset[k]);
-        if (cur_dist == comp) {
-          dist_code = k + 1;
-          break;
-        }
-      }
-      if (dist_code > 1) {
-        dist_ringbuffer[*ringbuffer_idx & 3] = cur_dist;
-        ++(*ringbuffer_idx);
-      }
-      pos += (*cmds)[i].copy_length_;
-    } else {
-      int word_idx = cur_dist - max_distance - 1;
-      const std::string word =
-          GetTransformedDictionaryWord((*cmds)[i].copy_length_code_, word_idx);
-      pos += word.size();
-    }
-    (*cmds)[i].distance_code_ = dist_code;
+void RecomputeDistancePrefixes(std::vector<Command>* cmds,
+                               int num_direct_distance_codes,
+                               int distance_postfix_bits) {
+  if (num_direct_distance_codes == 0 &&
+      distance_postfix_bits == 0) {
+    return;
  }
-}
-
-void ComputeCommandPrefixes(std::vector<Command>* cmds,
-                            int num_direct_distance_codes,
-                            int distance_postfix_bits) {
  for (int i = 0; i < cmds->size(); ++i) {
    Command* cmd = &(*cmds)[i];
-    cmd->command_prefix_ = CommandPrefix(cmd->insert_length_,
-                                         cmd->copy_length_code_);
-    if (cmd->copy_length_code_ > 0) {
-      PrefixEncodeCopyDistance(cmd->distance_code_,
+    if (cmd->copy_len_ > 0 && cmd->cmd_prefix_ >= 128) {
+      PrefixEncodeCopyDistance(cmd->DistanceCode(),
                               num_direct_distance_codes,
                               distance_postfix_bits,
-                               &cmd->distance_prefix_,
-                               &cmd->distance_extra_bits_,
-                               &cmd->distance_extra_bits_value_);
-    }
-    if (cmd->command_prefix_ < 128 && cmd->distance_prefix_ == 0) {
-      cmd->distance_prefix_ = 0xffff;
-    } else {
-      cmd->command_prefix_ += 128;
+                               &cmd->dist_prefix_,
+                               &cmd->dist_extra_);
    }
  }
 }

-int IndexOf(const std::vector<int>& v, int value) {
-  for (int i = 0; i < v.size(); ++i) {
-    if (v[i] == value) return i;
-  }
-  return -1;
-}
-
-void MoveToFront(std::vector<int>* v, int index) {
-  int value = (*v)[index];
-  for (int i = index; i > 0; --i) {
-    (*v)[i] = (*v)[i - 1];
-  }
-  (*v)[0] = value;
-}
-
-std::vector<int> MoveToFrontTransform(const std::vector<int>& v) {
-  if (v.empty()) return v;
-  std::vector<int> mtf(*max_element(v.begin(), v.end()) + 1);
-  for (int i = 0; i < mtf.size(); ++i) mtf[i] = i;
-  std::vector<int> result(v.size());
-  for (int i = 0; i < v.size(); ++i) {
-    int index = IndexOf(mtf, v[i]);
-    result[i] = index;
-    MoveToFront(&mtf, index);
-  }
-  return result;
-}
-
-// Finds runs of zeros in v_in and replaces them with a prefix code of the run
-// length plus extra bits in *v_out and *extra_bits. Non-zero values in v_in are
-// shifted by *max_length_prefix. Will not create prefix codes bigger than the
-// initial value of *max_run_length_prefix. The prefix code of run length L is
-// simply Log2Floor(L) and the number of extra bits is the same as the prefix
-// code.
-void RunLengthCodeZeros(const std::vector<int>& v_in,
-                        int* max_run_length_prefix,
-                        std::vector<int>* v_out,
-                        std::vector<int>* extra_bits) {
-  int max_reps = 0;
-  for (int i = 0; i < v_in.size();) {
-    for (; i < v_in.size() && v_in[i] != 0; ++i) ;
-    int reps = 0;
-    for (; i < v_in.size() && v_in[i] == 0; ++i) {
-      ++reps;
-    }
-    max_reps = std::max(reps, max_reps);
-  }
-  int max_prefix = max_reps > 0 ? Log2Floor(max_reps) : 0;
-  *max_run_length_prefix = std::min(max_prefix, *max_run_length_prefix);
-  for (int i = 0; i < v_in.size();) {
-    if (v_in[i] != 0) {
-      v_out->push_back(v_in[i] + *max_run_length_prefix);
-      extra_bits->push_back(0);
-      ++i;
-    } else {
-      int reps = 1;
-      for (uint32_t k = i + 1; k < v_in.size() && v_in[k] == 0; ++k) {
-        ++reps;
-      }
-      i += reps;
-      while (reps) {
-        if (reps < (2 << *max_run_length_prefix)) {
-          int run_length_prefix = Log2Floor(reps);
-          v_out->push_back(run_length_prefix);
-          extra_bits->push_back(reps - (1 << run_length_prefix));
-          break;
-        } else {
-          v_out->push_back(*max_run_length_prefix);
-          extra_bits->push_back((1 << *max_run_length_prefix) - 1);
-          reps -= (2 << *max_run_length_prefix) - 1;
-        }
-      }
-    }
-  }
-}
-
-// Returns a maximum zero-run-length-prefix value such that run-length coding
-// zeros in v with this maximum prefix value and then encoding the resulting
-// histogram and entropy-coding v produces the least amount of bits.
-int BestMaxZeroRunLengthPrefix(const std::vector<int>& v) {
-  int min_cost = std::numeric_limits<int>::max();
-  int best_max_prefix = 0;
-  for (int max_prefix = 0; max_prefix <= 16; ++max_prefix) {
-    std::vector<int> rle_symbols;
-    std::vector<int> extra_bits;
-    int max_run_length_prefix = max_prefix;
-    RunLengthCodeZeros(v, &max_run_length_prefix, &rle_symbols, &extra_bits);
-    if (max_run_length_prefix < max_prefix) break;
-    HistogramContextMap histogram;
-    for (int i = 0; i < rle_symbols.size(); ++i) {
-      histogram.Add(rle_symbols[i]);
-    }
-    int bit_cost = PopulationCost(histogram);
-    if (max_prefix > 0) {
-      bit_cost += 4;
-    }
-    for (int i = 1; i <= max_prefix; ++i) {
-      bit_cost += histogram.data_[i] * i;  // extra bits
-    }
-    if (bit_cost < min_cost) {
-      min_cost = bit_cost;
-      best_max_prefix = max_prefix;
-    }
-  }
-  return best_max_prefix;
-}
-
-void EncodeContextMap(const std::vector<int>& context_map,
-                      int num_clusters,
-                      int* storage_ix, uint8_t* storage) {
-  EncodeVarLenUint8(num_clusters - 1, storage_ix, storage);
-
-  if (num_clusters == 1) {
-    return;
-  }
-
-  std::vector<int> transformed_symbols = MoveToFrontTransform(context_map);
-  std::vector<int> rle_symbols;
-  std::vector<int> extra_bits;
-  int max_run_length_prefix = BestMaxZeroRunLengthPrefix(transformed_symbols);
-  RunLengthCodeZeros(transformed_symbols, &max_run_length_prefix,
-                     &rle_symbols, &extra_bits);
-  HistogramContextMap symbol_histogram;
-  for (int i = 0; i < rle_symbols.size(); ++i) {
-    symbol_histogram.Add(rle_symbols[i]);
-  }
-  bool use_rle = max_run_length_prefix > 0;
-  WriteBits(1, use_rle, storage_ix, storage);
-  if (use_rle) {
-    WriteBits(4, max_run_length_prefix - 1, storage_ix, storage);
-  }
-  EntropyCodeContextMap symbol_code;
-  BuildAndStoreEntropyCode(symbol_histogram, 15,
-                           num_clusters + max_run_length_prefix,
-                           &symbol_code,
-                           storage_ix, storage);
-  for (int i = 0; i < rle_symbols.size(); ++i) {
-    WriteBits(symbol_code.depth_[rle_symbols[i]],
-              symbol_code.bits_[rle_symbols[i]],
-              storage_ix, storage);
-    if (rle_symbols[i] > 0 && rle_symbols[i] <= max_run_length_prefix) {
-      WriteBits(rle_symbols[i], extra_bits[i], storage_ix, storage);
-    }
-  }
-  WriteBits(1, 1, storage_ix, storage);  // use move-to-front
-}
-
-struct BlockSplitCode {
-  EntropyCodeBlockType block_type_code;
-  EntropyCodeBlockLength block_len_code;
-};
-
-void EncodeBlockLength(const EntropyCodeBlockLength& entropy,
-                       int length,
-                       int* storage_ix, uint8_t* storage) {
-  int len_code = BlockLengthPrefix(length);
-  int extra_bits = BlockLengthExtraBits(len_code);
-  int extra_bits_value = length - BlockLengthOffset(len_code);
-  WriteBits(entropy.depth_[len_code], entropy.bits_[len_code],
-            storage_ix, storage);
-  if (extra_bits > 0) {
-    WriteBits(extra_bits, extra_bits_value, storage_ix, storage);
-  }
-}
-
-void ComputeBlockTypeShortCodes(BlockSplit* split) {
-  if (split->num_types_ <= 1) {
-    split->num_types_ = 1;
-    return;
-  }
-  int ringbuffer[2] = { 0, 1 };
-  size_t index = 0;
-  for (int i = 0; i < split->types_.size(); ++i) {
-    int type = split->types_[i];
-    int type_code;
-    if (type == ringbuffer[index & 1]) {
-      type_code = 0;
-    } else if (type == ringbuffer[(index - 1) & 1] + 1) {
-      type_code = 1;
-    } else {
-      type_code = type + 2;
-    }
-    ringbuffer[index & 1] = type;
-    ++index;
-    split->type_codes_.push_back(type_code);
-  }
-}
-
-void BuildAndEncodeBlockSplitCode(const BlockSplit& split,
-                                  BlockSplitCode* code,
-                                  int* storage_ix, uint8_t* storage) {
-  EncodeVarLenUint8(split.num_types_ - 1, storage_ix, storage);
-
-  if (split.num_types_ == 1) {
-    return;
-  }
-
-  HistogramBlockType type_histo;
-  for (int i = 1; i < split.type_codes_.size(); ++i) {
-    type_histo.Add(split.type_codes_[i]);
-  }
-  HistogramBlockLength length_histo;
-  for (int i = 0; i < split.lengths_.size(); ++i) {
-    length_histo.Add(BlockLengthPrefix(split.lengths_[i]));
-  }
-  BuildAndStoreEntropyCode(type_histo, 15, split.num_types_ + 2,
-                           &code->block_type_code,
-                           storage_ix, storage);
-  BuildAndStoreEntropyCode(length_histo, 15, kNumBlockLenPrefixes,
-                           &code->block_len_code,
-                           storage_ix, storage);
-  EncodeBlockLength(code->block_len_code, split.lengths_[0],
-                    storage_ix, storage);
-}
-
 void MoveAndEncode(const BlockSplitCode& code,
                   BlockSplitIterator* it,
                   int* storage_ix, uint8_t* storage) {
@ -687,11 +231,7 @@ void MoveAndEncode(const BlockSplitCode& code,
    ++it->idx_;
    it->type_ = it->split_.types_[it->idx_];
    it->length_ = it->split_.lengths_[it->idx_];
-    int type_code = it->split_.type_codes_[it->idx_];
-    WriteBits(code.block_type_code.depth_[type_code],
-              code.block_type_code.bits_[type_code],
-              storage_ix, storage);
-    EncodeBlockLength(code.block_len_code, it->length_, storage_ix, storage);
+    StoreBlockSwitch(code, it->idx_, storage_ix, storage);
  }
  --it->length_;
 }
@ -727,17 +267,14 @@ void BuildMetaBlock(const EncodingParams& params,
  if (cmds.empty()) {
    return;
  }
-  ComputeCommandPrefixes(&mb->cmds,
-                         mb->params.num_direct_distance_codes,
-                         mb->params.distance_postfix_bits);
+  RecomputeDistancePrefixes(&mb->cmds,
+                            mb->params.num_direct_distance_codes,
+                            mb->params.distance_postfix_bits);
  SplitBlock(mb->cmds,
             &ringbuffer[pos & mask],
             &mb->literal_split,
             &mb->command_split,
             &mb->distance_split);
-  ComputeBlockTypeShortCodes(&mb->literal_split);
-  ComputeBlockTypeShortCodes(&mb->command_split);
-  ComputeBlockTypeShortCodes(&mb->distance_split);

  mb->literal_context_modes.resize(mb->literal_split.num_types_,
                                   mb->params.literal_context_mode);
@ -786,7 +323,7 @@ size_t MetaBlockLength(const std::vector<Command>& cmds) {
  size_t length = 0;
  for (int i = 0; i < cmds.size(); ++i) {
    const Command& cmd = cmds[i];
-    length += cmd.insert_length_ + cmd.copy_length_;
+    length += cmd.insert_len_ + cmd.copy_len_;
  }
  return length;
 }
@ -807,12 +344,24 @@ void StoreMetaBlock(const MetaBlock& mb,
  BlockSplitCode literal_split_code;
  BlockSplitCode command_split_code;
  BlockSplitCode distance_split_code;
-  BuildAndEncodeBlockSplitCode(mb.literal_split, &literal_split_code,
-                               storage_ix, storage);
-  BuildAndEncodeBlockSplitCode(mb.command_split, &command_split_code,
-                               storage_ix, storage);
-  BuildAndEncodeBlockSplitCode(mb.distance_split, &distance_split_code,
-                               storage_ix, storage);
+  BuildAndStoreBlockSplitCode(mb.literal_split.types_,
+                              mb.literal_split.lengths_,
+                              mb.literal_split.num_types_,
+                              9,  // quality
+                              &literal_split_code,
+                              storage_ix, storage);
+  BuildAndStoreBlockSplitCode(mb.command_split.types_,
+                              mb.command_split.lengths_,
+                              mb.command_split.num_types_,
+                              9,  // quality
+                              &command_split_code,
+                              storage_ix, storage);
+  BuildAndStoreBlockSplitCode(mb.distance_split.types_,
+                              mb.distance_split.lengths_,
+                              mb.distance_split.num_types_,
+                              9,  // quality
+                              &distance_split_code,
+                              storage_ix, storage);
  WriteBits(2, mb.params.distance_postfix_bits, storage_ix, storage);
  WriteBits(4,
            mb.params.num_direct_distance_codes >>
@ -844,7 +393,7 @@ void StoreMetaBlock(const MetaBlock& mb,
    const Command& cmd = mb.cmds[i];
    MoveAndEncode(command_split_code, &command_it, storage_ix, storage);
    EncodeCommand(cmd, command_codes[command_it.type_], storage_ix, storage);
-    for (int j = 0; j < cmd.insert_length_; ++j) {
+    for (int j = 0; j < cmd.insert_len_; ++j) {
      MoveAndEncode(literal_split_code, &literal_it, storage_ix, storage);
      int histogram_idx = literal_it.type_;
      uint8_t prev_byte = *pos > 0 ? ringbuffer[(*pos - 1) & mask] : 0;
@ -859,16 +408,14 @@ void StoreMetaBlock(const MetaBlock& mb,
                storage_ix, storage);
      ++(*pos);
    }
-    if (*pos < end_pos && cmd.distance_prefix_ != 0xffff) {
+    if (*pos < end_pos && cmd.cmd_prefix_ >= 128) {
      MoveAndEncode(distance_split_code, &distance_it, storage_ix, storage);
-      int context = (distance_it.type_ << 2) +
-          ((cmd.copy_length_code_ > 4) ? 3 : cmd.copy_length_code_ - 2);
+      int context = (distance_it.type_ << 2) + cmd.DistanceContext();
      int histogram_index = mb.distance_context_map[context];
-      size_t max_distance = std::min(*pos, (size_t)kMaxBackwardDistance);
      EncodeCopyDistance(cmd, distance_codes[histogram_index],
                         storage_ix, storage);
    }
-    *pos += cmd.copy_length_;
+    *pos += cmd.copy_len_;
  }
 }

@ -876,20 +423,19 @@ BrotliCompressor::BrotliCompressor(BrotliParams params)
    : params_(params),
      window_bits_(kWindowBits),
      hashers_(new Hashers()),
-      dist_ringbuffer_idx_(0),
      input_pos_(0),
      ringbuffer_(kRingBufferBits, kMetaBlockSizeBits),
      literal_cost_(1 << kRingBufferBits),
      storage_ix_(0),
      storage_(new uint8_t[2 << kMetaBlockSizeBits]) {
-  dist_ringbuffer_[0] = 16;
-  dist_ringbuffer_[1] = 15;
-  dist_ringbuffer_[2] = 11;
-  dist_ringbuffer_[3] = 4;
+  dist_cache_[0] = 4;
+  dist_cache_[1] = 11;
+  dist_cache_[2] = 15;
+  dist_cache_[3] = 16;
  storage_[0] = 0;
  switch (params.mode) {
-    case BrotliParams::MODE_TEXT: hash_type_ = Hashers::HASH_15_8_4; break;
-    case BrotliParams::MODE_FONT: hash_type_ = Hashers::HASH_15_8_2; break;
+    case BrotliParams::MODE_TEXT: hash_type_ = 8; break;
+    case BrotliParams::MODE_FONT: hash_type_ = 9; break;
    default: break;
  }
  hashers_->Init(hash_type_);
@ -942,7 +488,7 @@ void BrotliCompressor::WriteMetaBlock(const size_t input_size,
                                      uint8_t* encoded_buffer) {
  static const double kMinUTF8Ratio = 0.75;
  bool utf8_mode = false;
-  std::vector<Command> commands;
+  std::vector<Command> commands((input_size + 1) >> 1);
  if (input_size > 0) {
    ringbuffer_.Write(input_buffer, input_size);
    utf8_mode = IsMostlyUTF8(
@ -957,17 +503,26 @@ void BrotliCompressor::WriteMetaBlock(const size_t input_size,
                                  kRingBufferMask, kRingBufferMask,
                                  ringbuffer_.start(), &literal_cost_[0]);
    }
+    int last_insert_len = 0;
+    int num_commands = 0;
+    double base_min_score = 8.115;
    CreateBackwardReferences(
        input_size, input_pos_,
-        ringbuffer_.start(),
-        &literal_cost_[0],
-        kRingBufferMask, kMaxBackwardDistance,
+        ringbuffer_.start(), kRingBufferMask,
+        &literal_cost_[0], kRingBufferMask,
+        kMaxBackwardDistance,
+        base_min_score,
+        9,  // quality
        hashers_.get(),
        hash_type_,
-        &commands);
-    ComputeDistanceShortCodes(&commands, input_pos_, kMaxBackwardDistance,
-                              dist_ringbuffer_,
-                              &dist_ringbuffer_idx_);
+        dist_cache_,
+        &last_insert_len,
+        &commands[0],
+        &num_commands);
+    commands.resize(num_commands);
+    if (last_insert_len > 0) {
+      commands.push_back(Command(last_insert_len));
+    }
  }
  EncodingParams params;
  params.num_direct_distance_codes =
@ -1015,7 +570,6 @@ void BrotliCompressor::FinishStream(
  WriteMetaBlock(0, NULL, true, encoded_size, encoded_buffer);
 }

-
 int BrotliCompressBuffer(BrotliParams params,
                         size_t input_size,
                         const uint8_t* input_buffer,
@ -1049,7 +603,6 @@ int BrotliCompressBuffer(BrotliParams params,
    *encoded_size += output_size;
    max_output_size -= output_size;
  }
-
  return 1;
 }

--- a/enc/encode.h
+++ b/enc/encode.h
@ -59,7 +59,6 @@ class BrotliCompressor {
  // sets *encoded_size to the number of bytes written.
  void FinishStream(size_t* encoded_size, uint8_t* encoded_buffer);

-
 private:
  // Initializes the hasher with the hashes of dictionary words.
  void StoreDictionaryWordHashes();
@ -67,12 +66,11 @@ class BrotliCompressor {
  BrotliParams params_;
  int window_bits_;
  std::unique_ptr<Hashers> hashers_;
-  Hashers::Type hash_type_;
-  int dist_ringbuffer_[4];
-  size_t dist_ringbuffer_idx_;
+  int hash_type_;
  size_t input_pos_;
  RingBuffer ringbuffer_;
  std::vector<float> literal_cost_;
+  int dist_cache_[4];
  int storage_ix_;
  uint8_t* storage_;
  static StaticDictionary *static_dictionary_;
@ -87,7 +85,6 @@ int BrotliCompressBuffer(BrotliParams params,
                         size_t* encoded_size,
                         uint8_t* encoded_buffer);

-
 }  // namespace brotli

 #endif  // BROTLI_ENC_ENCODE_H_
--- a/enc/entropy_encode.cc
+++ b/enc/entropy_encode.cc
@ -42,7 +42,7 @@ struct HuffmanTree {

 HuffmanTree::HuffmanTree() {}

-// Sort the root nodes, least popular first.
+// Sort the root nodes, least popular first, break ties by value.
 bool SortHuffmanTree(const HuffmanTree &v0, const HuffmanTree &v1) {
  if (v0.total_count_ == v1.total_count_) {
    return v0.index_right_or_value_ > v1.index_right_or_value_;
@ -50,6 +50,11 @@ bool SortHuffmanTree(const HuffmanTree &v0, const HuffmanTree &v1) {
  return v0.total_count_ < v1.total_count_;
 }

+// Sort the root nodes, least popular first.
+bool SortHuffmanTreeFast(const HuffmanTree &v0, const HuffmanTree &v1) {
+  return v0.total_count_ < v1.total_count_;
+}
+
 void SetDepth(const HuffmanTree &p,
              HuffmanTree *pool,
              uint8_t *depth,
@ -83,6 +88,7 @@ void SetDepth(const HuffmanTree &p,
 void CreateHuffmanTree(const int *data,
                       const int length,
                       const int tree_limit,
+                       const int quality,
                       uint8_t *depth) {
  // For block sizes below 64 kB, we never need to do a second iteration
  // of this loop. Probably all of our block sizes will be smaller than
@ -105,8 +111,11 @@ void CreateHuffmanTree(const int *data,
      break;
    }

-    std::sort(tree.begin(), tree.end(), SortHuffmanTree);
-
+    if (quality > 1) {
+      std::sort(tree.begin(), tree.end(), SortHuffmanTree);
+    } else {
+      std::sort(tree.begin(), tree.end(), SortHuffmanTreeFast);
+    }
    // The nodes are:
    // [0, n): the sorted leaf nodes that we start with.
    // [n]: we add a sentinel here.
@ -158,12 +167,12 @@ void CreateHuffmanTree(const int *data,
  }
 }

-void Reverse(uint8_t* v, int start, int end) {
+void Reverse(std::vector<uint8_t>* v, int start, int end) {
  --end;
  while (start < end) {
-    int tmp = v[start];
-    v[start] = v[end];
-    v[end] = tmp;
+    int tmp = (*v)[start];
+    (*v)[start] = (*v)[end];
+    (*v)[end] = tmp;
    ++start;
    --end;
  }
@ -173,75 +182,65 @@ void WriteHuffmanTreeRepetitions(
    const int previous_value,
    const int value,
    int repetitions,
-    uint8_t* tree,
-    uint8_t* extra_bits,
-    int* tree_size) {
+    std::vector<uint8_t> *tree,
+    std::vector<uint8_t> *extra_bits_data) {
  if (previous_value != value) {
-    tree[*tree_size] = value;
-    extra_bits[*tree_size] = 0;
-    ++(*tree_size);
+    tree->push_back(value);
+    extra_bits_data->push_back(0);
    --repetitions;
  }
  if (repetitions == 7) {
-    tree[*tree_size] = value;
-    extra_bits[*tree_size] = 0;
-    ++(*tree_size);
+    tree->push_back(value);
+    extra_bits_data->push_back(0);
    --repetitions;
  }
  if (repetitions < 3) {
    for (int i = 0; i < repetitions; ++i) {
-      tree[*tree_size] = value;
-      extra_bits[*tree_size] = 0;
-      ++(*tree_size);
+      tree->push_back(value);
+      extra_bits_data->push_back(0);
    }
  } else {
    repetitions -= 3;
-    int start = *tree_size;
+    int start = tree->size();
    while (repetitions >= 0) {
-      tree[*tree_size] = 16;
-      extra_bits[*tree_size] = repetitions & 0x3;
-      ++(*tree_size);
+      tree->push_back(16);
+      extra_bits_data->push_back(repetitions & 0x3);
      repetitions >>= 2;
      --repetitions;
    }
-    Reverse(tree, start, *tree_size);
-    Reverse(extra_bits, start, *tree_size);
+    Reverse(tree, start, tree->size());
+    Reverse(extra_bits_data, start, tree->size());
  }
 }

 void WriteHuffmanTreeRepetitionsZeros(
    int repetitions,
-    uint8_t* tree,
-    uint8_t* extra_bits,
-    int* tree_size) {
+    std::vector<uint8_t> *tree,
+    std::vector<uint8_t> *extra_bits_data) {
  if (repetitions == 11) {
-    tree[*tree_size] = 0;
-    extra_bits[*tree_size] = 0;
-    ++(*tree_size);
+    tree->push_back(0);
+    extra_bits_data->push_back(0);
    --repetitions;
  }
  if (repetitions < 3) {
    for (int i = 0; i < repetitions; ++i) {
-      tree[*tree_size] = 0;
-      extra_bits[*tree_size] = 0;
-      ++(*tree_size);
+      tree->push_back(0);
+      extra_bits_data->push_back(0);
    }
  } else {
    repetitions -= 3;
-    int start = *tree_size;
+    int start = tree->size();
    while (repetitions >= 0) {
-      tree[*tree_size] = 17;
-      extra_bits[*tree_size] = repetitions & 0x7;
-      ++(*tree_size);
+      tree->push_back(17);
+      extra_bits_data->push_back(repetitions & 0x7);
      repetitions >>= 3;
      --repetitions;
    }
-    Reverse(tree, start, *tree_size);
-    Reverse(extra_bits, start, *tree_size);
+    Reverse(tree, start, tree->size());
+    Reverse(extra_bits_data, start, tree->size());
  }
 }

-
 int OptimizeHuffmanCountsForRle(int length, int* counts) {
  int stride;
  int limit;
@ -371,7 +370,6 @@ int OptimizeHuffmanCountsForRle(int length, int* counts) {
  return 1;
 }

-
 static void DecideOverRleUse(const uint8_t* depth, const int length,
                             bool *use_rle_for_non_zero,
                             bool *use_rle_for_zero) {
@ -379,20 +377,10 @@ static void DecideOverRleUse(const uint8_t* depth, const int length,
  int total_reps_non_zero = 0;
  int count_reps_zero = 0;
  int count_reps_non_zero = 0;
-  int new_length = length;
-  for (int i = 0; i < length; ++i) {
-    if (depth[length - i - 1] == 0) {
-      --new_length;
-    } else {
-      break;
-    }
-  }
-  for (uint32_t i = 0; i < new_length;) {
+  for (uint32_t i = 0; i < length;) {
    const int value = depth[i];
    int reps = 1;
-    // Find rle coding for longer codes.
-    // Shorter codes seem not to benefit from rle.
-    for (uint32_t k = i + 1; k < new_length && depth[k] == value; ++k) {
+    for (uint32_t k = i + 1; k < length && depth[k] == value; ++k) {
      ++reps;
    }
    if (reps >= 3 && value == 0) {
@ -411,48 +399,51 @@ static void DecideOverRleUse(const uint8_t* depth, const int length,
  *use_rle_for_zero = total_reps_zero > 2;
 }

-
-void WriteHuffmanTree(const uint8_t* depth, const int length,
-                      uint8_t* tree,
-                      uint8_t* extra_bits_data,
-                      int* huffman_tree_size) {
+void WriteHuffmanTree(const uint8_t* depth,
+                      uint32_t length,
+                      std::vector<uint8_t> *tree,
+                      std::vector<uint8_t> *extra_bits_data) {
  int previous_value = 8;

+  // Throw away trailing zeros.
+  int new_length = length;
+  for (int i = 0; i < length; ++i) {
+    if (depth[length - i - 1] == 0) {
+      --new_length;
+    } else {
+      break;
+    }
+  }
+
  // First gather statistics on if it is a good idea to do rle.
-  bool use_rle_for_non_zero;
-  bool use_rle_for_zero;
-  DecideOverRleUse(depth, length, &use_rle_for_non_zero, &use_rle_for_zero);
+  bool use_rle_for_non_zero = false;
+  bool use_rle_for_zero = false;
+  if (length > 50) {
+    // Find rle coding for longer codes.
+    // Shorter codes seem not to benefit from rle.
+    DecideOverRleUse(depth, new_length,
+                     &use_rle_for_non_zero, &use_rle_for_zero);
+  }

  // Actual rle coding.
-  for (uint32_t i = 0; i < length;) {
+  for (uint32_t i = 0; i < new_length;) {
    const int value = depth[i];
    int reps = 1;
-    if (length > 50) {
-      // Find rle coding for longer codes.
-      // Shorter codes seem not to benefit from rle.
-      if ((value != 0 && use_rle_for_non_zero) ||
-          (value == 0 && use_rle_for_zero)) {
-        for (uint32_t k = i + 1; k < length && depth[k] == value; ++k) {
-          ++reps;
-        }
+    if ((value != 0 && use_rle_for_non_zero) ||
+        (value == 0 && use_rle_for_zero)) {
+      for (uint32_t k = i + 1; k < new_length && depth[k] == value; ++k) {
+        ++reps;
      }
    }
    if (value == 0) {
-      WriteHuffmanTreeRepetitionsZeros(reps, tree, extra_bits_data,
-                                       huffman_tree_size);
+      WriteHuffmanTreeRepetitionsZeros(reps, tree, extra_bits_data);
    } else {
-      WriteHuffmanTreeRepetitions(previous_value, value, reps, tree,
-                                  extra_bits_data, huffman_tree_size);
+      WriteHuffmanTreeRepetitions(previous_value,
+                                  value, reps, tree, extra_bits_data);
      previous_value = value;
    }
    i += reps;
  }
-  // Throw away trailing zeros.
-  for (; *huffman_tree_size > 0; --(*huffman_tree_size)) {
-    if (tree[*huffman_tree_size - 1] > 0 && tree[*huffman_tree_size - 1] < 17) {
-      break;
-    }
-  }
 }

 namespace {
--- a/enc/entropy_encode.h
+++ b/enc/entropy_encode.h
@ -19,6 +19,7 @@

 #include <stdint.h>
 #include <string.h>
+#include <vector>
 #include "./histogram.h"
 #include "./prefix.h"

@ -36,6 +37,7 @@ namespace brotli {
 void CreateHuffmanTree(const int *data,
                       const int length,
                       const int tree_limit,
+                       const int quality,
                       uint8_t *depth);

 // Change the population counts in a way that the consequent
@ -46,14 +48,13 @@ void CreateHuffmanTree(const int *data,
 // counts contains the population counts.
 int OptimizeHuffmanCountsForRle(int length, int* counts);

-
 // Write a huffman tree from bit depths into the bitstream representation
 // of a Huffman tree. The generated Huffman tree is to be compressed once
 // more using a Huffman tree
-void WriteHuffmanTree(const uint8_t* depth, const int length,
-                      uint8_t* tree,
-                      uint8_t* extra_bits_data,
-                      int* huffman_tree_size);
+void WriteHuffmanTree(const uint8_t* depth,
+                      uint32_t num,
+                      std::vector<uint8_t> *tree,
+                      std::vector<uint8_t> *extra_bits_data);

 // Get the actual bit values for a tree of bit depths.
 void ConvertBitDepthsToSymbols(const uint8_t *depth, int len, uint16_t *bits);
@ -70,34 +71,6 @@ struct EntropyCode {
  int symbols_[4];
 };

-template<int kSize>
-void BuildEntropyCode(const Histogram<kSize>& histogram,
-                      const int tree_limit,
-                      const int alphabet_size,
-                      EntropyCode<kSize>* code) {
-  memset(code->depth_, 0, sizeof(code->depth_));
-  memset(code->bits_, 0, sizeof(code->bits_));
-  memset(code->symbols_, 0, sizeof(code->symbols_));
-  code->count_ = 0;
-  if (histogram.total_count_ == 0) return;
-  for (int i = 0; i < kSize; ++i) {
-    if (histogram.data_[i] > 0) {
-      if (code->count_ < 4) code->symbols_[code->count_] = i;
-      ++code->count_;
-    }
-  }
-  if (alphabet_size >= 50 && code->count_ >= 16) {
-    int counts[kSize];
-    memcpy(counts, &histogram.data_[0], sizeof(counts[0]) * kSize);
-    OptimizeHuffmanCountsForRle(alphabet_size, counts);
-    CreateHuffmanTree(counts, alphabet_size, tree_limit, &code->depth_[0]);
-  } else {
-    CreateHuffmanTree(&histogram.data_[0], alphabet_size, tree_limit,
-                      &code->depth_[0]);
-  }
-  ConvertBitDepthsToSymbols(&code->depth_[0], alphabet_size, &code->bits_[0]);
-}
-
 static const int kCodeLengthCodes = 18;

 // Literal entropy code.
--- a/enc/fast_log.h
+++ b/enc/fast_log.h
@ -46,6 +46,16 @@ inline int Log2Floor(uint32_t n) {
 #endif
 }

+static inline int Log2FloorNonZero(uint32_t n) {
+#ifdef __GNUC__
+  return 31 ^ __builtin_clz(n);
+#else
+  unsigned int result = 0;
+  while (n >>= 1) result++;
+  return result;
+#endif
+}
+
 // Return ceiling(log2(n)) for positive integer n.  Returns -1 iff n == 0.
 inline int Log2Ceiling(uint32_t n) {
  int floor = Log2Floor(n);
--- a/enc/find_match_length.h
+++ b/enc/find_match_length.h
@ -19,6 +19,8 @@

 #include <stdint.h>

+#include <stddef.h>
+
 #include "./port.h"

 namespace brotli {
--- a/enc/hash.h
+++ b/enc/hash.h
@ -31,10 +31,18 @@
 #include "./fast_log.h"
 #include "./find_match_length.h"
 #include "./port.h"
+#include "./prefix.h"
 #include "./static_dict.h"

 namespace brotli {

+static const int kDistanceCacheIndex[] = {
+  0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
+};
+static const int kDistanceCacheOffset[] = {
+  0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3
+};
+
 // kHashMul32 multiplier has these properties:
 // * The multiplier must be odd. Otherwise we may lose the highest bit.
 // * No long streaks of 1s or 0s.
@ -75,59 +83,194 @@ inline uint32_t Hash(const uint8_t *data) {
 // when it is not much longer and the bit cost for encoding it is more
 // than the saved literals.
 inline double BackwardReferenceScore(double average_cost,
-                                     double start_cost4,
-                                     double start_cost3,
-                                     double start_cost2,
                                     int copy_length,
                                     int backward_reference_offset) {
-  double retval = 0;
-  switch (copy_length) {
-    case 2: retval = start_cost2; break;
-    case 3: retval = start_cost3; break;
-    default: retval = start_cost4 + (copy_length - 4) * average_cost; break;
-  }
-  retval -= 1.20 * Log2Floor(backward_reference_offset);
-  return retval;
+  return (copy_length * average_cost -
+          1.20 * Log2Floor(backward_reference_offset));
 }

 inline double BackwardReferenceScoreUsingLastDistance(double average_cost,
-                                                      double start_cost4,
-                                                      double start_cost3,
-                                                      double start_cost2,
                                                      int copy_length,
                                                      int distance_short_code) {
-  double retval = 0;
-  switch (copy_length) {
-    case 2: retval = start_cost2; break;
-    case 3: retval = start_cost3; break;
-    default: retval = start_cost4 + (copy_length - 4) * average_cost; break;
-  }
  static const double kDistanceShortCodeBitCost[16] = {
    -0.6, 0.95, 1.17, 1.27,
    0.93, 0.93, 0.96, 0.96, 0.99, 0.99,
    1.05, 1.05, 1.15, 1.15, 1.25, 1.25
  };
-  retval -= kDistanceShortCodeBitCost[distance_short_code];
-  return retval;
+  return (average_cost * copy_length
+          - kDistanceShortCodeBitCost[distance_short_code]);
 }

+// A (forgetful) hash table to the data seen by the compressor, to
+// help create backward references to previous data.
+//
+// This is a hash map of fixed size (kBucketSize). Starting from the
+// given index, kBucketSweep buckets are used to store values of a key.
+template <int kBucketBits, int kBucketSweep>
+class HashLongestMatchQuickly {
+ public:
+  HashLongestMatchQuickly() {
+    Reset();
+  }
+  void Reset() {
+    // It is not strictly necessary to fill this buffer here, but
+    // not filling will make the results of the compression stochastic
+    // (but correct). This is because random data would cause the
+    // system to find accidentally good backward references here and there.
+    std::fill(&buckets_[0],
+              &buckets_[sizeof(buckets_) / sizeof(buckets_[0])],
+              0);
+  }
+  // Look at 4 bytes at data.
+  // Compute a hash from these, and store the value somewhere within
+  // [ix .. ix+3].
+  inline void Store(const uint8_t *data, const int ix) {
+    const uint32_t key = Hash<kBucketBits, 4>(data);
+    // Wiggle the value with the bucket sweep range.
+    const uint32_t off = (static_cast<uint32_t>(ix) >> 3) % kBucketSweep;
+    buckets_[key + off] = ix;
+  }
+
+  // Store hashes for a range of data.
+  void StoreHashes(const uint8_t *data, size_t len, int startix, int mask) {
+    for (int p = 0; p < len; ++p) {
+      Store(&data[p & mask], startix + p);
+    }
+  }
+
+  bool HasStaticDictionary() const { return false; }
+
+  // Find a longest backward match of &ring_buffer[cur_ix & ring_buffer_mask]
+  // up to the length of max_length.
+  //
+  // Does not look for matches longer than max_length.
+  // Does not look for matches further away than max_backward.
+  // Writes the best found match length into best_len_out.
+  // Writes the index (&data[index]) of the start of the best match into
+  // best_distance_out.
+  inline bool FindLongestMatch(const uint8_t * __restrict ring_buffer,
+                               const size_t ring_buffer_mask,
+                               const float* __restrict literal_cost,
+                               const size_t literal_cost_mask,
+                               const double average_cost,
+                               const int* __restrict distance_cache,
+                               const uint32_t cur_ix,
+                               const uint32_t max_length,
+                               const uint32_t max_backward,
+                               int * __restrict best_len_out,
+                               int * __restrict best_len_code_out,
+                               int * __restrict best_distance_out,
+                               double* __restrict best_score_out) {
+    const int best_len_in = *best_len_out;
+    const int cur_ix_masked = cur_ix & ring_buffer_mask;
+    int compare_char = ring_buffer[cur_ix_masked + best_len_in];
+    double best_score = *best_score_out;
+    int best_len = best_len_in;
+    int backward = distance_cache[0];
+    size_t prev_ix = cur_ix - backward;
+    bool match_found = false;
+    if (prev_ix < cur_ix) {
+      prev_ix &= ring_buffer_mask;
+      if (compare_char == ring_buffer[prev_ix + best_len]) {
+        int len = FindMatchLengthWithLimit(&ring_buffer[prev_ix],
+                                           &ring_buffer[cur_ix_masked],
+                                           max_length);
+        if (len >= 4) {
+          best_score = BackwardReferenceScoreUsingLastDistance(average_cost,
+                                                               len, 0);
+          best_len = len;
+          *best_len_out = len;
+          *best_len_code_out = len;
+          *best_distance_out = backward;
+          *best_score_out = best_score;
+          compare_char = ring_buffer[cur_ix_masked + best_len];
+          if (kBucketSweep == 1) {
+            return true;
+          } else {
+            match_found = true;
+          }
+        }
+      }
+    }
+    const uint32_t key = Hash<kBucketBits, 4>(&ring_buffer[cur_ix_masked]);
+    if (kBucketSweep == 1) {
+      // Only one to look for, don't bother to prepare for a loop.
+      prev_ix = buckets_[key];
+      backward = cur_ix - prev_ix;
+      prev_ix &= ring_buffer_mask;
+      if (compare_char != ring_buffer[prev_ix + best_len_in]) {
+        return false;
+      }
+      if (PREDICT_FALSE(backward == 0 || backward > max_backward)) {
+        return false;
+      }
+      const int len = FindMatchLengthWithLimit(&ring_buffer[prev_ix],
+                                               &ring_buffer[cur_ix_masked],
+                                               max_length);
+      if (len >= 4) {
+        *best_len_out = len;
+        *best_len_code_out = len;
+        *best_distance_out = backward;
+        *best_score_out = BackwardReferenceScore(average_cost, len, backward);
+        return true;
+      } else {
+        return false;
+      }
+    } else {
+      uint32_t *bucket = buckets_ + key;
+      prev_ix = *bucket++;
+      for (int i = 0; i < kBucketSweep; ++i, prev_ix = *bucket++) {
+        const int backward = cur_ix - prev_ix;
+        prev_ix &= ring_buffer_mask;
+        if (compare_char != ring_buffer[prev_ix + best_len]) {
+          continue;
+        }
+        if (PREDICT_FALSE(backward == 0 || backward > max_backward)) {
+          continue;
+        }
+        const int len =
+            FindMatchLengthWithLimit(&ring_buffer[prev_ix],
+                                     &ring_buffer[cur_ix_masked],
+                                     max_length);
+        if (len >= 4) {
+          const double score = BackwardReferenceScore(average_cost,
+                                                      len, backward);
+          if (best_score < score) {
+            best_score = score;
+            best_len = len;
+            *best_len_out = best_len;
+            *best_len_code_out = best_len;
+            *best_distance_out = backward;
+            *best_score_out = score;
+            compare_char = ring_buffer[cur_ix_masked + best_len];
+            match_found = true;
+          }
+        }
+      }
+      return match_found;
+    }
+  }
+
+ private:
+  static const uint32_t kBucketSize = 1 << kBucketBits;
+  uint32_t buckets_[kBucketSize + kBucketSweep];
+};
+
 // A (forgetful) hash table to the data seen by the compressor, to
 // help create backward references to previous data.
 //
 // This is a hash map of fixed size (kBucketSize) to a ring buffer of
 // fixed size (kBlockSize). The ring buffer contains the last kBlockSize
 // index positions of the given hash key in the compressed data.
-template <int kBucketBits, int kBlockBits, int kMinLength>
+template <int kBucketBits,
+          int kBlockBits,
+          int kMinLength,
+          int kNumLastDistancesToCheck,
+          bool kUseCostModel,
+          bool kUseDictionary>
 class HashLongestMatch {
 public:
-  HashLongestMatch()
-      : last_distance1_(4),
-        last_distance2_(11),
-        last_distance3_(15),
-        last_distance4_(16),
-        insert_length_(0),
-        average_cost_(5.4),
-        static_dict_(NULL) {
+  HashLongestMatch() : static_dict_(NULL) {
    Reset();
  }
  void Reset() {
@ -166,72 +309,58 @@ class HashLongestMatch {
  // into best_distance_out.
  // Write the score of the best match into best_score_out.
  bool FindLongestMatch(const uint8_t * __restrict data,
-                        const float * __restrict literal_cost,
                        const size_t ring_buffer_mask,
+                        const float * __restrict literal_cost,
+                        const size_t literal_cost_mask,
+                        const double average_cost,
+                        const int* __restrict distance_cache,
                        const uint32_t cur_ix,
                        uint32_t max_length,
                        const uint32_t max_backward,
-                        size_t * __restrict best_len_out,
-                        size_t * __restrict best_len_code_out,
-                        size_t * __restrict best_distance_out,
-                        double * __restrict best_score_out,
-                        bool * __restrict in_dictionary) {
-    *in_dictionary = true;
+                        int * __restrict best_len_out,
+                        int * __restrict best_len_code_out,
+                        int * __restrict best_distance_out,
+                        double * __restrict best_score_out) {
    *best_len_code_out = 0;
    const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
-    const double start_cost4 = literal_cost == NULL ? 20 :
-        literal_cost[cur_ix_masked] +
-        literal_cost[(cur_ix + 1) & ring_buffer_mask] +
-        literal_cost[(cur_ix + 2) & ring_buffer_mask] +
-        literal_cost[(cur_ix + 3) & ring_buffer_mask];
-    const double start_cost3 = literal_cost == NULL ? 15 :
-        literal_cost[cur_ix_masked] +
-        literal_cost[(cur_ix + 1) & ring_buffer_mask] +
-        literal_cost[(cur_ix + 2) & ring_buffer_mask] + 0.3;
-    double start_cost2 = literal_cost == NULL ? 10 :
-        literal_cost[cur_ix_masked] +
-        literal_cost[(cur_ix + 1) & ring_buffer_mask] + 1.2;
+    double start_cost_diff4 = 0.0;
+    double start_cost_diff3 = 0.0;
+    double start_cost_diff2 = 0.0;
+    if (kUseCostModel) {
+      start_cost_diff4 = literal_cost == NULL ? 0 :
+          literal_cost[cur_ix & literal_cost_mask] +
+          literal_cost[(cur_ix + 1) & literal_cost_mask] +
+          literal_cost[(cur_ix + 2) & literal_cost_mask] +
+          literal_cost[(cur_ix + 3) & literal_cost_mask] -
+          4 * average_cost;
+      start_cost_diff3 = literal_cost == NULL ? 0 :
+          literal_cost[cur_ix & literal_cost_mask] +
+          literal_cost[(cur_ix + 1) & literal_cost_mask] +
+          literal_cost[(cur_ix + 2) & literal_cost_mask] -
+          3 * average_cost + 0.3;
+      start_cost_diff2 = literal_cost == NULL ? 0 :
+          literal_cost[cur_ix & literal_cost_mask] +
+          literal_cost[(cur_ix + 1) & literal_cost_mask] -
+          2 * average_cost + 1.2;
+    }
    bool match_found = false;
    // Don't accept a short copy from far away.
-    double best_score = 8.115;
-    if (insert_length_ < 4) {
-      double cost_diff[4] = { 0.10, 0.04, 0.02, 0.01 };
-      best_score += cost_diff[insert_length_];
-    }
-    size_t best_len = *best_len_out;
+    double best_score = *best_score_out;
+    int best_len = *best_len_out;
    *best_len_out = 0;
-    size_t best_ix = 1;
    // Try last distance first.
-    for (int i = 0; i < 16; ++i) {
-      size_t prev_ix = cur_ix;
-      switch(i) {
-        case 0: prev_ix -= last_distance1_; break;
-        case 1: prev_ix -= last_distance2_; break;
-        case 2: prev_ix -= last_distance3_; break;
-        case 3: prev_ix -= last_distance4_; break;
-
-        case 4: prev_ix -= last_distance1_ - 1; break;
-        case 5: prev_ix -= last_distance1_ + 1; break;
-        case 6: prev_ix -= last_distance1_ - 2; break;
-        case 7: prev_ix -= last_distance1_ + 2; break;
-        case 8: prev_ix -= last_distance1_ - 3; break;
-        case 9: prev_ix -= last_distance1_ + 3; break;
-
-        case 10: prev_ix -= last_distance2_ - 1; break;
-        case 11: prev_ix -= last_distance2_ + 1; break;
-        case 12: prev_ix -= last_distance2_ - 2; break;
-        case 13: prev_ix -= last_distance2_ + 2; break;
-        case 14: prev_ix -= last_distance2_ - 3; break;
-        case 15: prev_ix -= last_distance2_ + 3; break;
-      }
+    for (int i = 0; i < kNumLastDistancesToCheck; ++i) {
+      const int idx = kDistanceCacheIndex[i];
+      const int backward = distance_cache[idx] + kDistanceCacheOffset[i];
+      size_t prev_ix = cur_ix - backward;
      if (prev_ix >= cur_ix) {
        continue;
      }
-      const size_t backward = cur_ix - prev_ix;
      if (PREDICT_FALSE(backward > max_backward)) {
        continue;
      }
      prev_ix &= ring_buffer_mask;
+
      if (cur_ix_masked + best_len > ring_buffer_mask ||
          prev_ix + best_len > ring_buffer_mask ||
          data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
@ -245,29 +374,30 @@ class HashLongestMatch {
        // Comparing for >= 2 does not change the semantics, but just saves for
        // a few unnecessary binary logarithms in backward reference score,
        // since we are not interested in such short matches.
-        const double score = BackwardReferenceScoreUsingLastDistance(
-            average_cost_,
-            start_cost4,
-            start_cost3,
-            start_cost2,
-            len, i);
+        double score = BackwardReferenceScoreUsingLastDistance(
+            average_cost, len, i);
+        if (kUseCostModel) {
+          switch (len) {
+            case 2: score += start_cost_diff2; break;
+            case 3: score += start_cost_diff3; break;
+            default: score += start_cost_diff4;
+          }
+        }
        if (best_score < score) {
          best_score = score;
          best_len = len;
-          best_ix = backward;
          *best_len_out = best_len;
          *best_len_code_out = best_len;
-          *best_distance_out = best_ix;
+          *best_distance_out = backward;
          *best_score_out = best_score;
          match_found = true;
-          *in_dictionary = backward > max_backward;
        }
      }
    }
    if (kMinLength == 2) {
      int stop = int(cur_ix) - 64;
      if (stop < 0) { stop = 0; }
-      start_cost2 -= 1.0;
+      start_cost_diff2 -= 1.0;
      for (int i = cur_ix - 1; i > stop; --i) {
        size_t prev_ix = i;
        const size_t backward = cur_ix - prev_ix;
@ -280,15 +410,15 @@ class HashLongestMatch {
          continue;
        }
        int len = 2;
-        const double score = start_cost2 - 2.3 * Log2Floor(backward);
+        const double score =
+            average_cost * 2 - 2.3 * Log2Floor(backward) + start_cost_diff2;

        if (best_score < score) {
          best_score = score;
          best_len = len;
-          best_ix = backward;
          *best_len_out = best_len;
          *best_len_code_out = best_len;
-          *best_distance_out = best_ix;
+          *best_distance_out = backward;
          match_found = true;
        }
      }
@ -316,26 +446,24 @@ class HashLongestMatch {
          // Comparing for >= 3 does not change the semantics, but just saves
          // for a few unnecessary binary logarithms in backward reference
          // score, since we are not interested in such short matches.
-          const double score = BackwardReferenceScore(average_cost_,
-                                                      start_cost4,
-                                                      start_cost3,
-                                                      start_cost2,
-                                                      len, backward);
+          double score = BackwardReferenceScore(average_cost,
+                                                len, backward);
+          if (kUseCostModel) {
+            score += (len >= 4) ? start_cost_diff4 : start_cost_diff3;
+          }
          if (best_score < score) {
            best_score = score;
            best_len = len;
-            best_ix = backward;
            *best_len_out = best_len;
            *best_len_code_out = best_len;
-            *best_distance_out = best_ix;
+            *best_distance_out = backward;
            *best_score_out = best_score;
            match_found = true;
-            *in_dictionary = false;
          }
        }
      }
    }
-    if (static_dict_ != NULL) {
+    if (kUseDictionary && static_dict_ != NULL) {
      // We decide based on first 4 bytes how many bytes to test for.
      int prefix = BROTLI_UNALIGNED_LOAD32(&data[cur_ix_masked]);
      int maxlen = static_dict_->GetLength(prefix);
@ -346,21 +474,17 @@ class HashLongestMatch {
        int word_id;
        if (static_dict_->Get(snippet, &copy_len_code, &word_id)) {
          const size_t backward = max_backward + word_id + 1;
-          const double score = BackwardReferenceScore(average_cost_,
-                                                      start_cost4,
-                                                      start_cost3,
-                                                      start_cost2,
-                                                      len, backward);
+          const double score = (BackwardReferenceScore(average_cost,
+                                                       len, backward) +
+                                start_cost_diff4);
          if (best_score < score) {
            best_score = score;
            best_len = len;
-            best_ix = backward;
            *best_len_out = best_len;
            *best_len_code_out = copy_len_code;
-            *best_distance_out = best_ix;
+            *best_distance_out = backward;
            *best_score_out = best_score;
            match_found = true;
-            *in_dictionary = true;
          }
        }
      }
@ -368,21 +492,6 @@ class HashLongestMatch {
    return match_found;
  }

-  void set_last_distance(int v) {
-    if (last_distance1_ != v) {
-      last_distance4_ = last_distance3_;
-      last_distance3_ = last_distance2_;
-      last_distance2_ = last_distance1_;
-      last_distance1_ = v;
-    }
-  }
-
-  int last_distance() const { return last_distance1_; }
-
-  void set_insert_length(int v) { insert_length_ = v; }
-
-  void set_average_cost(double v) { average_cost_ = v; }
-
 private:
  // Number of hash buckets.
  static const uint32_t kBucketSize = 1 << kBucketBits;
@ -400,46 +509,48 @@ class HashLongestMatch {
  // Buckets containing kBlockSize of backward references.
  int buckets_[kBucketSize][kBlockSize];

-  int last_distance1_;
-  int last_distance2_;
-  int last_distance3_;
-  int last_distance4_;
-
-  // Cost adjustment for how many literals we are planning to insert
-  // anyway.
-  int insert_length_;
-
-  double average_cost_;
-
  const StaticDictionary *static_dict_;
 };

 struct Hashers {
-  enum Type {
-    HASH_15_8_4 = 0,
-    HASH_15_8_2 = 1,
-  };
+  typedef HashLongestMatchQuickly<16, 1> H1;
+  typedef HashLongestMatchQuickly<17, 4> H2;
+  typedef HashLongestMatch<14, 4, 4, 4, false, false> H3;
+  typedef HashLongestMatch<14, 5, 4, 4, false, false> H4;
+  typedef HashLongestMatch<15, 6, 4, 10, false, false> H5;
+  typedef HashLongestMatch<15, 7, 4, 10, false, false> H6;
+  typedef HashLongestMatch<15, 8, 4, 16, true, false> H7;
+  typedef HashLongestMatch<15, 8, 4, 16, true, true> H8;
+  typedef HashLongestMatch<15, 8, 2, 16, true, false> H9;

-  void Init(Type type) {
+  void Init(int type) {
    switch (type) {
-      case HASH_15_8_4:
-        hash_15_8_4.reset(new HashLongestMatch<15, 8, 4>());
-        break;
-      case HASH_15_8_2:
-        hash_15_8_2.reset(new HashLongestMatch<15, 8, 2>());
-        break;
-      default:
-        break;
+      case 1: hash_h1.reset(new H1); break;
+      case 2: hash_h2.reset(new H2); break;
+      case 3: hash_h3.reset(new H3); break;
+      case 4: hash_h4.reset(new H4); break;
+      case 5: hash_h5.reset(new H5); break;
+      case 6: hash_h6.reset(new H6); break;
+      case 7: hash_h7.reset(new H7); break;
+      case 8: hash_h8.reset(new H8); break;
+      case 9: hash_h9.reset(new H9); break;
+      default: break;
    }
  }

  void SetStaticDictionary(const StaticDictionary *dict) {
-    if (hash_15_8_4.get() != NULL) hash_15_8_4->SetStaticDictionary(dict);
-    if (hash_15_8_2.get() != NULL) hash_15_8_2->SetStaticDictionary(dict);
+    if (hash_h8.get() != NULL) hash_h8->SetStaticDictionary(dict);
  }

-  std::unique_ptr<HashLongestMatch<15, 8, 4> > hash_15_8_4;
-  std::unique_ptr<HashLongestMatch<15, 8, 2> > hash_15_8_2;
+  std::unique_ptr<H1> hash_h1;
+  std::unique_ptr<H2> hash_h2;
+  std::unique_ptr<H3> hash_h3;
+  std::unique_ptr<H4> hash_h4;
+  std::unique_ptr<H5> hash_h5;
+  std::unique_ptr<H6> hash_h6;
+  std::unique_ptr<H7> hash_h7;
+  std::unique_ptr<H8> hash_h8;
+  std::unique_ptr<H9> hash_h9;
 };

 }  // namespace brotli
--- a/enc/histogram.cc
+++ b/enc/histogram.cc
@ -45,8 +45,8 @@ void BuildHistograms(
    const Command &cmd = cmds[i];
    insert_and_copy_it.Next();
    (*insert_and_copy_histograms)[insert_and_copy_it.type_].Add(
-        cmd.command_prefix_);
-    for (int j = 0; j < cmd.insert_length_; ++j) {
+        cmd.cmd_prefix_);
+    for (int j = 0; j < cmd.insert_len_; ++j) {
      literal_it.Next();
      uint8_t prev_byte = pos > 0 ? ringbuffer[(pos - 1) & mask] : 0;
      uint8_t prev_byte2 = pos > 1 ? ringbuffer[(pos - 2) & mask] : 0;
@ -55,12 +55,12 @@ void BuildHistograms(
      (*literal_histograms)[context].Add(ringbuffer[pos & mask]);
      ++pos;
    }
-    pos += cmd.copy_length_;
-    if (cmd.copy_length_ > 0 && cmd.distance_prefix_ != 0xffff) {
+    pos += cmd.copy_len_;
+    if (cmd.copy_len_ > 0 && cmd.cmd_prefix_ >= 128) {
      dist_it.Next();
      int context = (dist_it.type_ << kDistanceContextBits) +
-          ((cmd.copy_length_code_ > 4) ? 3 : cmd.copy_length_code_ - 2);
-      (*copy_dist_histograms)[context].Add(cmd.distance_prefix_);
+          cmd.DistanceContext();
+      (*copy_dist_histograms)[context].Add(cmd.dist_prefix_);
    }
  }
 }
@ -77,7 +77,7 @@ void BuildLiteralHistogramsForBlockType(
  BlockSplitIterator literal_it(literal_split);
  for (int i = 0; i < cmds.size(); ++i) {
    const Command &cmd = cmds[i];
-    for (int j = 0; j < cmd.insert_length_; ++j) {
+    for (int j = 0; j < cmd.insert_len_; ++j) {
      literal_it.Next();
      if (literal_it.type_ == block_type) {
        uint8_t prev_byte = pos > 0 ? ringbuffer[(pos - 1) & mask] : 0;
@ -87,7 +87,7 @@ void BuildLiteralHistogramsForBlockType(
      }
      ++pos;
    }
-    pos += cmd.copy_length_;
+    pos += cmd.copy_len_;
  }
 }

--- a/enc/prefix.h
+++ b/enc/prefix.h
@ -19,6 +19,7 @@
 #define BROTLI_ENC_PREFIX_H_

 #include <stdint.h>
+#include "./fast_log.h"

 namespace brotli {

@ -29,22 +30,56 @@ static const int kNumBlockLenPrefixes = 26;
 static const int kNumDistanceShortCodes = 16;
 static const int kNumDistancePrefixes = 520;

-int CommandPrefix(int insert_length, int copy_length);
-int InsertLengthExtraBits(int prefix);
-int InsertLengthOffset(int prefix);
-int CopyLengthExtraBits(int prefix);
-int CopyLengthOffset(int prefix);
+// Represents the range of values belonging to a prefix code:
+// [offset, offset + 2^nbits)
+struct PrefixCodeRange {
+  int offset;
+  int nbits;
+};

-void PrefixEncodeCopyDistance(int distance_code,
-                              int num_direct_codes,
-                              int shift_bits,
-                              uint16_t* prefix,
-                              int* nbits,
-                              uint32_t* extra_bits);
+static const PrefixCodeRange kBlockLengthPrefixCode[kNumBlockLenPrefixes] = {
+  {   1,  2}, {    5,  2}, {  9,   2}, {  13,  2},
+  {  17,  3}, {   25,  3}, {  33,  3}, {  41,  3},
+  {  49,  4}, {   65,  4}, {  81,  4}, {  97,  4},
+  { 113,  5}, {  145,  5}, { 177,  5}, { 209,  5},
+  { 241,  6}, {  305,  6}, { 369,  7}, { 497,  8},
+  { 753,  9}, { 1265, 10}, {2289, 11}, {4337, 12},
+  {8433, 13}, {16625, 24}
+};

-int BlockLengthPrefix(int length);
-int BlockLengthExtraBits(int prefix);
-int BlockLengthOffset(int prefix);
+inline void GetBlockLengthPrefixCode(int len,
+                                     int* code, int* n_extra, int* extra) {
+  *code = 0;
+  while (*code < 25 && len >= kBlockLengthPrefixCode[*code + 1].offset) {
+    ++(*code);
+  }
+  *n_extra = kBlockLengthPrefixCode[*code].nbits;
+  *extra = len - kBlockLengthPrefixCode[*code].offset;
+}
+
+inline void PrefixEncodeCopyDistance(int distance_code,
+                                     int num_direct_codes,
+                                     int postfix_bits,
+                                     uint16_t* code,
+                                     uint32_t* extra_bits) {
+  distance_code -= 1;
+  if (distance_code < kNumDistanceShortCodes + num_direct_codes) {
+    *code = distance_code;
+    *extra_bits = 0;
+    return;
+  }
+  distance_code -= kNumDistanceShortCodes + num_direct_codes;
+  distance_code += (1 << (postfix_bits + 2));
+  int bucket = Log2Floor(distance_code) - 1;
+  int postfix_mask = (1 << postfix_bits) - 1;
+  int postfix = distance_code & postfix_mask;
+  int prefix = (distance_code >> bucket) & 1;
+  int offset = (2 + prefix) << bucket;
+  int nbits = bucket - postfix_bits;
+  *code = kNumDistanceShortCodes + num_direct_codes +
+      ((2 * (nbits - 1) + prefix) << postfix_bits) + postfix;
+  *extra_bits = (nbits << 24) | ((distance_code - offset) >> postfix_bits);
+}

 }  // namespace brotli

--- a/enc/ringbuffer.h
+++ b/enc/ringbuffer.h
@ -17,6 +17,9 @@
 #ifndef BROTLI_ENC_RINGBUFFER_H_
 #define BROTLI_ENC_RINGBUFFER_H_

+#include <stddef.h>
+#include <stdint.h>
+
 // A RingBuffer(window_bits, tail_bits) contains `1 << window_bits' bytes of
 // data in a circular manner: writing a byte writes it to
 // `position() % (1 << window_bits)'. For convenience, the RingBuffer array
@ -26,10 +29,10 @@ class RingBuffer {
 public:
  RingBuffer(int window_bits, int tail_bits)
      : window_bits_(window_bits), tail_bits_(tail_bits), pos_(0) {
-    static const int kSlackForThreeByteHashingEverywhere = 2;
+    static const int kSlackForFourByteHashingEverywhere = 3;
    const int buflen = (1 << window_bits_) + (1 << tail_bits_);
-    buffer_ = new uint8_t[buflen + kSlackForThreeByteHashingEverywhere];
-    for (int i = 0; i < kSlackForThreeByteHashingEverywhere; ++i) {
+    buffer_ = new uint8_t[buflen + kSlackForFourByteHashingEverywhere];
+    for (int i = 0; i < kSlackForFourByteHashingEverywhere; ++i) {
      buffer_[buflen + i] = 0;
    }
  }
--- a/enc/write_bits.h
+++ b/enc/write_bits.h
@ -54,6 +54,7 @@ inline void WriteBits(int n_bits,
 #ifdef BIT_WRITER_DEBUG
  printf("WriteBits  %2d  0x%016llx  %10d\n", n_bits, bits, *pos);
 #endif
+  assert(bits < 1ULL << n_bits);
 #ifdef IS_LITTLE_ENDIAN
  // This branch of the code can write up to 56 bits at a time,
  // 7 bits are lost by being perhaps already in *p and at least