From 0454ab4ec0cfe0dee125269a5ee065f7774d6a9d Mon Sep 17 00:00:00 2001
From: Zoltan Szabadka <szabadka@google.com>
Date: Fri, 14 Feb 2014 15:04:23 +0100
Subject: [PATCH] Updates to Brotli compression format, decoder and encoder

This commit contains a batch of changes that were made to the Brotli
compression algorithm in the last month. Most important changes:

   * Fixes to the spec.
   * Change of code length code order.
   * Use a 2-level Huffman lookup table in the decoder.
   * Faster uncompressed meta-block decoding.
   * Optimized encoding of the Huffman code.
   * Detection of UTF-8 input encoding.
   * UTF-8 based literal cost modeling for improved
     backward reference selection.
---
 brotlispec.txt             |  25 +-
 dec/bit_reader.c           |   4 +-
 dec/bit_reader.h           |  31 ++-
 dec/decode.c               | 538 ++++++++++++++++++++-----------------
 dec/decode.h               |   2 +
 dec/huffman.c              | 309 ++++++++-------------
 dec/huffman.h              |  49 +---
 enc/backward_references.cc | 152 +++++++----
 enc/bit_cost.h             |   2 +-
 enc/block_splitter.cc      |   6 +-
 enc/encode.cc              | 189 +++++++++----
 enc/entropy_encode.cc      | 119 +++++++-
 enc/entropy_encode.h       |   2 +-
 enc/hash.h                 |  13 +-
 enc/literal_cost.cc        |  99 +++++++
 enc/literal_cost.h         |   7 +-
 16 files changed, 912 insertions(+), 635 deletions(-)

diff --git a/brotlispec.txt b/brotlispec.txt
index 23de497..087c939 100644
--- a/brotlispec.txt
+++ b/brotlispec.txt
@@ -498,11 +498,11 @@ Abstract
                Symbol   Code
                ------   ----
                0          00
-               1        1010
-               2         100
-               3          11
-               4          01
-               5        1011
+               1        1110
+               2         110
+               3          01
+               4          10
+               5        1111
 
       We can now define the format of the complex Huffman code as
       follows:
@@ -513,7 +513,7 @@ Abstract
 
 
             Code lengths for symbols in the code length alphabet given
-               just above, in the order: 1, 2, 3, 4, 0, 17, 5, 6, 16, 7,
+               just above, in the order: 1, 2, 3, 4, 0, 5, 17, 6, 16, 7,
                8, 9, 10, 11, 12, 13, 14, 15
 
                The code lengths of code length symbols are between 0 and
@@ -572,7 +572,7 @@ Abstract
          6: last distance - 2
          7: last distance + 2
          8: last distance - 3
-         9: last disatnce + 3
+         9: last distance + 3
         10: second last distance - 1
         11: second last distance + 1
         12: second last distance - 2
@@ -647,7 +647,7 @@ Abstract
       ---- ---- ------    ---- ---- -------   ---- ---- -------
        0    0     0        8    2    10-13    16    6   130-193
        1    0     1        9    2    14-17    17    7   194-321
-       2    0     2       10    3    18-25    18    8   322-527
+       2    0     2       10    3    18-25    18    8   322-577
        3    0     3       11    3    26-33    19    9   578-1089
        4    0     4       12    4    34-49    20   10   1090-2113
        5    0     5       13    4    50-65    21   12   2114-6209
@@ -681,7 +681,7 @@ Abstract
                  |         |         |
                  +---------+---------+---------+
                  |         |         |         |
-            0-7  | 128-191 | 192-255 | 383-447 |
+            0-7  | 128-191 | 192-255 | 384-447 |
                  |         |         |         |
                  +---------+---------+---------+
                  |         |         |         |
@@ -689,7 +689,7 @@ Abstract
                  |         |         |         |
                  +---------+---------+---------+
                  |         |         |         |
-           16-23 | 448-551 | 576-639 | 640-703 |
+           16-23 | 448-511 | 576-639 | 640-703 |
                  |         |         |         |
                  +---------+---------+---------+
 
@@ -1008,9 +1008,10 @@ Abstract
             1 bit:  ISEMPTY, set to 1 if the meta-block is empty, this
                     field is only present if ISLAST bit is set, since
                     only the last meta-block can be empty
-            2 bits: MNIBBLES, (# of nibbles to represent the length) - 4
+            2 bits: MNIBBLES - 4, where MNIBBLES is # of nibbles to represent
+                    the length
 
-            (MNIBBLES + 4) x 4 bits: MLEN - 1, where MLEN is the length
+            MNIBBLES x 4 bits: MLEN - 1, where MLEN is the length
                of the meta-block in the input data in bytes
 
             1 bit:  ISUNCOMPRESSED, if set to 1, any bits of input up to
diff --git a/dec/bit_reader.c b/dec/bit_reader.c
index 25e33e3..981dccf 100644
--- a/dec/bit_reader.c
+++ b/dec/bit_reader.c
@@ -33,7 +33,7 @@ int BrotliInitBitReader(BrotliBitReader* const br, BrotliInput input) {
   br->val_ = 0;
   br->pos_ = 0;
   br->bit_pos_ = 0;
-  br->bits_left_ = 64;
+  br->bit_end_pos_ = 0;
   br->eos_ = 0;
   if (!BrotliReadMoreInput(br)) {
     return 0;
@@ -42,7 +42,7 @@ int BrotliInitBitReader(BrotliBitReader* const br, BrotliInput input) {
     br->val_ |= ((uint64_t)br->buf_[br->pos_]) << (8 * i);
     ++br->pos_;
   }
-  return (br->bits_left_ > 64);
+  return (br->bit_end_pos_ > 0);
 }
 
 #if defined(__cplusplus) || defined(c_plusplus)
diff --git a/dec/bit_reader.h b/dec/bit_reader.h
index 551cc14..3221cdd 100644
--- a/dec/bit_reader.h
+++ b/dec/bit_reader.h
@@ -31,7 +31,7 @@ extern "C" {
 #define BROTLI_IBUF_SIZE          (2 * BROTLI_READ_SIZE + 32)
 #define BROTLI_IBUF_MASK          (2 * BROTLI_READ_SIZE - 1)
 
-#define UNALIGNED_COPY64(dst, src) *(uint64_t*)(dst) = *(const uint64_t*)(src)
+#define UNALIGNED_COPY64(dst, src) memcpy(dst, src, 8)
 
 static const uint32_t kBitMask[BROTLI_MAX_NUM_BIT_READ] = {
   0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
@@ -42,13 +42,13 @@ typedef struct {
   /* Input byte buffer, consist of a ringbuffer and a "slack" region where */
   /* bytes from the start of the ringbuffer are copied. */
   uint8_t buf_[BROTLI_IBUF_SIZE];
-  uint8_t*    buf_ptr_;    /* next input will write here */
-  BrotliInput input_;      /* input callback */
-  uint64_t    val_;        /* pre-fetched bits */
-  uint32_t    pos_;        /* byte position in stream */
-  uint32_t    bit_pos_;    /* current bit-reading position in val_ */
-  uint32_t    bits_left_;  /* how many valid bits left */
-  int         eos_;        /* input stream is finished */
+  uint8_t*    buf_ptr_;      /* next input will write here */
+  BrotliInput input_;        /* input callback */
+  uint64_t    val_;          /* pre-fetched bits */
+  uint32_t    pos_;          /* byte position in stream */
+  uint32_t    bit_pos_;      /* current bit-reading position in val_ */
+  uint32_t    bit_end_pos_;  /* bit-reading end position from LSB of val_ */
+  int         eos_;          /* input stream is finished */
 } BrotliBitReader;
 
 int BrotliInitBitReader(BrotliBitReader* const br, BrotliInput input);
@@ -65,7 +65,7 @@ static BROTLI_INLINE void BrotliSetBitPos(BrotliBitReader* const br,
 #ifdef BROTLI_DECODE_DEBUG
   uint32_t n_bits = val - br->bit_pos_;
   const uint32_t bval = (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
-  printf("[BrotliReadBits]  %010ld %2d  val: %6x\n",
+  printf("[BrotliReadBits]  %010d %2d  val: %6x\n",
          (br->pos_ << 3) + br->bit_pos_ - 64, n_bits, bval);
 #endif
   br->bit_pos_ = val;
@@ -78,7 +78,7 @@ static BROTLI_INLINE void ShiftBytes(BrotliBitReader* const br) {
     br->val_ |= ((uint64_t)br->buf_[br->pos_ & BROTLI_IBUF_MASK]) << 56;
     ++br->pos_;
     br->bit_pos_ -= 8;
-    br->bits_left_ -= 8;
+    br->bit_end_pos_ -= 8;
   }
 }
 
@@ -95,10 +95,10 @@ static BROTLI_INLINE void ShiftBytes(BrotliBitReader* const br) {
    every 32 bytes of input is read.
 */
 static BROTLI_INLINE int BrotliReadMoreInput(BrotliBitReader* const br) {
-  if (br->bits_left_ > 320) {
+  if (br->bit_end_pos_ > 256) {
     return 1;
   } else if (br->eos_) {
-    return br->bit_pos_ <= br->bits_left_;
+    return br->bit_pos_ <= br->bit_end_pos_;
   } else {
     uint8_t* dst = br->buf_ptr_;
     int bytes_read = BrotliRead(br->input_, dst, BROTLI_READ_SIZE);
@@ -131,7 +131,7 @@ static BROTLI_INLINE int BrotliReadMoreInput(BrotliBitReader* const br) {
     } else {
       br->buf_ptr_ = br->buf_;
     }
-    br->bits_left_ += ((uint32_t)bytes_read << 3);
+    br->bit_end_pos_ += ((uint32_t)bytes_read << 3);
     return 1;
   }
 }
@@ -147,7 +147,7 @@ static BROTLI_INLINE void BrotliFillBitWindow(BrotliBitReader* const br) {
         br->buf_ + (br->pos_ & BROTLI_IBUF_MASK)) << 24;
     br->pos_ += 5;
     br->bit_pos_ -= 40;
-    br->bits_left_ -= 40;
+    br->bit_end_pos_ -= 40;
 #else
     ShiftBytes(br);
 #endif
@@ -155,14 +155,13 @@ static BROTLI_INLINE void BrotliFillBitWindow(BrotliBitReader* const br) {
 }
 
 /* Reads the specified number of bits from Read Buffer. */
-/* Requires that n_bits is positive. */
 static BROTLI_INLINE uint32_t BrotliReadBits(
     BrotliBitReader* const br, int n_bits) {
   uint32_t val;
   BrotliFillBitWindow(br);
   val = (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
 #ifdef BROTLI_DECODE_DEBUG
-  printf("[BrotliReadBits]  %010ld %2d  val: %6x\n",
+  printf("[BrotliReadBits]  %010d %2d  val: %6x\n",
          (br->pos_ << 3) + br->bit_pos_ - 64, n_bits, val);
 #endif
   br->bit_pos_ += (uint32_t)n_bits;
diff --git a/dec/decode.c b/dec/decode.c
index a8e41ab..070abf1 100644
--- a/dec/decode.c
+++ b/dec/decode.c
@@ -46,9 +46,14 @@ static const int kNumBlockLengthCodes = 26;
 static const int kLiteralContextBits = 6;
 static const int kDistanceContextBits = 2;
 
+#define HUFFMAN_TABLE_BITS      8
+#define HUFFMAN_TABLE_MASK      0xff
+/* This is a rough estimate, not an exact bound. */
+#define HUFFMAN_MAX_TABLE_SIZE  2048
+
 #define CODE_LENGTH_CODES 18
 static const uint8_t kCodeLengthCodeOrder[CODE_LENGTH_CODES] = {
-  1, 2, 3, 4, 0, 17, 5, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+  1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15,
 };
 
 #define NUM_DISTANCE_SHORT_CODES 16
@@ -104,36 +109,19 @@ static void DecodeMetaBlockLength(BrotliBitReader* br,
 }
 
 /* Decodes the next Huffman code from bit-stream. */
-static BROTLI_INLINE int ReadSymbol(const HuffmanTree* tree,
+static BROTLI_INLINE int ReadSymbol(const HuffmanCode* table,
                                     BrotliBitReader* br) {
-  uint32_t bits;
-  uint32_t bitpos;
-  int lut_ix;
-  uint8_t lut_bits;
-  const HuffmanTreeNode* node = tree->root_;
+  int nbits;
   BrotliFillBitWindow(br);
-  bits = BrotliPrefetchBits(br);
-  bitpos = br->bit_pos_;
-  /* Check if we find the bit combination from the Huffman lookup table. */
-  lut_ix = bits & (HUFF_LUT - 1);
-  lut_bits = tree->lut_bits_[lut_ix];
-  if (lut_bits <= HUFF_LUT_BITS) {
-    BrotliSetBitPos(br, bitpos + lut_bits);
-    return tree->lut_symbol_[lut_ix];
+  table += (int)(br->val_ >> br->bit_pos_) & HUFFMAN_TABLE_MASK;
+  nbits = table->bits - HUFFMAN_TABLE_BITS;
+  if (nbits > 0) {
+    br->bit_pos_ += HUFFMAN_TABLE_BITS;
+    table += table->value;
+    table += (int)(br->val_ >> br->bit_pos_) & ((1 << nbits) - 1);
   }
-  node += tree->lut_jump_[lut_ix];
-  bitpos += HUFF_LUT_BITS;
-  bits >>= HUFF_LUT_BITS;
-
-  /* Decode the value from a binary tree. */
-  assert(node != NULL);
-  do {
-    node = HuffmanTreeNextNode(node, bits & 1);
-    bits >>= 1;
-    ++bitpos;
-  } while (HuffmanTreeNodeIsNotLeaf(node));
-  BrotliSetBitPos(br, bitpos);
-  return node->symbol_;
+  br->bit_pos_ += table->bits;
+  return table->value;
 }
 
 static void PrintUcharVector(const uint8_t* v, int len) {
@@ -145,47 +133,34 @@ static int ReadHuffmanCodeLengths(
     const uint8_t* code_length_code_lengths,
     int num_symbols, uint8_t* code_lengths,
     BrotliBitReader* br) {
-  int ok = 0;
-  int symbol;
+  int symbol = 0;
   uint8_t prev_code_len = kDefaultCodeLength;
   int repeat = 0;
-  uint8_t repeat_length = 0;
+  uint8_t repeat_code_len = 0;
   int space = 32768;
-  HuffmanTree tree;
+  HuffmanCode table[32];
 
-  if (!BrotliHuffmanTreeBuildImplicit(&tree, code_length_code_lengths,
-                                      CODE_LENGTH_CODES)) {
+  if (!BrotliBuildHuffmanTable(table, 5,
+                               code_length_code_lengths,
+                               CODE_LENGTH_CODES)) {
     printf("[ReadHuffmanCodeLengths] Building code length tree failed: ");
     PrintUcharVector(code_length_code_lengths, CODE_LENGTH_CODES);
     return 0;
   }
 
-  if (!BrotliReadMoreInput(br)) {
-    printf("[ReadHuffmanCodeLengths] Unexpected end of input.\n");
-    return 0;
-  }
-
-  symbol = 0;
-  while (symbol + repeat < num_symbols && space > 0) {
+  while (symbol < num_symbols && space > 0) {
+    const HuffmanCode* p = table;
     uint8_t code_len;
     if (!BrotliReadMoreInput(br)) {
       printf("[ReadHuffmanCodeLengths] Unexpected end of input.\n");
-      goto End;
-    }
-    code_len = (uint8_t)ReadSymbol(&tree, br);
-    BROTLI_LOG_UINT(symbol);
-    BROTLI_LOG_UINT(repeat);
-    BROTLI_LOG_UINT(repeat_length);
-    BROTLI_LOG_UINT(code_len);
-    if ((code_len < kCodeLengthRepeatCode) ||
-        (code_len == kCodeLengthRepeatCode && repeat_length == 0) ||
-        (code_len > kCodeLengthRepeatCode && repeat_length > 0)) {
-      while (repeat > 0) {
-        code_lengths[symbol++] = repeat_length;
-        --repeat;
-      }
+      return 0;
     }
+    BrotliFillBitWindow(br);
+    p += (br->val_ >> br->bit_pos_) & 31;
+    br->bit_pos_ += p->bits;
+    code_len = (uint8_t)p->value;
     if (code_len < kCodeLengthRepeatCode) {
+      repeat = 0;
       code_lengths[symbol++] = code_len;
       if (code_len != 0) {
         prev_code_len = code_len;
@@ -193,47 +168,46 @@ static int ReadHuffmanCodeLengths(
       }
     } else {
       const int extra_bits = code_len - 14;
-      int i = repeat;
+      int old_repeat;
+      int repeat_delta;
+      uint8_t new_len = 0;
+      if (code_len == kCodeLengthRepeatCode) {
+        new_len =  prev_code_len;
+      }
+      if (repeat_code_len != new_len) {
+        repeat = 0;
+        repeat_code_len = new_len;
+      }
+      old_repeat = repeat;
       if (repeat > 0) {
         repeat -= 2;
         repeat <<= extra_bits;
       }
       repeat += (int)BrotliReadBits(br, extra_bits) + 3;
-      if (repeat + symbol > num_symbols) {
-        goto End;
+      repeat_delta = repeat - old_repeat;
+      if (symbol + repeat_delta > num_symbols) {
+        return 0;
       }
-      if (code_len == kCodeLengthRepeatCode) {
-        repeat_length = prev_code_len;
-        for (; i < repeat; ++i) {
-          space -= 32768 >> repeat_length;
-        }
-      } else {
-        repeat_length = 0;
+      memset(&code_lengths[symbol], repeat_code_len, (size_t)repeat_delta);
+      symbol += repeat_delta;
+      if (repeat_code_len != 0) {
+        space -= repeat_delta << (15 - repeat_code_len);
       }
     }
   }
   if (space != 0) {
     printf("[ReadHuffmanCodeLengths] space = %d\n", space);
-    goto End;
+    return 0;
   }
-  if (symbol + repeat > num_symbols) {
-    printf("[ReadHuffmanCodeLengths] symbol + repeat > num_symbols "
-           "(%d + %d vs %d)\n", symbol, repeat, num_symbols);
-    goto End;
-  }
-  while (repeat-- > 0) code_lengths[symbol++] = repeat_length;
-  while (symbol < num_symbols) code_lengths[symbol++] = 0;
-  ok = 1;
-
- End:
-  BrotliHuffmanTreeRelease(&tree);
-  return ok;
+  memset(&code_lengths[symbol], 0, (size_t)(num_symbols - symbol));
+  return 1;
 }
 
 static int ReadHuffmanCode(int alphabet_size,
-                           HuffmanTree* tree,
+                           HuffmanCode* table,
                            BrotliBitReader* br) {
   int ok = 1;
+  int table_size = 0;
   int simple_code_or_skip;
   uint8_t* code_lengths = NULL;
 
@@ -290,109 +264,49 @@ static int ReadHuffmanCode(int alphabet_size,
     int i;
     uint8_t code_length_code_lengths[CODE_LENGTH_CODES] = { 0 };
     int space = 32;
-    for (i = simple_code_or_skip;
-         i < CODE_LENGTH_CODES && space > 0; ++i) {
-      int code_len_idx = kCodeLengthCodeOrder[i];
-      uint8_t v = (uint8_t)BrotliReadBits(br, 2);
-      if (v == 1) {
-        v = (uint8_t)BrotliReadBits(br, 1);
-        if (v == 0) {
-          v = 2;
-        } else {
-          v = (uint8_t)BrotliReadBits(br, 1);
-          if (v == 0) {
-            v = 1;
-          } else {
-            v = 5;
-          }
-        }
-      } else if (v == 2) {
-        v = 4;
-      }
+    /* Static Huffman code for the code length code lengths */
+    static const HuffmanCode huff[16] = {
+      {2, 0}, {2, 4}, {2, 3}, {3, 2}, {2, 0}, {2, 4}, {2, 3}, {4, 1},
+      {2, 0}, {2, 4}, {2, 3}, {3, 2}, {2, 0}, {2, 4}, {2, 3}, {4, 5},
+    };
+    for (i = simple_code_or_skip; i < CODE_LENGTH_CODES && space > 0; ++i) {
+      const int code_len_idx = kCodeLengthCodeOrder[i];
+      const HuffmanCode* p = huff;
+      uint8_t v;
+      BrotliFillBitWindow(br);
+      p += (br->val_ >> br->bit_pos_) & 15;
+      br->bit_pos_ += p->bits;
+      v = (uint8_t)p->value;
       code_length_code_lengths[code_len_idx] = v;
       BROTLI_LOG_ARRAY_INDEX(code_length_code_lengths, code_len_idx);
       if (v != 0) {
         space -= (32 >> v);
       }
     }
-    ok = ReadHuffmanCodeLengths(code_length_code_lengths, alphabet_size,
-                                code_lengths, br);
+    ok = ReadHuffmanCodeLengths(code_length_code_lengths,
+                                alphabet_size, code_lengths, br);
   }
   if (ok) {
-    ok = BrotliHuffmanTreeBuildImplicit(tree, code_lengths, alphabet_size);
-    if (!ok) {
-      printf("[ReadHuffmanCode] HuffmanTreeBuildImplicit failed: ");
+    table_size = BrotliBuildHuffmanTable(table, HUFFMAN_TABLE_BITS,
+                                         code_lengths, alphabet_size);
+    if (table_size == 0) {
+      printf("[ReadHuffmanCode] BuildHuffmanTable failed: ");
       PrintUcharVector(code_lengths, alphabet_size);
     }
   }
   free(code_lengths);
-  return ok;
+  return table_size;
 }
 
-static int ReadCopyDistance(const HuffmanTree* tree,
-                            int num_direct_codes,
-                            int postfix_bits,
-                            int postfix_mask,
-                            BrotliBitReader* br) {
+static BROTLI_INLINE int ReadBlockLength(const HuffmanCode* table,
+                                         BrotliBitReader* br) {
   int code;
   int nbits;
-  int postfix;
-  int offset;
-  code = ReadSymbol(tree, br);
-  if (code < num_direct_codes) {
-    return code;
-  }
-  code -= num_direct_codes;
-  postfix = code & postfix_mask;
-  code >>= postfix_bits;
-  nbits = (code >> 1) + 1;
-  offset = ((2 + (code & 1)) << nbits) - 4;
-  return (num_direct_codes +
-          ((offset + (int)BrotliReadBits(br, nbits)) << postfix_bits) +
-          postfix);
-}
-
-static int ReadBlockLength(const HuffmanTree* tree, BrotliBitReader* br) {
-  int code;
-  int nbits;
-  code = ReadSymbol(tree, br);
+  code = ReadSymbol(table, br);
   nbits = kBlockLengthPrefixCode[code].nbits;
   return kBlockLengthPrefixCode[code].offset + (int)BrotliReadBits(br, nbits);
 }
 
-static void ReadInsertAndCopy(const HuffmanTree* tree,
-                              int* insert_len,
-                              int* copy_len,
-                              int* copy_dist,
-                              BrotliBitReader* br) {
-  int code;
-  int range_idx;
-  int insert_code;
-  int insert_extra_bits;
-  int copy_code;
-  int copy_extra_bits;
-  code = ReadSymbol(tree, br);
-  range_idx = code >> 6;
-  if (range_idx >= 2) {
-    range_idx -= 2;
-    *copy_dist = -1;
-  } else {
-    *copy_dist = 0;
-  }
-  insert_code = kInsertRangeLut[range_idx] + ((code >> 3) & 7);
-  copy_code = kCopyRangeLut[range_idx] + (code & 7);
-  *insert_len = kInsertLengthPrefixCode[insert_code].offset;
-  insert_extra_bits = kInsertLengthPrefixCode[insert_code].nbits;
-  if (insert_extra_bits > 0) {
-    *insert_len += (int)BrotliReadBits(br, insert_extra_bits);
-  }
-  *copy_len = kCopyLengthPrefixCode[copy_code].offset;
-  copy_extra_bits = kCopyLengthPrefixCode[copy_code].nbits;
-  if (copy_extra_bits > 0) {
-    *copy_len += (int)BrotliReadBits(br, copy_extra_bits);
-  }
-}
-
 static int TranslateShortCodes(int code, int* ringbuffer, int index) {
   int val;
   if (code < NUM_DISTANCE_SHORT_CODES) {
@@ -429,24 +343,22 @@ static void InverseMoveToFrontTransform(uint8_t* v, int v_len) {
 typedef struct {
   int alphabet_size;
   int num_htrees;
-  HuffmanTree* htrees;
+  HuffmanCode* codes;
+  HuffmanCode** htrees;
 } HuffmanTreeGroup;
 
 static void HuffmanTreeGroupInit(HuffmanTreeGroup* group, int alphabet_size,
                                  int ntrees) {
-  int i;
   group->alphabet_size = alphabet_size;
   group->num_htrees = ntrees;
-  group->htrees = (HuffmanTree*)malloc(sizeof(HuffmanTree) * (size_t)ntrees);
-  for (i = 0; i < ntrees; ++i) {
-    group->htrees[i].root_ = NULL;
-  }
+  group->codes = (HuffmanCode*)malloc(
+      sizeof(HuffmanCode) * (size_t)(ntrees * HUFFMAN_MAX_TABLE_SIZE));
+  group->htrees = (HuffmanCode**)malloc(sizeof(HuffmanCode*) * (size_t)ntrees);
 }
 
 static void HuffmanTreeGroupRelease(HuffmanTreeGroup* group) {
-  int i;
-  for (i = 0; i < group->num_htrees; ++i) {
-    BrotliHuffmanTreeRelease(&group->htrees[i]);
+  if (group->codes) {
+    free(group->codes);
   }
   if (group->htrees) {
     free(group->htrees);
@@ -456,8 +368,13 @@ static void HuffmanTreeGroupRelease(HuffmanTreeGroup* group) {
 static int HuffmanTreeGroupDecode(HuffmanTreeGroup* group,
                                   BrotliBitReader* br) {
   int i;
+  int table_size;
+  HuffmanCode* next = group->codes;
   for (i = 0; i < group->num_htrees; ++i) {
-    if (!ReadHuffmanCode(group->alphabet_size, &group->htrees[i], br)) {
+    group->htrees[i] = next;
+    table_size = ReadHuffmanCode(group->alphabet_size, next, br);
+    next += table_size;
+    if (table_size == 0) {
       return 0;
     }
   }
@@ -469,6 +386,10 @@ static int DecodeContextMap(int context_map_size,
                             uint8_t** context_map,
                             BrotliBitReader* br) {
   int ok = 1;
+  int use_rle_for_zeros;
+  int max_run_length_prefix = 0;
+  HuffmanCode* table;
+  int i;
   if (!BrotliReadMoreInput(br)) {
     printf("[DecodeContextMap] Unexpected end of input.\n");
     return 0;
@@ -487,55 +408,54 @@ static int DecodeContextMap(int context_map_size,
     return 1;
   }
 
-  {
-    HuffmanTree tree_index_htree;
-    int use_rle_for_zeros = (int)BrotliReadBits(br, 1);
-    int max_run_length_prefix = 0;
-    int i;
-    if (use_rle_for_zeros) {
-      max_run_length_prefix = (int)BrotliReadBits(br, 4) + 1;
+  use_rle_for_zeros = (int)BrotliReadBits(br, 1);
+  if (use_rle_for_zeros) {
+    max_run_length_prefix = (int)BrotliReadBits(br, 4) + 1;
+  }
+  table = (HuffmanCode*)malloc(HUFFMAN_MAX_TABLE_SIZE * sizeof(*table));
+  if (table == NULL) {
+    return 0;
+  }
+  if (!ReadHuffmanCode(*num_htrees + max_run_length_prefix, table, br)) {
+    ok = 0;
+    goto End;
+  }
+  for (i = 0; i < context_map_size;) {
+    int code;
+    if (!BrotliReadMoreInput(br)) {
+      printf("[DecodeContextMap] Unexpected end of input.\n");
+      ok = 0;
+      goto End;
     }
-    if (!ReadHuffmanCode(*num_htrees + max_run_length_prefix,
-                         &tree_index_htree, br)) {
-      return 0;
-    }
-    for (i = 0; i < context_map_size;) {
-      int code;
-      if (!BrotliReadMoreInput(br)) {
-        printf("[DecodeContextMap] Unexpected end of input.\n");
-        ok = 0;
-        goto End;
-      }
-      code = ReadSymbol(&tree_index_htree, br);
-      if (code == 0) {
+    code = ReadSymbol(table, br);
+    if (code == 0) {
+      (*context_map)[i] = 0;
+      ++i;
+    } else if (code <= max_run_length_prefix) {
+      int reps = 1 + (1 << code) + (int)BrotliReadBits(br, code);
+      while (--reps) {
+        if (i >= context_map_size) {
+          ok = 0;
+          goto End;
+        }
         (*context_map)[i] = 0;
         ++i;
-      } else if (code <= max_run_length_prefix) {
-        int reps = 1 + (1 << code) + (int)BrotliReadBits(br, code);
-        while (--reps) {
-          if (i >= context_map_size) {
-            ok = 0;
-            goto End;
-          }
-          (*context_map)[i] = 0;
-          ++i;
-        }
-      } else {
-        (*context_map)[i] = (uint8_t)(code - max_run_length_prefix);
-        ++i;
       }
+    } else {
+      (*context_map)[i] = (uint8_t)(code - max_run_length_prefix);
+      ++i;
     }
-   End:
-    BrotliHuffmanTreeRelease(&tree_index_htree);
   }
   if (BrotliReadBits(br, 1)) {
     InverseMoveToFrontTransform(*context_map, context_map_size);
   }
+End:
+  free(table);
   return ok;
 }
 
 static BROTLI_INLINE void DecodeBlockType(const int max_block_type,
-                                          const HuffmanTree* trees,
+                                          const HuffmanCode* trees,
                                           int tree_type,
                                           int* block_types,
                                           int* ringbuffers,
@@ -543,7 +463,7 @@ static BROTLI_INLINE void DecodeBlockType(const int max_block_type,
                                           BrotliBitReader* br) {
   int* ringbuffer = ringbuffers + tree_type * 2;
   int* index = indexes + tree_type;
-  int type_code = ReadSymbol(trees + tree_type, br);
+  int type_code = ReadSymbol(&trees[tree_type * HUFFMAN_MAX_TABLE_SIZE], br);
   int block_type;
   if (type_code == 0) {
     block_type = ringbuffer[*index & 1];
@@ -608,6 +528,92 @@ static BROTLI_INLINE void IncrementalCopyFastPath(
   }
 }
 
+int CopyUncompressedBlockToOutput(BrotliOutput output, int len, int pos,
+                                  uint8_t* ringbuffer, int ringbuffer_mask,
+                                  BrotliBitReader* br) {
+  const int rb_size = ringbuffer_mask + 1;
+  uint8_t* ringbuffer_end = ringbuffer + rb_size;
+  int rb_pos = pos & ringbuffer_mask;
+  int br_pos = br->pos_ & BROTLI_IBUF_MASK;
+  int nbytes;
+
+  /* For short lengths copy byte-by-byte */
+  if (len < 8 || br->bit_pos_ + (uint32_t)(len << 3) < br->bit_end_pos_) {
+    while (len-- > 0) {
+      if (!BrotliReadMoreInput(br)) {
+        return 0;
+      }
+      ringbuffer[rb_pos++]= (uint8_t)BrotliReadBits(br, 8);
+      if (rb_pos == rb_size) {
+        if (BrotliWrite(output, ringbuffer, (size_t)rb_size) < rb_size) {
+          return 0;
+        }
+        rb_pos = 0;
+      }
+    }
+    return 1;
+  }
+
+  if (br->bit_end_pos_ < 64) {
+    return 0;
+  }
+
+  /* Copy remaining 0-8 bytes from br->val_ to ringbuffer. */
+  while (br->bit_pos_ < 64) {
+    ringbuffer[rb_pos] = (uint8_t)(br->val_ >> br->bit_pos_);
+    br->bit_pos_ += 8;
+    ++rb_pos;
+    --len;
+  }
+
+  /* Copy remaining bytes from br->buf_ to ringbuffer. */
+  nbytes = (int)(br->bit_end_pos_ - br->bit_pos_) >> 3;
+  if (br_pos + nbytes > BROTLI_IBUF_MASK) {
+    int tail = BROTLI_IBUF_MASK + 1 - br_pos;
+    memcpy(&ringbuffer[rb_pos], &br->buf_[br_pos], (size_t)tail);
+    nbytes -= tail;
+    rb_pos += tail;
+    len -= tail;
+    br_pos = 0;
+  }
+  memcpy(&ringbuffer[rb_pos], &br->buf_[br_pos], (size_t)nbytes);
+  rb_pos += nbytes;
+  len -= nbytes;
+
+  /* If we wrote past the logical end of the ringbuffer, copy the tail of the
+     ringbuffer to its beginning and flush the ringbuffer to the output. */
+  if (rb_pos >= rb_size) {
+    if (BrotliWrite(output, ringbuffer, (size_t)rb_size) < rb_size) {
+      return 0;
+    }
+    rb_pos -= rb_size;
+    memcpy(ringbuffer, ringbuffer_end, (size_t)rb_pos);
+  }
+
+  /* If we have more to copy than the remaining size of the ringbuffer, then we
+     first fill the ringbuffer from the input and then flush the ringbuffer to
+     the output */
+  while (rb_pos + len >= rb_size) {
+    nbytes = rb_size - rb_pos;
+    if (BrotliRead(br->input_, &ringbuffer[rb_pos], (size_t)nbytes) < nbytes ||
+        BrotliWrite(output, ringbuffer, (size_t)rb_size) < nbytes) {
+      return 0;
+    }
+    len -= nbytes;
+    rb_pos = 0;
+  }
+
+  /* Copy straight from the input onto the ringbuffer. The ringbuffer will be
+     flushed to the output at a later time. */
+  if (BrotliRead(br->input_, &ringbuffer[rb_pos], (size_t)len) < len) {
+    return 0;
+  }
+
+  /* Restore the state of the bit reader. */
+  BrotliInitBitReader(br, br->input_);
+  return 1;
+}
+
 int BrotliDecompressedSize(size_t encoded_size,
                            const uint8_t* encoded_buffer,
                            size_t* decoded_size) {
@@ -662,11 +668,15 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
   uint8_t prev_byte1 = 0;
   uint8_t prev_byte2 = 0;
   HuffmanTreeGroup hgroup[3];
+  HuffmanCode* block_type_trees = NULL;
+  HuffmanCode* block_len_trees = NULL;
   BrotliBitReader br;
 
-  /* 16 bytes would be enough, but we add some more slack for transforms */
-  /* to work at the end of the ringbuffer. */
-  static const int kRingBufferWriteAheadSlack = 128;
+  /* We need the slack region for the following reasons:
+       - always doing two 8-byte copies for fast backward copying
+       - transforms
+       - flushing the input ringbuffer when decoding uncompressed blocks */
+  static const int kRingBufferWriteAheadSlack = 128 + BROTLI_READ_SIZE;
 
   static const int kMaxDictionaryWordLength = 0;
 
@@ -688,6 +698,16 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
   }
   ringbuffer_end = ringbuffer + ringbuffer_size;
 
+  if (ok) {
+    block_type_trees = (HuffmanCode*)malloc(
+        3 * HUFFMAN_MAX_TABLE_SIZE * sizeof(HuffmanCode));
+    block_len_trees = (HuffmanCode*)malloc(
+        3 * HUFFMAN_MAX_TABLE_SIZE * sizeof(HuffmanCode));
+    if (block_type_trees == NULL || block_len_trees == NULL) {
+      ok = 0;
+    }
+  }
+
   while (!input_end && ok) {
     int meta_block_remaining_len = 0;
     int is_uncompressed;
@@ -696,8 +716,6 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
     int num_block_types[3] = { 1, 1, 1 };
     int block_type_rb[6] = { 0, 1, 0, 1, 0, 1 };
     int block_type_rb_index[3] = { 0 };
-    HuffmanTree block_type_trees[3];
-    HuffmanTree block_len_trees[3];
     int distance_postfix_bits;
     int num_direct_distance_codes;
     int distance_postfix_mask;
@@ -716,12 +734,11 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
     int context_lookup_offset1 = 0;
     int context_lookup_offset2 = 0;
     uint8_t context_mode;
+    HuffmanCode* htree_command;
 
     for (i = 0; i < 3; ++i) {
-      hgroup[i].num_htrees = 0;
+      hgroup[i].codes = NULL;
       hgroup[i].htrees = NULL;
-      block_type_trees[i].root_ = NULL;
-      block_len_trees[i].root_ = NULL;
     }
 
     if (!BrotliReadMoreInput(&br)) {
@@ -738,31 +755,25 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
     }
     if (is_uncompressed) {
       BrotliSetBitPos(&br, (br.bit_pos_ + 7) & (uint32_t)(~7UL));
-      while (meta_block_remaining_len) {
-        ringbuffer[pos & ringbuffer_mask] = (uint8_t)BrotliReadBits(&br, 8);
-        if ((pos & ringbuffer_mask) == ringbuffer_mask) {
-          if (BrotliWrite(output, ringbuffer, (size_t)ringbuffer_size) < 0) {
-            ok = 0;
-            goto End;
-          }
-        }
-        ++pos;
-        --meta_block_remaining_len;
-      }
+      ok = CopyUncompressedBlockToOutput(output, meta_block_remaining_len, pos,
+                                         ringbuffer, ringbuffer_mask, &br);
+      pos += meta_block_remaining_len;
       goto End;
     }
     for (i = 0; i < 3; ++i) {
-      block_type_trees[i].root_ = NULL;
-      block_len_trees[i].root_ = NULL;
       num_block_types[i] = DecodeVarLenUint8(&br) + 1;
       if (num_block_types[i] >= 2) {
-        if (!ReadHuffmanCode(
-                num_block_types[i] + 2, &block_type_trees[i], &br) ||
-            !ReadHuffmanCode(kNumBlockLengthCodes, &block_len_trees[i], &br)) {
+        if (!ReadHuffmanCode(num_block_types[i] + 2,
+                             &block_type_trees[i * HUFFMAN_MAX_TABLE_SIZE],
+                             &br) ||
+            !ReadHuffmanCode(kNumBlockLengthCodes,
+                             &block_len_trees[i * HUFFMAN_MAX_TABLE_SIZE],
+                             &br)) {
           ok = 0;
           goto End;
         }
-        block_length[i] = ReadBlockLength(&block_len_trees[i], &br);
+        block_length[i] = ReadBlockLength(
+            &block_len_trees[i * HUFFMAN_MAX_TABLE_SIZE], &br);
         block_type_rb_index[i] = 1;
       }
     }
@@ -822,8 +833,13 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
     context_mode = context_modes[block_type[0]];
     context_lookup_offset1 = kContextLookupOffsets[context_mode];
     context_lookup_offset2 = kContextLookupOffsets[context_mode + 1];
+    htree_command = hgroup[1].htrees[0];
 
     while (meta_block_remaining_len > 0) {
+      int cmd_code;
+      int range_idx;
+      int insert_code;
+      int copy_code;
       int insert_length;
       int copy_length;
       int distance_code;
@@ -841,11 +857,25 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
         DecodeBlockType(num_block_types[1],
                         block_type_trees, 1, block_type, block_type_rb,
                         block_type_rb_index, &br);
-        block_length[1] = ReadBlockLength(&block_len_trees[1], &br);
+        block_length[1] = ReadBlockLength(
+            &block_len_trees[HUFFMAN_MAX_TABLE_SIZE], &br);
+        htree_command = hgroup[1].htrees[block_type[1]];
       }
       --block_length[1];
-      ReadInsertAndCopy(&hgroup[1].htrees[block_type[1]],
-                        &insert_length, &copy_length, &distance_code, &br);
+      cmd_code = ReadSymbol(htree_command, &br);
+      range_idx = cmd_code >> 6;
+      if (range_idx >= 2) {
+        range_idx -= 2;
+        distance_code = -1;
+      } else {
+        distance_code = 0;
+      }
+      insert_code = kInsertRangeLut[range_idx] + ((cmd_code >> 3) & 7);
+      copy_code = kCopyRangeLut[range_idx] + (cmd_code & 7);
+      insert_length = kInsertLengthPrefixCode[insert_code].offset +
+          (int)BrotliReadBits(&br, kInsertLengthPrefixCode[insert_code].nbits);
+      copy_length = kCopyLengthPrefixCode[copy_code].offset +
+          (int)BrotliReadBits(&br, kCopyLengthPrefixCode[copy_code].nbits);
       BROTLI_LOG_UINT(insert_length);
       BROTLI_LOG_UINT(copy_length);
       BROTLI_LOG_UINT(distance_code);
@@ -859,7 +889,7 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
           DecodeBlockType(num_block_types[0],
                           block_type_trees, 0, block_type, block_type_rb,
                           block_type_rb_index, &br);
-          block_length[0] = ReadBlockLength(&block_len_trees[0], &br);
+          block_length[0] = ReadBlockLength(block_len_trees, &br);
           context_offset = block_type[0] << kLiteralContextBits;
           context_map_slice = context_map + context_offset;
           context_mode = context_modes[block_type[0]];
@@ -872,7 +902,7 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
         literal_htree_index = context_map_slice[context];
         --block_length[0];
         prev_byte2 = prev_byte1;
-        prev_byte1 = (uint8_t)ReadSymbol(&hgroup[0].htrees[literal_htree_index],
+        prev_byte1 = (uint8_t)ReadSymbol(hgroup[0].htrees[literal_htree_index],
                                          &br);
         ringbuffer[pos & ringbuffer_mask] = prev_byte1;
         BROTLI_LOG_UINT(literal_htree_index);
@@ -899,7 +929,8 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
           DecodeBlockType(num_block_types[2],
                           block_type_trees, 2, block_type, block_type_rb,
                           block_type_rb_index, &br);
-          block_length[2] = ReadBlockLength(&block_len_trees[2], &br);
+          block_length[2] = ReadBlockLength(
+              &block_len_trees[2 * HUFFMAN_MAX_TABLE_SIZE], &br);
           dist_htree_index = (uint8_t)block_type[2];
           dist_context_offset = block_type[2] << kDistanceContextBits;
           dist_context_map_slice = dist_context_map + dist_context_offset;
@@ -907,11 +938,20 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
         --block_length[2];
         context = (uint8_t)(copy_length > 4 ? 3 : copy_length - 2);
         dist_htree_index = dist_context_map_slice[context];
-        distance_code = ReadCopyDistance(&hgroup[2].htrees[dist_htree_index],
-                                         num_direct_distance_codes,
-                                         distance_postfix_bits,
-                                         distance_postfix_mask,
-                                         &br);
+        distance_code = ReadSymbol(hgroup[2].htrees[dist_htree_index], &br);
+        if (distance_code >= num_direct_distance_codes) {
+          int nbits;
+          int postfix;
+          int offset;
+          distance_code -= num_direct_distance_codes;
+          postfix = distance_code & distance_postfix_mask;
+          distance_code >>= distance_postfix_bits;
+          nbits = (distance_code >> 1) + 1;
+          offset = ((2 + (distance_code & 1)) << nbits) - 4;
+          distance_code = num_direct_distance_codes +
+              ((offset + (int)BrotliReadBits(&br, nbits)) <<
+               distance_postfix_bits) + postfix;
+        }
       }
 
       /* Convert the distance code to the actual distance by possibly looking */
@@ -1004,8 +1044,6 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
     }
     for (i = 0; i < 3; ++i) {
       HuffmanTreeGroupRelease(&hgroup[i]);
-      BrotliHuffmanTreeRelease(&block_type_trees[i]);
-      BrotliHuffmanTreeRelease(&block_len_trees[i]);
     }
   }
 
@@ -1015,6 +1053,12 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
     }
     free(ringbuffer);
   }
+  if (block_type_trees != 0) {
+    free(block_type_trees);
+  }
+  if (block_len_trees != 0) {
+    free(block_len_trees);
+  }
   return ok;
 }
 
diff --git a/dec/decode.h b/dec/decode.h
index 9182438..ec6d65e 100644
--- a/dec/decode.h
+++ b/dec/decode.h
@@ -26,6 +26,8 @@ extern "C" {
 #endif
 
 /* Sets *decoded_size to the decompressed size of the given encoded stream. */
+/* This function only works if the encoded buffer has a single meta block, */
+/* and this meta block must have the "is last" bit set. */
 /* Returns 1 on success, 0 on failure. */
 int BrotliDecompressedSize(size_t encoded_size,
                            const uint8_t* encoded_buffer,
diff --git a/dec/huffman.c b/dec/huffman.c
index 20e6223..12493a9 100644
--- a/dec/huffman.c
+++ b/dec/huffman.c
@@ -12,11 +12,12 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 
-   Utilities for building and looking up Huffman trees.
+   Utilities for building Huffman decoding tables.
 */
 
 #include <assert.h>
 #include <stdlib.h>
+#include <stdio.h>
 #include <string.h>
 #include "./huffman.h"
 #include "./safe_malloc.h"
@@ -25,231 +26,137 @@
 extern "C" {
 #endif
 
-#define NON_EXISTENT_SYMBOL (-1)
-#define MAX_ALLOWED_CODE_LENGTH      15
+#define MAX_LENGTH 15
 
-static void TreeNodeInit(HuffmanTreeNode* const node) {
-  node->children_ = -1;   /* means: 'unassigned so far' */
-}
-
-static int NodeIsEmpty(const HuffmanTreeNode* const node) {
-  return (node->children_ < 0);
-}
-
-static int IsFull(const HuffmanTree* const tree) {
-  return (tree->num_nodes_ == tree->max_nodes_);
-}
-
-static void AssignChildren(HuffmanTree* const tree,
-                           HuffmanTreeNode* const node) {
-  HuffmanTreeNode* const children = tree->root_ + tree->num_nodes_;
-  node->children_ = (int)(children - node);
-  assert(children - node == (int)(children - node));
-  tree->num_nodes_ += 2;
-  TreeNodeInit(children + 0);
-  TreeNodeInit(children + 1);
-}
-
-static int TreeInit(HuffmanTree* const tree, int num_leaves) {
-  assert(tree != NULL);
-  tree->root_ = NULL;
-  if (num_leaves == 0) return 0;
-  /* We allocate maximum possible nodes in the tree at once. */
-  /* Note that a Huffman tree is a full binary tree; and in a full binary */
-  /* tree with L leaves, the total number of nodes N = 2 * L - 1. */
-  tree->max_nodes_ = 2 * num_leaves - 1;
-  assert(tree->max_nodes_ < (1 << 16));   /* limit for the lut_jump_ table */
-  tree->root_ = (HuffmanTreeNode*)BrotliSafeMalloc((uint64_t)tree->max_nodes_,
-                                                  sizeof(*tree->root_));
-  if (tree->root_ == NULL) return 0;
-  TreeNodeInit(tree->root_);  /* Initialize root. */
-  tree->num_nodes_ = 1;
-  memset(tree->lut_bits_, 255, sizeof(tree->lut_bits_));
-  memset(tree->lut_jump_, 0, sizeof(tree->lut_jump_));
-  return 1;
-}
-
-void BrotliHuffmanTreeRelease(HuffmanTree* const tree) {
-  if (tree != NULL) {
-    if (tree->root_ != NULL) {
-      free(tree->root_);
-    }
-    tree->root_ = NULL;
-    tree->max_nodes_ = 0;
-    tree->num_nodes_ = 0;
+/* Returns reverse(reverse(key, len) + 1, len), where reverse(key, len) is the
+   bit-wise reversal of the len least significant bits of key. */
+static BROTLI_INLINE int GetNextKey(int key, int len) {
+  int step = 1 << (len - 1);
+  while (key & step) {
+    step >>= 1;
   }
+  return (key & (step - 1)) + step;
 }
 
-/* Utility: converts Huffman code lengths to corresponding Huffman codes. */
-/* 'huff_codes' should be pre-allocated. */
-/* Returns false in case of error (memory allocation, invalid codes). */
-static int HuffmanCodeLengthsToCodes(const uint8_t* const code_lengths,
-                                     int code_lengths_size,
-                                     int* const huff_codes) {
-  int symbol;
-  int code_len;
-  int code_length_hist[MAX_ALLOWED_CODE_LENGTH + 1] = { 0 };
-  int curr_code;
-  int next_codes[MAX_ALLOWED_CODE_LENGTH + 1] = { 0 };
-  int max_code_length = 0;
+/* Stores code in table[0], table[step], table[2*step], ..., table[end] */
+/* Assumes that end is an integer multiple of step */
+static BROTLI_INLINE void ReplicateValue(HuffmanCode* table,
+                                         int step, int end,
+                                         HuffmanCode code) {
+  do {
+    end -= step;
+    table[end] = code;
+  } while (end > 0);
+}
 
-  assert(code_lengths != NULL);
-  assert(code_lengths_size > 0);
-  assert(huff_codes != NULL);
-
-  /* Calculate max code length. */
-  for (symbol = 0; symbol < code_lengths_size; ++symbol) {
-    if (code_lengths[symbol] > max_code_length) {
-      max_code_length = code_lengths[symbol];
-    }
+/* Returns the table width of the next 2nd level table. count is the histogram
+   of bit lengths for the remaining symbols, len is the code length of the next
+   processed symbol */
+static BROTLI_INLINE int NextTableBitSize(const int* const count,
+                                          int len, int root_bits) {
+  int left = 1 << (len - root_bits);
+  while (len < MAX_LENGTH) {
+    left -= count[len];
+    if (left <= 0) break;
+    ++len;
+    left <<= 1;
   }
-  if (max_code_length > MAX_ALLOWED_CODE_LENGTH) return 0;
+  return len - root_bits;
+}
 
-  /* Calculate code length histogram. */
-  for (symbol = 0; symbol < code_lengths_size; ++symbol) {
-    ++code_length_hist[code_lengths[symbol]];
-  }
-  code_length_hist[0] = 0;
+int BrotliBuildHuffmanTable(HuffmanCode* root_table,
+                            int root_bits,
+                            const uint8_t* const code_lengths,
+                            int code_lengths_size) {
+  HuffmanCode code;    /* current table entry */
+  HuffmanCode* table;  /* next available space in table */
+  int len;             /* current code length */
+  int symbol;          /* symbol index in original or sorted table */
+  int key;             /* reversed prefix code */
+  int step;            /* step size to replicate values in current table */
+  int low;             /* low bits for current root entry */
+  int mask;            /* mask for low bits */
+  int table_bits;      /* key length of current table */
+  int table_size;      /* size of current table */
+  int total_size;      /* sum of root table size and 2nd level table sizes */
+  int* sorted;         /* symbols sorted by code length */
+  int count[MAX_LENGTH + 1] = { 0 };  /* number of codes of each length */
+  int offset[MAX_LENGTH + 1];  /* offsets in sorted table for each length */
 
-  /* Calculate the initial values of 'next_codes' for each code length. */
-  /* next_codes[code_len] denotes the code to be assigned to the next symbol */
-  /* of code length 'code_len'. */
-  curr_code = 0;
-  next_codes[0] = -1;  /* Unused, as code length = 0 implies */
-                       /* code doesn't exist. */
-  for (code_len = 1; code_len <= max_code_length; ++code_len) {
-    curr_code = (curr_code + code_length_hist[code_len - 1]) << 1;
-    next_codes[code_len] = curr_code;
+  sorted = (int*)malloc((size_t)code_lengths_size * sizeof(*sorted));
+  if (sorted == NULL) {
+    return 0;
   }
 
-  /* Get symbols. */
-  for (symbol = 0; symbol < code_lengths_size; ++symbol) {
-    if (code_lengths[symbol] > 0) {
-      huff_codes[symbol] = next_codes[code_lengths[symbol]]++;
-    } else {
-      huff_codes[symbol] = NON_EXISTENT_SYMBOL;
-    }
+  /* build histogram of code lengths */
+  for (symbol = 0; symbol < code_lengths_size; symbol++) {
+    count[code_lengths[symbol]]++;
   }
-  return 1;
-}
 
-static const uint8_t kReverse7[128] = {
-  0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120,
-  4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124,
-  2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122,
-  6, 70, 38, 102, 22, 86, 54, 118, 14, 78, 46, 110, 30, 94, 62, 126,
-  1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121,
-  5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125,
-  3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123,
-  7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127
-};
-
-static int ReverseBitsShort(int bits, int num_bits) {
-  return kReverse7[bits] >> (7 - num_bits);
-}
-
-static int TreeAddSymbol(HuffmanTree* const tree,
-                         int symbol, int code, int code_length) {
-  int step = HUFF_LUT_BITS;
-  int base_code;
-  HuffmanTreeNode* node = tree->root_;
-  const HuffmanTreeNode* const max_node = tree->root_ + tree->max_nodes_;
-  assert(symbol == (int16_t)symbol);
-  if (code_length <= HUFF_LUT_BITS) {
-    int i = 1 << (HUFF_LUT_BITS - code_length);
-    base_code = ReverseBitsShort(code, code_length);
-    do {
-      int idx;
-      --i;
-      idx = base_code | (i << code_length);
-      tree->lut_symbol_[idx] = (int16_t)symbol;
-      tree->lut_bits_[idx] = (uint8_t)code_length;
-    } while (i > 0);
-  } else {
-    base_code = ReverseBitsShort((code >> (code_length - HUFF_LUT_BITS)),
-                                 HUFF_LUT_BITS);
+  /* generate offsets into sorted symbol table by code length */
+  offset[1] = 0;
+  for (len = 1; len < MAX_LENGTH; len++) {
+    offset[len + 1] = offset[len] + count[len];
   }
-  while (code_length-- > 0) {
-    if (node >= max_node) {
-      return 0;
-    }
-    if (NodeIsEmpty(node)) {
-      if (IsFull(tree)) return 0;    /* error: too many symbols. */
-      AssignChildren(tree, node);
-    } else if (!HuffmanTreeNodeIsNotLeaf(node)) {
-      return 0;  /* leaf is already occupied. */
-    }
-    node += node->children_ + ((code >> code_length) & 1);
-    if (--step == 0) {
-      tree->lut_jump_[base_code] = (int16_t)(node - tree->root_);
-    }
-  }
-  if (NodeIsEmpty(node)) {
-    node->children_ = 0;      /* turn newly created node into a leaf. */
-  } else if (HuffmanTreeNodeIsNotLeaf(node)) {
-    return 0;   /* trying to assign a symbol to already used code. */
-  }
-  node->symbol_ = symbol;  /* Add symbol in this node. */
-  return 1;
-}
 
-int BrotliHuffmanTreeBuildImplicit(HuffmanTree* const tree,
-                                   const uint8_t* const code_lengths,
-                                   int code_lengths_size) {
-  int symbol;
-  int num_symbols = 0;
-  int root_symbol = 0;
-
-  assert(tree != NULL);
-  assert(code_lengths != NULL);
-
-  /* Find out number of symbols and the root symbol. */
-  for (symbol = 0; symbol < code_lengths_size; ++symbol) {
-    if (code_lengths[symbol] > 0) {
-      /* Note: code length = 0 indicates non-existent symbol. */
-      ++num_symbols;
-      root_symbol = symbol;
+  /* sort symbols by length, by symbol order within each length */
+  for (symbol = 0; symbol < code_lengths_size; symbol++) {
+    if (code_lengths[symbol] != 0) {
+      sorted[offset[code_lengths[symbol]]++] = symbol;
     }
   }
 
-  /* Initialize the tree. Will fail for num_symbols = 0 */
-  if (!TreeInit(tree, num_symbols)) return 0;
+  table = root_table;
+  table_bits = root_bits;
+  table_size = 1 << table_bits;
+  total_size = table_size;
 
-  /* Build tree. */
-  if (num_symbols == 1) {  /* Trivial case. */
-    const int max_symbol = code_lengths_size;
-    if (root_symbol < 0 || root_symbol >= max_symbol) {
-      BrotliHuffmanTreeRelease(tree);
-      return 0;
+  /* special case code with only one value */
+  if (offset[MAX_LENGTH] == 1) {
+    code.bits = 0;
+    code.value = (uint16_t)sorted[0];
+    for (key = 0; key < total_size; ++key) {
+      table[key] = code;
     }
-    return TreeAddSymbol(tree, root_symbol, 0, 0);
-  } else {  /* Normal case. */
-    int ok = 0;
+    free(sorted);
+    return total_size;
+  }
 
-    /* Get Huffman codes from the code lengths. */
-    int* const codes =
-        (int*)BrotliSafeMalloc((uint64_t)code_lengths_size, sizeof(*codes));
-    if (codes == NULL) goto End;
-
-    if (!HuffmanCodeLengthsToCodes(code_lengths, code_lengths_size, codes)) {
-      goto End;
+  /* fill in root table */
+  key = 0;
+  symbol = 0;
+  for (len = 1, step = 2; len <= root_bits; ++len, step <<= 1) {
+    for (; count[len] > 0; --count[len]) {
+      code.bits = (uint8_t)(len);
+      code.value = (uint16_t)sorted[symbol++];
+      ReplicateValue(&table[key], step, table_size, code);
+      key = GetNextKey(key, len);
     }
+  }
 
-    /* Add symbols one-by-one. */
-    for (symbol = 0; symbol < code_lengths_size; ++symbol) {
-      if (code_lengths[symbol] > 0) {
-        if (!TreeAddSymbol(tree, symbol, codes[symbol], code_lengths[symbol])) {
-          goto End;
-        }
+  /* fill in 2nd level tables and add pointers to root table */
+  mask = total_size - 1;
+  low = -1;
+  for (len = root_bits + 1, step = 2; len <= MAX_LENGTH; ++len, step <<= 1) {
+    for (; count[len] > 0; --count[len]) {
+      if ((key & mask) != low) {
+        table += table_size;
+        table_bits = NextTableBitSize(count, len, root_bits);
+        table_size = 1 << table_bits;
+        total_size += table_size;
+        low = key & mask;
+        root_table[low].bits = (uint8_t)(table_bits + root_bits);
+        root_table[low].value = (uint16_t)((table - root_table) - low);
       }
+      code.bits = (uint8_t)(len - root_bits);
+      code.value = (uint16_t)sorted[symbol++];
+      ReplicateValue(&table[key >> root_bits], step, table_size, code);
+      key = GetNextKey(key, len);
     }
-    ok = 1;
- End:
-    free(codes);
-    ok = ok && IsFull(tree);
-    if (!ok) BrotliHuffmanTreeRelease(tree);
-    return ok;
   }
+
+  free(sorted);
+  return total_size;
 }
 
 #if defined(__cplusplus) || defined(c_plusplus)
diff --git a/dec/huffman.h b/dec/huffman.h
index fbd0744..834b316 100644
--- a/dec/huffman.h
+++ b/dec/huffman.h
@@ -12,7 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 
-   Utilities for building and looking up Huffman trees.
+   Utilities for building Huffman decoding tables.
 */
 
 #ifndef BROTLI_DEC_HUFFMAN_H_
@@ -25,48 +25,17 @@
 extern "C" {
 #endif
 
-/* A node of a Huffman tree. */
 typedef struct {
-  int symbol_;
-  int children_;  /* delta offset to both children (contiguous) or 0 if leaf. */
-} HuffmanTreeNode;
+  uint8_t bits;     /* number of bits used for this symbol */
+  uint16_t value;   /* symbol value or table offset */
+} HuffmanCode;
 
-/* Huffman Tree. */
-#define HUFF_LUT_BITS 7
-#define HUFF_LUT (1U << HUFF_LUT_BITS)
-typedef struct HuffmanTree HuffmanTree;
-struct HuffmanTree {
-  /* Fast lookup for short bit lengths. */
-  uint8_t lut_bits_[HUFF_LUT];
-  int16_t lut_symbol_[HUFF_LUT];
-  int16_t lut_jump_[HUFF_LUT];
-  /* Complete tree for lookups. */
-  HuffmanTreeNode* root_;   /* all the nodes, starting at root. */
-  int max_nodes_;           /* max number of nodes */
-  int num_nodes_;           /* number of currently occupied nodes */
-};
-
-/* Returns true if the given node is not a leaf of the Huffman tree. */
-static BROTLI_INLINE int HuffmanTreeNodeIsNotLeaf(
-    const HuffmanTreeNode* const node) {
-  return node->children_;
-}
-
-/* Go down one level. Most critical function. 'right_child' must be 0 or 1. */
-static BROTLI_INLINE const HuffmanTreeNode* HuffmanTreeNextNode(
-    const HuffmanTreeNode* node, int right_child) {
-  return node + node->children_ + right_child;
-}
-
-/* Releases the nodes of the Huffman tree. */
-/* Note: It does NOT free 'tree' itself. */
-void BrotliHuffmanTreeRelease(HuffmanTree* const tree);
-
-/* Builds Huffman tree assuming code lengths are implicitly in symbol order. */
+/* Builds Huffman lookup table assuming code lengths are in symbol order. */
 /* Returns false in case of error (invalid tree or memory error). */
-int BrotliHuffmanTreeBuildImplicit(HuffmanTree* const tree,
-                                   const uint8_t* const code_lengths,
-                                   int code_lengths_size);
+int BrotliBuildHuffmanTable(HuffmanCode* root_table,
+                            int root_bits,
+                            const uint8_t* const code_lengths,
+                            int code_lengths_size);
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    /* extern "C" */
diff --git a/enc/backward_references.cc b/enc/backward_references.cc
index 0e7f89b..f76d7d4 100644
--- a/enc/backward_references.cc
+++ b/enc/backward_references.cc
@@ -45,66 +45,97 @@ void CreateBackwardReferences(size_t num_bytes,
   average_cost /= num_bytes;
   hasher->set_average_cost(average_cost);
 
+  // M1 match is for considering for two repeated copies, if moving
+  // one literal form the previous copy to the current one allows the
+  // current copy to be more efficient (because the way static dictionary
+  // codes words). M1 matching improves text compression density by ~0.15 %.
+  bool match_found_M1 = false;
+  size_t best_len_M1 = 0;
+  size_t best_len_code_M1 = 0;
+  size_t best_dist_M1 = 0;
+  double best_score_M1 = 0;
   while (i + 2 < i_end) {
     size_t best_len = 0;
     size_t best_len_code = 0;
     size_t best_dist = 0;
     double best_score = 0;
     size_t max_distance = std::min(i + i_diff, max_backward_limit);
+    bool in_dictionary;
     hasher->set_insert_length(insert_length);
     bool match_found = hasher->FindLongestMatch(
         ringbuffer, literal_cost, ringbuffer_mask,
         i + i_diff, i_end - i, max_distance,
-        &best_len, &best_len_code, &best_dist, &best_score);
+        &best_len, &best_len_code, &best_dist, &best_score, &in_dictionary);
+    bool best_in_dictionary = in_dictionary;
     if (match_found) {
-      // Found a match. Let's look for something even better ahead.
-      int delayed_backward_references_in_row = 0;
-      while (i + 4 < i_end &&
-             delayed_backward_references_in_row < 4) {
-        size_t best_len_2 = 0;
-        size_t best_len_code_2 = 0;
-        size_t best_dist_2 = 0;
-        double best_score_2 = 0;
-        max_distance = std::min(i + i_diff + 1, max_backward_limit);
+      if (match_found_M1 && best_score_M1 > best_score) {
+        // Two copies after each other. Take the last literal from the
+        // last copy, and use it as the first of this one.
+        (commands->rbegin())->copy_length_ -= 1;
+        (commands->rbegin())->copy_length_code_ -= 1;
         hasher->Store(ringbuffer + i, i + i_diff);
-        match_found = hasher->FindLongestMatch(
-            ringbuffer, literal_cost, ringbuffer_mask,
-            i + i_diff + 1, i_end - i - 1, max_distance,
-            &best_len_2, &best_len_code_2, &best_dist_2, &best_score_2);
-        double cost_diff_lazy = 0;
-        if (best_len >= 4) {
-          cost_diff_lazy +=
-              literal_cost[(i + 4) & ringbuffer_mask] - average_cost;
-        }
-        {
-          const int tail_length = best_len_2 - best_len + 1;
-          for (int k = 0; k < tail_length; ++k) {
-            cost_diff_lazy -=
-                literal_cost[(i + best_len + k) & ringbuffer_mask] -
-                average_cost;
+        --i;
+        best_len = best_len_M1;
+        best_len_code = best_len_code_M1;
+        best_dist = best_dist_M1;
+        best_score = best_score_M1;
+        // in_dictionary doesn't need to be correct, but it is the only
+        // reason why M1 matching should be beneficial here. Setting it here
+        // will only disable further M1 matching against this copy.
+        best_in_dictionary = true;
+        in_dictionary = true;
+      } else {
+        // Found a match. Let's look for something even better ahead.
+        int delayed_backward_references_in_row = 0;
+        while (i + 4 < i_end &&
+               delayed_backward_references_in_row < 4) {
+          size_t best_len_2 = 0;
+          size_t best_len_code_2 = 0;
+          size_t best_dist_2 = 0;
+          double best_score_2 = 0;
+          max_distance = std::min(i + i_diff + 1, max_backward_limit);
+          hasher->Store(ringbuffer + i, i + i_diff);
+          match_found = hasher->FindLongestMatch(
+              ringbuffer, literal_cost, ringbuffer_mask,
+              i + i_diff + 1, i_end - i - 1, max_distance,
+              &best_len_2, &best_len_code_2, &best_dist_2, &best_score_2,
+              &in_dictionary);
+          double cost_diff_lazy = 0;
+          if (best_len >= 4) {
+            cost_diff_lazy +=
+                literal_cost[(i + 4) & ringbuffer_mask] - average_cost;
           }
-        }
-        // If we are not inserting any symbols, inserting one is more
-        // expensive than if we were inserting symbols anyways.
-        if (insert_length < 1) {
-          cost_diff_lazy += 0.97;
-        }
-        // Add bias to slightly avoid lazy matching.
-        cost_diff_lazy += 2.0 + delayed_backward_references_in_row * 0.2;
-        cost_diff_lazy += 0.04 * literal_cost[i & ringbuffer_mask];
+          {
+            const int tail_length = best_len_2 - best_len + 1;
+            for (int k = 0; k < tail_length; ++k) {
+              cost_diff_lazy -=
+                  literal_cost[(i + best_len + k) & ringbuffer_mask] -
+                  average_cost;
+            }
+          }
+          // If we are not inserting any symbols, inserting one is more
+          // expensive than if we were inserting symbols anyways.
+          if (insert_length < 1) {
+            cost_diff_lazy += 0.97;
+          }
+          // Add bias to slightly avoid lazy matching.
+          cost_diff_lazy += 2.0 + delayed_backward_references_in_row * 0.2;
+          cost_diff_lazy += 0.04 * literal_cost[i & ringbuffer_mask];
 
-        if (match_found && best_score_2 >= best_score + cost_diff_lazy) {
-          // Ok, let's just write one byte for now and start a match from the
-          // next byte.
-          ++insert_length;
-          ++delayed_backward_references_in_row;
-          best_len = best_len_2;
-          best_len_code = best_len_code_2;
-          best_dist = best_dist_2;
-          best_score = best_score_2;
-          i++;
-        } else {
-          break;
+          if (match_found && best_score_2 >= best_score + cost_diff_lazy) {
+            // Ok, let's just write one byte for now and start a match from the
+            // next byte.
+            ++insert_length;
+            ++delayed_backward_references_in_row;
+            best_len = best_len_2;
+            best_len_code = best_len_code_2;
+            best_dist = best_dist_2;
+            best_score = best_score_2;
+            best_in_dictionary = in_dictionary;
+            i++;
+          } else {
+            break;
+          }
         }
       }
       Command cmd;
@@ -117,13 +148,40 @@ void CreateBackwardReferences(size_t num_bytes,
 
       insert_length = 0;
       ++i;
-      for (int j = 1; j < best_len; ++j) {
+      // Copy all copied literals to the hasher, except the last one.
+      // We cannot store the last one yet, otherwise we couldn't find
+      // the possible M1 match.
+      for (int j = 1; j < best_len - 1; ++j) {
         if (i + 2 < i_end) {
           hasher->Store(ringbuffer + i, i + i_diff);
         }
         ++i;
       }
+      // Prepare M1 match.
+      if (best_len >= 4 && i + 20 < i_end && !best_in_dictionary) {
+        max_distance = std::min(i + i_diff, max_backward_limit);
+        match_found_M1 = hasher->FindLongestMatch(
+            ringbuffer, literal_cost, ringbuffer_mask,
+            i + i_diff, i_end - i, max_distance,
+            &best_len_M1, &best_len_code_M1, &best_dist_M1, &best_score_M1,
+            &in_dictionary);
+      } else {
+        match_found_M1 = false;
+        in_dictionary = false;
+      }
+      // This byte is just moved from the previous copy to the current,
+      // that is no gain.
+      best_score_M1 -= literal_cost[i & ringbuffer_mask];
+      // Adjust for losing the opportunity for lazy matching.
+      best_score_M1 -= 3.75;
+
+      // Store the last one of the match.
+      if (i + 2 < i_end) {
+        hasher->Store(ringbuffer + i, i + i_diff);
+      }
+      ++i;
     } else {
+      match_found_M1 = false;
       ++insert_length;
       hasher->Store(ringbuffer + i, i + i_diff);
       ++i;
diff --git a/enc/bit_cost.h b/enc/bit_cost.h
index c769455..c2fd3e4 100644
--- a/enc/bit_cost.h
+++ b/enc/bit_cost.h
@@ -93,7 +93,7 @@ static inline int HuffmanBitCost(const uint8_t* depth, int length) {
   cost[17] += 3;
 
   int tree_size = 0;
-  int bits = 6 + 3 * max_depth;  // huffman tree of huffman tree cost
+  int bits = 6 + 2 * max_depth;  // huffman tree of huffman tree cost
   for (int i = 0; i < kCodeLengthCodes; ++i) {
     bits += histogram[i] * cost[i];  // huffman tree bit cost
     tree_size += histogram[i];
diff --git a/enc/block_splitter.cc b/enc/block_splitter.cc
index 34363c4..57c1e90 100644
--- a/enc/block_splitter.cc
+++ b/enc/block_splitter.cc
@@ -31,16 +31,16 @@
 
 namespace brotli {
 
-static const int kMaxLiteralHistograms = 48;
+static const int kMaxLiteralHistograms = 100;
 static const int kMaxCommandHistograms = 50;
 static const double kLiteralBlockSwitchCost = 26;
 static const double kCommandBlockSwitchCost = 13.5;
 static const double kDistanceBlockSwitchCost = 14.6;
 static const int kLiteralStrideLength = 70;
 static const int kCommandStrideLength = 40;
-static const int kSymbolsPerLiteralHistogram = 550;
+static const int kSymbolsPerLiteralHistogram = 544;
 static const int kSymbolsPerCommandHistogram = 530;
-static const int kSymbolsPerDistanceHistogram = 550;
+static const int kSymbolsPerDistanceHistogram = 544;
 static const int kMinLengthForBlockSplitting = 128;
 static const int kIterMulForRefining = 2;
 static const int kMinItersForRefining = 100;
diff --git a/enc/encode.cc b/enc/encode.cc
index b492421..6603eec 100644
--- a/enc/encode.cc
+++ b/enc/encode.cc
@@ -77,6 +77,68 @@ void EncodeVarLenUint8(int n, int* storage_ix, uint8_t* storage) {
   }
 }
 
+int ParseAsUTF8(int* symbol, const uint8_t* input, int size) {
+  // ASCII
+  if ((input[0] & 0x80) == 0) {
+    *symbol = input[0];
+    if (*symbol > 0) {
+      return 1;
+    }
+  }
+  // 2-byte UTF8
+  if (size > 1 &&
+      (input[0] & 0xe0) == 0xc0 &&
+      (input[1] & 0xc0) == 0x80) {
+    *symbol = (((input[0] & 0x1f) << 6) |
+               (input[1] & 0x3f));
+    if (*symbol > 0x7f) {
+      return 2;
+    }
+  }
+  // 3-byte UFT8
+  if (size > 2 &&
+      (input[0] & 0xf0) == 0xe0 &&
+      (input[1] & 0xc0) == 0x80 &&
+      (input[2] & 0xc0) == 0x80) {
+    *symbol = (((input[0] & 0x0f) << 12) |
+               ((input[1] & 0x3f) << 6) |
+               (input[2] & 0x3f));
+    if (*symbol > 0x7ff) {
+      return 3;
+    }
+  }
+  // 4-byte UFT8
+  if (size > 3 &&
+      (input[0] & 0xf8) == 0xf0 &&
+      (input[1] & 0xc0) == 0x80 &&
+      (input[2] & 0xc0) == 0x80 &&
+      (input[3] & 0xc0) == 0x80) {
+    *symbol = (((input[0] & 0x07) << 18) |
+               ((input[1] & 0x3f) << 12) |
+               ((input[2] & 0x3f) << 6) |
+               (input[3] & 0x3f));
+    if (*symbol > 0xffff && *symbol <= 0x10ffff) {
+      return 4;
+    }
+  }
+  // Not UTF8, emit a special symbol above the UTF8-code space
+  *symbol = 0x110000 | input[0];
+  return 1;
+}
+
+// Returns true if at least min_fraction of the data is UTF8-encoded.
+bool IsMostlyUTF8(const uint8_t* data, size_t length, double min_fraction) {
+  size_t size_utf8 = 0;
+  size_t pos = 0;
+  while (pos < length) {
+    int symbol;
+    int bytes_read = ParseAsUTF8(&symbol, data + pos, length - pos);
+    pos += bytes_read;
+    if (symbol < 0x110000) size_utf8 += bytes_read;
+  }
+  return size_utf8 > min_fraction * length;
+}
+
 void EncodeMetaBlockLength(size_t meta_block_size,
                            bool is_last,
                            bool is_uncompressed,
@@ -118,7 +180,7 @@ void StoreHuffmanTreeOfHuffmanTreeToBitMask(
     const uint8_t* code_length_bitdepth,
     int* storage_ix, uint8_t* storage) {
   static const uint8_t kStorageOrder[kCodeLengthCodes] = {
-    1, 2, 3, 4, 0, 17, 5, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+    1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15,
   };
   // Throw away trailing zeros:
   int codes_to_store = kCodeLengthCodes;
@@ -147,7 +209,7 @@ void StoreHuffmanTreeOfHuffmanTreeToBitMask(
   WriteBits(2, skip_some, storage_ix, storage);
   for (int i = skip_some; i < codes_to_store; ++i) {
     uint8_t len[] = { 2, 4, 3, 2, 2, 4 };
-    uint8_t bits[] = { 0, 5, 1, 3, 2, 13 };
+    uint8_t bits[] = { 0, 7, 3, 2, 1, 15 };
     int v = code_length_bitdepth[kStorageOrder[i]];
     WriteBits(len[v], bits[v], storage_ix, storage);
   }
@@ -175,54 +237,49 @@ void StoreHuffmanTreeToBitMask(
 }
 
 template<int kSize>
-void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size,
-                      int* storage_ix, uint8_t* storage) {
+void StoreHuffmanCodeSimple(
+    const EntropyCode<kSize>& code, int alphabet_size,
+    int max_bits,
+    int* storage_ix, uint8_t* storage) {
   const uint8_t *depth = &code.depth_[0];
-  int max_bits_counter = alphabet_size - 1;
-  int max_bits = 0;
-  while (max_bits_counter) {
-    max_bits_counter >>= 1;
-    ++max_bits;
+  int symbols[4];
+  // Quadratic sort.
+  int k, j;
+  for (k = 0; k < code.count_; ++k) {
+    symbols[k] = code.symbols_[k];
   }
-  if (code.count_ == 0) {   // emit minimal tree for empty cases
-    // bits: small tree marker: 1, count-1: 0, max_bits-sized encoding for 0
-    WriteBits(4 + max_bits, 0x1, storage_ix, storage);
-    return;
-  }
-  if (code.count_ <= 4) {
-    int symbols[4];
-    // Quadratic sort.
-    int k, j;
-    for (k = 0; k < code.count_; ++k) {
-      symbols[k] = code.symbols_[k];
-    }
-    for (k = 0; k < code.count_; ++k) {
-      for (j = k + 1; j < code.count_; ++j) {
-        if (depth[symbols[j]] < depth[symbols[k]]) {
-          int t = symbols[k];
-          symbols[k] = symbols[j];
-          symbols[j] = t;
-        }
+  for (k = 0; k < code.count_; ++k) {
+    for (j = k + 1; j < code.count_; ++j) {
+      if (depth[symbols[j]] < depth[symbols[k]]) {
+        int t = symbols[k];
+        symbols[k] = symbols[j];
+        symbols[j] = t;
       }
     }
-    // Small tree marker to encode 1-4 symbols.
-    WriteBits(2, 1, storage_ix, storage);
-    WriteBits(2, code.count_ - 1, storage_ix, storage);
-    for (int i = 0; i < code.count_; ++i) {
-      WriteBits(max_bits, symbols[i], storage_ix, storage);
-    }
-    if (code.count_ == 4) {
-      if (depth[symbols[0]] == 2 &&
-          depth[symbols[1]] == 2 &&
-          depth[symbols[2]] == 2 &&
-          depth[symbols[3]] == 2) {
-        WriteBits(1, 0, storage_ix, storage);
-      } else {
-        WriteBits(1, 1, storage_ix, storage);
-      }
-    }
-    return;
   }
+  // Small tree marker to encode 1-4 symbols.
+  WriteBits(2, 1, storage_ix, storage);
+  WriteBits(2, code.count_ - 1, storage_ix, storage);
+  for (int i = 0; i < code.count_; ++i) {
+    WriteBits(max_bits, symbols[i], storage_ix, storage);
+  }
+  if (code.count_ == 4) {
+    if (depth[symbols[0]] == 2 &&
+        depth[symbols[1]] == 2 &&
+        depth[symbols[2]] == 2 &&
+        depth[symbols[3]] == 2) {
+      WriteBits(1, 0, storage_ix, storage);
+    } else {
+      WriteBits(1, 1, storage_ix, storage);
+    }
+  }
+}
+
+template<int kSize>
+void StoreHuffmanCodeComplex(
+    const EntropyCode<kSize>& code, int alphabet_size,
+    int* storage_ix, uint8_t* storage) {
+  const uint8_t *depth = &code.depth_[0];
   uint8_t huffman_tree[kSize];
   uint8_t huffman_tree_extra_bits[kSize];
   int huffman_tree_size = 0;
@@ -246,6 +303,31 @@ void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size,
                             storage_ix, storage);
 }
 
+
+template<int kSize>
+void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size,
+                      int* storage_ix, uint8_t* storage) {
+  int max_bits_counter = alphabet_size - 1;
+  int max_bits = 0;
+  while (max_bits_counter) {
+    max_bits_counter >>= 1;
+    ++max_bits;
+  }
+  if (code.count_ == 0) {
+    // Emit a minimal tree for empty cases.
+    // bits: small tree marker: 1, count-1: 0, max_bits-sized encoding for 0
+    WriteBits(4 + max_bits, 0x1, storage_ix, storage);
+  } else if (code.count_ <= 4) {
+    StoreHuffmanCodeSimple(
+        code, alphabet_size, max_bits,
+        storage_ix, storage);
+  } else {
+    StoreHuffmanCodeComplex(
+        code, alphabet_size,
+        storage_ix, storage);
+  }
+}
+
 template<int kSize>
 void StoreHuffmanCodes(const std::vector<EntropyCode<kSize> >& codes,
                        int alphabet_size,
@@ -798,12 +880,23 @@ void BrotliCompressor::WriteMetaBlock(const size_t input_size,
                                       const bool is_last,
                                       size_t* encoded_size,
                                       uint8_t* encoded_buffer) {
+  static const double kMinUTF8Ratio = 0.75;
+  bool utf8_mode = false;
   std::vector<Command> commands;
   if (input_size > 0) {
     ringbuffer_.Write(input_buffer, input_size);
-    EstimateBitCostsForLiterals(input_pos_, input_size,
-                                kRingBufferMask, ringbuffer_.start(),
-                                &literal_cost_[0]);
+    utf8_mode = IsMostlyUTF8(
+      &ringbuffer_.start()[input_pos_ & kRingBufferMask],
+      input_size, kMinUTF8Ratio);
+    if (utf8_mode) {
+      EstimateBitCostsForLiteralsUTF8(input_pos_, input_size,
+                                      kRingBufferMask, ringbuffer_.start(),
+                                      &literal_cost_[0]);
+    } else {
+      EstimateBitCostsForLiterals(input_pos_, input_size,
+                                  kRingBufferMask, ringbuffer_.start(),
+                                  &literal_cost_[0]);
+    }
     CreateBackwardReferences(input_size, input_pos_,
                              ringbuffer_.start(),
                              &literal_cost_[0],
diff --git a/enc/entropy_encode.cc b/enc/entropy_encode.cc
index e4c6b20..1ec50f1 100644
--- a/enc/entropy_encode.cc
+++ b/enc/entropy_encode.cc
@@ -182,6 +182,12 @@ void WriteHuffmanTreeRepetitions(
     ++(*tree_size);
     --repetitions;
   }
+  if (repetitions == 7) {
+    tree[*tree_size] = value;
+    extra_bits[*tree_size] = 0;
+    ++(*tree_size);
+    --repetitions;
+  }
   if (repetitions < 3) {
     for (int i = 0; i < repetitions; ++i) {
       tree[*tree_size] = value;
@@ -208,6 +214,12 @@ void WriteHuffmanTreeRepetitionsZeros(
     uint8_t* tree,
     uint8_t* extra_bits,
     int* tree_size) {
+  if (repetitions == 11) {
+    tree[*tree_size] = 0;
+    extra_bits[*tree_size] = 0;
+    ++(*tree_size);
+    --repetitions;
+  }
   if (repetitions < 3) {
     for (int i = 0; i < repetitions; ++i) {
       tree[*tree_size] = 0;
@@ -230,11 +242,6 @@ void WriteHuffmanTreeRepetitionsZeros(
 }
 
 
-// Heuristics for selecting the stride ranges to collapse.
-int ValuesShouldBeCollapsedToStrideAverage(int a, int b) {
-  return abs(a - b) < 4;
-}
-
 int OptimizeHuffmanCountsForRle(int length, int* counts) {
   int stride;
   int limit;
@@ -251,6 +258,35 @@ int OptimizeHuffmanCountsForRle(int length, int* counts) {
       break;
     }
   }
+  {
+    int nonzeros = 0;
+    int smallest_nonzero = 1 << 30;
+    for (i = 0; i < length; ++i) {
+      if (counts[i] != 0) {
+        ++nonzeros;
+        if (smallest_nonzero > counts[i]) {
+          smallest_nonzero = counts[i];
+        }
+      }
+    }
+    if (nonzeros < 5) {
+      // Small histogram will model it well.
+      return 1;
+    }
+    int zeros = length - nonzeros;
+    if (smallest_nonzero < 4) {
+      if (zeros < 6) {
+        for (i = 1; i < length - 1; ++i) {
+          if (counts[i - 1] != 0 && counts[i] == 0 && counts[i + 1] != 0) {
+            counts[i] = 1;
+          }
+        }
+      }
+    }
+    if (nonzeros < 28) {
+      return 1;
+    }
+  }
   // 2) Let's mark all population counts that already can be encoded
   // with an rle code.
   good_for_rle = (uint8_t*)calloc(length, 1);
@@ -282,13 +318,15 @@ int OptimizeHuffmanCountsForRle(int length, int* counts) {
     }
   }
   // 3) Let's replace those population counts that lead to more rle codes.
+  // Math here is in 24.8 fixed point representation.
+  const int streak_limit = 1240;
   stride = 0;
-  limit = (counts[0] + counts[1] + counts[2]) / 3 + 1;
+  limit = 256 * (counts[0] + counts[1] + counts[2]) / 3 + 420;
   sum = 0;
   for (i = 0; i < length + 1; ++i) {
     if (i == length || good_for_rle[i] ||
         (i != 0 && good_for_rle[i - 1]) ||
-        !ValuesShouldBeCollapsedToStrideAverage(counts[i], limit)) {
+        abs(256 * counts[i] - limit) >= streak_limit) {
       if (stride >= 4 || (stride >= 3 && sum == 0)) {
         int k;
         // The stride must end, collapse what we have, if we have enough (4).
@@ -311,9 +349,9 @@ int OptimizeHuffmanCountsForRle(int length, int* counts) {
       if (i < length - 2) {
         // All interesting strides have a count of at least 4,
         // at least when non-zeros.
-        limit = (counts[i] + counts[i + 1] + counts[i + 2]) / 3 + 1;
+        limit = 256 * (counts[i] + counts[i + 1] + counts[i + 2]) / 3 + 420;
       } else if (i < length) {
-        limit = counts[i];
+        limit = 256 * counts[i];
       } else {
         limit = 0;
       }
@@ -322,7 +360,10 @@ int OptimizeHuffmanCountsForRle(int length, int* counts) {
     if (i != length) {
       sum += counts[i];
       if (stride >= 4) {
-        limit = (sum + stride / 2) / stride;
+        limit = (256 * sum + stride / 2) / stride;
+      }
+      if (stride == 4) {
+        limit += 120;
       }
     }
   }
@@ -331,16 +372,70 @@ int OptimizeHuffmanCountsForRle(int length, int* counts) {
 }
 
 
+static void DecideOverRleUse(const uint8_t* depth, const int length,
+                             bool *use_rle_for_non_zero,
+                             bool *use_rle_for_zero) {
+  int total_reps_zero = 0;
+  int total_reps_non_zero = 0;
+  int count_reps_zero = 0;
+  int count_reps_non_zero = 0;
+  int new_length = length;
+  for (int i = 0; i < length; ++i) {
+    if (depth[length - i - 1] == 0) {
+      --new_length;
+    } else {
+      break;
+    }
+  }
+  for (uint32_t i = 0; i < new_length;) {
+    const int value = depth[i];
+    int reps = 1;
+    // Find rle coding for longer codes.
+    // Shorter codes seem not to benefit from rle.
+    for (uint32_t k = i + 1; k < new_length && depth[k] == value; ++k) {
+      ++reps;
+    }
+    if (reps >= 3 && value == 0) {
+      total_reps_zero += reps;
+      ++count_reps_zero;
+    }
+    if (reps >= 4 && value != 0) {
+      total_reps_non_zero += reps;
+      ++count_reps_non_zero;
+    }
+    i += reps;
+  }
+  total_reps_non_zero -= count_reps_non_zero * 2;
+  total_reps_zero -= count_reps_zero * 2;
+  *use_rle_for_non_zero = total_reps_non_zero > 2;
+  *use_rle_for_zero = total_reps_zero > 2;
+}
+
+
 void WriteHuffmanTree(const uint8_t* depth, const int length,
                       uint8_t* tree,
                       uint8_t* extra_bits_data,
                       int* huffman_tree_size) {
   int previous_value = 8;
+
+  // First gather statistics on if it is a good idea to do rle.
+  bool use_rle_for_non_zero;
+  bool use_rle_for_zero;
+  DecideOverRleUse(depth, length, &use_rle_for_non_zero, &use_rle_for_zero);
+
+  // Actual rle coding.
   for (uint32_t i = 0; i < length;) {
     const int value = depth[i];
     int reps = 1;
-    for (uint32_t k = i + 1; k < length && depth[k] == value; ++k) {
-      ++reps;
+    if (length > 50) {
+      // Find rle coding for longer codes.
+      // Shorter codes seem not to benefit from rle.
+      if ((value != 0 && use_rle_for_non_zero) ||
+          (value == 0 && use_rle_for_zero)) {
+        for (uint32_t k = i + 1; k < length && depth[k] == value; ++k) {
+          ++reps;
+        }
+      }
     }
     if (value == 0) {
       WriteHuffmanTreeRepetitionsZeros(reps, tree, extra_bits_data,
diff --git a/enc/entropy_encode.h b/enc/entropy_encode.h
index 89c3e1a..aabb9a5 100644
--- a/enc/entropy_encode.h
+++ b/enc/entropy_encode.h
@@ -86,7 +86,7 @@ void BuildEntropyCode(const Histogram<kSize>& histogram,
       ++code->count_;
     }
   }
-  if (code->count_ >= 64) {
+  if (alphabet_size >= 50 && code->count_ >= 16) {
     int counts[kSize];
     memcpy(counts, &histogram.data_[0], sizeof(counts[0]) * kSize);
     OptimizeHuffmanCountsForRle(alphabet_size, counts);
diff --git a/enc/hash.h b/enc/hash.h
index cb38e8f..920c88b 100644
--- a/enc/hash.h
+++ b/enc/hash.h
@@ -150,7 +150,10 @@ class HashLongestMatch {
                         size_t * __restrict best_len_out,
                         size_t * __restrict best_len_code_out,
                         size_t * __restrict best_distance_out,
-                        double * __restrict best_score_out) {
+                        double * __restrict best_score_out,
+                        bool * __restrict in_dictionary) {
+    *in_dictionary = true;
+    *best_len_code_out = 0;
     const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
     const double start_cost4 = literal_cost == NULL ? 20 :
         literal_cost[cur_ix_masked] +
@@ -166,9 +169,9 @@ class HashLongestMatch {
         literal_cost[(cur_ix + 1) & ring_buffer_mask] + 1.2;
     bool match_found = false;
     // Don't accept a short copy from far away.
-    double best_score = 8.25;
+    double best_score = 8.11;
     if (insert_length_ < 4) {
-      double cost_diff[4] = { 0.20, 0.09, 0.05, 0.03 };
+      double cost_diff[4] = { 0.10, 0.04, 0.02, 0.01 };
       best_score += cost_diff[insert_length_];
     }
     size_t best_len = *best_len_out;
@@ -235,6 +238,7 @@ class HashLongestMatch {
           *best_distance_out = best_ix;
           *best_score_out = best_score;
           match_found = true;
+          *in_dictionary = backward > max_backward;
         }
       }
     }
@@ -257,7 +261,7 @@ class HashLongestMatch {
         continue;
       }
       int len = 2;
-      const double score = start_cost2 - 1.70 * Log2Floor(backward);
+      const double score = start_cost2 - 2.3 * Log2Floor(backward);
 
       if (best_score < score) {
         best_score = score;
@@ -309,6 +313,7 @@ class HashLongestMatch {
             *best_distance_out = best_ix;
             *best_score_out = best_score;
             match_found = true;
+            *in_dictionary = false;
           }
         }
       }
diff --git a/enc/literal_cost.cc b/enc/literal_cost.cc
index bf05a98..a944599 100644
--- a/enc/literal_cost.cc
+++ b/enc/literal_cost.cc
@@ -22,6 +22,104 @@
 
 namespace brotli {
 
+static int UTF8Position(int last, int c, int clamp) {
+  if (c < 128) {
+    return 0;  // Next one is the 'Byte 1' again.
+  } else if (c >= 192) {
+    return std::min(1, clamp);  // Next one is the 'Byte 2' of utf-8 encoding.
+  } else {
+    // Let's decide over the last byte if this ends the sequence.
+    if (last < 0xe0) {
+      return 0;  // Completed two or three byte coding.
+    } else {
+      return std::min(2, clamp);  // Next one is the 'Byte 3' of utf-8 encoding.
+    }
+  }
+}
+
+static int DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask,
+                                     const uint8_t *data) {
+  int counts[3] = { 0 };
+  int max_utf8 = 1;  // should be 2, but 1 compresses better.
+  int last_c = 0;
+  int utf8_pos = 0;
+  for (int i = 0; i < len; ++i) {
+    int c = data[(pos + i) & mask];
+    utf8_pos = UTF8Position(last_c, c, 2);
+    ++counts[utf8_pos];
+    last_c = c;
+  }
+  if (counts[2] < 500) {
+    max_utf8 = 1;
+  }
+  if (counts[1] + counts[2] < 25) {
+    max_utf8 = 0;
+  }
+  return max_utf8;
+}
+
+void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
+                                     const uint8_t *data, float *cost) {
+
+  // max_utf8 is 0 (normal ascii single byte modeling),
+  // 1 (for 2-byte utf-8 modeling), or 2 (for 3-byte utf-8 modeling).
+  const int max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data);
+  int histogram[3][256] = { { 0 } };
+  int window_half = 495;
+  int in_window = std::min(static_cast<size_t>(window_half), len);
+  int in_window_utf8[3] = { 0 };
+
+  // Bootstrap histograms.
+  int last_c = 0;
+  int utf8_pos = 0;
+  for (int i = 0; i < in_window; ++i) {
+    int c = data[(pos + i) & mask];
+    ++histogram[utf8_pos][c];
+    ++in_window_utf8[utf8_pos];
+    utf8_pos = UTF8Position(last_c, c, max_utf8);
+    last_c = c;
+  }
+
+  // Compute bit costs with sliding window.
+  for (int i = 0; i < len; ++i) {
+    if (i - window_half >= 0) {
+      // Remove a byte in the past.
+      int c = (i - window_half - 1) < 0 ?
+          0 : data[(pos + i - window_half - 1) & mask];
+      int last_c = (i - window_half - 2) < 0 ?
+          0 : data[(pos + i - window_half - 2) & mask];
+      int utf8_pos2 = UTF8Position(last_c, c, max_utf8);
+      --histogram[utf8_pos2][data[(pos + i - window_half) & mask]];
+      --in_window_utf8[utf8_pos2];
+    }
+    if (i + window_half < len) {
+      // Add a byte in the future.
+      int c = (i + window_half - 1) < 0 ?
+          0 : data[(pos + i + window_half - 1) & mask];
+      int last_c = (i + window_half - 2) < 0 ?
+          0 : data[(pos + i + window_half - 2) & mask];
+      int utf8_pos2 = UTF8Position(last_c, c, max_utf8);
+      ++histogram[utf8_pos2][data[(pos + i + window_half) & mask]];
+      ++in_window_utf8[utf8_pos2];
+    }
+    int c = i < 1 ? 0 : data[(pos + i - 1) & mask];
+    int last_c = i < 2 ? 0 : data[(pos + i - 2) & mask];
+    int utf8_pos = UTF8Position(last_c, c, max_utf8);
+    int masked_pos = (pos + i) & mask;
+    int histo = histogram[utf8_pos][data[masked_pos]];
+    if (histo == 0) {
+      histo = 1;
+    }
+    cost[masked_pos] = log2(static_cast<double>(in_window_utf8[utf8_pos])
+                            / histo);
+    cost[masked_pos] += 0.02905;
+    if (cost[masked_pos] < 1.0) {
+      cost[masked_pos] *= 0.5;
+      cost[masked_pos] += 0.5;
+    }
+  }
+}
+
 void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
                                  const uint8_t *data, float *cost) {
   int histogram[256] = { 0 };
@@ -59,4 +157,5 @@ void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
   }
 }
 
+
 }  // namespace brotli
diff --git a/enc/literal_cost.h b/enc/literal_cost.h
index fd7f325..ca39a4e 100644
--- a/enc/literal_cost.h
+++ b/enc/literal_cost.h
@@ -26,7 +26,12 @@ namespace brotli {
 // ringbuffer (data, mask) will take entropy coded and writes these estimates
 // to the ringbuffer (cost, mask).
 void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
-                                 const uint8_t *data, float *cost);
+                                 const uint8_t *data,
+                                 float *cost);
+
+void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
+                                     const uint8_t *data,
+                                     float *cost);
 
 }  // namespace brotli