Updates to Brotli compression format, decoder and encoder

This commit contains a batch of changes that were made to the Brotli compression algorithm in the last three weeks. Most important changes: * Added UTF8 context model for good text compression. * Simplified context modeling by having only 4 context modes. * Per-block context mode selection. * Faster backward copying and bit reading functions. * More efficient histogram coding. * Streaming support for the decoder and encoder.
2024-11-22 03:30:07 +00:00 · 2013-11-15 19:02:17 +01:00 · 2013-11-15 19:02:17 +01:00 · c6b9c7c5c8
commit c6b9c7c5c8
parent c66e4e3e4f
23 changed files with 1647 additions and 870 deletions
--- a/dec/bit_reader.c
+++ b/dec/bit_reader.c
@ -15,6 +15,7 @@
 // Bit reading helpers
 #include <assert.h>
 #include <stdlib.h>
 #include "./bit_reader.h"
@ -22,99 +23,24 @@
 extern "C" {
 #endif
-#define MAX_NUM_BIT_READ 25
+int BrotliInitBitReader(BrotliBitReader* const br, BrotliInput input) {
 #define LBITS 64      // Number of bits prefetched.
 #define WBITS 32      // Minimum number of bytes needed after
                      // BrotliFillBitWindow.
 #define LOG8_WBITS 4  // Number of bytes needed to store WBITS bits.
 static const uint32_t kBitMask[MAX_NUM_BIT_READ] = {
  0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
  65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215
 };
 void BrotliInitBitReader(BrotliBitReader* const br,
                         const uint8_t* const start,
                         size_t length) {
  size_t i;
  assert(br != NULL);
  assert(start != NULL);
  assert(length < 0xfffffff8u);   // can't happen with a RIFF chunk.
-  br->buf_ = start;
+  br->input_ = input;
  br->len_ = length;
  br->val_ = 0;
  br->pos_ = 0;
  br->bit_pos_ = 0;
  br->end_pos_ = 0;
  br->eos_ = 0;
-  br->error_ = 0;
+  if (!BrotliReadMoreInput(br)) {
-  for (i = 0; i < sizeof(br->val_) && i < br->len_; ++i) {
+    return 0;
  }
  for (i = 0; i < sizeof(br->val_); ++i) {
    br->val_ |= ((uint64_t)br->buf_[br->pos_]) << (8 * i);
    ++br->pos_;
  }
-}
+  return (br->end_pos_ > 0);
 void BrotliBitReaderSetBuffer(BrotliBitReader* const br,
                              const uint8_t* const buf, size_t len) {
  assert(br != NULL);
  assert(buf != NULL);
  assert(len < 0xfffffff8u);   // can't happen with a RIFF chunk.
  br->eos_ = (br->pos_ >= len);
  br->buf_ = buf;
  br->len_ = len;
 }
 // If not at EOS, reload up to LBITS byte-by-byte
 static void ShiftBytes(BrotliBitReader* const br) {
  while (br->bit_pos_ >= 8 && br->pos_ < br->len_) {
    br->val_ >>= 8;
    br->val_ |= ((uint64_t)br->buf_[br->pos_]) << (LBITS - 8);
    ++br->pos_;
    br->bit_pos_ -= 8;
  }
 }
 void BrotliFillBitWindow(BrotliBitReader* const br) {
  if (br->bit_pos_ >= WBITS) {
 #if (defined(__x86_64__) || defined(_M_X64))
    if (br->pos_ + sizeof(br->val_) < br->len_) {
      br->val_ >>= WBITS;
      br->bit_pos_ -= WBITS;
      // The expression below needs a little-endian arch to work correctly.
      // This gives a large speedup for decoding speed.
      br->val_ |= *(const uint64_t*)(br->buf_ + br->pos_) << (LBITS - WBITS);
      br->pos_ += LOG8_WBITS;
      return;
    }
 #endif
    ShiftBytes(br);       // Slow path.
    if (br->pos_ == br->len_ && br->bit_pos_ == LBITS) {
      br->eos_ = 1;
    }
  }
 }
 uint32_t BrotliReadBits(BrotliBitReader* const br, int n_bits) {
  assert(n_bits >= 0);
  // Flag an error if end_of_stream or n_bits is more than allowed limit.
  if (n_bits == 0 || (!br->eos_ && n_bits < MAX_NUM_BIT_READ)) {
    const uint32_t val =
        (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
    const int new_bits = br->bit_pos_ + n_bits;
    br->bit_pos_ = new_bits;
    // If this read is going to cross the read buffer, set the eos flag.
    if (br->pos_ == br->len_) {
      if (new_bits >= LBITS) {
        br->eos_ = 1;
      }
    }
    ShiftBytes(br);
    return val;
  } else {
    br->error_ = 1;
    return 0;
  }
 }
 #if defined(__cplusplus) || defined(c_plusplus)
--- a/dec/bit_reader.h
+++ b/dec/bit_reader.h
@ -17,34 +17,39 @@
 #ifndef BROTLI_DEC_BIT_READER_H_
 #define BROTLI_DEC_BIT_READER_H_
 #include <string.h>
 #include "./streams.h"
 #include "./types.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 #define BROTLI_MAX_NUM_BIT_READ   25
 #define BROTLI_READ_SIZE          4096
 #define BROTLI_IBUF_SIZE          (2 * BROTLI_READ_SIZE + 32)
 #define BROTLI_IBUF_MASK          (2 * BROTLI_READ_SIZE - 1)
 #define UNALIGNED_COPY64(dst, src) *(uint64_t*)(dst) = *(const uint64_t*)(src)
 static const uint32_t kBitMask[BROTLI_MAX_NUM_BIT_READ] = {
  0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
  65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215
 };
 typedef struct {
  // Input byte buffer, consist of a ringbuffer and a "slack" region where
  // bytes from the start of the ringbuffer are copied.
  uint8_t buf_[BROTLI_IBUF_SIZE];
  BrotliInput input_;    // input callback
  uint64_t    val_;      // pre-fetched bits
-  const uint8_t* buf_;        // input byte buffer
+  size_t      pos_;      // byte position in stream
  size_t         len_;        // buffer length
  size_t         pos_;        // byte position in buf_
  int         bit_pos_;  // current bit-reading position in val_
-  int            eos_;        // bitstream is finished
+  size_t      end_pos_;  // current end position in stream
-  int            error_;      // an error occurred (buffer overflow attempt...)
+  int         eos_;      // input stream is finished
 } BrotliBitReader;
-void BrotliInitBitReader(BrotliBitReader* const br,
+int BrotliInitBitReader(BrotliBitReader* const br, BrotliInput input);
                         const uint8_t* const start,
                         size_t length);
 //  Sets a new data buffer.
 void BrotliBitReaderSetBuffer(BrotliBitReader* const br,
                              const uint8_t* const buffer, size_t length);
 // Reads the specified number of bits from Read Buffer.
 // Flags an error in case end_of_stream or n_bits is more than allowed limit.
 // Flags eos if this read attempt is going to cross the read buffer.
 uint32_t BrotliReadBits(BrotliBitReader* const br, int n_bits);
 // Return the prefetched bits, so they can be looked up.
 static BROTLI_INLINE uint32_t BrotliPrefetchBits(BrotliBitReader* const br) {
@ -57,8 +62,92 @@ static BROTLI_INLINE void BrotliSetBitPos(BrotliBitReader* const br, int val) {
  br->bit_pos_ = val;
 }
-// Advances the Read buffer by 4 bytes to make room for reading next 32 bits.
+// Reload up to 64 bits byte-by-byte
-void BrotliFillBitWindow(BrotliBitReader* const br);
+static BROTLI_INLINE void ShiftBytes(BrotliBitReader* const br) {
  while (br->bit_pos_ >= 8) {
    br->val_ >>= 8;
    br->val_ |= ((uint64_t)br->buf_[br->pos_ & BROTLI_IBUF_MASK]) << 56;
    ++br->pos_;
    br->bit_pos_ -= 8;
  }
 }
 // Fills up the input ringbuffer by calling the input callback.
 //
 // Does nothing if there are at least 32 bytes present after current position.
 //
 // Returns 0 if either:
 //  - the input callback returned an error, or
 //  - there is no more input and the position is past the end of the stream.
 //
 // After encountering the end of the input stream, 32 additional zero bytes are
 // copied to the ringbuffer, therefore it is safe to call this function after
 // every 32 bytes of input is read.
 static BROTLI_INLINE int BrotliReadMoreInput(BrotliBitReader* const br) {
  if (br->pos_ + 32 < br->end_pos_) {
    return 1;
  } else if (br->eos_) {
    return (br->pos_ << 3) + br->bit_pos_ <= (br->end_pos_ << 3) + 64;
  } else {
    uint8_t* dst = br->buf_ + (br->end_pos_ & BROTLI_IBUF_MASK);
    int bytes_read = BrotliRead(br->input_, dst, BROTLI_READ_SIZE);
    if (bytes_read < 0) {
      return 0;
    }
    if (bytes_read < BROTLI_READ_SIZE) {
      br->eos_ = 1;
      // Store 32 bytes of zero after the stream end.
 #if (defined(__x86_64__) || defined(_M_X64))
      *(uint64_t*)(dst + bytes_read) = 0;
      *(uint64_t*)(dst + bytes_read + 8) = 0;
      *(uint64_t*)(dst + bytes_read + 16) = 0;
      *(uint64_t*)(dst + bytes_read + 24) = 0;
 #else
      memset(dst + bytes_read, 0, 32);
 #endif
    }
    if (dst == br->buf_) {
      // Copy the head of the ringbuffer to the slack region.
 #if (defined(__x86_64__) || defined(_M_X64))
      UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 32, br->buf_);
      UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 24, br->buf_ + 8);
      UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 16, br->buf_ + 16);
      UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 8, br->buf_ + 24);
 #else
      memcpy(br->buf_ + (BROTLI_READ_SIZE << 1), br->buf_, 32);
 #endif
    }
    br->end_pos_ += bytes_read;
    return 1;
  }
 }
 // Advances the Read buffer by 5 bytes to make room for reading next 24 bits.
 static BROTLI_INLINE void BrotliFillBitWindow(BrotliBitReader* const br) {
  if (br->bit_pos_ >= 40) {
 #if (defined(__x86_64__) || defined(_M_X64))
    br->val_ >>= 40;
    br->bit_pos_ -= 40;
    // The expression below needs a little-endian arch to work correctly.
    // This gives a large speedup for decoding speed.
    br->val_ |= *(const uint64_t*)(
        br->buf_ + (br->pos_ & BROTLI_IBUF_MASK)) << 24;
    br->pos_ += 5;
 #else
    ShiftBytes(br);
 #endif
  }
 }
 // Reads the specified number of bits from Read Buffer.
 // Requires that n_bits is positive.
 static BROTLI_INLINE uint32_t BrotliReadBits(
    BrotliBitReader* const br, int n_bits) {
  BrotliFillBitWindow(br);
  const uint32_t val = (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
  br->bit_pos_ += n_bits;
  return val;
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/dec/context.h
+++ b/dec/context.h
@ -12,34 +12,154 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-// Lookup tables to map the previous one to three bytes to a context id.
+// Lookup table to map the previous two bytes to a context id.
 //
 // There are four different context modeling modes defined here:
 //   CONTEXT_LSB6: context id is the least significant 6 bits of the last byte,
 //   CONTEXT_MSB6: context id is the most significant 6 bits of the last byte,
 //   CONTEXT_UTF8: second-order context model tuned for UTF8-encoded text,
 //   CONTEXT_SIGNED: second-order context model tuned for signed integers.
 //
 // The context id for the UTF8 context model is calculated as follows. If p1
 // and p2 are the previous two bytes, we calcualte the context as
 //
 //   context = kContextLookup[p1] | kContextLookup[p2 + 256].
 //
 // If the previous two bytes are ASCII characters (i.e. < 128), this will be
 // equivalent to
 //
 //   context = 4 * context1(p1) + context2(p2),
 //
 // where context1 is based on the previous byte in the following way:
 //
 //   0  : non-ASCII control
 //   1  : \t, \n, \r
 //   2  : space
 //   3  : other punctuation
 //   4  : " '
 //   5  : %
 //   6  : ( < [ {
 //   7  : ) > ] }
 //   8  : , ; :
 //   9  : .
 //   10 : =
 //   11 : number
 //   12 : upper-case vowel
 //   13 : upper-case consonant
 //   14 : lower-case vowel
 //   15 : lower-case consonant
 //
 // and context2 is based on the second last byte:
 //
 //   0 : control, space
 //   1 : punctuation
 //   2 : upper-case letter, number
 //   3 : lower-case letter
 //
 // If the last byte is ASCII, and the second last byte is not (in a valid UTF8
 // stream it will be a continuation byte, value between 128 and 191), the
 // context is the same as if the second last byte was an ASCII control or space.
 //
 // If the last byte is a UTF8 lead byte (value >= 192), then the next byte will
 // be a continuation byte and the context id is 2 or 3 depending on the LSB of
 // the last byte and to a lesser extent on the second last byte if it is ASCII.
 //
 // If the last byte is a UTF8 continuation byte, the second last byte can be:
 //   - continuation byte: the next byte is probably ASCII or lead byte (assuming
 //     4-byte UTF8 characters are rare) and the context id is 0 or 1.
 //   - lead byte (192 - 207): next byte is ASCII or lead byte, context is 0 or 1
 //   - lead byte (208 - 255): next byte is continuation byte, context is 2 or 3
 //
 // The possible value combinations of the previous two bytes, the range of
 // context ids and the type of the next byte is summarized in the table below:
 //
 // |--------\-----------------------------------------------------------------|
 // |         \                         Last byte                              |
 // | Second   \---------------------------------------------------------------|
 // | last byte \    ASCII            |   cont. byte        |   lead byte      |
 // |            \   (0-127)          |   (128-191)         |   (192-)         |
 // |=============|===================|=====================|==================|
 // |  ASCII      | next: ASCII/lead  |  not valid          |  next: cont.     |
 // |  (0-127)    | context: 4 - 63   |                     |  context: 2 - 3  |
 // |-------------|-------------------|---------------------|------------------|
 // |  cont. byte | next: ASCII/lead  |  next: ASCII/lead   |  next: cont.     |
 // |  (128-191)  | context: 4 - 63   |  context: 0 - 1     |  context: 2 - 3  |
 // |-------------|-------------------|---------------------|------------------|
 // |  lead byte  | not valid         |  next: ASCII/lead   |  not valid       |
 // |  (192-207)  |                   |  context: 0 - 1     |                  |
 // |-------------|-------------------|---------------------|------------------|
 // |  lead byte  | not valid         |  next: cont.        |  not valid       |
 // |  (208-)     |                   |  context: 2 - 3     |                  |
 // |-------------|-------------------|---------------------|------------------|
 //
 // The context id for the signed context mode is calculated as:
 //
 //   context = (kContextLookup[512 + p1] << 3) | kContextLookup[512 + p2].
 //
 // For any context modeling modes, the context ids can be calculated by |-ing
 // together two lookups from one table using context model dependent offsets:
 //
 //   context = kContextLookup[offset1 + p1] | kContextLookup[offset2 + p2].
 //
 // where offset1 and offset2 are dependent on the context mode.
 #ifndef BROTLI_DEC_CONTEXT_H_
 #define BROTLI_DEC_CONTEXT_H_
 #include "./types.h"
-static const int kSigned2BitContextLookup[] = {
+enum ContextType {
-  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  CONTEXT_LSB6         = 0,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  CONTEXT_MSB6         = 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  CONTEXT_UTF8         = 2,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  CONTEXT_SIGNED       = 3
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
 };
-static const int kSigned3BitContextLookup[] = {
+// Common context lookup table for all context modes.
 static const uint8_t kContextLookup[1792] = {
  // CONTEXT_UTF8, last byte.
  //
  // ASCII range.
   0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  4,  0,  0,  4,  0,  0,
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12,
  44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12,
  12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48,
  52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12,
  12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56,
  60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12,  0,
  // UTF8 continuation byte range.
  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
  // UTF8 lead byte range.
  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
  // CONTEXT_UTF8 second last byte.
  //
  // ASCII range.
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
  1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
  // UTF8 continuation byte range.
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  // UTF8 lead byte range.
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  // CONTEXT_SIGNED, second last byte.
  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@ -56,69 +176,85 @@ static const int kSigned3BitContextLookup[] = {
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
  // CONTEXT_SIGNED, last byte, same as the above values shifted by 3 bits.
   0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
  32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
  32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
  32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
  40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
  40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
  40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
  48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 56,
  // CONTEXT_LSB6, last byte.
   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
  // CONTEXT_MSB6, last byte.
   0,  0,  0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,
   4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,
   8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11,
  12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15,
  16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19,
  20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23,
  24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27,
  28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31,
  32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35,
  36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 39, 39,
  40, 40, 40, 40, 41, 41, 41, 41, 42, 42, 42, 42, 43, 43, 43, 43,
  44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47,
  48, 48, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51, 51,
  52, 52, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55,
  56, 56, 56, 56, 57, 57, 57, 57, 58, 58, 58, 58, 59, 59, 59, 59,
  60, 60, 60, 60, 61, 61, 61, 61, 62, 62, 62, 62, 63, 63, 63, 63,
  // CONTEXT_{M,L}SB6, second last byte,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 };
-static const int kSigned4BitContextLookup[] = {
+static const int kContextLookupOffsets[8] = {
-   0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
+  // CONTEXT_LSB6
-   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+  1024, 1536,
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+  // CONTEXT_MSB6
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+  1280, 1536,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+  // CONTEXT_UTF8
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+  0, 256,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+  // CONTEXT_SIGNED
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+  768, 512,
   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
  11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 14, 15,
 };
 enum ContextType {
  CONTEXT_FULL        = 0,
  CONTEXT_MSB7        = 1,
  CONTEXT_MSB6        = 2,
  CONTEXT_MSB5        = 3,
  CONTEXT_MSB4        = 4,
  CONTEXT_MSB3        = 5,
  CONTEXT_MSB2        = 6,
  CONTEXT_MSB1        = 7,
  CONTEXT_IS_ZERO     = 8,
  CONTEXT_SIGNED_2BIT = 9,
  CONTEXT_SIGNED_3BIT = 10,
  CONTEXT_SIGNED_4BIT = 11,
  CONTEXT_SIGNED_MIXED_3BYTE = 12
 };
 static const int kContextSize[] = {
  256, 128, 64, 32, 16, 8, 4, 2, 2, 4, 8, 16, 64,
 };
 static BROTLI_INLINE int NumContexts(int mode) {
  return kContextSize[mode];
 }
 static BROTLI_INLINE uint8_t Context(uint8_t prev_byte, uint8_t prev_byte2,
                                     uint8_t prev_byte3, int mode) {
  switch (mode) {
    case CONTEXT_IS_ZERO:
      return prev_byte == 0 ? 0 : 1;
    case CONTEXT_SIGNED_2BIT:
      return kSigned2BitContextLookup[prev_byte];
    case CONTEXT_SIGNED_3BIT:
      return kSigned3BitContextLookup[prev_byte];
    case CONTEXT_SIGNED_4BIT:
      return kSigned4BitContextLookup[prev_byte];
    case CONTEXT_SIGNED_MIXED_3BYTE:
      return ((kSigned3BitContextLookup[prev_byte] << 3) +
              (kSigned2BitContextLookup[prev_byte2] << 1) +
              (prev_byte3 == 0 ? 0 : 1));
    default:
      return prev_byte >> mode;
  }
 }
 #endif  // BROTLI_DEC_CONTEXT_H_
--- a/dec/decode.c
+++ b/dec/decode.c
@ -14,7 +14,6 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include "./bit_reader.h"
 #include "./context.h"
 #include "./decode.h"
@ -28,10 +27,10 @@ extern "C" {
 #ifdef BROTLI_DECODE_DEBUG
 #define BROTLI_LOG_UINT(name)                                    \
-  printf("[%s] %s = %lu\n", __func__, #name, (unsigned long)name)
+  printf("[%s] %s = %lu\n", __func__, #name, (unsigned long)(name))
 #define BROTLI_LOG_ARRAY_INDEX(array_name, idx)                  \
  printf("[%s] %s[%lu] = %lu\n", __func__, #array_name, \
-         (unsigned long)idx, (unsigned long)array_name[idx])
+         (unsigned long)(idx), (unsigned long)array_name[idx])
 #else
 #define BROTLI_LOG_UINT(name)
 #define BROTLI_LOG_ARRAY_INDEX(array_name, idx)
@ -46,10 +45,12 @@ static const int kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
 static const int kNumLiteralCodes = 256;
 static const int kNumInsertAndCopyCodes = 704;
 static const int kNumBlockLengthCodes = 26;
 static const int kLiteralContextBits = 6;
 static const int kDistanceContextBits = 2;
 #define CODE_LENGTH_CODES 19
 static const uint8_t kCodeLengthCodeOrder[CODE_LENGTH_CODES] = {
-  17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  1, 2, 3, 4, 0, 17, 18, 5, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15
 };
 #define NUM_DISTANCE_SHORT_CODES 16
@ -61,23 +62,39 @@ static const int kDistanceShortCodeValueOffset[NUM_DISTANCE_SHORT_CODES] = {
  0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3
 };
-static int DecodeSize(BrotliBitReader* br, size_t* len) {
+static int64_t DecodeSize(BrotliBitReader* br) {
  int size_bytes = BrotliReadBits(br, 3);
  int i = 0;
-  *len = 0;
+  int64_t len = 0;
-  for (; i < size_bytes; ++i) {
+  if (size_bytes == 0) {
-    *len |= BrotliReadBits(br, 8) << (i * 8);
+    return -1;
  }
-  return !br->error_;
+  for (; i < size_bytes; ++i) {
    len |= BrotliReadBits(br, 8) << (i * 8);
  }
  return len;
 }
-static int DecodeMetaBlockLength(int input_size_bits,
+static void DecodeMetaBlockLength(int input_size_bits,
-                                 size_t remaining_length,
+                                  size_t pos,
                                  int64_t input_size,
                                  BrotliBitReader* br,
-                                 size_t* meta_block_length) {
+                                  size_t* meta_block_length,
-  if (BrotliReadBits(br, 1)) {
+                                  int* input_end) {
-    *meta_block_length = remaining_length;
+  *input_end = BrotliReadBits(br, 1);
-    return 1;
+  if (input_size < 0) {
    *meta_block_length = 0;
    if (!*input_end) {
      int size_nibbles = BrotliReadBits(br, 3);
      int i;
      for (i = 0; i < size_nibbles; ++i) {
        *meta_block_length |= BrotliReadBits(br, 4) << (i * 4);
      }
      ++(*meta_block_length);
    }
  } else {
    if (*input_end) {
      *meta_block_length = (size_t)input_size - pos;
    } else {
      int shift = 0;
      *meta_block_length = 0;
@ -90,19 +107,15 @@ static int DecodeMetaBlockLength(int input_size_bits,
        *meta_block_length |= BrotliReadBits(br, input_size_bits) << shift;
      }
      ++(*meta_block_length);
-    return !br->error_;
+    }
  }
 }
 // Decodes the next Huffman code from bit-stream.
 // FillBitWindow(br) needs to be called at minimum every second call
 // to ReadSymbol, in order to pre-fetch enough bits.
 static BROTLI_INLINE int ReadSymbol(const HuffmanTree* tree,
                                    BrotliBitReader* br) {
  if (tree->fixed_bit_length_ > 0) {
    return BrotliReadBits(br, tree->fixed_bit_length_);
  } else {
  const HuffmanTreeNode* node = tree->root_;
  BrotliFillBitWindow(br);
  uint32_t bits = BrotliPrefetchBits(br);
  int bitpos = br->bit_pos_;
  // Check if we find the bit combination from the Huffman lookup table.
@ -126,16 +139,15 @@ static BROTLI_INLINE int ReadSymbol(const HuffmanTree* tree,
  BrotliSetBitPos(br, bitpos);
  return node->symbol_;
 }
 }
-static void PrintIntVector(const int* v, int len) {
+static void PrintUcharVector(const uint8_t* v, int len) {
  while (len-- > 0) printf(" %d", *v++);
  printf("\n");
 }
 static int ReadHuffmanCodeLengths(
-    const int* code_length_code_lengths,
+    const uint8_t* code_length_code_lengths,
-    int num_symbols, int* code_lengths,
+    int num_symbols, uint8_t* code_lengths,
    BrotliBitReader* br) {
  int ok = 0;
  int symbol;
@ -147,10 +159,14 @@ static int ReadHuffmanCodeLengths(
  if (!BrotliHuffmanTreeBuildImplicit(&tree, code_length_code_lengths,
                                      CODE_LENGTH_CODES)) {
    printf("[ReadHuffmanCodeLengths] Building code length tree failed: ");
-    PrintIntVector(code_length_code_lengths, CODE_LENGTH_CODES);
+    PrintUcharVector(code_length_code_lengths, CODE_LENGTH_CODES);
    return 0;
  }
  if (!BrotliReadMoreInput(br)) {
    printf("[ReadHuffmanCodeLengths] Unexpected end of input.\n");
    return 0;
  }
  decode_number_of_code_length_codes = BrotliReadBits(br, 1);
  BROTLI_LOG_UINT(decode_number_of_code_length_codes);
  if (decode_number_of_code_length_codes) {
@ -171,7 +187,10 @@ static int ReadHuffmanCodeLengths(
  while (symbol < num_symbols) {
    int code_len;
    if (max_symbol-- == 0) break;
-    BrotliFillBitWindow(br);
+    if (!BrotliReadMoreInput(br)) {
      printf("[ReadHuffmanCodeLengths] Unexpected end of input.\n");
      goto End;
    }
    code_len = ReadSymbol(&tree, br);
    BROTLI_LOG_UINT(symbol);
    BROTLI_LOG_UINT(code_len);
@ -206,128 +225,101 @@ static int ReadHuffmanCodeLengths(
  return ok;
 }
 static const int64_t kUnitInterval = 1LL<<30;
 static int RepairHuffmanCodeLengths(int num_symbols, int* code_lengths) {
  int i;
  int64_t space = kUnitInterval;
  int max_length = 0;
  for(i = 0; i < num_symbols; i++)
    if (code_lengths[i] != 0) {
      if (code_lengths[i] > max_length)
        max_length = code_lengths[i];
      space -= kUnitInterval >> code_lengths[i];
    }
  // The code which contains one symbol of length one cannot be made optimal.
  if (max_length == 1)
    return 1;
  if (space < 0) {
    int count_longest = 0;
    int new_length = max_length;
    for(i = 0; i < num_symbols; i++) {
      if (code_lengths[i] == max_length)
        count_longest++;
    }
    // Substitute all longest codes with sufficiently longer ones, so that all
    // code words fit into the unit interval. Leftover space will be
    // redistributed later.
    space += count_longest * (kUnitInterval >> max_length);
    if (space < 0)
      return 0;
    while (space < count_longest * (kUnitInterval >> new_length))
      new_length++;
    space -= count_longest * (kUnitInterval >> new_length);
    for(i = 0; i < num_symbols; i++) {
      if (code_lengths[i] == max_length)
        code_lengths[i] = new_length;
    }
  }
  while (space > 0) {
    // Redistribute leftover space in an approximation of a uniform fashion.
    for(i = 0; i < num_symbols; i++) {
      if (code_lengths[i] > 1 && space >= (kUnitInterval >> code_lengths[i])) {
        space -= kUnitInterval >> code_lengths[i];
        code_lengths[i]--;
      }
      if (space == 0)
        break;
    }
  }
  return 1;
 }
 static int ReadHuffmanCode(int alphabet_size,
                           HuffmanTree* tree,
                           BrotliBitReader* br) {
-  int ok = 0;
+  int ok = 1;
-  const int simple_code = BrotliReadBits(br, 1);
+  int simple_code;
-  BROTLI_LOG_UINT(simple_code);
+  uint8_t* code_lengths = NULL;
  code_lengths =
      (uint8_t*)BrotliSafeMalloc((uint64_t)alphabet_size,
                                 sizeof(*code_lengths));
  if (code_lengths == NULL) {
    return 0;
  }
  if (!BrotliReadMoreInput(br)) {
    printf("[ReadHuffmanCode] Unexpected end of input.\n");
    return 0;
  }
  simple_code = BrotliReadBits(br, 1);
  BROTLI_LOG_UINT(simple_code);
  if (simple_code) {  // Read symbols, codes & code lengths directly.
-    int symbols[2] = { 0 };
+    int i;
-    int codes[2];
+    int max_bits_counter = alphabet_size - 1;
-    int code_lengths[2];
+    int max_bits = 0;
-    const int num_symbols = BrotliReadBits(br, 1) + 1;
+    int symbols[4] = { 0 };
-    const int first_symbol_len_code = BrotliReadBits(br, 1);
+    const int num_symbols = BrotliReadBits(br, 2) + 1;
-    // The first code is either 1 bit or 8 bit code.
+    while (max_bits_counter) {
-    symbols[0] = BrotliReadBits(br, (first_symbol_len_code == 0) ? 1 : 8);
+      max_bits_counter >>= 1;
-    codes[0] = 0;
+      ++max_bits;
-    code_lengths[0] = num_symbols - 1;
+    }
-    // The second code (if present), is always 8 bit long.
+    memset(code_lengths, 0, alphabet_size);
-    if (num_symbols == 2) {
+    for (i = 0; i < num_symbols; ++i) {
-      symbols[1] = BrotliReadBits(br, 8);
+      symbols[i] = BrotliReadBits(br, max_bits);
-      codes[1] = 1;
+      code_lengths[symbols[i]] = 2;
-      code_lengths[1] = num_symbols - 1;
+    }
    code_lengths[symbols[0]] = 1;
    switch (num_symbols) {
      case 1:
      case 3:
        break;
      case 2:
        code_lengths[symbols[1]] = 1;
        break;
      case 4:
        if (BrotliReadBits(br, 1)) {
          code_lengths[symbols[2]] = 3;
          code_lengths[symbols[3]] = 3;
        } else {
          code_lengths[symbols[0]] = 2;
        }
        break;
    }
    BROTLI_LOG_UINT(num_symbols);
    BROTLI_LOG_UINT(first_symbol_len_code);
    BROTLI_LOG_UINT(symbols[0]);
    BROTLI_LOG_UINT(symbols[1]);
    ok = BrotliHuffmanTreeBuildExplicit(tree, code_lengths, codes, symbols,
                                        alphabet_size, num_symbols);
    if (!ok) {
      printf("[ReadHuffmanCode] HuffmanTreeBuildExplicit failed: ");
      PrintIntVector(code_lengths, num_symbols);
    }
  } else {  // Decode Huffman-coded code lengths.
    int* code_lengths = NULL;
    int i;
-    int code_length_code_lengths[CODE_LENGTH_CODES] = { 0 };
+    uint8_t code_length_code_lengths[CODE_LENGTH_CODES] = { 0 };
    const int num_codes = BrotliReadBits(br, 4) + 4;
    BROTLI_LOG_UINT(num_codes);
    if (num_codes > CODE_LENGTH_CODES) {
      return 0;
    }
-
+    for (i = BrotliReadBits(br, 1) * 2; i < num_codes; ++i) {
    code_lengths =
        (int*)BrotliSafeMalloc((uint64_t)alphabet_size, sizeof(*code_lengths));
    if (code_lengths == NULL) {
      return 0;
    }
    for (i = 0; i < num_codes; ++i) {
      int code_len_idx = kCodeLengthCodeOrder[i];
-      code_length_code_lengths[code_len_idx] = BrotliReadBits(br, 3);
+      int v = BrotliReadBits(br, 2);
      if (v == 3) {
        v = BrotliReadBits(br, 1);
        if (v == 0) {
          v = 2;
        } else {
          v = BrotliReadBits(br, 1);
          if (v == 0) {
            v = 1;
          } else {
            v = 5;
          }
        }
      } else if (v == 1) {
        v = 3;
      } else if (v == 2) {
        v = 4;
      }
      code_length_code_lengths[code_len_idx] = v;
      BROTLI_LOG_ARRAY_INDEX(code_length_code_lengths, code_len_idx);
    }
    ok = ReadHuffmanCodeLengths(code_length_code_lengths, alphabet_size,
-                                code_lengths, br) &&
+                                code_lengths, br);
-         RepairHuffmanCodeLengths(alphabet_size, code_lengths);
+  }
  if (ok) {
    ok = BrotliHuffmanTreeBuildImplicit(tree, code_lengths, alphabet_size);
    if (!ok) {
      printf("[ReadHuffmanCode] HuffmanTreeBuildImplicit failed: ");
-        PrintIntVector(code_lengths, alphabet_size);
+      PrintUcharVector(code_lengths, alphabet_size);
    }
  }
  free(code_lengths);
-  }
+  return ok;
  ok = ok && !br->error_;
  if (!ok) {
    return 0;
  }
  return 1;
 }
 static int ReadCopyDistance(const HuffmanTree* tree,
@ -339,7 +331,6 @@ static int ReadCopyDistance(const HuffmanTree* tree,
  int nbits;
  int postfix;
  int offset;
  BrotliFillBitWindow(br);
  code = ReadSymbol(tree, br);
  if (code < num_direct_codes) {
    return code;
@ -357,7 +348,6 @@ static int ReadCopyDistance(const HuffmanTree* tree,
 static int ReadBlockLength(const HuffmanTree* tree, BrotliBitReader* br) {
  int code;
  int nbits;
  BrotliFillBitWindow(br);
  code = ReadSymbol(tree, br);
  nbits = kBlockLengthPrefixCode[code].nbits;
  return kBlockLengthPrefixCode[code].offset + BrotliReadBits(br, nbits);
@ -371,8 +361,9 @@ static void ReadInsertAndCopy(const HuffmanTree* tree,
  int code;
  int range_idx;
  int insert_code;
  int insert_extra_bits;
  int copy_code;
-  BrotliFillBitWindow(br);
+  int copy_extra_bits;
  code = ReadSymbol(tree, br);
  range_idx = code >> 6;
  if (range_idx >= 2) {
@ -383,27 +374,27 @@ static void ReadInsertAndCopy(const HuffmanTree* tree,
  }
  insert_code = (kInsertRangeLut[range_idx] << 3) + ((code >> 3) & 7);
  copy_code = (kCopyRangeLut[range_idx] << 3) + (code & 7);
-  *insert_len =
+  *insert_len = kInsertLengthPrefixCode[insert_code].offset;
-      kInsertLengthPrefixCode[insert_code].offset +
+  insert_extra_bits = kInsertLengthPrefixCode[insert_code].nbits;
-      BrotliReadBits(br, kInsertLengthPrefixCode[insert_code].nbits);
+  if (insert_extra_bits > 0) {
-  *copy_len =
+    *insert_len += BrotliReadBits(br, insert_extra_bits);
-      kCopyLengthPrefixCode[copy_code].offset +
+  }
-      BrotliReadBits(br, kCopyLengthPrefixCode[copy_code].nbits);
+  *copy_len = kCopyLengthPrefixCode[copy_code].offset;
  copy_extra_bits = kCopyLengthPrefixCode[copy_code].nbits;
  if (copy_extra_bits > 0) {
    *copy_len += BrotliReadBits(br, copy_extra_bits);
  }
 }
-static int TranslateShortCodes(int code, int* ringbuffer, size_t* index) {
+static int TranslateShortCodes(int code, int* ringbuffer, size_t index) {
  int val;
  if (code < NUM_DISTANCE_SHORT_CODES) {
-    int index_offset = kDistanceShortCodeIndexOffset[code];
+    index += kDistanceShortCodeIndexOffset[code];
-    int value_offset = kDistanceShortCodeValueOffset[code];
+    index &= 3;
-    val = ringbuffer[(*index + index_offset) & 3] + value_offset;
+    val = ringbuffer[index] + kDistanceShortCodeValueOffset[code];
  } else {
    val = code - NUM_DISTANCE_SHORT_CODES + 1;
  }
  if (code > 0) {
    ringbuffer[*index & 3] = val;
    ++(*index);
  }
  return val;
 }
@ -453,41 +444,24 @@ static int HuffmanTreeGroupDecode(HuffmanTreeGroup* group,
                                  BrotliBitReader* br) {
  int i;
  for (i = 0; i < group->num_htrees; ++i) {
-    ReadHuffmanCode(group->alphabet_size, &group->htrees[i], br);
+    if (!ReadHuffmanCode(group->alphabet_size, &group->htrees[i], br)) {
      return 0;
    }
  }
  return 1;
 }
-static int DecodeContextMap(int num_block_types,
+static int DecodeContextMap(int context_map_size,
                            int stream_type,
                            int* context_mode,
                            int* contexts_per_block,
                            int* num_htrees,
                            uint8_t** context_map,
                            BrotliBitReader* br) {
-  int context_map_size;
+  int ok = 1;
-  int use_context = BrotliReadBits(br, 1);
+  if (!BrotliReadMoreInput(br)) {
-  if (!use_context) {
+    printf("[DecodeContextMap] Unexpected end of input.\n");
-    *context_mode = 0;
+    return 0;
    *contexts_per_block = 1;
    *context_map = NULL;
    *num_htrees = num_block_types;
    return 1;
  }
  switch (stream_type) {
    case 0:
      *context_mode = BrotliReadBits(br, 4);
      *contexts_per_block = NumContexts(*context_mode);
      break;
    case 2:
      *context_mode = 1;
      *contexts_per_block = 4;
      break;
  }
  context_map_size = *contexts_per_block * num_block_types;
  *num_htrees = BrotliReadBits(br, 8) + 1;
  BROTLI_LOG_UINT(*context_mode);
  BROTLI_LOG_UINT(context_map_size);
  BROTLI_LOG_UINT(*num_htrees);
@ -511,13 +485,19 @@ static int DecodeContextMap(int num_block_types,
    if (use_rle_for_zeros) {
      max_run_length_prefix = BrotliReadBits(br, 4) + 1;
    }
-    ReadHuffmanCode(*num_htrees + max_run_length_prefix,
+    if (!ReadHuffmanCode(*num_htrees + max_run_length_prefix,
-                    &tree_index_htree, br);
+                         &tree_index_htree, br)) {
      return 0;
    }
    if (use_rle_for_zeros) {
      int i;
      for (i = 0; i < context_map_size;) {
        int code;
-        BrotliFillBitWindow(br);
+        if (!BrotliReadMoreInput(br)) {
          printf("[DecodeContextMap] Unexpected end of input.\n");
          ok = 0;
          goto End;
        }
        code = ReadSymbol(&tree_index_htree, br);
        if (code == 0) {
          (*context_map)[i] = 0;
@ -536,16 +516,21 @@ static int DecodeContextMap(int num_block_types,
    } else {
      int i;
      for (i = 0; i < context_map_size; ++i) {
-        BrotliFillBitWindow(br);
+        if (!BrotliReadMoreInput(br)) {
          printf("[DecodeContextMap] Unexpected end of input.\n");
          ok = 0;
          goto End;
        }
        (*context_map)[i] = ReadSymbol(&tree_index_htree, br);
      }
    }
   End:
    BrotliHuffmanTreeRelease(&tree_index_htree);
  }
  if (BrotliReadBits(br, 1)) {
    InverseMoveToFrontTransform(*context_map, context_map_size);
  }
-  return 1;
+  return ok;
 }
 static BROTLI_INLINE void DecodeBlockType(const HuffmanTree* trees,
@ -570,39 +555,116 @@ static BROTLI_INLINE void DecodeBlockType(const HuffmanTree* trees,
  ++(*index);
 }
 // Copy len bytes from src to dst. It can write up to ten extra bytes
 // after the end of the copy.
 //
 // The main part of this loop is a simple copy of eight bytes at a time until
 // we've copied (at least) the requested amount of bytes.  However, if dst and
 // src are less than eight bytes apart (indicating a repeating pattern of
 // length < 8), we first need to expand the pattern in order to get the correct
 // results. For instance, if the buffer looks like this, with the eight-byte
 // <src> and <dst> patterns marked as intervals:
 //
 //    abxxxxxxxxxxxx
 //    [------]           src
 //      [------]         dst
 //
 // a single eight-byte copy from <src> to <dst> will repeat the pattern once,
 // after which we can move <dst> two bytes without moving <src>:
 //
 //    ababxxxxxxxxxx
 //    [------]           src
 //        [------]       dst
 //
 // and repeat the exercise until the two no longer overlap.
 //
 // This allows us to do very well in the special case of one single byte
 // repeated many times, without taking a big hit for more general cases.
 //
 // The worst case of extra writing past the end of the match occurs when
 // dst - src == 1 and len == 1; the last copy will read from byte positions
 // [0..7] and write to [4..11], whereas it was only supposed to write to
 // position 1. Thus, ten excess bytes.
 static BROTLI_INLINE void IncrementalCopyFastPath(
    uint8_t* dst, const uint8_t* src, int len) {
  if (src < dst) {
    while (dst - src < 8) {
      UNALIGNED_COPY64(dst, src);
      len -= dst - src;
      dst += dst - src;
    }
  }
  while (len > 0) {
    UNALIGNED_COPY64(dst, src);
    src += 8;
    dst += 8;
    len -= 8;
  }
 }
 int BrotliDecompressedSize(size_t encoded_size,
                           const uint8_t* encoded_buffer,
                           size_t* decoded_size) {
  BrotliMemInput memin;
  BrotliInput input = BrotliInitMemInput(encoded_buffer, encoded_size, &memin);
  BrotliBitReader br;
-  BrotliInitBitReader(&br, encoded_buffer, encoded_size);
+  if (!BrotliInitBitReader(&br, input)) {
-  return DecodeSize(&br, decoded_size);
+    return 0;
  }
  int64_t size = DecodeSize(&br);
  if (size < 0) {
    return 0;
  }
  *decoded_size = (size_t)size;
  return 1;
 }
 int BrotliDecompressBuffer(size_t encoded_size,
                           const uint8_t* encoded_buffer,
                           size_t* decoded_size,
                           uint8_t* decoded_buffer) {
  BrotliMemInput memin;
  BrotliInput in = BrotliInitMemInput(encoded_buffer, encoded_size, &memin);
  BrotliMemOutput mout;
  BrotliOutput out = BrotliInitMemOutput(decoded_buffer, *decoded_size, &mout);
  int success = BrotliDecompress(in, out);
  *decoded_size = mout.pos;
  return success;
 }
 int BrotliDecompress(BrotliInput input, BrotliOutput output) {
  int ok = 1;
  int i;
  size_t pos = 0;
-  uint8_t* data = decoded_buffer;
+  int64_t decoded_size;
-  int input_size_bits;
+  int input_size_bits = 0;
  int input_end = 0;
  int window_bits = 0;
  size_t ringbuffer_size;
  size_t ringbuffer_mask;
  uint8_t* ringbuffer;
  uint8_t* ringbuffer_end;
  // This ring buffer holds a few past copy distances that will be used by
  // some special distance codes.
  int dist_rb[4] = { 4, 11, 15, 16 };
  size_t dist_rb_idx = 0;
  // The previous 2 bytes used for context.
  uint8_t prev_byte1 = 0;
  uint8_t prev_byte2 = 0;
  HuffmanTreeGroup hgroup[3];
  BrotliBitReader br;
  BrotliInitBitReader(&br, encoded_buffer, encoded_size);
-  ok = DecodeSize(&br, decoded_size);
+  if (!BrotliInitBitReader(&br, input)) {
-  if (!ok) return 0;
+    return 0;
  }
-  if (*decoded_size == 0) {
+  decoded_size = DecodeSize(&br);
  if (decoded_size == 0) {
    return 1;
  }
-  {
+
-    size_t n = *decoded_size;
+  if (decoded_size > 0) {
    size_t n = (size_t)decoded_size;
    input_size_bits = (n == (n &~ (n - 1))) ? -1 : 0;
    while (n) {
      ++input_size_bits;
@ -610,12 +672,24 @@ int BrotliDecompressBuffer(size_t encoded_size,
    }
  }
-  BROTLI_LOG_UINT(*decoded_size);
+  // Decode window size.
  if ((decoded_size < 0 || input_size_bits > 16) && BrotliReadBits(&br, 1)) {
    window_bits = 17 + BrotliReadBits(&br, 3);
  } else {
    window_bits = 16;
  }
  ringbuffer_size = 1 << window_bits;
  ringbuffer_mask = ringbuffer_size - 1;
  ringbuffer = (uint8_t*)malloc(ringbuffer_size + 16);
  ringbuffer_end = ringbuffer + ringbuffer_size;
  BROTLI_LOG_UINT(decoded_size);
  BROTLI_LOG_UINT(input_size_bits);
-  while (pos < *decoded_size && ok) {
+  while (!input_end && ok) {
    size_t meta_block_len = 0;
-    size_t meta_block_end;
+    size_t meta_block_end_pos;
    size_t block_length[3] = { 0 };
    int block_type[3] = { 0 };
    int num_block_types[3] = { 0 };
@ -628,12 +702,9 @@ int BrotliDecompressBuffer(size_t encoded_size,
    uint32_t distance_postfix_mask;
    int num_distance_codes;
    uint8_t* context_map = NULL;
-    int context_mode;
+    uint8_t* context_modes = NULL;
    int contexts_per_block;
    int num_literal_htrees;
    uint8_t* dist_context_map = NULL;
    int dist_context_mode;
    int dist_contexts_per_block;
    int num_dist_htrees;
    int context_offset = 0;
    uint8_t* context_map_slice = NULL;
@ -641,23 +712,41 @@ int BrotliDecompressBuffer(size_t encoded_size,
    int dist_context_offset = 0;
    uint8_t* dist_context_map_slice = NULL;
    uint8_t dist_htree_index = 0;
    int context_lookup_offset1 = 0;
    int context_lookup_offset2 = 0;
    uint8_t context_mode;
-    BROTLI_LOG_UINT(pos);
+    for (i = 0; i < 3; ++i) {
-    if (!DecodeMetaBlockLength(input_size_bits, *decoded_size - pos,
+      hgroup[i].num_htrees = 0;
-                               &br, &meta_block_len)) {
+      hgroup[i].htrees = NULL;
-      printf("Could not decode meta-block length.\n");
+      block_type_trees[i].root_ = NULL;
      block_len_trees[i].root_ = NULL;
    }
    if (!BrotliReadMoreInput(&br)) {
      printf("[BrotliDecompress] Unexpected end of input.\n");
      ok = 0;
      goto End;
    }
    BROTLI_LOG_UINT(pos);
    DecodeMetaBlockLength(input_size_bits, pos, decoded_size,
                          &br, &meta_block_len, &input_end);
    BROTLI_LOG_UINT(meta_block_len);
-    meta_block_end = pos + meta_block_len;
+    if (meta_block_len == 0) {
      goto End;
    }
    meta_block_end_pos = pos + meta_block_len;
    for (i = 0; i < 3; ++i) {
      block_type_trees[i].root_ = NULL;
      block_len_trees[i].root_ = NULL;
      if (BrotliReadBits(&br, 1)) {
        num_block_types[i] = BrotliReadBits(&br, 8) + 1;
-        ReadHuffmanCode(num_block_types[i] + 2, &block_type_trees[i], &br);
+        if (!ReadHuffmanCode(
-        ReadHuffmanCode(kNumBlockLengthCodes, &block_len_trees[i], &br);
+                num_block_types[i] + 2, &block_type_trees[i], &br) ||
            !ReadHuffmanCode(kNumBlockLengthCodes, &block_len_trees[i], &br)) {
          ok = 0;
          goto End;
        }
        block_length[i] = ReadBlockLength(&block_len_trees[i], &br);
        block_type_rb_index[i] = 1;
      } else {
@ -673,21 +762,32 @@ int BrotliDecompressBuffer(size_t encoded_size,
    BROTLI_LOG_UINT(block_length[1]);
    BROTLI_LOG_UINT(block_length[2]);
    if (!BrotliReadMoreInput(&br)) {
      printf("[BrotliDecompress] Unexpected end of input.\n");
      ok = 0;
      goto End;
    }
    distance_postfix_bits = BrotliReadBits(&br, 2);
    num_direct_distance_codes = NUM_DISTANCE_SHORT_CODES +
        (BrotliReadBits(&br, 4) << distance_postfix_bits);
    distance_postfix_mask = (1 << distance_postfix_bits) - 1;
    num_distance_codes = (num_direct_distance_codes +
                          (48 << distance_postfix_bits));
    context_modes = (uint8_t*)malloc(num_block_types[0]);
    for (i = 0; i < num_block_types[0]; ++i) {
      context_modes[i] = BrotliReadBits(&br, 2) << 1;
      BROTLI_LOG_ARRAY_INDEX(context_modes, i);
    }
    BROTLI_LOG_UINT(num_direct_distance_codes);
    BROTLI_LOG_UINT(distance_postfix_bits);
-    DecodeContextMap(num_block_types[0], 0, &context_mode, &contexts_per_block,
+    if (!DecodeContextMap(num_block_types[0] << kLiteralContextBits,
-                     &num_literal_htrees, &context_map, &br);
+                          &num_literal_htrees, &context_map, &br) ||
-
+        !DecodeContextMap(num_block_types[2] << kDistanceContextBits,
-    DecodeContextMap(num_block_types[2], 2, &dist_context_mode,
+                          &num_dist_htrees, &dist_context_map, &br)) {
-                     &dist_contexts_per_block,
+      ok = 0;
-                     &num_dist_htrees, &dist_context_map, &br);
+      goto End;
    }
    HuffmanTreeGroupInit(&hgroup[0], kNumLiteralCodes, num_literal_htrees);
    HuffmanTreeGroupInit(&hgroup[1], kNumInsertAndCopyCodes,
@ -695,18 +795,32 @@ int BrotliDecompressBuffer(size_t encoded_size,
    HuffmanTreeGroupInit(&hgroup[2], num_distance_codes, num_dist_htrees);
    for (i = 0; i < 3; ++i) {
-      HuffmanTreeGroupDecode(&hgroup[i], &br);
+      if (!HuffmanTreeGroupDecode(&hgroup[i], &br)) {
        ok = 0;
        goto End;
      }
    }
    context_map_slice = context_map;
    dist_context_map_slice = dist_context_map;
    context_mode = context_modes[block_type[0]];
    context_lookup_offset1 = kContextLookupOffsets[context_mode];
    context_lookup_offset2 = kContextLookupOffsets[context_mode + 1];
-    while (pos < meta_block_end) {
+    while (pos < meta_block_end_pos) {
      int insert_length;
      int copy_length;
      int distance_code;
      int distance;
      uint8_t context;
      int j;
      const uint8_t* copy_src;
      uint8_t* copy_dst;
      if (!BrotliReadMoreInput(&br)) {
        printf("[BrotliDecompress] Unexpected end of input.\n");
        ok = 0;
        goto End;
      }
      if (block_length[1] == 0) {
        DecodeBlockType(block_type_trees, 1, block_type, block_type_rb,
                        block_type_rb_index, &br);
@ -719,87 +833,120 @@ int BrotliDecompressBuffer(size_t encoded_size,
      BROTLI_LOG_UINT(copy_length);
      BROTLI_LOG_UINT(distance_code);
      for (j = 0; j < insert_length; ++j) {
        if (!BrotliReadMoreInput(&br)) {
          printf("[BrotliDecompress] Unexpected end of input.\n");
          ok = 0;
          goto End;
        }
        if (block_length[0] == 0) {
          DecodeBlockType(block_type_trees, 0, block_type, block_type_rb,
                          block_type_rb_index, &br);
          block_length[0] = ReadBlockLength(&block_len_trees[0], &br);
-          literal_htree_index = block_type[0];
+          context_offset = block_type[0] << kLiteralContextBits;
          context_offset = block_type[0] * contexts_per_block;
          context_map_slice = context_map + context_offset;
          context_mode = context_modes[block_type[0]];
          context_lookup_offset1 = kContextLookupOffsets[context_mode];
          context_lookup_offset2 = kContextLookupOffsets[context_mode + 1];
        }
-        --block_length[0];
+        context = (kContextLookup[context_lookup_offset1 + prev_byte1] |
-        BrotliFillBitWindow(&br);
+                   kContextLookup[context_lookup_offset2 + prev_byte2]);
        // Figure out htree
        if (contexts_per_block > 1) {
          uint8_t prev_byte = pos > 0 ? data[pos - 1] : 0;
          uint8_t prev_byte2 = pos > 1 ? data[pos - 2] : 0;
          uint8_t prev_byte3 = pos > 2 ? data[pos - 3] : 0;
          uint8_t context = Context(prev_byte, prev_byte2, prev_byte3,
                                    context_mode);
        BROTLI_LOG_UINT(context);
        literal_htree_index = context_map_slice[context];
-        }
+        --block_length[0];
-        data[pos] = ReadSymbol(&hgroup[0].htrees[literal_htree_index], &br);
+        prev_byte2 = prev_byte1;
        prev_byte1 = ReadSymbol(&hgroup[0].htrees[literal_htree_index], &br);
        ringbuffer[pos & ringbuffer_mask] = prev_byte1;
        BROTLI_LOG_UINT(literal_htree_index);
-        BROTLI_LOG_ARRAY_INDEX(data, pos);
+        BROTLI_LOG_ARRAY_INDEX(ringbuffer, pos & ringbuffer_mask);
-        ++pos;
+        if ((pos & ringbuffer_mask) == ringbuffer_mask) {
-      }
+          if (BrotliWrite(output, ringbuffer, ringbuffer_size) < 0) {
      if (br.error_) {
        printf("Read error after decoding literal sequence.\n");
            ok = 0;
            goto End;
          }
-
+        }
-      if (pos == meta_block_end) break;
+        ++pos;
      }
      if (pos == meta_block_end_pos) break;
      if (distance_code < 0) {
        if (!BrotliReadMoreInput(&br)) {
          printf("[BrotliDecompress] Unexpected end of input.\n");
          ok = 0;
          goto End;
        }
        if (block_length[2] == 0) {
          DecodeBlockType(block_type_trees, 2, block_type, block_type_rb,
                          block_type_rb_index, &br);
          block_length[2] = ReadBlockLength(&block_len_trees[2], &br);
          dist_htree_index = block_type[2];
-          dist_context_offset = block_type[2] * dist_contexts_per_block;
+          dist_context_offset = block_type[2] << kDistanceContextBits;
          dist_context_map_slice = dist_context_map + dist_context_offset;
        }
        --block_length[2];
        if (dist_contexts_per_block > 1) {
        uint8_t context = copy_length > 4 ? 3 : copy_length - 2;
        dist_htree_index = dist_context_map_slice[context];
        }
        distance_code = ReadCopyDistance(&hgroup[2].htrees[dist_htree_index],
                                         num_direct_distance_codes,
                                         distance_postfix_bits,
                                         distance_postfix_mask,
                                         &br);
        if (br.error_) {
          printf("Could not read copy distance.\n");
          ok = 0;
          goto End;
        }
      }
      // Convert the distance code to the actual distance by possibly looking
      // up past distnaces from the ringbuffer.
-      distance = TranslateShortCodes(distance_code, dist_rb, &dist_rb_idx);
+      distance = TranslateShortCodes(distance_code, dist_rb, dist_rb_idx);
      if (distance_code > 0) {
        dist_rb[dist_rb_idx & 3] = distance;
        ++dist_rb_idx;
      }
      BROTLI_LOG_UINT(distance);
-      // Do the actual copy if it is valid.
+      if (pos < (size_t)distance || pos + copy_length > meta_block_end_pos) {
-      if (distance > 0 && pos >= (size_t)distance &&
+        printf("Invalid backward reference. pos: %ld distance: %d "
-          pos + copy_length <= *decoded_size) {
+               "len: %d end: %lu\n", pos, distance, copy_length,
-        int j;
+               (unsigned long)meta_block_end_pos);
-        for (j = 0; j < copy_length; ++j) {
+        ok = 0;
-          data[pos + j] = data[pos + j - distance];
+        goto End;
      }
      copy_src = &ringbuffer[(pos - distance) & ringbuffer_mask];
      copy_dst = &ringbuffer[pos & ringbuffer_mask];
 #if (defined(__x86_64__) || defined(_M_X64))
      if (copy_src + copy_length <= ringbuffer_end &&
          copy_dst + copy_length < ringbuffer_end) {
        if (copy_length <= 16 && distance >= 8) {
          UNALIGNED_COPY64(copy_dst, copy_src);
          UNALIGNED_COPY64(copy_dst + 8, copy_src + 8);
        } else {
          IncrementalCopyFastPath(copy_dst, copy_src, copy_length);
        }
        pos += copy_length;
-      } else {
+        copy_length = 0;
-        printf("Invalid backward reference. pos: %lu distance: %d "
+      }
-               "len: %d end: %lu\n", (unsigned long)pos, distance, copy_length,
+#endif
-               (unsigned long)*decoded_size);
+
      for (j = 0; j < copy_length; ++j) {
        ringbuffer[pos & ringbuffer_mask] =
            ringbuffer[(pos - distance) & ringbuffer_mask];
        if ((pos & ringbuffer_mask) == ringbuffer_mask) {
          if (BrotliWrite(output, ringbuffer, ringbuffer_size) < 0) {
            ok = 0;
            goto End;
          }
        }
        ++pos;
      }
      // When we get here, we must have inserted at least one literal and made
      // a copy of at least length two, therefore accessing the last 2 bytes is
      // valid.
      prev_byte1 = ringbuffer[(pos - 1) & ringbuffer_mask];
      prev_byte2 = ringbuffer[(pos - 2) & ringbuffer_mask];
    }
 End:
    free(context_modes);
    free(context_map);
    free(dist_context_map);
    for (i = 0; i < 3; ++i) {
@ -809,6 +956,10 @@ int BrotliDecompressBuffer(size_t encoded_size,
    }
  }
  if (BrotliWrite(output, ringbuffer, pos & ringbuffer_mask) < 0) {
    ok = 0;
  }
  free(ringbuffer);
  return ok;
 }
--- a/dec/decode.h
+++ b/dec/decode.h
@ -17,6 +17,7 @@
 #ifndef BROTLI_DEC_DECODE_H_
 #define BROTLI_DEC_DECODE_H_
 #include "./streams.h"
 #include "./types.h"
 #if defined(__cplusplus) || defined(c_plusplus)
@ -39,6 +40,10 @@ int BrotliDecompressBuffer(size_t encoded_size,
                           size_t* decoded_size,
                           uint8_t* decoded_buffer);
 // Same as above, but uses the specified input and output callbacks instead of
 // reading from and writing to pre-allocated memory buffers.
 int BrotliDecompress(BrotliInput input, BrotliOutput output);
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/dec/huffman.c
+++ b/dec/huffman.c
@ -24,10 +24,6 @@
 extern "C" {
 #endif
 // Uncomment the following to use look-up table for ReverseBits()
 // (might be faster on some platform)
 // #define USE_LUT_REVERSE_BITS
 #define NON_EXISTENT_SYMBOL (-1)
 #define MAX_ALLOWED_CODE_LENGTH      15
@ -55,7 +51,6 @@ static void AssignChildren(HuffmanTree* const tree,
 static int TreeInit(HuffmanTree* const tree, int num_leaves) {
  assert(tree != NULL);
  tree->fixed_bit_length_ = 0;
  if (num_leaves == 0) return 0;
  // We allocate maximum possible nodes in the tree at once.
  // Note that a Huffman tree is a full binary tree; and in a full binary tree
@ -84,7 +79,7 @@ void BrotliHuffmanTreeRelease(HuffmanTree* const tree) {
 // Utility: converts Huffman code lengths to corresponding Huffman codes.
 // 'huff_codes' should be pre-allocated.
 // Returns false in case of error (memory allocation, invalid codes).
-static int HuffmanCodeLengthsToCodes(const int* const code_lengths,
+static int HuffmanCodeLengthsToCodes(const uint8_t* const code_lengths,
                                     int code_lengths_size,
                                     int* const huff_codes) {
  int symbol;
@ -133,35 +128,21 @@ static int HuffmanCodeLengthsToCodes(const int* const code_lengths,
  return 1;
 }
-#ifndef USE_LUT_REVERSE_BITS
+static const uint8_t kReverse7[128] = {
-
+  0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120,
-static int ReverseBitsShort(int bits, int num_bits) {
+  4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124,
-  int retval = 0;
+  2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122,
-  int i;
+  6, 70, 38, 102, 22, 86, 54, 118, 14, 78, 46, 110, 30, 94, 62, 126,
-  assert(num_bits <= 8);   // Not a hard requirement, just for coherency.
+  1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121,
-  for (i = 0; i < num_bits; ++i) {
+  5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125,
-    retval <<= 1;
+  3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123,
-    retval |= bits & 1;
+  7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127
    bits >>= 1;
  }
  return retval;
 }
 #else
 static const uint8_t kReversedBits[16] = {  // Pre-reversed 4-bit values.
  0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
  0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf
 };
 static int ReverseBitsShort(int bits, int num_bits) {
-  const uint8_t v = (kReversedBits[bits & 0xf] << 4) | kReversedBits[bits >> 4];
+  return kReverse7[bits] >> (7 - num_bits);
  assert(num_bits <= 8);
  return v >> (8 - num_bits);
 }
 #endif
 static int TreeAddSymbol(HuffmanTree* const tree,
                         int symbol, int code, int code_length) {
  int step = HUFF_LUT_BITS;
@ -170,13 +151,14 @@ static int TreeAddSymbol(HuffmanTree* const tree,
  const HuffmanTreeNode* const max_node = tree->root_ + tree->max_nodes_;
  assert(symbol == (int16_t)symbol);
  if (code_length <= HUFF_LUT_BITS) {
-    int i;
+    int i = 1 << (HUFF_LUT_BITS - code_length);
    base_code = ReverseBitsShort(code, code_length);
-    for (i = 0; i < (1 << (HUFF_LUT_BITS - code_length)); ++i) {
+    do {
      --i;
      const int idx = base_code | (i << code_length);
      tree->lut_symbol_[idx] = (int16_t)symbol;
      tree->lut_bits_[idx] = code_length;
-    }
+    } while (i > 0);
  } else {
    base_code = ReverseBitsShort((code >> (code_length - HUFF_LUT_BITS)),
                                 HUFF_LUT_BITS);
@ -206,7 +188,7 @@ static int TreeAddSymbol(HuffmanTree* const tree,
 }
 int BrotliHuffmanTreeBuildImplicit(HuffmanTree* const tree,
-                                   const int* const code_lengths,
+                                   const uint8_t* const code_lengths,
                                   int code_lengths_size) {
  int symbol;
  int num_symbols = 0;
@ -264,41 +246,6 @@ int BrotliHuffmanTreeBuildImplicit(HuffmanTree* const tree,
  }
 }
 int BrotliHuffmanTreeBuildExplicit(HuffmanTree* const tree,
                                   const int* const code_lengths,
                                   const int* const codes,
                                   const int* const symbols,
                                   int max_symbol,
                                   int num_symbols) {
  int ok = 0;
  int i;
  assert(tree != NULL);
  assert(code_lengths != NULL);
  assert(codes != NULL);
  assert(symbols != NULL);
  // Initialize the tree. Will fail if num_symbols = 0.
  if (!TreeInit(tree, num_symbols)) return 0;
  // Add symbols one-by-one.
  for (i = 0; i < num_symbols; ++i) {
    if (codes[i] != NON_EXISTENT_SYMBOL) {
      if (symbols[i] < 0 || symbols[i] >= max_symbol) {
        goto End;
      }
      if (!TreeAddSymbol(tree, symbols[i], codes[i], code_lengths[i])) {
        goto End;
      }
    }
  }
  ok = 1;
 End:
  ok = ok && IsFull(tree);
  if (!ok) BrotliHuffmanTreeRelease(tree);
  return ok;
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/dec/huffman.h
+++ b/dec/huffman.h
@ -43,7 +43,6 @@ struct HuffmanTree {
  HuffmanTreeNode* root_;   // all the nodes, starting at root.
  int max_nodes_;           // max number of nodes
  int num_nodes_;           // number of currently occupied nodes
  int fixed_bit_length_;     // If non-zero, uses fixed length coding
 };
 // Returns true if the given node is not a leaf of the Huffman tree.
@ -65,19 +64,9 @@ void BrotliHuffmanTreeRelease(HuffmanTree* const tree);
 // Builds Huffman tree assuming code lengths are implicitly in symbol order.
 // Returns false in case of error (invalid tree or memory error).
 int BrotliHuffmanTreeBuildImplicit(HuffmanTree* const tree,
-                                   const int* const code_lengths,
+                                   const uint8_t* const code_lengths,
                                   int code_lengths_size);
 // Build a Huffman tree with explicitly given lists of code lengths, codes
 // and symbols. Verifies that all symbols added are smaller than max_symbol.
 // Returns false in case of an invalid symbol, invalid tree or memory error.
 int BrotliHuffmanTreeBuildExplicit(HuffmanTree* const tree,
                                   const int* const code_lengths,
                                   const int* const codes,
                                   const int* const symbols,
                                   int max_symbol,
                                   int num_symbols);
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/dec/streams.c
+++ b/dec/streams.c
@ -0,0 +1,106 @@
 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 // Functions for streaming input and output.
 #include <string.h>
 #include <unistd.h>
 #include "./streams.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 int BrotliMemInputFunction(void* data, uint8_t* buf, size_t count) {
  BrotliMemInput* input = (BrotliMemInput*)data;
  if (input->pos > input->length) {
    return -1;
  }
  if (input->pos + count > input->length) {
    count = input->length - input->pos;
  }
  memcpy(buf, input->buffer + input->pos, count);
  input->pos += count;
  return count;
 }
 BrotliInput BrotliInitMemInput(const uint8_t* buffer, size_t length,
                               BrotliMemInput* mem_input) {
  mem_input->buffer = buffer;
  mem_input->length = length;
  mem_input->pos = 0;
  BrotliInput input;
  input.cb_ = &BrotliMemInputFunction;
  input.data_ = mem_input;
  return input;
 }
 int BrotliMemOutputFunction(void* data, const uint8_t* buf, size_t count) {
  BrotliMemOutput* output = (BrotliMemOutput*)data;
  if (output->pos + count > output->length) {
    return -1;
  }
  memcpy(output->buffer + output->pos, buf, count);
  output->pos += count;
  return count;
 }
 BrotliOutput BrotliInitMemOutput(uint8_t* buffer, size_t length,
                                 BrotliMemOutput* mem_output) {
  mem_output->buffer = buffer;
  mem_output->length = length;
  mem_output->pos = 0;
  BrotliOutput output;
  output.cb_ = &BrotliMemOutputFunction;
  output.data_ = mem_output;
  return output;
 }
 int BrotliStdinInputFunction(void* data, uint8_t* buf, size_t count) {
  return read(STDIN_FILENO, buf, count);
 }
 BrotliInput BrotliStdinInput() {
  BrotliInput in;
  in.cb_ = BrotliStdinInputFunction;
  in.data_ = NULL;
  return in;
 }
 int BrotliStdoutOutputFunction(void* data, const uint8_t* buf, size_t count) {
  return write(STDOUT_FILENO, buf, count);
 }
 BrotliOutput BrotliStdoutOutput() {
  BrotliOutput out;
  out.cb_ = BrotliStdoutOutputFunction;
  out.data_ = NULL;
  return out;
 }
 int BrotliFileOutputFunction(void* data, const uint8_t* buf, size_t count) {
  return fwrite(buf, 1, count, (FILE*)data);
 }
 BrotliOutput BrotliFileOutput(FILE* f) {
  BrotliOutput out;
  out.cb_ = BrotliFileOutputFunction;
  out.data_ = f;
  return out;
 }
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/dec/streams.h
+++ b/dec/streams.h
@ -0,0 +1,102 @@
 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 // Functions for streaming input and output.
 #ifndef BROTLI_DEC_STREAMS_H_
 #define BROTLI_DEC_STREAMS_H_
 #include <stdio.h>
 #include "./types.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 // Function pointer type used to read len bytes into buf. Returns the
 // number of bytes read or -1 on error.
 typedef int (*BrotliInputFunction)(void* data, uint8_t* buf, size_t len);
 // Input callback function with associated data.
 typedef struct {
  BrotliInputFunction cb_;
  void* data_;
 } BrotliInput;
 // Reads len bytes into buf, using the in callback.
 static BROTLI_INLINE int BrotliRead(BrotliInput in, uint8_t* buf, size_t len) {
  return in.cb_(in.data_, buf, len);
 }
 // Function pointer type used to write len bytes into buf. Returns the
 // number of bytes written or -1 on error.
 typedef int (*BrotliOutputFunction)(void* data, const uint8_t* buf, size_t len);
 // Output callback function with associated data.
 typedef struct {
  BrotliOutputFunction cb_;
  void* data_;
 } BrotliOutput;
 // Writes len bytes into buf, using the out callback.
 static BROTLI_INLINE int BrotliWrite(BrotliOutput out,
                                     const uint8_t* buf, size_t len) {
  return out.cb_(out.data_, buf, len);
 }
 // Memory region with position.
 typedef struct {
  const uint8_t* buffer;
  size_t length;
  size_t pos;
 } BrotliMemInput;
 // Input callback where *data is a BrotliMemInput struct.
 int BrotliMemInputFunction(void* data, uint8_t* buf, size_t count);
 // Returns an input callback that wraps the given memory region.
 BrotliInput BrotliInitMemInput(const uint8_t* buffer, size_t length,
                               BrotliMemInput* mem_input);
 // Output buffer with position.
 typedef struct {
  uint8_t* buffer;
  size_t length;
  size_t pos;
 } BrotliMemOutput;
 // Output callback where *data is a BrotliMemOutput struct.
 int BrotliMemOutputFunction(void* data, const uint8_t* buf, size_t count);
 // Returns an output callback that wraps the given memory region.
 BrotliOutput BrotliInitMemOutput(uint8_t* buffer, size_t length,
                                 BrotliMemOutput* mem_output);
 // Input callback that reads from standard input.
 int BrotliStdinInputFunction(void* data, uint8_t* buf, size_t count);
 BrotliInput BrotliStdinInput();
 // Output callback that writes to standard output.
 int BrotliStdoutOutputFunction(void* data, const uint8_t* buf, size_t count);
 BrotliOutput BrotliStdoutOutput();
 // Output callback that writes to a file.
 int BrotliFileOutputFunction(void* data, const uint8_t* buf, size_t count);
 BrotliOutput BrotliFileOutput(FILE* f);
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 #endif  // BROTLI_DEC_STREAMS_H_
--- a/enc/backward_references.cc
+++ b/enc/backward_references.cc
@ -20,60 +20,64 @@
 #include <vector>
 #include "./command.h"
 #include "./hash.h"
 #include "./literal_cost.h"
 namespace brotli {
-void CreateBackwardReferences(const uint8_t* data,
+void CreateBackwardReferences(size_t num_bytes,
-                              int length,
+                              size_t position,
                              const uint8_t* ringbuffer,
                              const float* literal_cost,
                              size_t ringbuffer_mask,
                              const size_t max_backward_limit,
                              Hasher* hasher,
                              std::vector<Command>* commands) {
  HashLongestMatch<13,11> *hasher = new HashLongestMatch<13,11>;
  float *literal_cost = new float[length];
  EstimateBitCostsForLiterals(length, data, literal_cost);
  hasher->SetLiteralCost(literal_cost);
  // Length heuristic that seems to help probably by better selection
  // of lazy matches of similar lengths.
  int insert_length = 0;
-  size_t i = 0;
+  size_t i = position & ringbuffer_mask;
  const int i_diff = position - i;
  const size_t i_end = i + num_bytes;
  double average_cost = 0.0;
-  for (int i = 0; i < length; ++i) {
+  for (int k = position; k < position + num_bytes; ++k) {
-    average_cost += literal_cost[i];
+    average_cost += literal_cost[k & ringbuffer_mask];
  }
-  average_cost /= length;
+  average_cost /= num_bytes;
  hasher->set_average_cost(average_cost);
-  while (i + 2 < length) {
+  while (i + 2 < i_end) {
    size_t best_len = 0;
    size_t best_dist = 0;
    double best_score = 0;
-    const size_t max_distance = std::min(i, 1UL << 24);
+    const size_t max_distance = std::min(i + i_diff, max_backward_limit);
    hasher->set_insert_length(insert_length);
    bool match_found = hasher->FindLongestMatch(
-        data, i, length - i, max_distance,
+        ringbuffer, literal_cost, ringbuffer_mask,
        i + i_diff, i_end - i, max_distance,
        &best_len, &best_dist, &best_score);
    if (match_found) {
      // Found a match. Let's look for something even better ahead.
      int delayed_backward_references_in_row = 0;
-      while (i + 4 < length &&
+      while (i + 4 < i_end &&
             delayed_backward_references_in_row < 4) {
        size_t best_len_2 = 0;
        size_t best_dist_2 = 0;
        double best_score_2 = 0;
-        hasher->Store(data + i, i);
+        hasher->Store(ringbuffer + i, i + i_diff);
        match_found = hasher->FindLongestMatch(
-            data, i + 1, length - i - 1, max_distance,
+            ringbuffer, literal_cost, ringbuffer_mask,
            i + i_diff + 1, i_end - i - 1, max_distance,
            &best_len_2, &best_dist_2, &best_score_2);
        double cost_diff_lazy = 0;
        if (best_len >= 4) {
-          cost_diff_lazy += hasher->literal_cost(i + 4) - average_cost;
+          cost_diff_lazy +=
              literal_cost[(i + 4) & ringbuffer_mask] - average_cost;
        }
        {
          const int tail_length = best_len_2 - best_len + 1;
          for (int k = 0; k < tail_length; ++k) {
-            cost_diff_lazy -= hasher->literal_cost(i + best_len + k) -
+            cost_diff_lazy -=
                literal_cost[(i + best_len + k) & ringbuffer_mask] -
                average_cost;
          }
        }
@ -84,7 +88,7 @@ void CreateBackwardReferences(const uint8_t* data,
        }
        // Add bias to slightly avoid lazy matching.
        cost_diff_lazy += 2.0 + delayed_backward_references_in_row * 0.2;
-        cost_diff_lazy += 0.04 * hasher->literal_cost(i);
+        cost_diff_lazy += 0.04 * literal_cost[i & ringbuffer_mask];
        if (match_found && best_score_2 >= best_score + cost_diff_lazy) {
          // Ok, let's just write one byte for now and start a match from the
@ -109,18 +113,18 @@ void CreateBackwardReferences(const uint8_t* data,
      insert_length = 0;
      ++i;
      for (int j = 1; j < best_len; ++j) {
-        if (i + 2 < length) {
+        if (i + 2 < i_end) {
-          hasher->Store(data + i, i);
+          hasher->Store(ringbuffer + i, i + i_diff);
        }
        ++i;
      }
    } else {
      ++insert_length;
-      hasher->Store(data + i, i);
+      hasher->Store(ringbuffer + i, i + i_diff);
      ++i;
    }
  }
-  insert_length += (length - i);
+  insert_length += (i_end - i);
  if (insert_length > 0) {
    Command cmd;
@ -129,9 +133,6 @@ void CreateBackwardReferences(const uint8_t* data,
    cmd.copy_distance_ = 0;
    commands->push_back(cmd);
  }
  delete[] literal_cost;
  delete hasher;
 }
 }  // namespace brotli
--- a/enc/backward_references.h
+++ b/enc/backward_references.h
@ -20,12 +20,18 @@
 #include <stdint.h>
 #include <vector>
 #include "./hash.h"
 #include "./command.h"
 namespace brotli {
-void CreateBackwardReferences(const uint8_t* data,
+void CreateBackwardReferences(size_t num_bytes,
-                              int length,
+                              size_t position,
                              const uint8_t* ringbuffer,
                              const float* literal_cost,
                              size_t ringbuffer_mask,
                              const size_t max_backward_limit,
                              Hasher* hasher,
                              std::vector<Command>* commands);
 }  // namespace brotli
--- a/enc/bit_cost.h
+++ b/enc/bit_cost.h
@ -122,26 +122,31 @@ static inline int HuffmanBitCost(const uint8_t* depth, int length) {
 template<int kSize>
 double PopulationCost(const Histogram<kSize>& histogram) {
  if (histogram.total_count_ == 0) {
-    return 4;
+    return 11;
  }
  int symbols[2] = { 0 };
  int count = 0;
-  for (int i = 0; i < kSize && count < 3; ++i) {
+  for (int i = 0; i < kSize && count < 5; ++i) {
    if (histogram.data_[i] > 0) {
      if (count < 2) symbols[count] = i;
      ++count;
    }
  }
-  if (count <= 2 && symbols[0] < 256 && symbols[1] < 256) {
+  if (count == 1) {
-    return ((symbols[0] <= 1 ? 4 : 11) +
+    return 11;
-            (count == 2 ? 8 + histogram.total_count_ : 0));
+  }
  if (count == 2) {
    return 19 + histogram.total_count_;
  }
  uint8_t depth[kSize] = { 0 };
  CreateHuffmanTree(&histogram.data_[0], kSize, 15, depth);
-  int bits = HuffmanBitCost(depth, kSize);
+  int bits = 0;
  for (int i = 0; i < kSize; ++i) {
    bits += histogram.data_[i] * depth[i];
  }
  if (count == 3) {
    bits += 27;
  } else {
    bits += HuffmanBitCost(depth, kSize);
  }
  return bits;
 }
--- a/enc/context.h
+++ b/enc/context.h
@ -21,25 +21,124 @@
 namespace brotli {
-static const int kSigned2BitContextLookup[] = {
+// Second-order context lookup table for UTF8 byte streams.
 //
 // If p1 and p2 are the previous two bytes, we calcualte the context as
 //
 //   context = kUTF8ContextLookup[p1] | kUTF8ContextLookup[p2 + 256].
 //
 // If the previous two bytes are ASCII characters (i.e. < 128), this will be
 // equivalent to
 //
 //   context = 4 * context1(p1) + context2(p2),
 //
 // where context1 is based on the previous byte in the following way:
 //
 //   0  : non-ASCII control
 //   1  : \t, \n, \r
 //   2  : space
 //   3  : other punctuation
 //   4  : " '
 //   5  : %
 //   6  : ( < [ {
 //   7  : ) > ] }
 //   8  : , ; :
 //   9  : .
 //   10 : =
 //   11 : number
 //   12 : upper-case vowel
 //   13 : upper-case consonant
 //   14 : lower-case vowel
 //   15 : lower-case consonant
 //
 // and context2 is based on the second last byte:
 //
 //   0 : control, space
 //   1 : punctuation
 //   2 : upper-case letter, number
 //   3 : lower-case letter
 //
 // If the last byte is ASCII, and the second last byte is not (in a valid UTF8
 // stream it will be a continuation byte, value between 128 and 191), the
 // context is the same as if the second last byte was an ASCII control or space.
 //
 // If the last byte is a UTF8 lead byte (value >= 192), then the next byte will
 // be a continuation byte and the context id is 2 or 3 depending on the LSB of
 // the last byte and to a lesser extent on the second last byte if it is ASCII.
 //
 // If the last byte is a UTF8 continuation byte, the second last byte can be:
 //   - continuation byte: the next byte is probably ASCII or lead byte (assuming
 //     4-byte UTF8 characters are rare) and the context id is 0 or 1.
 //   - lead byte (192 - 207): next byte is ASCII or lead byte, context is 0 or 1
 //   - lead byte (208 - 255): next byte is continuation byte, context is 2 or 3
 //
 // The possible value combinations of the previous two bytes, the range of
 // context ids and the type of the next byte is summarized in the table below:
 //
 // |--------\-----------------------------------------------------------------|
 // |         \                         Last byte                              |
 // | Second   \---------------------------------------------------------------|
 // | last byte \    ASCII            |   cont. byte        |   lead byte      |
 // |            \   (0-127)          |   (128-191)         |   (192-)         |
 // |=============|===================|=====================|==================|
 // |  ASCII      | next: ASCII/lead  |  not valid          |  next: cont.     |
 // |  (0-127)    | context: 4 - 63   |                     |  context: 2 - 3  |
 // |-------------|-------------------|---------------------|------------------|
 // |  cont. byte | next: ASCII/lead  |  next: ASCII/lead   |  next: cont.     |
 // |  (128-191)  | context: 4 - 63   |  context: 0 - 1     |  context: 2 - 3  |
 // |-------------|-------------------|---------------------|------------------|
 // |  lead byte  | not valid         |  next: ASCII/lead   |  not valid       |
 // |  (192-207)  |                   |  context: 0 - 1     |                  |
 // |-------------|-------------------|---------------------|------------------|
 // |  lead byte  | not valid         |  next: cont.        |  not valid       |
 // |  (208-)     |                   |  context: 2 - 3     |                  |
 // |-------------|-------------------|---------------------|------------------|
 static const uint8_t kUTF8ContextLookup[512] = {
  // Last byte.
  //
  // ASCII range.
   0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  4,  0,  0,  4,  0,  0,
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12,
  44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12,
  12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48,
  52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12,
  12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56,
  60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12,  0,
  // UTF8 continuation byte range.
  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
  // UTF8 lead byte range.
  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
  // Second last byte.
  //
  // ASCII range.
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  // UTF8 continuation byte range.
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  // UTF8 lead byte range.
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
 };
 // Context lookup table for small signed integers.
 static const int kSigned3BitContextLookup[] = {
  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@ -59,69 +158,25 @@ static const int kSigned3BitContextLookup[] = {
  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
 };
 static const int kSigned4BitContextLookup[] = {
   0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
  11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 14, 15,
 };
 enum ContextType {
-  CONTEXT_NONE        = 0,
+  CONTEXT_LSB6         = 0,
-  CONTEXT_FULL        = 1,
+  CONTEXT_MSB6         = 1,
-  CONTEXT_MSB7        = 2,
+  CONTEXT_UTF8         = 2,
-  CONTEXT_MSB6        = 3,
+  CONTEXT_SIGNED       = 3
  CONTEXT_MSB5        = 4,
  CONTEXT_MSB4        = 5,
  CONTEXT_MSB3        = 6,
  CONTEXT_MSB2        = 7,
  CONTEXT_MSB1        = 8,
  CONTEXT_IS_ZERO     = 9,
  CONTEXT_SIGNED_2BIT = 10,
  CONTEXT_SIGNED_3BIT = 11,
  CONTEXT_SIGNED_4BIT = 12,
  CONTEXT_SIGNED_MIXED_3BYTE = 13,
 };
-static const int kContextSize[] = {
+static inline uint8_t Context(uint8_t p1, uint8_t p2, int mode) {
  1, 256, 128, 64, 32, 16, 8, 4, 2, 2, 4, 8, 16, 64,
 };
 static inline int NumContexts(int mode) {
  return kContextSize[mode];
 }
 static inline uint8_t Context(uint8_t prev_byte, uint8_t prev_byte2,
                              uint8_t prev_byte3, int mode) {
  switch (mode) {
-    case CONTEXT_NONE:
+    case CONTEXT_LSB6:
-      return 0;
+      return p1 & 0x3f;
-    case CONTEXT_IS_ZERO:
+    case CONTEXT_MSB6:
-      return prev_byte == 0 ? 0 : 1;
+      return p1 >> 2;
-    case CONTEXT_SIGNED_2BIT:
+    case CONTEXT_UTF8:
-      return kSigned2BitContextLookup[prev_byte];
+      return kUTF8ContextLookup[p1] | kUTF8ContextLookup[p2 + 256];
-    case CONTEXT_SIGNED_3BIT:
+    case CONTEXT_SIGNED:
-      return kSigned3BitContextLookup[prev_byte];
+      return (kSigned3BitContextLookup[p1] << 3) + kSigned3BitContextLookup[p2];
    case CONTEXT_SIGNED_4BIT:
      return kSigned4BitContextLookup[prev_byte];
    case CONTEXT_SIGNED_MIXED_3BYTE:
      return ((kSigned3BitContextLookup[prev_byte] << 3) +
              (kSigned2BitContextLookup[prev_byte2] << 1) +
              (prev_byte3 == 0 ? 0 : 1));
    default:
-      return prev_byte >> (mode - 1);
+      return 0;
  }
 }
--- a/enc/encode.cc
+++ b/enc/encode.cc
@ -26,7 +26,9 @@
 #include "./context.h"
 #include "./entropy_encode.h"
 #include "./fast_log.h"
 #include "./hash.h"
 #include "./histogram.h"
 #include "./literal_cost.h"
 #include "./prefix.h"
 #include "./write_bits.h"
@ -41,31 +43,39 @@ double Entropy(const std::vector<Histogram<kSize> >& histograms) {
  return retval;
 }
 template<int kSize>
 double TotalBitCost(const std::vector<Histogram<kSize> >& histograms) {
  double retval = 0;
  for (int i = 0; i < histograms.size(); ++i) {
    retval += PopulationCost(histograms[i]);
  }
  return retval;
 }
 void EncodeSize(size_t len, int* storage_ix, uint8_t* storage) {
  std::vector<uint8_t> len_bytes;
-  while (len > 0) {
+  do {
    len_bytes.push_back(len & 0xff);
    len >>= 8;
-  };
+  } while (len > 0);
  WriteBits(3, len_bytes.size(), storage_ix, storage);
  for (int i = 0; i < len_bytes.size(); ++i) {
    WriteBits(8, len_bytes[i], storage_ix, storage);
  }
 }
-void EncodeMetaBlockLength(int input_size_bits,
+void EncodeMetaBlockLength(size_t meta_block_size,
                           size_t meta_block_size,
                           bool is_last_meta_block,
                           int* storage_ix, uint8_t* storage) {
-  WriteBits(1, is_last_meta_block, storage_ix, storage);
+  WriteBits(1, 0, storage_ix, storage);
-  if (is_last_meta_block) return;
+  int num_bits = Log2Floor(meta_block_size) + 1;
-  while (input_size_bits > 0) {
+  WriteBits(3, (num_bits + 3) >> 2, storage_ix, storage);
-    WriteBits(8, meta_block_size & 0xff, storage_ix, storage);
+  while (num_bits > 0) {
-    meta_block_size >>= 8;
+    WriteBits(4, meta_block_size & 0xf, storage_ix, storage);
-    input_size_bits -= 8;
+    meta_block_size >>= 4;
    num_bits -= 4;
  }
-  if (input_size_bits > 0) {
+  if (num_bits > 0) {
-    WriteBits(input_size_bits, meta_block_size, storage_ix, storage);
+    WriteBits(num_bits, meta_block_size, storage_ix, storage);
  }
 }
@ -82,7 +92,7 @@ void StoreHuffmanTreeOfHuffmanTreeToBitMask(
    const uint8_t* code_length_bitdepth,
    int* storage_ix, uint8_t* storage) {
  static const uint8_t kStorageOrder[kCodeLengthCodes] = {
-    17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    1, 2, 3, 4, 0, 17, 18, 5, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15
  };
  // Throw away trailing zeros:
  int codes_to_store = kCodeLengthCodes;
@ -92,8 +102,16 @@ void StoreHuffmanTreeOfHuffmanTreeToBitMask(
    }
  }
  WriteBits(4, codes_to_store - 4, storage_ix, storage);
-  for (int i = 0; i < codes_to_store; ++i) {
+  const int skip_two_first =
-    WriteBits(3, code_length_bitdepth[kStorageOrder[i]], storage_ix, storage);
+      code_length_bitdepth[kStorageOrder[0]] == 0 &&
      code_length_bitdepth[kStorageOrder[1]] == 0;
  WriteBits(1, skip_two_first, storage_ix, storage);
  for (int i = skip_two_first * 2; i < codes_to_store; ++i) {
    uint8_t len[] = { 2, 4, 3, 2, 2, 4 };
    uint8_t bits[] = { 0, 7, 3, 1, 2, 15 };
    int v = code_length_bitdepth[kStorageOrder[i]];
    WriteBits(len[v], bits[v], storage_ix, storage);
  }
 }
@ -124,30 +142,49 @@ void StoreHuffmanTreeToBitMask(
 template<int kSize>
 void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size,
                      int* storage_ix, uint8_t* storage) {
-  const int kMaxBits = 8;
+  const uint8_t *depth = &code.depth_[0];
-  const int kMaxSymbol = 1 << kMaxBits;
+  int max_bits_counter = alphabet_size - 1;
-
+  int max_bits = 0;
  while (max_bits_counter) {
    max_bits_counter >>= 1;
    ++max_bits;
  }
  if (code.count_ == 0) {   // emit minimal tree for empty cases
-    // bits: small tree marker: 1, count-1: 0, large 8-bit code: 0, code: 0
+    // bits: small tree marker: 1, count-1: 0, max_bits-sized encoding for 0
-    WriteBits(4, 0x01, storage_ix, storage);
+    WriteBits(3 + max_bits, 0x01, storage_ix, storage);
    return;
  }
-  if (code.count_ <= 2 &&
+  if (code.count_ <= 4) {
-      code.symbols_[0] < kMaxSymbol &&
+    int symbols[4];
-      code.symbols_[1] < kMaxSymbol) {
+    // Quadratic sort.
-    // Small tree marker to encode 1 or 2 symbols.
+    int k, j;
    for (k = 0; k < code.count_; ++k) {
      symbols[k] = code.symbols_[k];
    }
    for (k = 0; k < code.count_; ++k) {
      for (j = k + 1; j < code.count_; ++j) {
        if (depth[symbols[j]] < depth[symbols[k]]) {
          int t = symbols[k];
          symbols[k] = symbols[j];
          symbols[j] = t;
        }
      }
    }
    // Small tree marker to encode 1-4 symbols.
    WriteBits(1, 1, storage_ix, storage);
-    WriteBits(1, code.count_ - 1, storage_ix, storage);
+    WriteBits(2, code.count_ - 1, storage_ix, storage);
-    if (code.symbols_[0] <= 1) {
+    for (int i = 0; i < code.count_; ++i) {
-      // Code bit for small (1 bit) symbol value.
+      WriteBits(max_bits, symbols[i], storage_ix, storage);
    }
    if (code.count_ == 4) {
      if (depth[symbols[0]] == 2 &&
          depth[symbols[1]] == 2 &&
          depth[symbols[2]] == 2 &&
          depth[symbols[3]] == 2) {
        WriteBits(1, 0, storage_ix, storage);
      WriteBits(1, code.symbols_[0], storage_ix, storage);
      } else {
        WriteBits(1, 1, storage_ix, storage);
      WriteBits(8, code.symbols_[0], storage_ix, storage);
      }
    if (code.count_ == 2) {
      WriteBits(8, code.symbols_[1], storage_ix, storage);
    }
    return;
  }
@ -156,7 +193,7 @@ void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size,
  uint8_t huffman_tree[kSize];
  uint8_t huffman_tree_extra_bits[kSize];
  int huffman_tree_size = 0;
-  WriteHuffmanTree(&code.depth_[0],
+  WriteHuffmanTree(depth,
                   alphabet_size,
                   &huffman_tree[0],
                   &huffman_tree_extra_bits[0],
@ -167,7 +204,7 @@ void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size,
    huffman_tree_histogram.Add(huffman_tree[i]);
  }
  EntropyCode<kCodeLengthCodes> huffman_tree_entropy;
-  BuildEntropyCode(huffman_tree_histogram, 7, kCodeLengthCodes,
+  BuildEntropyCode(huffman_tree_histogram, 5, kCodeLengthCodes,
                   &huffman_tree_entropy);
  Histogram<kCodeLengthCodes> trimmed_histogram = huffman_tree_histogram;
  uint8_t* last_code = &huffman_tree[huffman_tree_size - 1];
@ -178,7 +215,7 @@ void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size,
  bool write_length = false;
  if (trimmed_size > 1 && trimmed_size < huffman_tree_size) {
    EntropyCode<kCodeLengthCodes> trimmed_entropy;
-    BuildEntropyCode(trimmed_histogram, 7, kCodeLengthCodes, &trimmed_entropy);
+    BuildEntropyCode(trimmed_histogram, 5, kCodeLengthCodes, &trimmed_entropy);
    int huffman_bit_cost = HuffmanTreeBitCost(huffman_tree_histogram,
                                              huffman_tree_entropy);
    int trimmed_bit_cost = HuffmanTreeBitCost(trimmed_histogram,
@ -247,16 +284,15 @@ void EncodeCopyDistance(const Command& cmd, const EntropyCodeDistance& entropy,
  }
 }
-
+void ComputeDistanceShortCodes(std::vector<Command>* cmds,
-void ComputeDistanceShortCodes(std::vector<Command>* cmds) {
+                               int* dist_ringbuffer,
                               size_t* ringbuffer_idx) {
  static const int kIndexOffset[16] = {
    3, 2, 1, 0, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2
  };
  static const int kValueOffset[16] = {
    0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3
  };
  int dist_ringbuffer[4] = { 4, 11, 15, 16 };
  int ringbuffer_idx = 0;
  for (int i = 0; i < cmds->size(); ++i) {
    int cur_dist = (*cmds)[i].copy_distance_;
    if (cur_dist == 0) break;
@ -268,7 +304,7 @@ void ComputeDistanceShortCodes(std::vector<Command>* cmds) {
        // with them.
        continue;
      }
-      int comp = (dist_ringbuffer[(ringbuffer_idx + kIndexOffset[k]) & 3] +
+      int comp = (dist_ringbuffer[(*ringbuffer_idx + kIndexOffset[k]) & 3] +
                  kValueOffset[k]);
      if (cur_dist == comp) {
        dist_code = k + 1;
@ -276,8 +312,8 @@ void ComputeDistanceShortCodes(std::vector<Command>* cmds) {
      }
    }
    if (dist_code > 1) {
-      dist_ringbuffer[ringbuffer_idx & 3] = cur_dist;
+      dist_ringbuffer[*ringbuffer_idx & 3] = cur_dist;
-      ++ringbuffer_idx;
+      ++(*ringbuffer_idx);
    }
    (*cmds)[i].distance_code_ = dist_code;
  }
@ -414,19 +450,8 @@ int BestMaxZeroRunLengthPrefix(const std::vector<int>& v) {
 }
 void EncodeContextMap(const std::vector<int>& context_map,
                      int context_mode,
                      int context_mode_bits,
                      int num_clusters,
                      int* storage_ix, uint8_t* storage) {
  if (context_mode == 0) {
    WriteBits(1, 0, storage_ix, storage);  // no context
    return;
  }
  WriteBits(1, 1, storage_ix, storage);  // have context
  if (context_mode_bits > 0) {
    WriteBits(context_mode_bits, context_mode - 1, storage_ix, storage);
  }
  WriteBits(8, num_clusters - 1, storage_ix, storage);
  if (num_clusters == 1 || num_clusters == context_map.size()) {
@ -560,7 +585,6 @@ struct EncodingParams {
  int num_direct_distance_codes;
  int distance_postfix_bits;
  int literal_context_mode;
  int distance_context_mode;
 };
 struct MetaBlock {
@ -569,6 +593,7 @@ struct MetaBlock {
  BlockSplit literal_split;
  BlockSplit command_split;
  BlockSplit distance_split;
  std::vector<int> literal_context_modes;
  std::vector<int> literal_context_map;
  std::vector<int> distance_context_map;
  std::vector<HistogramLiteral> literal_histograms;
@ -578,8 +603,9 @@ struct MetaBlock {
 void BuildMetaBlock(const EncodingParams& params,
                    const std::vector<Command>& cmds,
-                    const uint8_t* input_buffer,
+                    const uint8_t* ringbuffer,
-                    size_t pos,
+                    const size_t pos,
                    const size_t mask,
                    MetaBlock* mb) {
  mb->cmds = cmds;
  mb->params = params;
@ -587,7 +613,7 @@ void BuildMetaBlock(const EncodingParams& params,
                         mb->params.num_direct_distance_codes,
                         mb->params.distance_postfix_bits);
  SplitBlock(mb->cmds,
-             input_buffer + pos,
+             &ringbuffer[pos & mask],
             &mb->literal_split,
             &mb->command_split,
             &mb->distance_split);
@ -595,16 +621,14 @@ void BuildMetaBlock(const EncodingParams& params,
  ComputeBlockTypeShortCodes(&mb->command_split);
  ComputeBlockTypeShortCodes(&mb->distance_split);
-  int num_literal_contexts_per_block_type =
+  mb->literal_context_modes.resize(mb->literal_split.num_types_,
-      NumContexts(mb->params.literal_context_mode);
+                                   mb->params.literal_context_mode);
  int num_literal_contexts =
-      mb->literal_split.num_types_ *
+      mb->literal_split.num_types_ << kLiteralContextBits;
      num_literal_contexts_per_block_type;
  int num_distance_contexts_per_block_type =
      (mb->params.distance_context_mode > 0 ? 4 : 1);
  int num_distance_contexts =
-      mb->distance_split.num_types_ *
+      mb->distance_split.num_types_ << kDistanceContextBits;
      num_distance_contexts_per_block_type;
  std::vector<HistogramLiteral> literal_histograms(num_literal_contexts);
  mb->command_histograms.resize(mb->command_split.num_types_);
  std::vector<HistogramDistance> distance_histograms(num_distance_contexts);
@ -612,10 +636,10 @@ void BuildMetaBlock(const EncodingParams& params,
                  mb->literal_split,
                  mb->command_split,
                  mb->distance_split,
-                  input_buffer,
+                  ringbuffer,
                  pos,
-                  mb->params.literal_context_mode,
+                  mask,
-                  mb->params.distance_context_mode,
+                  mb->literal_context_modes,
                  &literal_histograms,
                  &mb->command_histograms,
                  &distance_histograms);
@ -625,25 +649,21 @@ void BuildMetaBlock(const EncodingParams& params,
  static const int kMaxNumberOfHistograms = 240;
  mb->literal_histograms = literal_histograms;
  if (mb->params.literal_context_mode > 0) {
  ClusterHistograms(literal_histograms,
-                      num_literal_contexts_per_block_type,
+                    1 << kLiteralContextBits,
                    mb->literal_split.num_types_,
                    kMaxNumberOfHistograms,
                    &mb->literal_histograms,
                    &mb->literal_context_map);
  }
  mb->distance_histograms = distance_histograms;
  if (mb->params.distance_context_mode > 0) {
  ClusterHistograms(distance_histograms,
-                      num_distance_contexts_per_block_type,
+                    1 << kDistanceContextBits,
                    mb->distance_split.num_types_,
                    kMaxNumberOfHistograms,
                    &mb->distance_histograms,
                    &mb->distance_context_map);
 }
 }
 size_t MetaBlockLength(const std::vector<Command>& cmds) {
  size_t length = 0;
@ -655,14 +675,13 @@ size_t MetaBlockLength(const std::vector<Command>& cmds) {
 }
 void StoreMetaBlock(const MetaBlock& mb,
-                    const uint8_t* input_buffer,
+                    const uint8_t* ringbuffer,
-                    int input_size_bits,
+                    const size_t mask,
                    bool is_last,
                    size_t* pos,
                    int* storage_ix, uint8_t* storage) {
  size_t length = MetaBlockLength(mb.cmds);
  const size_t end_pos = *pos + length;
-  EncodeMetaBlockLength(input_size_bits, length - 1, is_last,
+  EncodeMetaBlockLength(length - 1,
                        storage_ix, storage);
  BlockSplitCode literal_split_code;
  BlockSplitCode command_split_code;
@ -680,10 +699,11 @@ void StoreMetaBlock(const MetaBlock& mb,
  int num_distance_codes =
      kNumDistanceShortCodes + mb.params.num_direct_distance_codes +
      (48 << mb.params.distance_postfix_bits);
-  EncodeContextMap(mb.literal_context_map, mb.params.literal_context_mode, 4,
+  for (int i = 0; i < mb.literal_split.num_types_; ++i) {
-                   mb.literal_histograms.size(), storage_ix, storage);
+    WriteBits(2, mb.literal_context_modes[i], storage_ix, storage);
-  EncodeContextMap(mb.distance_context_map, mb.params.distance_context_mode, 0,
+  }
-                   mb.distance_histograms.size(), storage_ix, storage);
+  EncodeContextMap(mb.literal_context_map, mb.literal_histograms.size(), storage_ix, storage);
  EncodeContextMap(mb.distance_context_map, mb.distance_histograms.size(), storage_ix, storage);
  std::vector<EntropyCodeLiteral> literal_codes;
  std::vector<EntropyCodeCommand> command_codes;
  std::vector<EntropyCodeDistance> distance_codes;
@ -705,27 +725,22 @@ void StoreMetaBlock(const MetaBlock& mb,
    for (int j = 0; j < cmd.insert_length_; ++j) {
      MoveAndEncode(literal_split_code, &literal_it, storage_ix, storage);
      int histogram_idx = literal_it.type_;
-      if (mb.params.literal_context_mode > 0) {
+      uint8_t prev_byte = *pos > 0 ? ringbuffer[(*pos - 1) & mask] : 0;
-        uint8_t prev_byte = *pos > 0 ? input_buffer[*pos - 1] : 0;
+      uint8_t prev_byte2 = *pos > 1 ? ringbuffer[(*pos - 2) & mask] : 0;
-        uint8_t prev_byte2 = *pos > 1 ? input_buffer[*pos - 2] : 0;
+      int context = ((literal_it.type_ << kLiteralContextBits) +
-        uint8_t prev_byte3 = *pos > 2 ? input_buffer[*pos - 3] : 0;
+                     Context(prev_byte, prev_byte2,
-        int context = (literal_it.type_ *
+                             mb.literal_context_modes[literal_it.type_]));
                       NumContexts(mb.params.literal_context_mode) +
                       Context(prev_byte, prev_byte2, prev_byte3,
                               mb.params.literal_context_mode));
      histogram_idx = mb.literal_context_map[context];
-      }
+      EntropyEncode(ringbuffer[*pos & mask],
      EntropyEncode(input_buffer[(*pos)++],
                    literal_codes[histogram_idx], storage_ix, storage);
      ++(*pos);
    }
    if (*pos < end_pos && cmd.distance_prefix_ != 0xffff) {
      MoveAndEncode(distance_split_code, &distance_it, storage_ix, storage);
      int histogram_index = distance_it.type_;
-      if (mb.params.distance_context_mode > 0) {
+      int context = (distance_it.type_ << 2) +
-        int context = distance_it.type_ << 2;
+          ((cmd.copy_length_ > 4) ? 3 : cmd.copy_length_ - 2);
        context += (cmd.copy_length_ > 4) ? 3 : cmd.copy_length_ - 2;
      histogram_index = mb.distance_context_map[context];
      }
      EncodeCopyDistance(cmd, distance_codes[histogram_index],
                         storage_ix, storage);
    }
@ -733,45 +748,123 @@ void StoreMetaBlock(const MetaBlock& mb,
  }
 }
 static const int kWindowBits = 22;
 // To make decoding faster, we allow the decoder to write 16 bytes ahead in
 // its ringbuffer, therefore the encoder has to decrease max distance by this
 // amount.
 static const int kDecoderRingBufferWriteAheadSlack = 16;
 static const int kMaxBackwardDistance =
    (1 << kWindowBits) - kDecoderRingBufferWriteAheadSlack;
 static const int kMetaBlockSizeBits = 21;
 static const int kRingBufferBits = 23;
 static const int kRingBufferMask = (1 << kRingBufferBits) - 1;
 BrotliCompressor::BrotliCompressor()
    : hasher_(new Hasher),
      dist_ringbuffer_idx_(0),
      input_pos_(0),
      ringbuffer_(kRingBufferBits, kMetaBlockSizeBits),
      literal_cost_(1 << kRingBufferBits),
      storage_ix_(0),
      storage_(new uint8_t[2 << kMetaBlockSizeBits]) {
    dist_ringbuffer_[0] = 4;
    dist_ringbuffer_[1] = 11;
    dist_ringbuffer_[2] = 15;
    dist_ringbuffer_[3] = 16;
    storage_[0] = 0;
  }
 BrotliCompressor::~BrotliCompressor() {
  delete hasher_;
  delete[] storage_;
 }
 void BrotliCompressor::WriteStreamHeader() {
  // Don't encode input size.
  WriteBits(3, 0, &storage_ix_, storage_);
  // Encode window size.
  WriteBits(1, 1, &storage_ix_, storage_);
  WriteBits(3, kWindowBits - 17, &storage_ix_, storage_);
 }
 void BrotliCompressor::WriteMetaBlock(const size_t input_size,
                                      const uint8_t* input_buffer,
                                      size_t* encoded_size,
                                      uint8_t* encoded_buffer) {
  ringbuffer_.Write(input_buffer, input_size);
  EstimateBitCostsForLiterals(input_pos_, input_size,
                              kRingBufferMask, ringbuffer_.start(),
                              &literal_cost_[0]);
  std::vector<Command> commands;
  CreateBackwardReferences(input_size, input_pos_,
                           ringbuffer_.start(),
                           &literal_cost_[0],
                           kRingBufferMask, kMaxBackwardDistance,
                           hasher_,
                           &commands);
  ComputeDistanceShortCodes(&commands, dist_ringbuffer_,
                            &dist_ringbuffer_idx_);
  EncodingParams params;
  params.num_direct_distance_codes = 12;
  params.distance_postfix_bits = 1;
  params.literal_context_mode = CONTEXT_SIGNED;
  MetaBlock mb;
  BuildMetaBlock(params, commands, ringbuffer_.start(), input_pos_,
                 kRingBufferMask, &mb);
  StoreMetaBlock(mb, ringbuffer_.start(), kRingBufferMask,
                 &input_pos_, &storage_ix_, storage_);
  size_t output_size = storage_ix_ >> 3;
  memcpy(encoded_buffer, storage_, output_size);
  *encoded_size = output_size;
  storage_ix_ -= output_size << 3;
  storage_[storage_ix_ >> 3] = storage_[output_size];
 }
 void BrotliCompressor::FinishStream(
    size_t* encoded_size, uint8_t* encoded_buffer) {
  WriteBits(1, 1, &storage_ix_, storage_);
  *encoded_size = (storage_ix_ + 7) >> 3;
  memcpy(encoded_buffer, storage_, *encoded_size);
 }
 int BrotliCompressBuffer(size_t input_size,
                         const uint8_t* input_buffer,
                         size_t* encoded_size,
                         uint8_t* encoded_buffer) {
  int storage_ix = 0;
  uint8_t* storage = encoded_buffer;
  WriteBitsPrepareStorage(storage_ix, storage);
  EncodeSize(input_size, &storage_ix, storage);
  if (input_size == 0) {
-    *encoded_size = (storage_ix + 7) >> 3;
+    encoded_buffer[0] = 1;
    encoded_buffer[1] = 0;
    *encoded_size = 2;
    return 1;
  }
  int input_size_bits = Log2Ceiling(input_size);
-  std::vector<Command> all_commands;
+  BrotliCompressor compressor;
-  CreateBackwardReferences(input_buffer, input_size, &all_commands);
+  compressor.WriteStreamHeader();
  ComputeDistanceShortCodes(&all_commands);
-  std::vector<std::vector<Command> > meta_block_commands;
+  const int max_block_size = 1 << kMetaBlockSizeBits;
-  SplitBlockByTotalLength(all_commands, input_size, 2 << 20,
+  size_t max_output_size = *encoded_size;
-                          &meta_block_commands);
+  const uint8_t* input_end = input_buffer + input_size;
  *encoded_size = 0;
-  size_t pos = 0;
+  while (input_buffer < input_end) {
-  for (int block_idx = 0; block_idx < meta_block_commands.size(); ++block_idx) {
+    int block_size = max_block_size;
-    const std::vector<Command>& commands = meta_block_commands[block_idx];
+    if (block_size >= input_end - input_buffer) {
-    bool is_last_meta_block = (block_idx + 1 == meta_block_commands.size());
+      block_size = input_end - input_buffer;
-    EncodingParams params;
+    }
-    params.num_direct_distance_codes = 12;
+    size_t output_size = max_output_size;
-    params.distance_postfix_bits = 1;
+    compressor.WriteMetaBlock(block_size, input_buffer,
-    params.literal_context_mode = CONTEXT_SIGNED_MIXED_3BYTE;
+                              &output_size, &encoded_buffer[*encoded_size]);
-    params.distance_context_mode = 1;
+    input_buffer += block_size;
-    MetaBlock mb;
+    *encoded_size += output_size;
-    BuildMetaBlock(params, commands, input_buffer, pos, &mb);
+    max_output_size -= output_size;
    StoreMetaBlock(mb, input_buffer, input_size_bits, is_last_meta_block,
                   &pos, &storage_ix, storage);
  }
-  *encoded_size = (storage_ix + 7) >> 3;
+  size_t output_size = max_output_size;
  compressor.FinishStream(&output_size, &encoded_buffer[*encoded_size]);
  *encoded_size += output_size;
  return 1;
 }
--- a/enc/encode.h
+++ b/enc/encode.h
@ -20,9 +20,45 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <string>
 #include <vector>
 #include "./hash.h"
 #include "./ringbuffer.h"
 namespace brotli {
 class BrotliCompressor {
 public:
  BrotliCompressor();
  ~BrotliCompressor();
  // Writes the stream header into the internal output buffer.
  void WriteStreamHeader();
  // Encodes the data in input_buffer as a meta-block and writes it to
  // encoded_buffer and sets *encoded_size to the number of bytes that was
  // written.
  void WriteMetaBlock(const size_t input_size,
                      const uint8_t* input_buffer,
                      size_t* encoded_size,
                      uint8_t* encoded_buffer);
  // Writes a zero-length meta-block with end-of-input bit set to the
  // internal output buffer and copies the output buffer to encoded_buffer and
  // sets *encoded_size to the number of bytes written.
  void FinishStream(size_t* encoded_size, uint8_t* encoded_buffer);
 private:
  Hasher* hasher_;
  int dist_ringbuffer_[4];
  size_t dist_ringbuffer_idx_;
  size_t input_pos_;
  RingBuffer ringbuffer_;
  std::vector<float> literal_cost_;
  int storage_ix_;
  uint8_t* storage_;
 };
 // Compresses the data in input_buffer into encoded_buffer, and sets
 // *encoded_size to the compressed length.
 // Returns 0 if there was an error and 1 otherwise.
--- a/enc/entropy_encode.cc
+++ b/enc/entropy_encode.cc
@ -43,6 +43,9 @@ HuffmanTree::HuffmanTree() {}
 // Sort the root nodes, least popular first.
 bool SortHuffmanTree(const HuffmanTree &v0, const HuffmanTree &v1) {
  if (v0.total_count_ == v1.total_count_) {
    return v0.index_right_or_value_ > v1.index_right_or_value_;
  }
  return v0.total_count_ < v1.total_count_;
 }
@ -276,7 +279,7 @@ int OptimizeHuffmanCountsForRle(int length, int* counts) {
  }
  // 3) Let's replace those population counts that lead to more rle codes.
  stride = 0;
-  limit = counts[0];
+  limit = (counts[0] + counts[1] + counts[2]) / 3 + 1;
  sum = 0;
  for (i = 0; i < length + 1; ++i) {
    if (i == length || good_for_rle[i] ||
@ -301,11 +304,10 @@ int OptimizeHuffmanCountsForRle(int length, int* counts) {
      }
      stride = 0;
      sum = 0;
-      if (i < length - 3) {
+      if (i < length - 2) {
        // All interesting strides have a count of at least 4,
        // at least when non-zeros.
-        limit = (counts[i] + counts[i + 1] +
+        limit = (counts[i] + counts[i + 1] + counts[i + 2]) / 3 + 1;
                 counts[i + 2] + counts[i + 3] + 2) / 4;
      } else if (i < length) {
        limit = counts[i];
      } else {
@ -329,7 +331,7 @@ void WriteHuffmanTree(const uint8_t* depth, const int length,
                      uint8_t* tree,
                      uint8_t* extra_bits_data,
                      int* huffman_tree_size) {
-  int previous_value = 0;
+  int previous_value = 8;
  for (uint32_t i = 0; i < length;) {
    const int value = depth[i];
    int reps = 1;
--- a/enc/entropy_encode.h
+++ b/enc/entropy_encode.h
@ -66,8 +66,8 @@ struct EntropyCode {
  uint16_t bits_[kSize];
  // How many non-zero depth.
  int count_;
-  // First two symbols with non-zero depth.
+  // First four symbols with non-zero depth.
-  int symbols_[2];
+  int symbols_[4];
 };
 template<int kSize>
@ -82,7 +82,7 @@ void BuildEntropyCode(const Histogram<kSize>& histogram,
  if (histogram.total_count_ == 0) return;
  for (int i = 0; i < kSize; ++i) {
    if (histogram.data_[i] > 0) {
-      if (code->count_ < 2) code->symbols_[code->count_] = i;
+      if (code->count_ < 4) code->symbols_[code->count_] = i;
      ++code->count_;
    }
  }
--- a/enc/hash.h
+++ b/enc/hash.h
@ -103,8 +103,7 @@ template <int kBucketBits, int kBlockBits>
 class HashLongestMatch {
 public:
  HashLongestMatch()
-      : literal_cost_(NULL),
+      : last_distance1_(4),
        last_distance1_(4),
        last_distance2_(11),
        last_distance3_(15),
        last_distance4_(16),
@ -115,10 +114,6 @@ class HashLongestMatch {
  void Reset() {
    std::fill(&num_[0], &num_[sizeof(num_) / sizeof(num_[0])], 0);
  }
  void SetLiteralCost(float *cost) {
    literal_cost_ = cost;
  }
  double literal_cost(int i) const { return literal_cost_[i]; }
  // Look at 3 bytes at data.
  // Compute a hash from these, and store the value of ix at that position.
@ -146,25 +141,27 @@ class HashLongestMatch {
  // into best_distance_out.
  // Write the score of the best match into best_score_out.
  bool FindLongestMatch(const uint8_t * __restrict data,
                        const float * __restrict literal_cost,
                        const size_t ring_buffer_mask,
                        const uint32_t cur_ix,
                        uint32_t max_length,
                        const uint32_t max_backward,
                        size_t * __restrict best_len_out,
                        size_t * __restrict best_distance_out,
                        double * __restrict best_score_out) {
-    const double start_cost4 = literal_cost_ == NULL ? 20 :
+    const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
-        literal_cost_[cur_ix] +
+    const double start_cost4 = literal_cost == NULL ? 20 :
-        literal_cost_[cur_ix + 1] +
+        literal_cost[cur_ix_masked] +
-        literal_cost_[cur_ix + 2] +
+        literal_cost[(cur_ix + 1) & ring_buffer_mask] +
-        literal_cost_[cur_ix + 3];
+        literal_cost[(cur_ix + 2) & ring_buffer_mask] +
-
+        literal_cost[(cur_ix + 3) & ring_buffer_mask];
-    const double start_cost3 = literal_cost_ == NULL ? 15 :
+    const double start_cost3 = literal_cost == NULL ? 15 :
-        literal_cost_[cur_ix] +
+        literal_cost[cur_ix_masked] +
-        literal_cost_[cur_ix + 1] +
+        literal_cost[(cur_ix + 1) & ring_buffer_mask] +
-        literal_cost_[cur_ix + 2] + 0.3;
+        literal_cost[(cur_ix + 2) & ring_buffer_mask] + 0.3;
-    double start_cost2 = literal_cost_ == NULL ? 10 :
+    double start_cost2 = literal_cost == NULL ? 10 :
-        literal_cost_[cur_ix] +
+        literal_cost[cur_ix_masked] +
-        literal_cost_[cur_ix + 1] + 1.2;
+        literal_cost[(cur_ix + 1) & ring_buffer_mask] + 1.2;
    bool match_found = false;
    // Don't accept a short copy from far away.
    double best_score = 8.25;
@ -177,7 +174,7 @@ class HashLongestMatch {
    size_t best_ix = 1;
    // Try last distance first.
    for (int i = 0; i < 16; ++i) {
-      int prev_ix = cur_ix;
+      size_t prev_ix = cur_ix;
      switch(i) {
        case 0: prev_ix -= last_distance1_; break;
        case 1: prev_ix -= last_distance2_; break;
@ -205,11 +202,13 @@ class HashLongestMatch {
      if (PREDICT_FALSE(backward > max_backward)) {
        continue;
      }
-      if (data[cur_ix + best_len] != data[prev_ix + best_len]) {
+      prev_ix &= ring_buffer_mask;
      if (data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
        continue;
      }
      const size_t len =
-          FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix], max_length);
+          FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
                                   max_length);
      if (len >= 3 || (len == 2 && i < 2)) {
        // Comparing for >= 2 does not change the semantics, but just saves for
        // a few unnecessary binary logarithms in backward reference score,
@ -234,7 +233,7 @@ class HashLongestMatch {
        }
      }
    }
-    const uint32_t key = Hash3Bytes(&data[cur_ix], kBucketBits);
+    const uint32_t key = Hash3Bytes(&data[cur_ix_masked], kBucketBits);
    const uint32_t * __restrict const bucket = &buckets_[key][0];
    const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
    int stop = int(cur_ix) - 64;
@ -247,8 +246,9 @@ class HashLongestMatch {
      if (PREDICT_FALSE(backward > max_backward)) {
        break;
      }
-      if (data[cur_ix] != data[prev_ix] ||
+      prev_ix &= ring_buffer_mask;
-          data[cur_ix + 1] != data[prev_ix + 1]) {
+      if (data[cur_ix_masked] != data[prev_ix] ||
          data[cur_ix_masked + 1] != data[prev_ix + 1]) {
        continue;
      }
      int len = 2;
@ -269,11 +269,13 @@ class HashLongestMatch {
      if (PREDICT_FALSE(backward > max_backward)) {
        break;
      }
-      if (data[cur_ix + best_len] != data[prev_ix + best_len]) {
+      prev_ix &= ring_buffer_mask;
      if (data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
        continue;
      }
      const size_t len =
-          FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix], max_length);
+          FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
                                   max_length);
      if (len >= 3) {
        // Comparing for >= 3 does not change the semantics, but just saves for
        // a few unnecessary binary logarithms in backward reference score,
@ -333,10 +335,6 @@ class HashLongestMatch {
  // Buckets containing kBlockSize of backward references.
  uint32_t buckets_[kBucketSize][kBlockSize];
  // Model of how much the ith literal costs to encode using
  // the entropy model.
  float *literal_cost_;
  int last_distance1_;
  int last_distance2_;
  int last_distance3_;
@ -349,6 +347,8 @@ class HashLongestMatch {
  double average_cost_;
 };
 typedef HashLongestMatch<13, 11> Hasher;
 }  // namespace brotli
 #endif  // BROTLI_ENC_HASH_H_
--- a/enc/histogram.cc
+++ b/enc/histogram.cc
@ -31,10 +31,10 @@ void BuildHistograms(
    const BlockSplit& literal_split,
    const BlockSplit& insert_and_copy_split,
    const BlockSplit& dist_split,
-    const uint8_t* input_buffer,
+    const uint8_t* ringbuffer,
    size_t pos,
-    int context_mode,
+    size_t mask,
-    int distance_context_mode,
+    const std::vector<int>& context_modes,
    std::vector<HistogramLiteral>* literal_histograms,
    std::vector<HistogramCommand>* insert_and_copy_histograms,
    std::vector<HistogramDistance>* copy_dist_histograms) {
@ -48,25 +48,47 @@ void BuildHistograms(
        cmd.command_prefix_);
    for (int j = 0; j < cmd.insert_length_; ++j) {
      literal_it.Next();
-      uint8_t prev_byte = pos > 0 ? input_buffer[pos - 1] : 0;
+      uint8_t prev_byte = pos > 0 ? ringbuffer[(pos - 1) & mask] : 0;
-      uint8_t prev_byte2 = pos > 1 ? input_buffer[pos - 2] : 0;
+      uint8_t prev_byte2 = pos > 1 ? ringbuffer[(pos - 2) & mask] : 0;
-      uint8_t prev_byte3 = pos > 2 ? input_buffer[pos - 3] : 0;
+      int context = (literal_it.type_ << kLiteralContextBits) +
-      int context = (literal_it.type_ * NumContexts(context_mode) +
+          Context(prev_byte, prev_byte2, context_modes[literal_it.type_]);
-                     Context(prev_byte, prev_byte2, prev_byte3, context_mode));
+      (*literal_histograms)[context].Add(ringbuffer[pos & mask]);
      (*literal_histograms)[context].Add(input_buffer[pos]);
      ++pos;
    }
    pos += cmd.copy_length_;
    if (cmd.copy_length_ > 0 && cmd.distance_prefix_ != 0xffff) {
      dist_it.Next();
-      int context = dist_it.type_;
+      int context = (dist_it.type_ << kDistanceContextBits) +
-      if (distance_context_mode > 0) {
+          ((cmd.copy_length_ > 4) ? 3 : cmd.copy_length_ - 2);
        context <<= 2;
        context += (cmd.copy_length_ > 4) ? 3 : cmd.copy_length_ - 2;
      }
      (*copy_dist_histograms)[context].Add(cmd.distance_prefix_);
    }
  }
 }
 void BuildLiteralHistogramsForBlockType(
    const std::vector<Command>& cmds,
    const BlockSplit& literal_split,
    const uint8_t* ringbuffer,
    size_t pos,
    size_t mask,
    int block_type,
    int context_mode,
    std::vector<HistogramLiteral>* histograms) {
  BlockSplitIterator literal_it(literal_split);
  for (int i = 0; i < cmds.size(); ++i) {
    const Command &cmd = cmds[i];
    for (int j = 0; j < cmd.insert_length_; ++j) {
      literal_it.Next();
      if (literal_it.type_ == block_type) {
        uint8_t prev_byte = pos > 0 ? ringbuffer[(pos - 1) & mask] : 0;
        uint8_t prev_byte2 = pos > 1 ? ringbuffer[(pos - 2) & mask] : 0;
        int context = Context(prev_byte, prev_byte2, context_mode);
        (*histograms)[context].Add(ringbuffer[pos & mask]);
      }
      ++pos;
    }
    pos += cmd.copy_length_;
  }
 }
 }  // namespace brotli
--- a/enc/histogram.h
+++ b/enc/histogram.h
@ -79,19 +79,32 @@ typedef Histogram<kNumCommandPrefixes> HistogramCommand;
 typedef Histogram<kNumDistancePrefixes> HistogramDistance;
 typedef Histogram<kNumBlockLenPrefixes> HistogramBlockLength;
 static const int kLiteralContextBits = 6;
 static const int kDistanceContextBits = 2;
 void BuildHistograms(
    const std::vector<Command>& cmds,
    const BlockSplit& literal_split,
    const BlockSplit& insert_and_copy_split,
    const BlockSplit& dist_split,
-    const uint8_t* input_buffer,
+    const uint8_t* ringbuffer,
    size_t pos,
-    int context_mode,
+    size_t mask,
-    int distance_context_mode,
+    const std::vector<int>& context_modes,
    std::vector<HistogramLiteral>* literal_histograms,
    std::vector<HistogramCommand>* insert_and_copy_histograms,
    std::vector<HistogramDistance>* copy_dist_histograms);
 void BuildLiteralHistogramsForBlockType(
    const std::vector<Command>& cmds,
    const BlockSplit& literal_split,
    const uint8_t* ringbuffer,
    size_t pos,
    size_t mask,
    int block_type,
    int context_mode,
    std::vector<HistogramLiteral>* histograms);
 }  // namespace brotli
 #endif  // BROTLI_ENC_HISTOGRAM_H_
--- a/enc/literal_cost.cc
+++ b/enc/literal_cost.cc
@ -22,37 +22,39 @@
 namespace brotli {
-void EstimateBitCostsForLiterals(size_t len, const uint8_t *data, float *cost) {
+void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
                                 const uint8_t *data, float *cost) {
  int histogram[256] = { 0 };
  int window_half = 2000;
  int in_window = std::min(static_cast<size_t>(window_half), len);
  // Bootstrap histogram.
  for (int i = 0; i < in_window; ++i) {
-    ++histogram[data[i]];
+    ++histogram[data[(pos + i) & mask]];
  }
  // Compute bit costs with sliding window.
  for (int i = 0; i < len; ++i) {
    if (i - window_half >= 0) {
      // Remove a byte in the past.
-      --histogram[data[i - window_half]];
+      --histogram[data[(pos + i - window_half) & mask]];
      --in_window;
    }
    if (i + window_half < len) {
      // Add a byte in the future.
-      ++histogram[data[i + window_half]];
+      ++histogram[data[(pos + i + window_half) & mask]];
      ++in_window;
    }
-    int histo = histogram[data[i]];
+    int masked_pos = (pos + i) & mask;
    int histo = histogram[data[masked_pos]];
    if (histo == 0) {
      histo = 1;
    }
-    cost[i] = log2(static_cast<double>(in_window) / histo);
+    cost[masked_pos] = log2(static_cast<double>(in_window) / histo);
-    cost[i] += 0.03;
+    cost[masked_pos] += 0.03;
-    if (cost[i] < 1.0) {
+    if (cost[masked_pos] < 1.0) {
-      cost[i] *= 0.5;
+      cost[masked_pos] *= 0.5;
-      cost[i] += 0.5;
+      cost[masked_pos] += 0.5;
    }
  }
 }
--- a/enc/literal_cost.h
+++ b/enc/literal_cost.h
@ -22,9 +22,11 @@
 namespace brotli {
-// Input: length of data, and the bytes.
+// Estimates how many bits the literals in the interval [pos, pos + len) in the
-// Output: estimate of how many bits the literal will take entropy coded.
+// ringbuffer (data, mask) will take entropy coded and writes these estimates
-void EstimateBitCostsForLiterals(size_t len, const uint8_t *data, float *cost);
+// to the ringbuffer (cost, mask).
 void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
                                 const uint8_t *data, float *cost);
 }  // namespace brotli
--- a/enc/ringbuffer.h
+++ b/enc/ringbuffer.h
@ -0,0 +1,89 @@
 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 // Sliding window over the input data.
 #ifndef BROTLI_ENC_RINGBUFFER_H_
 #define BROTLI_ENC_RINGBUFFER_H_
 // A RingBuffer(window_bits, tail_bits) contains `1 << window_bits' bytes of
 // data in a circular manner: writing a byte writes it to
 // `position() % (1 << window_bits)'. For convenience, the RingBuffer array
 // contains another copy of the first `1 << tail_bits' bytes:
 // buffer_[i] == buffer_[i + (1 << window_bits)] if i < (1 << tail_bits).
 class RingBuffer {
 public:
  RingBuffer(int window_bits, int tail_bits)
      : window_bits_(window_bits), tail_bits_(tail_bits), pos_(0) {
    static const int kSlackForThreeByteHashingEverywhere = 2;
    const int buflen = (1 << window_bits_) + (1 << tail_bits_);
    buffer_ = new uint8_t[buflen + kSlackForThreeByteHashingEverywhere];
    for (int i = 0; i < kSlackForThreeByteHashingEverywhere; ++i) {
      buffer_[buflen + i] = 0;
    }
  }
  ~RingBuffer() {
    delete [] buffer_;
  }
  // Push bytes into the ring buffer.
  void Write(const uint8_t *bytes, size_t n) {
    const size_t masked_pos = pos_ & ((1 << window_bits_) - 1);
    // The length of the writes is limited so that we do not need to worry
    // about a write
    WriteTail(bytes, n);
    if (masked_pos + n <= (1 << window_bits_)) {
      // A single write fits.
      memcpy(&buffer_[masked_pos], bytes, n);
    } else {
      // Split into two writes.
      // Copy into the end of the buffer, including the tail buffer.
      memcpy(&buffer_[masked_pos], bytes,
             std::min(n,
                      ((1 << window_bits_) + (1 << tail_bits_)) - masked_pos));
      // Copy into the begining of the buffer
      memcpy(&buffer_[0], bytes + ((1 << window_bits_) - masked_pos),
             n - ((1 << window_bits_) - masked_pos));
    }
    pos_ += n;
  }
  // Logical cursor position in the ring buffer.
  size_t position() const { return pos_; }
  uint8_t *start() { return &buffer_[0]; }
  const uint8_t *start() const { return &buffer_[0]; }
 private:
  void WriteTail(const uint8_t *bytes, size_t n) {
    const size_t masked_pos = pos_ & ((1 << window_bits_) - 1);
    if (masked_pos < (1 << tail_bits_)) {
      // Just fill the tail buffer with the beginning data.
      const size_t p = (1 << window_bits_) + masked_pos;
      memcpy(&buffer_[p], bytes, std::min(n, (1 << tail_bits_) - masked_pos));
    }
  }
  // Size of the ringbuffer is (1 << window_bits) + (1 << tail_bits).
  const int window_bits_;
  const int tail_bits_;
  // Position to write in the ring buffer.
  size_t pos_;
  // The actual ring buffer containing the data and the copy of the beginning
  // as a tail.
  uint8_t *buffer_;
 };
 #endif  // BROTLI_ENC_RINGBUFFER_H_