Updates to Brotli compression format, decoder and encoder

This commit contains a batch of changes that were made to the Brotli compression algorithm in the last three weeks. Most important changes: * Added UTF8 context model for good text compression. * Simplified context modeling by having only 4 context modes. * Per-block context mode selection. * Faster backward copying and bit reading functions. * More efficient histogram coding. * Streaming support for the decoder and encoder.
2024-11-21 19:20:09 +00:00 · 2013-11-15 19:02:17 +01:00 · 2013-11-15 19:02:17 +01:00 · c6b9c7c5c8
commit c6b9c7c5c8
parent c66e4e3e4f
23 changed files with 1647 additions and 870 deletions
--- a/dec/bit_reader.c
+++ b/dec/bit_reader.c
@ -15,6 +15,7 @@
 // Bit reading helpers

 #include <assert.h>
+#include <stdlib.h>

 #include "./bit_reader.h"

@ -22,99 +23,24 @@
 extern "C" {
 #endif

-#define MAX_NUM_BIT_READ 25
-
-#define LBITS 64      // Number of bits prefetched.
-#define WBITS 32      // Minimum number of bytes needed after
-                      // BrotliFillBitWindow.
-#define LOG8_WBITS 4  // Number of bytes needed to store WBITS bits.
-
-static const uint32_t kBitMask[MAX_NUM_BIT_READ] = {
-  0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
-  65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215
-};
-
-void BrotliInitBitReader(BrotliBitReader* const br,
-                         const uint8_t* const start,
-                         size_t length) {
+int BrotliInitBitReader(BrotliBitReader* const br, BrotliInput input) {
  size_t i;
  assert(br != NULL);
-  assert(start != NULL);
-  assert(length < 0xfffffff8u);   // can't happen with a RIFF chunk.

-  br->buf_ = start;
-  br->len_ = length;
+  br->input_ = input;
  br->val_ = 0;
  br->pos_ = 0;
  br->bit_pos_ = 0;
+  br->end_pos_ = 0;
  br->eos_ = 0;
-  br->error_ = 0;
-  for (i = 0; i < sizeof(br->val_) && i < br->len_; ++i) {
+  if (!BrotliReadMoreInput(br)) {
+    return 0;
+  }
+  for (i = 0; i < sizeof(br->val_); ++i) {
    br->val_ |= ((uint64_t)br->buf_[br->pos_]) << (8 * i);
    ++br->pos_;
  }
-}
-
-void BrotliBitReaderSetBuffer(BrotliBitReader* const br,
-                              const uint8_t* const buf, size_t len) {
-  assert(br != NULL);
-  assert(buf != NULL);
-  assert(len < 0xfffffff8u);   // can't happen with a RIFF chunk.
-  br->eos_ = (br->pos_ >= len);
-  br->buf_ = buf;
-  br->len_ = len;
-}
-
-// If not at EOS, reload up to LBITS byte-by-byte
-static void ShiftBytes(BrotliBitReader* const br) {
-  while (br->bit_pos_ >= 8 && br->pos_ < br->len_) {
-    br->val_ >>= 8;
-    br->val_ |= ((uint64_t)br->buf_[br->pos_]) << (LBITS - 8);
-    ++br->pos_;
-    br->bit_pos_ -= 8;
-  }
-}
-
-void BrotliFillBitWindow(BrotliBitReader* const br) {
-  if (br->bit_pos_ >= WBITS) {
-#if (defined(__x86_64__) || defined(_M_X64))
-    if (br->pos_ + sizeof(br->val_) < br->len_) {
-      br->val_ >>= WBITS;
-      br->bit_pos_ -= WBITS;
-      // The expression below needs a little-endian arch to work correctly.
-      // This gives a large speedup for decoding speed.
-      br->val_ |= *(const uint64_t*)(br->buf_ + br->pos_) << (LBITS - WBITS);
-      br->pos_ += LOG8_WBITS;
-      return;
-    }
-#endif
-    ShiftBytes(br);       // Slow path.
-    if (br->pos_ == br->len_ && br->bit_pos_ == LBITS) {
-      br->eos_ = 1;
-    }
-  }
-}
-
-uint32_t BrotliReadBits(BrotliBitReader* const br, int n_bits) {
-  assert(n_bits >= 0);
-  // Flag an error if end_of_stream or n_bits is more than allowed limit.
-  if (n_bits == 0 || (!br->eos_ && n_bits < MAX_NUM_BIT_READ)) {
-    const uint32_t val =
-        (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
-    const int new_bits = br->bit_pos_ + n_bits;
-    br->bit_pos_ = new_bits;
-    // If this read is going to cross the read buffer, set the eos flag.
-    if (br->pos_ == br->len_) {
-      if (new_bits >= LBITS) {
-        br->eos_ = 1;
-      }
-    }
-    ShiftBytes(br);
-    return val;
-  } else {
-    br->error_ = 1;
-    return 0;
-  }
+  return (br->end_pos_ > 0);
 }

 #if defined(__cplusplus) || defined(c_plusplus)
--- a/dec/bit_reader.h
+++ b/dec/bit_reader.h
@ -17,34 +17,39 @@
 #ifndef BROTLI_DEC_BIT_READER_H_
 #define BROTLI_DEC_BIT_READER_H_

+#include <string.h>
+#include "./streams.h"
 #include "./types.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

+#define BROTLI_MAX_NUM_BIT_READ   25
+#define BROTLI_READ_SIZE          4096
+#define BROTLI_IBUF_SIZE          (2 * BROTLI_READ_SIZE + 32)
+#define BROTLI_IBUF_MASK          (2 * BROTLI_READ_SIZE - 1)
+
+#define UNALIGNED_COPY64(dst, src) *(uint64_t*)(dst) = *(const uint64_t*)(src)
+
+static const uint32_t kBitMask[BROTLI_MAX_NUM_BIT_READ] = {
+  0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
+  65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215
+};
+
 typedef struct {
-  uint64_t       val_;        // pre-fetched bits
-  const uint8_t* buf_;        // input byte buffer
-  size_t         len_;        // buffer length
-  size_t         pos_;        // byte position in buf_
-  int            bit_pos_;    // current bit-reading position in val_
-  int            eos_;        // bitstream is finished
-  int            error_;      // an error occurred (buffer overflow attempt...)
+  // Input byte buffer, consist of a ringbuffer and a "slack" region where
+  // bytes from the start of the ringbuffer are copied.
+  uint8_t buf_[BROTLI_IBUF_SIZE];
+  BrotliInput input_;    // input callback
+  uint64_t    val_;      // pre-fetched bits
+  size_t      pos_;      // byte position in stream
+  int         bit_pos_;  // current bit-reading position in val_
+  size_t      end_pos_;  // current end position in stream
+  int         eos_;      // input stream is finished
 } BrotliBitReader;

-void BrotliInitBitReader(BrotliBitReader* const br,
-                         const uint8_t* const start,
-                         size_t length);
-
-//  Sets a new data buffer.
-void BrotliBitReaderSetBuffer(BrotliBitReader* const br,
-                              const uint8_t* const buffer, size_t length);
-
-// Reads the specified number of bits from Read Buffer.
-// Flags an error in case end_of_stream or n_bits is more than allowed limit.
-// Flags eos if this read attempt is going to cross the read buffer.
-uint32_t BrotliReadBits(BrotliBitReader* const br, int n_bits);
+int BrotliInitBitReader(BrotliBitReader* const br, BrotliInput input);

 // Return the prefetched bits, so they can be looked up.
 static BROTLI_INLINE uint32_t BrotliPrefetchBits(BrotliBitReader* const br) {
@ -57,8 +62,92 @@ static BROTLI_INLINE void BrotliSetBitPos(BrotliBitReader* const br, int val) {
  br->bit_pos_ = val;
 }

-// Advances the Read buffer by 4 bytes to make room for reading next 32 bits.
-void BrotliFillBitWindow(BrotliBitReader* const br);
+// Reload up to 64 bits byte-by-byte
+static BROTLI_INLINE void ShiftBytes(BrotliBitReader* const br) {
+  while (br->bit_pos_ >= 8) {
+    br->val_ >>= 8;
+    br->val_ |= ((uint64_t)br->buf_[br->pos_ & BROTLI_IBUF_MASK]) << 56;
+    ++br->pos_;
+    br->bit_pos_ -= 8;
+  }
+}
+
+// Fills up the input ringbuffer by calling the input callback.
+//
+// Does nothing if there are at least 32 bytes present after current position.
+//
+// Returns 0 if either:
+//  - the input callback returned an error, or
+//  - there is no more input and the position is past the end of the stream.
+//
+// After encountering the end of the input stream, 32 additional zero bytes are
+// copied to the ringbuffer, therefore it is safe to call this function after
+// every 32 bytes of input is read.
+static BROTLI_INLINE int BrotliReadMoreInput(BrotliBitReader* const br) {
+  if (br->pos_ + 32 < br->end_pos_) {
+    return 1;
+  } else if (br->eos_) {
+    return (br->pos_ << 3) + br->bit_pos_ <= (br->end_pos_ << 3) + 64;
+  } else {
+    uint8_t* dst = br->buf_ + (br->end_pos_ & BROTLI_IBUF_MASK);
+    int bytes_read = BrotliRead(br->input_, dst, BROTLI_READ_SIZE);
+    if (bytes_read < 0) {
+      return 0;
+    }
+    if (bytes_read < BROTLI_READ_SIZE) {
+      br->eos_ = 1;
+      // Store 32 bytes of zero after the stream end.
+#if (defined(__x86_64__) || defined(_M_X64))
+      *(uint64_t*)(dst + bytes_read) = 0;
+      *(uint64_t*)(dst + bytes_read + 8) = 0;
+      *(uint64_t*)(dst + bytes_read + 16) = 0;
+      *(uint64_t*)(dst + bytes_read + 24) = 0;
+#else
+      memset(dst + bytes_read, 0, 32);
+#endif
+    }
+    if (dst == br->buf_) {
+      // Copy the head of the ringbuffer to the slack region.
+#if (defined(__x86_64__) || defined(_M_X64))
+      UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 32, br->buf_);
+      UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 24, br->buf_ + 8);
+      UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 16, br->buf_ + 16);
+      UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 8, br->buf_ + 24);
+#else
+      memcpy(br->buf_ + (BROTLI_READ_SIZE << 1), br->buf_, 32);
+#endif
+    }
+    br->end_pos_ += bytes_read;
+    return 1;
+  }
+}
+
+// Advances the Read buffer by 5 bytes to make room for reading next 24 bits.
+static BROTLI_INLINE void BrotliFillBitWindow(BrotliBitReader* const br) {
+  if (br->bit_pos_ >= 40) {
+#if (defined(__x86_64__) || defined(_M_X64))
+    br->val_ >>= 40;
+    br->bit_pos_ -= 40;
+    // The expression below needs a little-endian arch to work correctly.
+    // This gives a large speedup for decoding speed.
+    br->val_ |= *(const uint64_t*)(
+        br->buf_ + (br->pos_ & BROTLI_IBUF_MASK)) << 24;
+    br->pos_ += 5;
+#else
+    ShiftBytes(br);
+#endif
+  }
+}
+
+// Reads the specified number of bits from Read Buffer.
+// Requires that n_bits is positive.
+static BROTLI_INLINE uint32_t BrotliReadBits(
+    BrotliBitReader* const br, int n_bits) {
+  BrotliFillBitWindow(br);
+  const uint32_t val = (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
+  br->bit_pos_ += n_bits;
+  return val;
+}

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/dec/context.h
+++ b/dec/context.h
@ -12,34 +12,154 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-// Lookup tables to map the previous one to three bytes to a context id.
+// Lookup table to map the previous two bytes to a context id.
+//
+// There are four different context modeling modes defined here:
+//   CONTEXT_LSB6: context id is the least significant 6 bits of the last byte,
+//   CONTEXT_MSB6: context id is the most significant 6 bits of the last byte,
+//   CONTEXT_UTF8: second-order context model tuned for UTF8-encoded text,
+//   CONTEXT_SIGNED: second-order context model tuned for signed integers.
+//
+// The context id for the UTF8 context model is calculated as follows. If p1
+// and p2 are the previous two bytes, we calcualte the context as
+//
+//   context = kContextLookup[p1] | kContextLookup[p2 + 256].
+//
+// If the previous two bytes are ASCII characters (i.e. < 128), this will be
+// equivalent to
+//
+//   context = 4 * context1(p1) + context2(p2),
+//
+// where context1 is based on the previous byte in the following way:
+//
+//   0  : non-ASCII control
+//   1  : \t, \n, \r
+//   2  : space
+//   3  : other punctuation
+//   4  : " '
+//   5  : %
+//   6  : ( < [ {
+//   7  : ) > ] }
+//   8  : , ; :
+//   9  : .
+//   10 : =
+//   11 : number
+//   12 : upper-case vowel
+//   13 : upper-case consonant
+//   14 : lower-case vowel
+//   15 : lower-case consonant
+//
+// and context2 is based on the second last byte:
+//
+//   0 : control, space
+//   1 : punctuation
+//   2 : upper-case letter, number
+//   3 : lower-case letter
+//
+// If the last byte is ASCII, and the second last byte is not (in a valid UTF8
+// stream it will be a continuation byte, value between 128 and 191), the
+// context is the same as if the second last byte was an ASCII control or space.
+//
+// If the last byte is a UTF8 lead byte (value >= 192), then the next byte will
+// be a continuation byte and the context id is 2 or 3 depending on the LSB of
+// the last byte and to a lesser extent on the second last byte if it is ASCII.
+//
+// If the last byte is a UTF8 continuation byte, the second last byte can be:
+//   - continuation byte: the next byte is probably ASCII or lead byte (assuming
+//     4-byte UTF8 characters are rare) and the context id is 0 or 1.
+//   - lead byte (192 - 207): next byte is ASCII or lead byte, context is 0 or 1
+//   - lead byte (208 - 255): next byte is continuation byte, context is 2 or 3
+//
+// The possible value combinations of the previous two bytes, the range of
+// context ids and the type of the next byte is summarized in the table below:
+//
+// |--------\-----------------------------------------------------------------|
+// |         \                         Last byte                              |
+// | Second   \---------------------------------------------------------------|
+// | last byte \    ASCII            |   cont. byte        |   lead byte      |
+// |            \   (0-127)          |   (128-191)         |   (192-)         |
+// |=============|===================|=====================|==================|
+// |  ASCII      | next: ASCII/lead  |  not valid          |  next: cont.     |
+// |  (0-127)    | context: 4 - 63   |                     |  context: 2 - 3  |
+// |-------------|-------------------|---------------------|------------------|
+// |  cont. byte | next: ASCII/lead  |  next: ASCII/lead   |  next: cont.     |
+// |  (128-191)  | context: 4 - 63   |  context: 0 - 1     |  context: 2 - 3  |
+// |-------------|-------------------|---------------------|------------------|
+// |  lead byte  | not valid         |  next: ASCII/lead   |  not valid       |
+// |  (192-207)  |                   |  context: 0 - 1     |                  |
+// |-------------|-------------------|---------------------|------------------|
+// |  lead byte  | not valid         |  next: cont.        |  not valid       |
+// |  (208-)     |                   |  context: 2 - 3     |                  |
+// |-------------|-------------------|---------------------|------------------|
+//
+// The context id for the signed context mode is calculated as:
+//
+//   context = (kContextLookup[512 + p1] << 3) | kContextLookup[512 + p2].
+//
+// For any context modeling modes, the context ids can be calculated by |-ing
+// together two lookups from one table using context model dependent offsets:
+//
+//   context = kContextLookup[offset1 + p1] | kContextLookup[offset2 + p2].
+//
+// where offset1 and offset2 are dependent on the context mode.

 #ifndef BROTLI_DEC_CONTEXT_H_
 #define BROTLI_DEC_CONTEXT_H_

-
 #include "./types.h"

-static const int kSigned2BitContextLookup[] = {
-  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
+enum ContextType {
+  CONTEXT_LSB6         = 0,
+  CONTEXT_MSB6         = 1,
+  CONTEXT_UTF8         = 2,
+  CONTEXT_SIGNED       = 3
 };

-static const int kSigned3BitContextLookup[] = {
+// Common context lookup table for all context modes.
+static const uint8_t kContextLookup[1792] = {
+  // CONTEXT_UTF8, last byte.
+  //
+  // ASCII range.
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  4,  0,  0,  4,  0,  0,
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+   8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12,
+  44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12,
+  12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48,
+  52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12,
+  12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56,
+  60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12,  0,
+  // UTF8 continuation byte range.
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  // UTF8 lead byte range.
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  // CONTEXT_UTF8 second last byte.
+  //
+  // ASCII range.
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
+  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
+  1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
+  // UTF8 continuation byte range.
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // UTF8 lead byte range.
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  // CONTEXT_SIGNED, second last byte.
  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@ -56,69 +176,85 @@ static const int kSigned3BitContextLookup[] = {
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  // CONTEXT_SIGNED, last byte, same as the above values shifted by 3 bits.
+   0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+  32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+  32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+  32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+  40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+  40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+  40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+  48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 56,
+  // CONTEXT_LSB6, last byte.
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  // CONTEXT_MSB6, last byte.
+   0,  0,  0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,
+   4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,
+   8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11,
+  12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15,
+  16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19,
+  20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23,
+  24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27,
+  28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31,
+  32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35,
+  36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 39, 39,
+  40, 40, 40, 40, 41, 41, 41, 41, 42, 42, 42, 42, 43, 43, 43, 43,
+  44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47,
+  48, 48, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51, 51,
+  52, 52, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55,
+  56, 56, 56, 56, 57, 57, 57, 57, 58, 58, 58, 58, 59, 59, 59, 59,
+  60, 60, 60, 60, 61, 61, 61, 61, 62, 62, 62, 62, 63, 63, 63, 63,
+  // CONTEXT_{M,L}SB6, second last byte,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 };

-static const int kSigned4BitContextLookup[] = {
-   0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
-   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-  11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 14, 15,
+static const int kContextLookupOffsets[8] = {
+  // CONTEXT_LSB6
+  1024, 1536,
+  // CONTEXT_MSB6
+  1280, 1536,
+  // CONTEXT_UTF8
+  0, 256,
+  // CONTEXT_SIGNED
+  768, 512,
 };

-enum ContextType {
-  CONTEXT_FULL        = 0,
-  CONTEXT_MSB7        = 1,
-  CONTEXT_MSB6        = 2,
-  CONTEXT_MSB5        = 3,
-  CONTEXT_MSB4        = 4,
-  CONTEXT_MSB3        = 5,
-  CONTEXT_MSB2        = 6,
-  CONTEXT_MSB1        = 7,
-  CONTEXT_IS_ZERO     = 8,
-  CONTEXT_SIGNED_2BIT = 9,
-  CONTEXT_SIGNED_3BIT = 10,
-  CONTEXT_SIGNED_4BIT = 11,
-  CONTEXT_SIGNED_MIXED_3BYTE = 12
-};
-
-static const int kContextSize[] = {
-  256, 128, 64, 32, 16, 8, 4, 2, 2, 4, 8, 16, 64,
-};
-
-static BROTLI_INLINE int NumContexts(int mode) {
-  return kContextSize[mode];
-}
-
-static BROTLI_INLINE uint8_t Context(uint8_t prev_byte, uint8_t prev_byte2,
-                                     uint8_t prev_byte3, int mode) {
-  switch (mode) {
-    case CONTEXT_IS_ZERO:
-      return prev_byte == 0 ? 0 : 1;
-    case CONTEXT_SIGNED_2BIT:
-      return kSigned2BitContextLookup[prev_byte];
-    case CONTEXT_SIGNED_3BIT:
-      return kSigned3BitContextLookup[prev_byte];
-    case CONTEXT_SIGNED_4BIT:
-      return kSigned4BitContextLookup[prev_byte];
-    case CONTEXT_SIGNED_MIXED_3BYTE:
-      return ((kSigned3BitContextLookup[prev_byte] << 3) +
-              (kSigned2BitContextLookup[prev_byte2] << 1) +
-              (prev_byte3 == 0 ? 0 : 1));
-    default:
-      return prev_byte >> mode;
-  }
-}
-
 #endif  // BROTLI_DEC_CONTEXT_H_
--- a/dec/decode.c
+++ b/dec/decode.c
--- a/dec/decode.h
+++ b/dec/decode.h
@ -17,6 +17,7 @@
 #ifndef BROTLI_DEC_DECODE_H_
 #define BROTLI_DEC_DECODE_H_

+#include "./streams.h"
 #include "./types.h"

 #if defined(__cplusplus) || defined(c_plusplus)
@ -39,6 +40,10 @@ int BrotliDecompressBuffer(size_t encoded_size,
                           size_t* decoded_size,
                           uint8_t* decoded_buffer);

+// Same as above, but uses the specified input and output callbacks instead of
+// reading from and writing to pre-allocated memory buffers.
+int BrotliDecompress(BrotliInput input, BrotliOutput output);
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/dec/huffman.c
+++ b/dec/huffman.c
@ -24,10 +24,6 @@
 extern "C" {
 #endif

-// Uncomment the following to use look-up table for ReverseBits()
-// (might be faster on some platform)
-// #define USE_LUT_REVERSE_BITS
-
 #define NON_EXISTENT_SYMBOL (-1)
 #define MAX_ALLOWED_CODE_LENGTH      15

@ -55,7 +51,6 @@ static void AssignChildren(HuffmanTree* const tree,

 static int TreeInit(HuffmanTree* const tree, int num_leaves) {
  assert(tree != NULL);
-  tree->fixed_bit_length_ = 0;
  if (num_leaves == 0) return 0;
  // We allocate maximum possible nodes in the tree at once.
  // Note that a Huffman tree is a full binary tree; and in a full binary tree
@ -84,7 +79,7 @@ void BrotliHuffmanTreeRelease(HuffmanTree* const tree) {
 // Utility: converts Huffman code lengths to corresponding Huffman codes.
 // 'huff_codes' should be pre-allocated.
 // Returns false in case of error (memory allocation, invalid codes).
-static int HuffmanCodeLengthsToCodes(const int* const code_lengths,
+static int HuffmanCodeLengthsToCodes(const uint8_t* const code_lengths,
                                     int code_lengths_size,
                                     int* const huff_codes) {
  int symbol;
@ -133,35 +128,21 @@ static int HuffmanCodeLengthsToCodes(const int* const code_lengths,
  return 1;
 }

-#ifndef USE_LUT_REVERSE_BITS
-
-static int ReverseBitsShort(int bits, int num_bits) {
-  int retval = 0;
-  int i;
-  assert(num_bits <= 8);   // Not a hard requirement, just for coherency.
-  for (i = 0; i < num_bits; ++i) {
-    retval <<= 1;
-    retval |= bits & 1;
-    bits >>= 1;
-  }
-  return retval;
-}
-
-#else
-
-static const uint8_t kReversedBits[16] = {  // Pre-reversed 4-bit values.
-  0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
-  0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf
+static const uint8_t kReverse7[128] = {
+  0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120,
+  4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124,
+  2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122,
+  6, 70, 38, 102, 22, 86, 54, 118, 14, 78, 46, 110, 30, 94, 62, 126,
+  1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121,
+  5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125,
+  3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123,
+  7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127
 };

 static int ReverseBitsShort(int bits, int num_bits) {
-  const uint8_t v = (kReversedBits[bits & 0xf] << 4) | kReversedBits[bits >> 4];
-  assert(num_bits <= 8);
-  return v >> (8 - num_bits);
+  return kReverse7[bits] >> (7 - num_bits);
 }

-#endif
-
 static int TreeAddSymbol(HuffmanTree* const tree,
                         int symbol, int code, int code_length) {
  int step = HUFF_LUT_BITS;
@ -170,13 +151,14 @@ static int TreeAddSymbol(HuffmanTree* const tree,
  const HuffmanTreeNode* const max_node = tree->root_ + tree->max_nodes_;
  assert(symbol == (int16_t)symbol);
  if (code_length <= HUFF_LUT_BITS) {
-    int i;
+    int i = 1 << (HUFF_LUT_BITS - code_length);
    base_code = ReverseBitsShort(code, code_length);
-    for (i = 0; i < (1 << (HUFF_LUT_BITS - code_length)); ++i) {
+    do {
+      --i;
      const int idx = base_code | (i << code_length);
      tree->lut_symbol_[idx] = (int16_t)symbol;
      tree->lut_bits_[idx] = code_length;
-    }
+    } while (i > 0);
  } else {
    base_code = ReverseBitsShort((code >> (code_length - HUFF_LUT_BITS)),
                                 HUFF_LUT_BITS);
@ -206,7 +188,7 @@ static int TreeAddSymbol(HuffmanTree* const tree,
 }

 int BrotliHuffmanTreeBuildImplicit(HuffmanTree* const tree,
-                                   const int* const code_lengths,
+                                   const uint8_t* const code_lengths,
                                   int code_lengths_size) {
  int symbol;
  int num_symbols = 0;
@ -264,41 +246,6 @@ int BrotliHuffmanTreeBuildImplicit(HuffmanTree* const tree,
  }
 }

-int BrotliHuffmanTreeBuildExplicit(HuffmanTree* const tree,
-                                   const int* const code_lengths,
-                                   const int* const codes,
-                                   const int* const symbols,
-                                   int max_symbol,
-                                   int num_symbols) {
-  int ok = 0;
-  int i;
-
-  assert(tree != NULL);
-  assert(code_lengths != NULL);
-  assert(codes != NULL);
-  assert(symbols != NULL);
-
-  // Initialize the tree. Will fail if num_symbols = 0.
-  if (!TreeInit(tree, num_symbols)) return 0;
-
-  // Add symbols one-by-one.
-  for (i = 0; i < num_symbols; ++i) {
-    if (codes[i] != NON_EXISTENT_SYMBOL) {
-      if (symbols[i] < 0 || symbols[i] >= max_symbol) {
-        goto End;
-      }
-      if (!TreeAddSymbol(tree, symbols[i], codes[i], code_lengths[i])) {
-        goto End;
-      }
-    }
-  }
-  ok = 1;
- End:
-  ok = ok && IsFull(tree);
-  if (!ok) BrotliHuffmanTreeRelease(tree);
-  return ok;
-}
-
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/dec/huffman.h
+++ b/dec/huffman.h
@ -43,7 +43,6 @@ struct HuffmanTree {
  HuffmanTreeNode* root_;   // all the nodes, starting at root.
  int max_nodes_;           // max number of nodes
  int num_nodes_;           // number of currently occupied nodes
-  int fixed_bit_length_;     // If non-zero, uses fixed length coding
 };

 // Returns true if the given node is not a leaf of the Huffman tree.
@ -65,19 +64,9 @@ void BrotliHuffmanTreeRelease(HuffmanTree* const tree);
 // Builds Huffman tree assuming code lengths are implicitly in symbol order.
 // Returns false in case of error (invalid tree or memory error).
 int BrotliHuffmanTreeBuildImplicit(HuffmanTree* const tree,
-                                   const int* const code_lengths,
+                                   const uint8_t* const code_lengths,
                                   int code_lengths_size);

-// Build a Huffman tree with explicitly given lists of code lengths, codes
-// and symbols. Verifies that all symbols added are smaller than max_symbol.
-// Returns false in case of an invalid symbol, invalid tree or memory error.
-int BrotliHuffmanTreeBuildExplicit(HuffmanTree* const tree,
-                                   const int* const code_lengths,
-                                   const int* const codes,
-                                   const int* const symbols,
-                                   int max_symbol,
-                                   int num_symbols);
-
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/dec/streams.c
+++ b/dec/streams.c
@ -0,0 +1,106 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Functions for streaming input and output.
+
+#include <string.h>
+#include <unistd.h>
+#include "./streams.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+int BrotliMemInputFunction(void* data, uint8_t* buf, size_t count) {
+  BrotliMemInput* input = (BrotliMemInput*)data;
+  if (input->pos > input->length) {
+    return -1;
+  }
+  if (input->pos + count > input->length) {
+    count = input->length - input->pos;
+  }
+  memcpy(buf, input->buffer + input->pos, count);
+  input->pos += count;
+  return count;
+}
+
+BrotliInput BrotliInitMemInput(const uint8_t* buffer, size_t length,
+                               BrotliMemInput* mem_input) {
+  mem_input->buffer = buffer;
+  mem_input->length = length;
+  mem_input->pos = 0;
+  BrotliInput input;
+  input.cb_ = &BrotliMemInputFunction;
+  input.data_ = mem_input;
+  return input;
+}
+
+int BrotliMemOutputFunction(void* data, const uint8_t* buf, size_t count) {
+  BrotliMemOutput* output = (BrotliMemOutput*)data;
+  if (output->pos + count > output->length) {
+    return -1;
+  }
+  memcpy(output->buffer + output->pos, buf, count);
+  output->pos += count;
+  return count;
+}
+
+BrotliOutput BrotliInitMemOutput(uint8_t* buffer, size_t length,
+                                 BrotliMemOutput* mem_output) {
+  mem_output->buffer = buffer;
+  mem_output->length = length;
+  mem_output->pos = 0;
+  BrotliOutput output;
+  output.cb_ = &BrotliMemOutputFunction;
+  output.data_ = mem_output;
+  return output;
+}
+
+int BrotliStdinInputFunction(void* data, uint8_t* buf, size_t count) {
+  return read(STDIN_FILENO, buf, count);
+}
+
+BrotliInput BrotliStdinInput() {
+  BrotliInput in;
+  in.cb_ = BrotliStdinInputFunction;
+  in.data_ = NULL;
+  return in;
+}
+
+int BrotliStdoutOutputFunction(void* data, const uint8_t* buf, size_t count) {
+  return write(STDOUT_FILENO, buf, count);
+}
+
+BrotliOutput BrotliStdoutOutput() {
+  BrotliOutput out;
+  out.cb_ = BrotliStdoutOutputFunction;
+  out.data_ = NULL;
+  return out;
+}
+
+int BrotliFileOutputFunction(void* data, const uint8_t* buf, size_t count) {
+  return fwrite(buf, 1, count, (FILE*)data);
+}
+
+BrotliOutput BrotliFileOutput(FILE* f) {
+  BrotliOutput out;
+  out.cb_ = BrotliFileOutputFunction;
+  out.data_ = f;
+  return out;
+}
+
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/dec/streams.h
+++ b/dec/streams.h
@ -0,0 +1,102 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Functions for streaming input and output.
+
+#ifndef BROTLI_DEC_STREAMS_H_
+#define BROTLI_DEC_STREAMS_H_
+
+#include <stdio.h>
+#include "./types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// Function pointer type used to read len bytes into buf. Returns the
+// number of bytes read or -1 on error.
+typedef int (*BrotliInputFunction)(void* data, uint8_t* buf, size_t len);
+
+// Input callback function with associated data.
+typedef struct {
+  BrotliInputFunction cb_;
+  void* data_;
+} BrotliInput;
+
+// Reads len bytes into buf, using the in callback.
+static BROTLI_INLINE int BrotliRead(BrotliInput in, uint8_t* buf, size_t len) {
+  return in.cb_(in.data_, buf, len);
+}
+
+// Function pointer type used to write len bytes into buf. Returns the
+// number of bytes written or -1 on error.
+typedef int (*BrotliOutputFunction)(void* data, const uint8_t* buf, size_t len);
+
+// Output callback function with associated data.
+typedef struct {
+  BrotliOutputFunction cb_;
+  void* data_;
+} BrotliOutput;
+
+// Writes len bytes into buf, using the out callback.
+static BROTLI_INLINE int BrotliWrite(BrotliOutput out,
+                                     const uint8_t* buf, size_t len) {
+  return out.cb_(out.data_, buf, len);
+}
+
+// Memory region with position.
+typedef struct {
+  const uint8_t* buffer;
+  size_t length;
+  size_t pos;
+} BrotliMemInput;
+
+// Input callback where *data is a BrotliMemInput struct.
+int BrotliMemInputFunction(void* data, uint8_t* buf, size_t count);
+
+// Returns an input callback that wraps the given memory region.
+BrotliInput BrotliInitMemInput(const uint8_t* buffer, size_t length,
+                               BrotliMemInput* mem_input);
+
+// Output buffer with position.
+typedef struct {
+  uint8_t* buffer;
+  size_t length;
+  size_t pos;
+} BrotliMemOutput;
+
+// Output callback where *data is a BrotliMemOutput struct.
+int BrotliMemOutputFunction(void* data, const uint8_t* buf, size_t count);
+
+// Returns an output callback that wraps the given memory region.
+BrotliOutput BrotliInitMemOutput(uint8_t* buffer, size_t length,
+                                 BrotliMemOutput* mem_output);
+
+// Input callback that reads from standard input.
+int BrotliStdinInputFunction(void* data, uint8_t* buf, size_t count);
+BrotliInput BrotliStdinInput();
+
+// Output callback that writes to standard output.
+int BrotliStdoutOutputFunction(void* data, const uint8_t* buf, size_t count);
+BrotliOutput BrotliStdoutOutput();
+
+// Output callback that writes to a file.
+int BrotliFileOutputFunction(void* data, const uint8_t* buf, size_t count);
+BrotliOutput BrotliFileOutput(FILE* f);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  // BROTLI_DEC_STREAMS_H_
--- a/enc/backward_references.cc
+++ b/enc/backward_references.cc
@ -20,60 +20,64 @@
 #include <vector>

 #include "./command.h"
-#include "./hash.h"
-#include "./literal_cost.h"

 namespace brotli {

-void CreateBackwardReferences(const uint8_t* data,
-                              int length,
+void CreateBackwardReferences(size_t num_bytes,
+                              size_t position,
+                              const uint8_t* ringbuffer,
+                              const float* literal_cost,
+                              size_t ringbuffer_mask,
+                              const size_t max_backward_limit,
+                              Hasher* hasher,
                              std::vector<Command>* commands) {
-  HashLongestMatch<13,11> *hasher = new HashLongestMatch<13,11>;
-  float *literal_cost = new float[length];
-  EstimateBitCostsForLiterals(length, data, literal_cost);
-  hasher->SetLiteralCost(literal_cost);
-
  // Length heuristic that seems to help probably by better selection
  // of lazy matches of similar lengths.
  int insert_length = 0;
-  size_t i = 0;
+  size_t i = position & ringbuffer_mask;
+  const int i_diff = position - i;
+  const size_t i_end = i + num_bytes;

  double average_cost = 0.0;
-  for (int i = 0; i < length; ++i) {
-    average_cost += literal_cost[i];
+  for (int k = position; k < position + num_bytes; ++k) {
+    average_cost += literal_cost[k & ringbuffer_mask];
  }
-  average_cost /= length;
+  average_cost /= num_bytes;
  hasher->set_average_cost(average_cost);

-  while (i + 2 < length) {
+  while (i + 2 < i_end) {
    size_t best_len = 0;
    size_t best_dist = 0;
    double best_score = 0;
-    const size_t max_distance = std::min(i, 1UL << 24);
+    const size_t max_distance = std::min(i + i_diff, max_backward_limit);
    hasher->set_insert_length(insert_length);
    bool match_found = hasher->FindLongestMatch(
-        data, i, length - i, max_distance,
+        ringbuffer, literal_cost, ringbuffer_mask,
+        i + i_diff, i_end - i, max_distance,
        &best_len, &best_dist, &best_score);
    if (match_found) {
      // Found a match. Let's look for something even better ahead.
      int delayed_backward_references_in_row = 0;
-      while (i + 4 < length &&
+      while (i + 4 < i_end &&
             delayed_backward_references_in_row < 4) {
        size_t best_len_2 = 0;
        size_t best_dist_2 = 0;
        double best_score_2 = 0;
-        hasher->Store(data + i, i);
+        hasher->Store(ringbuffer + i, i + i_diff);
        match_found = hasher->FindLongestMatch(
-            data, i + 1, length - i - 1, max_distance,
+            ringbuffer, literal_cost, ringbuffer_mask,
+            i + i_diff + 1, i_end - i - 1, max_distance,
            &best_len_2, &best_dist_2, &best_score_2);
        double cost_diff_lazy = 0;
        if (best_len >= 4) {
-          cost_diff_lazy += hasher->literal_cost(i + 4) - average_cost;
+          cost_diff_lazy +=
+              literal_cost[(i + 4) & ringbuffer_mask] - average_cost;
        }
        {
          const int tail_length = best_len_2 - best_len + 1;
          for (int k = 0; k < tail_length; ++k) {
-            cost_diff_lazy -= hasher->literal_cost(i + best_len + k) -
+            cost_diff_lazy -=
+                literal_cost[(i + best_len + k) & ringbuffer_mask] -
                average_cost;
          }
        }
@ -84,7 +88,7 @@ void CreateBackwardReferences(const uint8_t* data,
        }
        // Add bias to slightly avoid lazy matching.
        cost_diff_lazy += 2.0 + delayed_backward_references_in_row * 0.2;
-        cost_diff_lazy += 0.04 * hasher->literal_cost(i);
+        cost_diff_lazy += 0.04 * literal_cost[i & ringbuffer_mask];

        if (match_found && best_score_2 >= best_score + cost_diff_lazy) {
          // Ok, let's just write one byte for now and start a match from the
@ -109,18 +113,18 @@ void CreateBackwardReferences(const uint8_t* data,
      insert_length = 0;
      ++i;
      for (int j = 1; j < best_len; ++j) {
-        if (i + 2 < length) {
-          hasher->Store(data + i, i);
+        if (i + 2 < i_end) {
+          hasher->Store(ringbuffer + i, i + i_diff);
        }
        ++i;
      }
    } else {
      ++insert_length;
-      hasher->Store(data + i, i);
+      hasher->Store(ringbuffer + i, i + i_diff);
      ++i;
    }
  }
-  insert_length += (length - i);
+  insert_length += (i_end - i);

  if (insert_length > 0) {
    Command cmd;
@ -129,9 +133,6 @@ void CreateBackwardReferences(const uint8_t* data,
    cmd.copy_distance_ = 0;
    commands->push_back(cmd);
  }
-
-  delete[] literal_cost;
-  delete hasher;
 }

 }  // namespace brotli
--- a/enc/backward_references.h
+++ b/enc/backward_references.h
@ -20,12 +20,18 @@
 #include <stdint.h>
 #include <vector>

+#include "./hash.h"
 #include "./command.h"

 namespace brotli {

-void CreateBackwardReferences(const uint8_t* data,
-                              int length,
+void CreateBackwardReferences(size_t num_bytes,
+                              size_t position,
+                              const uint8_t* ringbuffer,
+                              const float* literal_cost,
+                              size_t ringbuffer_mask,
+                              const size_t max_backward_limit,
+                              Hasher* hasher,
                              std::vector<Command>* commands);

 }  // namespace brotli
--- a/enc/bit_cost.h
+++ b/enc/bit_cost.h
@ -122,26 +122,31 @@ static inline int HuffmanBitCost(const uint8_t* depth, int length) {
 template<int kSize>
 double PopulationCost(const Histogram<kSize>& histogram) {
  if (histogram.total_count_ == 0) {
-    return 4;
+    return 11;
  }
-  int symbols[2] = { 0 };
  int count = 0;
-  for (int i = 0; i < kSize && count < 3; ++i) {
+  for (int i = 0; i < kSize && count < 5; ++i) {
    if (histogram.data_[i] > 0) {
-      if (count < 2) symbols[count] = i;
      ++count;
    }
  }
-  if (count <= 2 && symbols[0] < 256 && symbols[1] < 256) {
-    return ((symbols[0] <= 1 ? 4 : 11) +
-            (count == 2 ? 8 + histogram.total_count_ : 0));
+  if (count == 1) {
+    return 11;
+  }
+  if (count == 2) {
+    return 19 + histogram.total_count_;
  }
  uint8_t depth[kSize] = { 0 };
  CreateHuffmanTree(&histogram.data_[0], kSize, 15, depth);
-  int bits = HuffmanBitCost(depth, kSize);
+  int bits = 0;
  for (int i = 0; i < kSize; ++i) {
    bits += histogram.data_[i] * depth[i];
  }
+  if (count == 3) {
+    bits += 27;
+  } else {
+    bits += HuffmanBitCost(depth, kSize);
+  }
  return bits;
 }

--- a/enc/context.h
+++ b/enc/context.h
@ -21,25 +21,124 @@

 namespace brotli {

-static const int kSigned2BitContextLookup[] = {
+// Second-order context lookup table for UTF8 byte streams.
+//
+// If p1 and p2 are the previous two bytes, we calcualte the context as
+//
+//   context = kUTF8ContextLookup[p1] | kUTF8ContextLookup[p2 + 256].
+//
+// If the previous two bytes are ASCII characters (i.e. < 128), this will be
+// equivalent to
+//
+//   context = 4 * context1(p1) + context2(p2),
+//
+// where context1 is based on the previous byte in the following way:
+//
+//   0  : non-ASCII control
+//   1  : \t, \n, \r
+//   2  : space
+//   3  : other punctuation
+//   4  : " '
+//   5  : %
+//   6  : ( < [ {
+//   7  : ) > ] }
+//   8  : , ; :
+//   9  : .
+//   10 : =
+//   11 : number
+//   12 : upper-case vowel
+//   13 : upper-case consonant
+//   14 : lower-case vowel
+//   15 : lower-case consonant
+//
+// and context2 is based on the second last byte:
+//
+//   0 : control, space
+//   1 : punctuation
+//   2 : upper-case letter, number
+//   3 : lower-case letter
+//
+// If the last byte is ASCII, and the second last byte is not (in a valid UTF8
+// stream it will be a continuation byte, value between 128 and 191), the
+// context is the same as if the second last byte was an ASCII control or space.
+//
+// If the last byte is a UTF8 lead byte (value >= 192), then the next byte will
+// be a continuation byte and the context id is 2 or 3 depending on the LSB of
+// the last byte and to a lesser extent on the second last byte if it is ASCII.
+//
+// If the last byte is a UTF8 continuation byte, the second last byte can be:
+//   - continuation byte: the next byte is probably ASCII or lead byte (assuming
+//     4-byte UTF8 characters are rare) and the context id is 0 or 1.
+//   - lead byte (192 - 207): next byte is ASCII or lead byte, context is 0 or 1
+//   - lead byte (208 - 255): next byte is continuation byte, context is 2 or 3
+//
+// The possible value combinations of the previous two bytes, the range of
+// context ids and the type of the next byte is summarized in the table below:
+//
+// |--------\-----------------------------------------------------------------|
+// |         \                         Last byte                              |
+// | Second   \---------------------------------------------------------------|
+// | last byte \    ASCII            |   cont. byte        |   lead byte      |
+// |            \   (0-127)          |   (128-191)         |   (192-)         |
+// |=============|===================|=====================|==================|
+// |  ASCII      | next: ASCII/lead  |  not valid          |  next: cont.     |
+// |  (0-127)    | context: 4 - 63   |                     |  context: 2 - 3  |
+// |-------------|-------------------|---------------------|------------------|
+// |  cont. byte | next: ASCII/lead  |  next: ASCII/lead   |  next: cont.     |
+// |  (128-191)  | context: 4 - 63   |  context: 0 - 1     |  context: 2 - 3  |
+// |-------------|-------------------|---------------------|------------------|
+// |  lead byte  | not valid         |  next: ASCII/lead   |  not valid       |
+// |  (192-207)  |                   |  context: 0 - 1     |                  |
+// |-------------|-------------------|---------------------|------------------|
+// |  lead byte  | not valid         |  next: cont.        |  not valid       |
+// |  (208-)     |                   |  context: 2 - 3     |                  |
+// |-------------|-------------------|---------------------|------------------|
+static const uint8_t kUTF8ContextLookup[512] = {
+  // Last byte.
+  //
+  // ASCII range.
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  4,  0,  0,  4,  0,  0,
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+   8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12,
+  44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12,
+  12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48,
+  52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12,
+  12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56,
+  60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12,  0,
+  // UTF8 continuation byte range.
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  // UTF8 lead byte range.
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  // Second last byte.
+  //
+  // ASCII range.
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
+  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
+  1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
+  // UTF8 continuation byte range.
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // UTF8 lead byte range.
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
 };

+// Context lookup table for small signed integers.
 static const int kSigned3BitContextLookup[] = {
  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@ -59,69 +158,25 @@ static const int kSigned3BitContextLookup[] = {
  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
 };

-static const int kSigned4BitContextLookup[] = {
-   0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
-   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-  11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 14, 15,
-};
-
 enum ContextType {
-  CONTEXT_NONE        = 0,
-  CONTEXT_FULL        = 1,
-  CONTEXT_MSB7        = 2,
-  CONTEXT_MSB6        = 3,
-  CONTEXT_MSB5        = 4,
-  CONTEXT_MSB4        = 5,
-  CONTEXT_MSB3        = 6,
-  CONTEXT_MSB2        = 7,
-  CONTEXT_MSB1        = 8,
-  CONTEXT_IS_ZERO     = 9,
-  CONTEXT_SIGNED_2BIT = 10,
-  CONTEXT_SIGNED_3BIT = 11,
-  CONTEXT_SIGNED_4BIT = 12,
-  CONTEXT_SIGNED_MIXED_3BYTE = 13,
+  CONTEXT_LSB6         = 0,
+  CONTEXT_MSB6         = 1,
+  CONTEXT_UTF8         = 2,
+  CONTEXT_SIGNED       = 3
 };

-static const int kContextSize[] = {
-  1, 256, 128, 64, 32, 16, 8, 4, 2, 2, 4, 8, 16, 64,
-};
-
-static inline int NumContexts(int mode) {
-  return kContextSize[mode];
-}
-
-static inline uint8_t Context(uint8_t prev_byte, uint8_t prev_byte2,
-                              uint8_t prev_byte3, int mode) {
+static inline uint8_t Context(uint8_t p1, uint8_t p2, int mode) {
  switch (mode) {
-    case CONTEXT_NONE:
-      return 0;
-    case CONTEXT_IS_ZERO:
-      return prev_byte == 0 ? 0 : 1;
-    case CONTEXT_SIGNED_2BIT:
-      return kSigned2BitContextLookup[prev_byte];
-    case CONTEXT_SIGNED_3BIT:
-      return kSigned3BitContextLookup[prev_byte];
-    case CONTEXT_SIGNED_4BIT:
-      return kSigned4BitContextLookup[prev_byte];
-    case CONTEXT_SIGNED_MIXED_3BYTE:
-      return ((kSigned3BitContextLookup[prev_byte] << 3) +
-              (kSigned2BitContextLookup[prev_byte2] << 1) +
-              (prev_byte3 == 0 ? 0 : 1));
+    case CONTEXT_LSB6:
+      return p1 & 0x3f;
+    case CONTEXT_MSB6:
+      return p1 >> 2;
+    case CONTEXT_UTF8:
+      return kUTF8ContextLookup[p1] | kUTF8ContextLookup[p2 + 256];
+    case CONTEXT_SIGNED:
+      return (kSigned3BitContextLookup[p1] << 3) + kSigned3BitContextLookup[p2];
    default:
-      return prev_byte >> (mode - 1);
+      return 0;
  }
 }

--- a/enc/encode.cc
+++ b/enc/encode.cc
@ -26,7 +26,9 @@
 #include "./context.h"
 #include "./entropy_encode.h"
 #include "./fast_log.h"
+#include "./hash.h"
 #include "./histogram.h"
+#include "./literal_cost.h"
 #include "./prefix.h"
 #include "./write_bits.h"

@ -41,31 +43,39 @@ double Entropy(const std::vector<Histogram<kSize> >& histograms) {
  return retval;
 }

+template<int kSize>
+double TotalBitCost(const std::vector<Histogram<kSize> >& histograms) {
+  double retval = 0;
+  for (int i = 0; i < histograms.size(); ++i) {
+    retval += PopulationCost(histograms[i]);
+  }
+  return retval;
+}
+
 void EncodeSize(size_t len, int* storage_ix, uint8_t* storage) {
  std::vector<uint8_t> len_bytes;
-  while (len > 0) {
+  do {
    len_bytes.push_back(len & 0xff);
    len >>= 8;
-  };
+  } while (len > 0);
  WriteBits(3, len_bytes.size(), storage_ix, storage);
  for (int i = 0; i < len_bytes.size(); ++i) {
    WriteBits(8, len_bytes[i], storage_ix, storage);
  }
 }

-void EncodeMetaBlockLength(int input_size_bits,
-                           size_t meta_block_size,
-                           bool is_last_meta_block,
+void EncodeMetaBlockLength(size_t meta_block_size,
                           int* storage_ix, uint8_t* storage) {
-  WriteBits(1, is_last_meta_block, storage_ix, storage);
-  if (is_last_meta_block) return;
-  while (input_size_bits > 0) {
-    WriteBits(8, meta_block_size & 0xff, storage_ix, storage);
-    meta_block_size >>= 8;
-    input_size_bits -= 8;
+  WriteBits(1, 0, storage_ix, storage);
+  int num_bits = Log2Floor(meta_block_size) + 1;
+  WriteBits(3, (num_bits + 3) >> 2, storage_ix, storage);
+  while (num_bits > 0) {
+    WriteBits(4, meta_block_size & 0xf, storage_ix, storage);
+    meta_block_size >>= 4;
+    num_bits -= 4;
  }
-  if (input_size_bits > 0) {
-    WriteBits(input_size_bits, meta_block_size, storage_ix, storage);
+  if (num_bits > 0) {
+    WriteBits(num_bits, meta_block_size, storage_ix, storage);
  }
 }

@ -82,7 +92,7 @@ void StoreHuffmanTreeOfHuffmanTreeToBitMask(
    const uint8_t* code_length_bitdepth,
    int* storage_ix, uint8_t* storage) {
  static const uint8_t kStorageOrder[kCodeLengthCodes] = {
-    17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    1, 2, 3, 4, 0, 17, 18, 5, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15
  };
  // Throw away trailing zeros:
  int codes_to_store = kCodeLengthCodes;
@ -92,8 +102,16 @@ void StoreHuffmanTreeOfHuffmanTreeToBitMask(
    }
  }
  WriteBits(4, codes_to_store - 4, storage_ix, storage);
-  for (int i = 0; i < codes_to_store; ++i) {
-    WriteBits(3, code_length_bitdepth[kStorageOrder[i]], storage_ix, storage);
+  const int skip_two_first =
+      code_length_bitdepth[kStorageOrder[0]] == 0 &&
+      code_length_bitdepth[kStorageOrder[1]] == 0;
+  WriteBits(1, skip_two_first, storage_ix, storage);
+
+  for (int i = skip_two_first * 2; i < codes_to_store; ++i) {
+    uint8_t len[] = { 2, 4, 3, 2, 2, 4 };
+    uint8_t bits[] = { 0, 7, 3, 1, 2, 15 };
+    int v = code_length_bitdepth[kStorageOrder[i]];
+    WriteBits(len[v], bits[v], storage_ix, storage);
  }
 }

@ -124,30 +142,49 @@ void StoreHuffmanTreeToBitMask(
 template<int kSize>
 void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size,
                      int* storage_ix, uint8_t* storage) {
-  const int kMaxBits = 8;
-  const int kMaxSymbol = 1 << kMaxBits;
-
+  const uint8_t *depth = &code.depth_[0];
+  int max_bits_counter = alphabet_size - 1;
+  int max_bits = 0;
+  while (max_bits_counter) {
+    max_bits_counter >>= 1;
+    ++max_bits;
+  }
  if (code.count_ == 0) {   // emit minimal tree for empty cases
-    // bits: small tree marker: 1, count-1: 0, large 8-bit code: 0, code: 0
-    WriteBits(4, 0x01, storage_ix, storage);
+    // bits: small tree marker: 1, count-1: 0, max_bits-sized encoding for 0
+    WriteBits(3 + max_bits, 0x01, storage_ix, storage);
    return;
  }
-  if (code.count_ <= 2 &&
-      code.symbols_[0] < kMaxSymbol &&
-      code.symbols_[1] < kMaxSymbol) {
-    // Small tree marker to encode 1 or 2 symbols.
-    WriteBits(1, 1, storage_ix, storage);
-    WriteBits(1, code.count_ - 1, storage_ix, storage);
-    if (code.symbols_[0] <= 1) {
-      // Code bit for small (1 bit) symbol value.
-      WriteBits(1, 0, storage_ix, storage);
-      WriteBits(1, code.symbols_[0], storage_ix, storage);
-    } else {
-      WriteBits(1, 1, storage_ix, storage);
-      WriteBits(8, code.symbols_[0], storage_ix, storage);
+  if (code.count_ <= 4) {
+    int symbols[4];
+    // Quadratic sort.
+    int k, j;
+    for (k = 0; k < code.count_; ++k) {
+      symbols[k] = code.symbols_[k];
    }
-    if (code.count_ == 2) {
-      WriteBits(8, code.symbols_[1], storage_ix, storage);
+    for (k = 0; k < code.count_; ++k) {
+      for (j = k + 1; j < code.count_; ++j) {
+        if (depth[symbols[j]] < depth[symbols[k]]) {
+          int t = symbols[k];
+          symbols[k] = symbols[j];
+          symbols[j] = t;
+        }
+      }
+    }
+    // Small tree marker to encode 1-4 symbols.
+    WriteBits(1, 1, storage_ix, storage);
+    WriteBits(2, code.count_ - 1, storage_ix, storage);
+    for (int i = 0; i < code.count_; ++i) {
+      WriteBits(max_bits, symbols[i], storage_ix, storage);
+    }
+    if (code.count_ == 4) {
+      if (depth[symbols[0]] == 2 &&
+          depth[symbols[1]] == 2 &&
+          depth[symbols[2]] == 2 &&
+          depth[symbols[3]] == 2) {
+        WriteBits(1, 0, storage_ix, storage);
+      } else {
+        WriteBits(1, 1, storage_ix, storage);
+      }
    }
    return;
  }
@ -156,7 +193,7 @@ void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size,
  uint8_t huffman_tree[kSize];
  uint8_t huffman_tree_extra_bits[kSize];
  int huffman_tree_size = 0;
-  WriteHuffmanTree(&code.depth_[0],
+  WriteHuffmanTree(depth,
                   alphabet_size,
                   &huffman_tree[0],
                   &huffman_tree_extra_bits[0],
@ -167,7 +204,7 @@ void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size,
    huffman_tree_histogram.Add(huffman_tree[i]);
  }
  EntropyCode<kCodeLengthCodes> huffman_tree_entropy;
-  BuildEntropyCode(huffman_tree_histogram, 7, kCodeLengthCodes,
+  BuildEntropyCode(huffman_tree_histogram, 5, kCodeLengthCodes,
                   &huffman_tree_entropy);
  Histogram<kCodeLengthCodes> trimmed_histogram = huffman_tree_histogram;
  uint8_t* last_code = &huffman_tree[huffman_tree_size - 1];
@ -178,7 +215,7 @@ void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size,
  bool write_length = false;
  if (trimmed_size > 1 && trimmed_size < huffman_tree_size) {
    EntropyCode<kCodeLengthCodes> trimmed_entropy;
-    BuildEntropyCode(trimmed_histogram, 7, kCodeLengthCodes, &trimmed_entropy);
+    BuildEntropyCode(trimmed_histogram, 5, kCodeLengthCodes, &trimmed_entropy);
    int huffman_bit_cost = HuffmanTreeBitCost(huffman_tree_histogram,
                                              huffman_tree_entropy);
    int trimmed_bit_cost = HuffmanTreeBitCost(trimmed_histogram,
@ -247,16 +284,15 @@ void EncodeCopyDistance(const Command& cmd, const EntropyCodeDistance& entropy,
  }
 }

-
-void ComputeDistanceShortCodes(std::vector<Command>* cmds) {
+void ComputeDistanceShortCodes(std::vector<Command>* cmds,
+                               int* dist_ringbuffer,
+                               size_t* ringbuffer_idx) {
  static const int kIndexOffset[16] = {
    3, 2, 1, 0, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2
  };
  static const int kValueOffset[16] = {
    0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3
  };
-  int dist_ringbuffer[4] = { 4, 11, 15, 16 };
-  int ringbuffer_idx = 0;
  for (int i = 0; i < cmds->size(); ++i) {
    int cur_dist = (*cmds)[i].copy_distance_;
    if (cur_dist == 0) break;
@ -268,7 +304,7 @@ void ComputeDistanceShortCodes(std::vector<Command>* cmds) {
        // with them.
        continue;
      }
-      int comp = (dist_ringbuffer[(ringbuffer_idx + kIndexOffset[k]) & 3] +
+      int comp = (dist_ringbuffer[(*ringbuffer_idx + kIndexOffset[k]) & 3] +
                  kValueOffset[k]);
      if (cur_dist == comp) {
        dist_code = k + 1;
@ -276,8 +312,8 @@ void ComputeDistanceShortCodes(std::vector<Command>* cmds) {
      }
    }
    if (dist_code > 1) {
-      dist_ringbuffer[ringbuffer_idx & 3] = cur_dist;
-      ++ringbuffer_idx;
+      dist_ringbuffer[*ringbuffer_idx & 3] = cur_dist;
+      ++(*ringbuffer_idx);
    }
    (*cmds)[i].distance_code_ = dist_code;
  }
@ -414,19 +450,8 @@ int BestMaxZeroRunLengthPrefix(const std::vector<int>& v) {
 }

 void EncodeContextMap(const std::vector<int>& context_map,
-                      int context_mode,
-                      int context_mode_bits,
                      int num_clusters,
                      int* storage_ix, uint8_t* storage) {
-  if (context_mode == 0) {
-    WriteBits(1, 0, storage_ix, storage);  // no context
-    return;
-  }
-
-  WriteBits(1, 1, storage_ix, storage);  // have context
-  if (context_mode_bits > 0) {
-    WriteBits(context_mode_bits, context_mode - 1, storage_ix, storage);
-  }
  WriteBits(8, num_clusters - 1, storage_ix, storage);

  if (num_clusters == 1 || num_clusters == context_map.size()) {
@ -560,7 +585,6 @@ struct EncodingParams {
  int num_direct_distance_codes;
  int distance_postfix_bits;
  int literal_context_mode;
-  int distance_context_mode;
 };

 struct MetaBlock {
@ -569,6 +593,7 @@ struct MetaBlock {
  BlockSplit literal_split;
  BlockSplit command_split;
  BlockSplit distance_split;
+  std::vector<int> literal_context_modes;
  std::vector<int> literal_context_map;
  std::vector<int> distance_context_map;
  std::vector<HistogramLiteral> literal_histograms;
@ -578,8 +603,9 @@ struct MetaBlock {

 void BuildMetaBlock(const EncodingParams& params,
                    const std::vector<Command>& cmds,
-                    const uint8_t* input_buffer,
-                    size_t pos,
+                    const uint8_t* ringbuffer,
+                    const size_t pos,
+                    const size_t mask,
                    MetaBlock* mb) {
  mb->cmds = cmds;
  mb->params = params;
@ -587,7 +613,7 @@ void BuildMetaBlock(const EncodingParams& params,
                         mb->params.num_direct_distance_codes,
                         mb->params.distance_postfix_bits);
  SplitBlock(mb->cmds,
-             input_buffer + pos,
+             &ringbuffer[pos & mask],
             &mb->literal_split,
             &mb->command_split,
             &mb->distance_split);
@ -595,16 +621,14 @@ void BuildMetaBlock(const EncodingParams& params,
  ComputeBlockTypeShortCodes(&mb->command_split);
  ComputeBlockTypeShortCodes(&mb->distance_split);

-  int num_literal_contexts_per_block_type =
-      NumContexts(mb->params.literal_context_mode);
+  mb->literal_context_modes.resize(mb->literal_split.num_types_,
+                                   mb->params.literal_context_mode);
+
+
  int num_literal_contexts =
-      mb->literal_split.num_types_ *
-      num_literal_contexts_per_block_type;
-  int num_distance_contexts_per_block_type =
-      (mb->params.distance_context_mode > 0 ? 4 : 1);
+      mb->literal_split.num_types_ << kLiteralContextBits;
  int num_distance_contexts =
-      mb->distance_split.num_types_ *
-      num_distance_contexts_per_block_type;
+      mb->distance_split.num_types_ << kDistanceContextBits;
  std::vector<HistogramLiteral> literal_histograms(num_literal_contexts);
  mb->command_histograms.resize(mb->command_split.num_types_);
  std::vector<HistogramDistance> distance_histograms(num_distance_contexts);
@ -612,10 +636,10 @@ void BuildMetaBlock(const EncodingParams& params,
                  mb->literal_split,
                  mb->command_split,
                  mb->distance_split,
-                  input_buffer,
+                  ringbuffer,
                  pos,
-                  mb->params.literal_context_mode,
-                  mb->params.distance_context_mode,
+                  mask,
+                  mb->literal_context_modes,
                  &literal_histograms,
                  &mb->command_histograms,
                  &distance_histograms);
@ -625,24 +649,20 @@ void BuildMetaBlock(const EncodingParams& params,
  static const int kMaxNumberOfHistograms = 240;

  mb->literal_histograms = literal_histograms;
-  if (mb->params.literal_context_mode > 0) {
-    ClusterHistograms(literal_histograms,
-                      num_literal_contexts_per_block_type,
-                      mb->literal_split.num_types_,
-                      kMaxNumberOfHistograms,
-                      &mb->literal_histograms,
-                      &mb->literal_context_map);
-  }
+  ClusterHistograms(literal_histograms,
+                    1 << kLiteralContextBits,
+                    mb->literal_split.num_types_,
+                    kMaxNumberOfHistograms,
+                    &mb->literal_histograms,
+                    &mb->literal_context_map);

  mb->distance_histograms = distance_histograms;
-  if (mb->params.distance_context_mode > 0) {
-    ClusterHistograms(distance_histograms,
-                      num_distance_contexts_per_block_type,
-                      mb->distance_split.num_types_,
-                      kMaxNumberOfHistograms,
-                      &mb->distance_histograms,
-                      &mb->distance_context_map);
-  }
+  ClusterHistograms(distance_histograms,
+                    1 << kDistanceContextBits,
+                    mb->distance_split.num_types_,
+                    kMaxNumberOfHistograms,
+                    &mb->distance_histograms,
+                    &mb->distance_context_map);
 }

 size_t MetaBlockLength(const std::vector<Command>& cmds) {
@ -655,14 +675,13 @@ size_t MetaBlockLength(const std::vector<Command>& cmds) {
 }

 void StoreMetaBlock(const MetaBlock& mb,
-                    const uint8_t* input_buffer,
-                    int input_size_bits,
-                    bool is_last,
+                    const uint8_t* ringbuffer,
+                    const size_t mask,
                    size_t* pos,
                    int* storage_ix, uint8_t* storage) {
  size_t length = MetaBlockLength(mb.cmds);
  const size_t end_pos = *pos + length;
-  EncodeMetaBlockLength(input_size_bits, length - 1, is_last,
+  EncodeMetaBlockLength(length - 1,
                        storage_ix, storage);
  BlockSplitCode literal_split_code;
  BlockSplitCode command_split_code;
@ -680,10 +699,11 @@ void StoreMetaBlock(const MetaBlock& mb,
  int num_distance_codes =
      kNumDistanceShortCodes + mb.params.num_direct_distance_codes +
      (48 << mb.params.distance_postfix_bits);
-  EncodeContextMap(mb.literal_context_map, mb.params.literal_context_mode, 4,
-                   mb.literal_histograms.size(), storage_ix, storage);
-  EncodeContextMap(mb.distance_context_map, mb.params.distance_context_mode, 0,
-                   mb.distance_histograms.size(), storage_ix, storage);
+  for (int i = 0; i < mb.literal_split.num_types_; ++i) {
+    WriteBits(2, mb.literal_context_modes[i], storage_ix, storage);
+  }
+  EncodeContextMap(mb.literal_context_map, mb.literal_histograms.size(), storage_ix, storage);
+  EncodeContextMap(mb.distance_context_map, mb.distance_histograms.size(), storage_ix, storage);
  std::vector<EntropyCodeLiteral> literal_codes;
  std::vector<EntropyCodeCommand> command_codes;
  std::vector<EntropyCodeDistance> distance_codes;
@ -705,27 +725,22 @@ void StoreMetaBlock(const MetaBlock& mb,
    for (int j = 0; j < cmd.insert_length_; ++j) {
      MoveAndEncode(literal_split_code, &literal_it, storage_ix, storage);
      int histogram_idx = literal_it.type_;
-      if (mb.params.literal_context_mode > 0) {
-        uint8_t prev_byte = *pos > 0 ? input_buffer[*pos - 1] : 0;
-        uint8_t prev_byte2 = *pos > 1 ? input_buffer[*pos - 2] : 0;
-        uint8_t prev_byte3 = *pos > 2 ? input_buffer[*pos - 3] : 0;
-        int context = (literal_it.type_ *
-                       NumContexts(mb.params.literal_context_mode) +
-                       Context(prev_byte, prev_byte2, prev_byte3,
-                               mb.params.literal_context_mode));
-        histogram_idx = mb.literal_context_map[context];
-      }
-      EntropyEncode(input_buffer[(*pos)++],
+      uint8_t prev_byte = *pos > 0 ? ringbuffer[(*pos - 1) & mask] : 0;
+      uint8_t prev_byte2 = *pos > 1 ? ringbuffer[(*pos - 2) & mask] : 0;
+      int context = ((literal_it.type_ << kLiteralContextBits) +
+                     Context(prev_byte, prev_byte2,
+                             mb.literal_context_modes[literal_it.type_]));
+      histogram_idx = mb.literal_context_map[context];
+      EntropyEncode(ringbuffer[*pos & mask],
                    literal_codes[histogram_idx], storage_ix, storage);
+      ++(*pos);
    }
    if (*pos < end_pos && cmd.distance_prefix_ != 0xffff) {
      MoveAndEncode(distance_split_code, &distance_it, storage_ix, storage);
      int histogram_index = distance_it.type_;
-      if (mb.params.distance_context_mode > 0) {
-        int context = distance_it.type_ << 2;
-        context += (cmd.copy_length_ > 4) ? 3 : cmd.copy_length_ - 2;
-        histogram_index = mb.distance_context_map[context];
-      }
+      int context = (distance_it.type_ << 2) +
+          ((cmd.copy_length_ > 4) ? 3 : cmd.copy_length_ - 2);
+      histogram_index = mb.distance_context_map[context];
      EncodeCopyDistance(cmd, distance_codes[histogram_index],
                         storage_ix, storage);
    }
@ -733,45 +748,123 @@ void StoreMetaBlock(const MetaBlock& mb,
  }
 }

+static const int kWindowBits = 22;
+// To make decoding faster, we allow the decoder to write 16 bytes ahead in
+// its ringbuffer, therefore the encoder has to decrease max distance by this
+// amount.
+static const int kDecoderRingBufferWriteAheadSlack = 16;
+static const int kMaxBackwardDistance =
+    (1 << kWindowBits) - kDecoderRingBufferWriteAheadSlack;
+
+static const int kMetaBlockSizeBits = 21;
+static const int kRingBufferBits = 23;
+static const int kRingBufferMask = (1 << kRingBufferBits) - 1;
+
+BrotliCompressor::BrotliCompressor()
+    : hasher_(new Hasher),
+      dist_ringbuffer_idx_(0),
+      input_pos_(0),
+      ringbuffer_(kRingBufferBits, kMetaBlockSizeBits),
+      literal_cost_(1 << kRingBufferBits),
+      storage_ix_(0),
+      storage_(new uint8_t[2 << kMetaBlockSizeBits]) {
+    dist_ringbuffer_[0] = 4;
+    dist_ringbuffer_[1] = 11;
+    dist_ringbuffer_[2] = 15;
+    dist_ringbuffer_[3] = 16;
+    storage_[0] = 0;
+  }
+
+BrotliCompressor::~BrotliCompressor() {
+  delete hasher_;
+  delete[] storage_;
+}
+
+void BrotliCompressor::WriteStreamHeader() {
+  // Don't encode input size.
+  WriteBits(3, 0, &storage_ix_, storage_);
+  // Encode window size.
+  WriteBits(1, 1, &storage_ix_, storage_);
+  WriteBits(3, kWindowBits - 17, &storage_ix_, storage_);
+}
+
+void BrotliCompressor::WriteMetaBlock(const size_t input_size,
+                                      const uint8_t* input_buffer,
+                                      size_t* encoded_size,
+                                      uint8_t* encoded_buffer) {
+  ringbuffer_.Write(input_buffer, input_size);
+  EstimateBitCostsForLiterals(input_pos_, input_size,
+                              kRingBufferMask, ringbuffer_.start(),
+                              &literal_cost_[0]);
+  std::vector<Command> commands;
+  CreateBackwardReferences(input_size, input_pos_,
+                           ringbuffer_.start(),
+                           &literal_cost_[0],
+                           kRingBufferMask, kMaxBackwardDistance,
+                           hasher_,
+                           &commands);
+  ComputeDistanceShortCodes(&commands, dist_ringbuffer_,
+                            &dist_ringbuffer_idx_);
+  EncodingParams params;
+  params.num_direct_distance_codes = 12;
+  params.distance_postfix_bits = 1;
+  params.literal_context_mode = CONTEXT_SIGNED;
+  MetaBlock mb;
+  BuildMetaBlock(params, commands, ringbuffer_.start(), input_pos_,
+                 kRingBufferMask, &mb);
+  StoreMetaBlock(mb, ringbuffer_.start(), kRingBufferMask,
+                 &input_pos_, &storage_ix_, storage_);
+  size_t output_size = storage_ix_ >> 3;
+  memcpy(encoded_buffer, storage_, output_size);
+  *encoded_size = output_size;
+  storage_ix_ -= output_size << 3;
+  storage_[storage_ix_ >> 3] = storage_[output_size];
+}
+
+void BrotliCompressor::FinishStream(
+    size_t* encoded_size, uint8_t* encoded_buffer) {
+  WriteBits(1, 1, &storage_ix_, storage_);
+  *encoded_size = (storage_ix_ + 7) >> 3;
+  memcpy(encoded_buffer, storage_, *encoded_size);
+}
+
+
 int BrotliCompressBuffer(size_t input_size,
                         const uint8_t* input_buffer,
                         size_t* encoded_size,
                         uint8_t* encoded_buffer) {
-  int storage_ix = 0;
-  uint8_t* storage = encoded_buffer;
-  WriteBitsPrepareStorage(storage_ix, storage);
-  EncodeSize(input_size, &storage_ix, storage);
-
  if (input_size == 0) {
-    *encoded_size = (storage_ix + 7) >> 3;
+    encoded_buffer[0] = 1;
+    encoded_buffer[1] = 0;
+    *encoded_size = 2;
    return 1;
  }
-  int input_size_bits = Log2Ceiling(input_size);

-  std::vector<Command> all_commands;
-  CreateBackwardReferences(input_buffer, input_size, &all_commands);
-  ComputeDistanceShortCodes(&all_commands);
+  BrotliCompressor compressor;
+  compressor.WriteStreamHeader();

-  std::vector<std::vector<Command> > meta_block_commands;
-  SplitBlockByTotalLength(all_commands, input_size, 2 << 20,
-                          &meta_block_commands);
+  const int max_block_size = 1 << kMetaBlockSizeBits;
+  size_t max_output_size = *encoded_size;
+  const uint8_t* input_end = input_buffer + input_size;
+  *encoded_size = 0;

-  size_t pos = 0;
-  for (int block_idx = 0; block_idx < meta_block_commands.size(); ++block_idx) {
-    const std::vector<Command>& commands = meta_block_commands[block_idx];
-    bool is_last_meta_block = (block_idx + 1 == meta_block_commands.size());
-    EncodingParams params;
-    params.num_direct_distance_codes = 12;
-    params.distance_postfix_bits = 1;
-    params.literal_context_mode = CONTEXT_SIGNED_MIXED_3BYTE;
-    params.distance_context_mode = 1;
-    MetaBlock mb;
-    BuildMetaBlock(params, commands, input_buffer, pos, &mb);
-    StoreMetaBlock(mb, input_buffer, input_size_bits, is_last_meta_block,
-                   &pos, &storage_ix, storage);
+  while (input_buffer < input_end) {
+    int block_size = max_block_size;
+    if (block_size >= input_end - input_buffer) {
+      block_size = input_end - input_buffer;
+    }
+    size_t output_size = max_output_size;
+    compressor.WriteMetaBlock(block_size, input_buffer,
+                              &output_size, &encoded_buffer[*encoded_size]);
+    input_buffer += block_size;
+    *encoded_size += output_size;
+    max_output_size -= output_size;
  }

-  *encoded_size = (storage_ix + 7) >> 3;
+  size_t output_size = max_output_size;
+  compressor.FinishStream(&output_size, &encoded_buffer[*encoded_size]);
+  *encoded_size += output_size;
+
  return 1;
 }

--- a/enc/encode.h
+++ b/enc/encode.h
@ -20,9 +20,45 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <string>
+#include <vector>
+#include "./hash.h"
+#include "./ringbuffer.h"

 namespace brotli {

+class BrotliCompressor {
+ public:
+  BrotliCompressor();
+  ~BrotliCompressor();
+
+  // Writes the stream header into the internal output buffer.
+  void WriteStreamHeader();
+
+  // Encodes the data in input_buffer as a meta-block and writes it to
+  // encoded_buffer and sets *encoded_size to the number of bytes that was
+  // written.
+  void WriteMetaBlock(const size_t input_size,
+                      const uint8_t* input_buffer,
+                      size_t* encoded_size,
+                      uint8_t* encoded_buffer);
+
+  // Writes a zero-length meta-block with end-of-input bit set to the
+  // internal output buffer and copies the output buffer to encoded_buffer and
+  // sets *encoded_size to the number of bytes written.
+  void FinishStream(size_t* encoded_size, uint8_t* encoded_buffer);
+
+
+ private:
+  Hasher* hasher_;
+  int dist_ringbuffer_[4];
+  size_t dist_ringbuffer_idx_;
+  size_t input_pos_;
+  RingBuffer ringbuffer_;
+  std::vector<float> literal_cost_;
+  int storage_ix_;
+  uint8_t* storage_;
+};
+
 // Compresses the data in input_buffer into encoded_buffer, and sets
 // *encoded_size to the compressed length.
 // Returns 0 if there was an error and 1 otherwise.
--- a/enc/entropy_encode.cc
+++ b/enc/entropy_encode.cc
@ -43,6 +43,9 @@ HuffmanTree::HuffmanTree() {}

 // Sort the root nodes, least popular first.
 bool SortHuffmanTree(const HuffmanTree &v0, const HuffmanTree &v1) {
+  if (v0.total_count_ == v1.total_count_) {
+    return v0.index_right_or_value_ > v1.index_right_or_value_;
+  }
  return v0.total_count_ < v1.total_count_;
 }

@ -276,7 +279,7 @@ int OptimizeHuffmanCountsForRle(int length, int* counts) {
  }
  // 3) Let's replace those population counts that lead to more rle codes.
  stride = 0;
-  limit = counts[0];
+  limit = (counts[0] + counts[1] + counts[2]) / 3 + 1;
  sum = 0;
  for (i = 0; i < length + 1; ++i) {
    if (i == length || good_for_rle[i] ||
@ -301,11 +304,10 @@ int OptimizeHuffmanCountsForRle(int length, int* counts) {
      }
      stride = 0;
      sum = 0;
-      if (i < length - 3) {
+      if (i < length - 2) {
        // All interesting strides have a count of at least 4,
        // at least when non-zeros.
-        limit = (counts[i] + counts[i + 1] +
-                 counts[i + 2] + counts[i + 3] + 2) / 4;
+        limit = (counts[i] + counts[i + 1] + counts[i + 2]) / 3 + 1;
      } else if (i < length) {
        limit = counts[i];
      } else {
@ -329,7 +331,7 @@ void WriteHuffmanTree(const uint8_t* depth, const int length,
                      uint8_t* tree,
                      uint8_t* extra_bits_data,
                      int* huffman_tree_size) {
-  int previous_value = 0;
+  int previous_value = 8;
  for (uint32_t i = 0; i < length;) {
    const int value = depth[i];
    int reps = 1;
--- a/enc/entropy_encode.h
+++ b/enc/entropy_encode.h
@ -66,8 +66,8 @@ struct EntropyCode {
  uint16_t bits_[kSize];
  // How many non-zero depth.
  int count_;
-  // First two symbols with non-zero depth.
-  int symbols_[2];
+  // First four symbols with non-zero depth.
+  int symbols_[4];
 };

 template<int kSize>
@ -82,7 +82,7 @@ void BuildEntropyCode(const Histogram<kSize>& histogram,
  if (histogram.total_count_ == 0) return;
  for (int i = 0; i < kSize; ++i) {
    if (histogram.data_[i] > 0) {
-      if (code->count_ < 2) code->symbols_[code->count_] = i;
+      if (code->count_ < 4) code->symbols_[code->count_] = i;
      ++code->count_;
    }
  }
--- a/enc/hash.h
+++ b/enc/hash.h
@ -103,8 +103,7 @@ template <int kBucketBits, int kBlockBits>
 class HashLongestMatch {
 public:
  HashLongestMatch()
-      : literal_cost_(NULL),
-        last_distance1_(4),
+      : last_distance1_(4),
        last_distance2_(11),
        last_distance3_(15),
        last_distance4_(16),
@ -115,10 +114,6 @@ class HashLongestMatch {
  void Reset() {
    std::fill(&num_[0], &num_[sizeof(num_) / sizeof(num_[0])], 0);
  }
-  void SetLiteralCost(float *cost) {
-    literal_cost_ = cost;
-  }
-  double literal_cost(int i) const { return literal_cost_[i]; }

  // Look at 3 bytes at data.
  // Compute a hash from these, and store the value of ix at that position.
@ -146,25 +141,27 @@ class HashLongestMatch {
  // into best_distance_out.
  // Write the score of the best match into best_score_out.
  bool FindLongestMatch(const uint8_t * __restrict data,
+                        const float * __restrict literal_cost,
+                        const size_t ring_buffer_mask,
                        const uint32_t cur_ix,
                        uint32_t max_length,
                        const uint32_t max_backward,
                        size_t * __restrict best_len_out,
                        size_t * __restrict best_distance_out,
                        double * __restrict best_score_out) {
-    const double start_cost4 = literal_cost_ == NULL ? 20 :
-        literal_cost_[cur_ix] +
-        literal_cost_[cur_ix + 1] +
-        literal_cost_[cur_ix + 2] +
-        literal_cost_[cur_ix + 3];
-
-    const double start_cost3 = literal_cost_ == NULL ? 15 :
-        literal_cost_[cur_ix] +
-        literal_cost_[cur_ix + 1] +
-        literal_cost_[cur_ix + 2] + 0.3;
-    double start_cost2 = literal_cost_ == NULL ? 10 :
-        literal_cost_[cur_ix] +
-        literal_cost_[cur_ix + 1] + 1.2;
+    const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
+    const double start_cost4 = literal_cost == NULL ? 20 :
+        literal_cost[cur_ix_masked] +
+        literal_cost[(cur_ix + 1) & ring_buffer_mask] +
+        literal_cost[(cur_ix + 2) & ring_buffer_mask] +
+        literal_cost[(cur_ix + 3) & ring_buffer_mask];
+    const double start_cost3 = literal_cost == NULL ? 15 :
+        literal_cost[cur_ix_masked] +
+        literal_cost[(cur_ix + 1) & ring_buffer_mask] +
+        literal_cost[(cur_ix + 2) & ring_buffer_mask] + 0.3;
+    double start_cost2 = literal_cost == NULL ? 10 :
+        literal_cost[cur_ix_masked] +
+        literal_cost[(cur_ix + 1) & ring_buffer_mask] + 1.2;
    bool match_found = false;
    // Don't accept a short copy from far away.
    double best_score = 8.25;
@ -177,7 +174,7 @@ class HashLongestMatch {
    size_t best_ix = 1;
    // Try last distance first.
    for (int i = 0; i < 16; ++i) {
-      int prev_ix = cur_ix;
+      size_t prev_ix = cur_ix;
      switch(i) {
        case 0: prev_ix -= last_distance1_; break;
        case 1: prev_ix -= last_distance2_; break;
@ -205,11 +202,13 @@ class HashLongestMatch {
      if (PREDICT_FALSE(backward > max_backward)) {
        continue;
      }
-      if (data[cur_ix + best_len] != data[prev_ix + best_len]) {
+      prev_ix &= ring_buffer_mask;
+      if (data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
        continue;
      }
      const size_t len =
-          FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix], max_length);
+          FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
+                                   max_length);
      if (len >= 3 || (len == 2 && i < 2)) {
        // Comparing for >= 2 does not change the semantics, but just saves for
        // a few unnecessary binary logarithms in backward reference score,
@ -234,7 +233,7 @@ class HashLongestMatch {
        }
      }
    }
-    const uint32_t key = Hash3Bytes(&data[cur_ix], kBucketBits);
+    const uint32_t key = Hash3Bytes(&data[cur_ix_masked], kBucketBits);
    const uint32_t * __restrict const bucket = &buckets_[key][0];
    const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
    int stop = int(cur_ix) - 64;
@ -247,8 +246,9 @@ class HashLongestMatch {
      if (PREDICT_FALSE(backward > max_backward)) {
        break;
      }
-      if (data[cur_ix] != data[prev_ix] ||
-          data[cur_ix + 1] != data[prev_ix + 1]) {
+      prev_ix &= ring_buffer_mask;
+      if (data[cur_ix_masked] != data[prev_ix] ||
+          data[cur_ix_masked + 1] != data[prev_ix + 1]) {
        continue;
      }
      int len = 2;
@ -269,11 +269,13 @@ class HashLongestMatch {
      if (PREDICT_FALSE(backward > max_backward)) {
        break;
      }
-      if (data[cur_ix + best_len] != data[prev_ix + best_len]) {
+      prev_ix &= ring_buffer_mask;
+      if (data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
        continue;
      }
      const size_t len =
-          FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix], max_length);
+          FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
+                                   max_length);
      if (len >= 3) {
        // Comparing for >= 3 does not change the semantics, but just saves for
        // a few unnecessary binary logarithms in backward reference score,
@ -333,10 +335,6 @@ class HashLongestMatch {
  // Buckets containing kBlockSize of backward references.
  uint32_t buckets_[kBucketSize][kBlockSize];

-  // Model of how much the ith literal costs to encode using
-  // the entropy model.
-  float *literal_cost_;
-
  int last_distance1_;
  int last_distance2_;
  int last_distance3_;
@ -349,6 +347,8 @@ class HashLongestMatch {
  double average_cost_;
 };

+typedef HashLongestMatch<13, 11> Hasher;
+
 }  // namespace brotli

 #endif  // BROTLI_ENC_HASH_H_
--- a/enc/histogram.cc
+++ b/enc/histogram.cc
@ -31,10 +31,10 @@ void BuildHistograms(
    const BlockSplit& literal_split,
    const BlockSplit& insert_and_copy_split,
    const BlockSplit& dist_split,
-    const uint8_t* input_buffer,
+    const uint8_t* ringbuffer,
    size_t pos,
-    int context_mode,
-    int distance_context_mode,
+    size_t mask,
+    const std::vector<int>& context_modes,
    std::vector<HistogramLiteral>* literal_histograms,
    std::vector<HistogramCommand>* insert_and_copy_histograms,
    std::vector<HistogramDistance>* copy_dist_histograms) {
@ -48,25 +48,47 @@ void BuildHistograms(
        cmd.command_prefix_);
    for (int j = 0; j < cmd.insert_length_; ++j) {
      literal_it.Next();
-      uint8_t prev_byte = pos > 0 ? input_buffer[pos - 1] : 0;
-      uint8_t prev_byte2 = pos > 1 ? input_buffer[pos - 2] : 0;
-      uint8_t prev_byte3 = pos > 2 ? input_buffer[pos - 3] : 0;
-      int context = (literal_it.type_ * NumContexts(context_mode) +
-                     Context(prev_byte, prev_byte2, prev_byte3, context_mode));
-      (*literal_histograms)[context].Add(input_buffer[pos]);
+      uint8_t prev_byte = pos > 0 ? ringbuffer[(pos - 1) & mask] : 0;
+      uint8_t prev_byte2 = pos > 1 ? ringbuffer[(pos - 2) & mask] : 0;
+      int context = (literal_it.type_ << kLiteralContextBits) +
+          Context(prev_byte, prev_byte2, context_modes[literal_it.type_]);
+      (*literal_histograms)[context].Add(ringbuffer[pos & mask]);
      ++pos;
    }
    pos += cmd.copy_length_;
    if (cmd.copy_length_ > 0 && cmd.distance_prefix_ != 0xffff) {
      dist_it.Next();
-      int context = dist_it.type_;
-      if (distance_context_mode > 0) {
-        context <<= 2;
-        context += (cmd.copy_length_ > 4) ? 3 : cmd.copy_length_ - 2;
-      }
+      int context = (dist_it.type_ << kDistanceContextBits) +
+          ((cmd.copy_length_ > 4) ? 3 : cmd.copy_length_ - 2);
      (*copy_dist_histograms)[context].Add(cmd.distance_prefix_);
    }
  }
 }

+void BuildLiteralHistogramsForBlockType(
+    const std::vector<Command>& cmds,
+    const BlockSplit& literal_split,
+    const uint8_t* ringbuffer,
+    size_t pos,
+    size_t mask,
+    int block_type,
+    int context_mode,
+    std::vector<HistogramLiteral>* histograms) {
+  BlockSplitIterator literal_it(literal_split);
+  for (int i = 0; i < cmds.size(); ++i) {
+    const Command &cmd = cmds[i];
+    for (int j = 0; j < cmd.insert_length_; ++j) {
+      literal_it.Next();
+      if (literal_it.type_ == block_type) {
+        uint8_t prev_byte = pos > 0 ? ringbuffer[(pos - 1) & mask] : 0;
+        uint8_t prev_byte2 = pos > 1 ? ringbuffer[(pos - 2) & mask] : 0;
+        int context = Context(prev_byte, prev_byte2, context_mode);
+        (*histograms)[context].Add(ringbuffer[pos & mask]);
+      }
+      ++pos;
+    }
+    pos += cmd.copy_length_;
+  }
+}
+
 }  // namespace brotli
--- a/enc/histogram.h
+++ b/enc/histogram.h
@ -79,19 +79,32 @@ typedef Histogram<kNumCommandPrefixes> HistogramCommand;
 typedef Histogram<kNumDistancePrefixes> HistogramDistance;
 typedef Histogram<kNumBlockLenPrefixes> HistogramBlockLength;

+static const int kLiteralContextBits = 6;
+static const int kDistanceContextBits = 2;
+
 void BuildHistograms(
    const std::vector<Command>& cmds,
    const BlockSplit& literal_split,
    const BlockSplit& insert_and_copy_split,
    const BlockSplit& dist_split,
-    const uint8_t* input_buffer,
+    const uint8_t* ringbuffer,
    size_t pos,
-    int context_mode,
-    int distance_context_mode,
+    size_t mask,
+    const std::vector<int>& context_modes,
    std::vector<HistogramLiteral>* literal_histograms,
    std::vector<HistogramCommand>* insert_and_copy_histograms,
    std::vector<HistogramDistance>* copy_dist_histograms);

+void BuildLiteralHistogramsForBlockType(
+    const std::vector<Command>& cmds,
+    const BlockSplit& literal_split,
+    const uint8_t* ringbuffer,
+    size_t pos,
+    size_t mask,
+    int block_type,
+    int context_mode,
+    std::vector<HistogramLiteral>* histograms);
+
 }  // namespace brotli

 #endif  // BROTLI_ENC_HISTOGRAM_H_
--- a/enc/literal_cost.cc
+++ b/enc/literal_cost.cc
@ -22,37 +22,39 @@

 namespace brotli {

-void EstimateBitCostsForLiterals(size_t len, const uint8_t *data, float *cost) {
+void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
+                                 const uint8_t *data, float *cost) {
  int histogram[256] = { 0 };
  int window_half = 2000;
  int in_window = std::min(static_cast<size_t>(window_half), len);

  // Bootstrap histogram.
  for (int i = 0; i < in_window; ++i) {
-    ++histogram[data[i]];
+    ++histogram[data[(pos + i) & mask]];
  }

  // Compute bit costs with sliding window.
  for (int i = 0; i < len; ++i) {
    if (i - window_half >= 0) {
      // Remove a byte in the past.
-      --histogram[data[i - window_half]];
+      --histogram[data[(pos + i - window_half) & mask]];
      --in_window;
    }
    if (i + window_half < len) {
      // Add a byte in the future.
-      ++histogram[data[i + window_half]];
+      ++histogram[data[(pos + i + window_half) & mask]];
      ++in_window;
    }
-    int histo = histogram[data[i]];
+    int masked_pos = (pos + i) & mask;
+    int histo = histogram[data[masked_pos]];
    if (histo == 0) {
      histo = 1;
    }
-    cost[i] = log2(static_cast<double>(in_window) / histo);
-    cost[i] += 0.03;
-    if (cost[i] < 1.0) {
-      cost[i] *= 0.5;
-      cost[i] += 0.5;
+    cost[masked_pos] = log2(static_cast<double>(in_window) / histo);
+    cost[masked_pos] += 0.03;
+    if (cost[masked_pos] < 1.0) {
+      cost[masked_pos] *= 0.5;
+      cost[masked_pos] += 0.5;
    }
  }
 }
--- a/enc/literal_cost.h
+++ b/enc/literal_cost.h
@ -22,9 +22,11 @@

 namespace brotli {

-// Input: length of data, and the bytes.
-// Output: estimate of how many bits the literal will take entropy coded.
-void EstimateBitCostsForLiterals(size_t len, const uint8_t *data, float *cost);
+// Estimates how many bits the literals in the interval [pos, pos + len) in the
+// ringbuffer (data, mask) will take entropy coded and writes these estimates
+// to the ringbuffer (cost, mask).
+void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
+                                 const uint8_t *data, float *cost);

 }  // namespace brotli

--- a/enc/ringbuffer.h
+++ b/enc/ringbuffer.h
@ -0,0 +1,89 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Sliding window over the input data.
+
+#ifndef BROTLI_ENC_RINGBUFFER_H_
+#define BROTLI_ENC_RINGBUFFER_H_
+
+// A RingBuffer(window_bits, tail_bits) contains `1 << window_bits' bytes of
+// data in a circular manner: writing a byte writes it to
+// `position() % (1 << window_bits)'. For convenience, the RingBuffer array
+// contains another copy of the first `1 << tail_bits' bytes:
+// buffer_[i] == buffer_[i + (1 << window_bits)] if i < (1 << tail_bits).
+class RingBuffer {
+ public:
+  RingBuffer(int window_bits, int tail_bits)
+      : window_bits_(window_bits), tail_bits_(tail_bits), pos_(0) {
+    static const int kSlackForThreeByteHashingEverywhere = 2;
+    const int buflen = (1 << window_bits_) + (1 << tail_bits_);
+    buffer_ = new uint8_t[buflen + kSlackForThreeByteHashingEverywhere];
+    for (int i = 0; i < kSlackForThreeByteHashingEverywhere; ++i) {
+      buffer_[buflen + i] = 0;
+    }
+  }
+  ~RingBuffer() {
+    delete [] buffer_;
+  }
+
+  // Push bytes into the ring buffer.
+  void Write(const uint8_t *bytes, size_t n) {
+    const size_t masked_pos = pos_ & ((1 << window_bits_) - 1);
+    // The length of the writes is limited so that we do not need to worry
+    // about a write
+    WriteTail(bytes, n);
+    if (masked_pos + n <= (1 << window_bits_)) {
+      // A single write fits.
+      memcpy(&buffer_[masked_pos], bytes, n);
+    } else {
+      // Split into two writes.
+      // Copy into the end of the buffer, including the tail buffer.
+      memcpy(&buffer_[masked_pos], bytes,
+             std::min(n,
+                      ((1 << window_bits_) + (1 << tail_bits_)) - masked_pos));
+      // Copy into the begining of the buffer
+      memcpy(&buffer_[0], bytes + ((1 << window_bits_) - masked_pos),
+             n - ((1 << window_bits_) - masked_pos));
+    }
+    pos_ += n;
+  }
+
+  // Logical cursor position in the ring buffer.
+  size_t position() const { return pos_; }
+
+  uint8_t *start() { return &buffer_[0]; }
+  const uint8_t *start() const { return &buffer_[0]; }
+
+ private:
+  void WriteTail(const uint8_t *bytes, size_t n) {
+    const size_t masked_pos = pos_ & ((1 << window_bits_) - 1);
+    if (masked_pos < (1 << tail_bits_)) {
+      // Just fill the tail buffer with the beginning data.
+      const size_t p = (1 << window_bits_) + masked_pos;
+      memcpy(&buffer_[p], bytes, std::min(n, (1 << tail_bits_) - masked_pos));
+    }
+  }
+
+  // Size of the ringbuffer is (1 << window_bits) + (1 << tail_bits).
+  const int window_bits_;
+  const int tail_bits_;
+
+  // Position to write in the ring buffer.
+  size_t pos_;
+  // The actual ring buffer containing the data and the copy of the beginning
+  // as a tail.
+  uint8_t *buffer_;
+};
+
+#endif  // BROTLI_ENC_RINGBUFFER_H_