From b43df8f699484501e0972703e7d453b817de702d Mon Sep 17 00:00:00 2001 From: Zoltan Szabadka Date: Fri, 12 Jun 2015 15:43:54 +0200 Subject: [PATCH] Brotli custom LZ77 dictionary support. Adds functions to prepend such dictionary to the encoder and decoder, and twiddles their internal parameters to do as if that was a previous part of the input. This dictionary is just a prefilled LZ77 window, it is not related to the built in transformable brotli dictionary. --- dec/bit_reader.h | 13 ------------- dec/decode.c | 28 +++++++++++++++++++++++++--- dec/decode.h | 13 +++++++++++++ dec/state.c | 3 +++ dec/state.h | 4 ++++ enc/encode.cc | 18 ++++++++++++++++++ enc/encode.h | 13 +++++++++++++ enc/hash.h | 25 +++++++++++++++++++++++++ 8 files changed, 101 insertions(+), 16 deletions(-) diff --git a/dec/bit_reader.h b/dec/bit_reader.h index 4e3b7fb..eacf16b 100644 --- a/dec/bit_reader.h +++ b/dec/bit_reader.h @@ -97,19 +97,6 @@ static BROTLI_INLINE uint32_t BrotliPrefetchBits(BrotliBitReader* const br) { return (uint32_t)(br->val_ >> br->bit_pos_); } -/* For jumping over a number of bits in the bit stream when accessed with */ -/* BrotliPrefetchBits and BrotliFillBitWindow. */ -static BROTLI_INLINE void BrotliSetBitPos(BrotliBitReader* const br, - uint32_t val) { -#ifdef BROTLI_DECODE_DEBUG - uint32_t n_bits = val - br->bit_pos_; - const uint32_t bval = (uint32_t)(br->val_ >> br->bit_pos_) & BitMask(n_bits); - printf("[BrotliReadBits] %010d %2d val: %6x\n", - (br->pos_ << 3) + br->bit_pos_ - 64, n_bits, bval); -#endif - br->bit_pos_ = val; -} - /* * Reload up to 32 bits byte-by-byte. * This function works on both little and big endian. diff --git a/dec/decode.c b/dec/decode.c index dbeaf0f..9dd0869 100644 --- a/dec/decode.c +++ b/dec/decode.c @@ -1101,11 +1101,16 @@ BrotliResult BrotliDecompressStreaming(BrotliInput input, BrotliOutput output, if (BrotliDecompressedSize(BROTLI_READ_SIZE, br->buf_, &known_size) == BROTLI_RESULT_SUCCESS) { while (s->ringbuffer_size >= known_size * 2 - && s->ringbuffer_size > 0) { + && s->ringbuffer_size > 1) { s->ringbuffer_size /= 2; } } + /* But make it fit the custom dictionary if there is one. */ + while (s->ringbuffer_size < s->custom_dict_size) { + s->ringbuffer_size *= 2; + } + s->ringbuffer_mask = s->ringbuffer_size - 1; s->ringbuffer = (uint8_t*)malloc((size_t)(s->ringbuffer_size + kRingBufferWriteAheadSlack + @@ -1115,6 +1120,17 @@ BrotliResult BrotliDecompressStreaming(BrotliInput input, BrotliOutput output, break; } s->ringbuffer_end = s->ringbuffer + s->ringbuffer_size; + + if (s->custom_dict) { + memcpy(&s->ringbuffer[(-s->custom_dict_size) & s->ringbuffer_mask], + s->custom_dict, (size_t)s->custom_dict_size); + if (s->custom_dict_size > 0) { + s->prev_byte1 = s->custom_dict[s->custom_dict_size - 1]; + } + if (s->custom_dict_size > 1) { + s->prev_byte2 = s->custom_dict[s->custom_dict_size - 2]; + } + } } if (s->is_metadata) { @@ -1455,9 +1471,9 @@ BrotliResult BrotliDecompressStreaming(BrotliInput input, BrotliOutput output, } BROTLI_LOG_UINT(s->distance); - if (pos < s->max_backward_distance && + if (pos + s->custom_dict_size < s->max_backward_distance && s->max_distance != s->max_backward_distance) { - s->max_distance = pos; + s->max_distance = pos + s->custom_dict_size; } else { s->max_distance = s->max_backward_distance; } @@ -1702,6 +1718,12 @@ BrotliResult BrotliDecompressStreaming(BrotliInput input, BrotliOutput output, return result; } +void BrotliSetCustomDictionary( + size_t size, const uint8_t* dict, BrotliState* s) { + s->custom_dict = dict; + s->custom_dict_size = (int) size; +} + #if defined(__cplusplus) || defined(c_plusplus) } /* extern "C" */ #endif diff --git a/dec/decode.h b/dec/decode.h index 834a7b5..e09de97 100644 --- a/dec/decode.h +++ b/dec/decode.h @@ -123,6 +123,19 @@ BrotliResult BrotliDecompressBufferStreaming(size_t* available_in, size_t* total_out, BrotliState* s); +/* Fills the new state with a dictionary for LZ77, warming up the ringbuffer, + e.g. for custom static dictionaries for data formats. + Not to be confused with the built-in transformable dictionary of Brotli. + The dictionary must exist in memory until decoding is done and is owned by + the caller. To use: + -initialize state with BrotliStateInit + -use BrotliSetCustomDictionary + -use BrotliDecompressBufferStreaming + -clean up with BrotliStateCleanup +*/ +void BrotliSetCustomDictionary( + size_t size, const uint8_t* dict, BrotliState* s); + #if defined(__cplusplus) || defined(c_plusplus) } /* extern "C" */ #endif diff --git a/dec/state.c b/dec/state.c index 7ad64be..c45fdc4 100644 --- a/dec/state.c +++ b/dec/state.c @@ -46,6 +46,9 @@ void BrotliStateInit(BrotliState* s) { s->code_lengths = NULL; s->context_map_table = NULL; + + s->custom_dict = NULL; + s->custom_dict_size = 0; } void BrotliStateCleanup(BrotliState* s) { diff --git a/dec/state.h b/dec/state.h index e5d6b5a..9bd9f74 100644 --- a/dec/state.h +++ b/dec/state.h @@ -167,6 +167,10 @@ typedef struct { int context_index; int max_run_length_prefix; HuffmanCode* context_map_table; + + /* For custom dictionaries */ + const uint8_t* custom_dict; + int custom_dict_size; } BrotliState; void BrotliStateInit(BrotliState* s); diff --git a/enc/encode.cc b/enc/encode.cc index a595365..fa62c1b 100644 --- a/enc/encode.cc +++ b/enc/encode.cc @@ -282,6 +282,17 @@ void BrotliCompressor::CopyInputToRingBuffer(const size_t input_size, } } +void BrotliCompressor::BrotliSetCustomDictionary( + const size_t size, const uint8_t* dict) { + CopyInputToRingBuffer(size, dict); + last_flush_pos_ = size; + last_processed_pos_ = size; + if (size > 0) prev_byte_ = dict[size - 1]; + if (size > 1) prev_byte2_ = dict[size - 2]; + + hashers_->PrependCustomDictionary(hash_type_, size, dict); +} + bool BrotliCompressor::WriteBrotliData(const bool is_last, const bool force_flush, size_t* out_size, @@ -641,11 +652,18 @@ bool BrotliInIsFinished(BrotliIn* r) { } int BrotliCompress(BrotliParams params, BrotliIn* in, BrotliOut* out) { + return BrotliCompressWithCustomDictionary(0, nullptr, params, in, out); +} + +int BrotliCompressWithCustomDictionary(size_t dictsize, const uint8_t* dict, + BrotliParams params, + BrotliIn* in, BrotliOut* out) { size_t in_bytes = 0; size_t out_bytes = 0; uint8_t* output; bool final_block = false; BrotliCompressor compressor(params); + if (dictsize != 0) compressor.BrotliSetCustomDictionary(dictsize, dict); while (!final_block) { in_bytes = CopyOneBlockToRingBuffer(in, &compressor); final_block = in_bytes == 0 || BrotliInIsFinished(in); diff --git a/enc/encode.h b/enc/encode.h index d52c0c1..5d4d550 100644 --- a/enc/encode.h +++ b/enc/encode.h @@ -128,6 +128,13 @@ class BrotliCompressor { bool WriteBrotliData(const bool is_last, const bool force_flush, size_t* out_size, uint8_t** output); + // Fills the new state with a dictionary for LZ77, warming up the ringbuffer, + // e.g. for custom static dictionaries for data formats. + // Not to be confused with the built-in transformable dictionary of Brotli. + // To decode, use BrotliSetCustomDictionary of the decoder with the same + // dictionary. + void BrotliSetCustomDictionary(size_t size, const uint8_t* dict); + // No-op, but we keep it here for API backward-compatibility. void WriteStreamHeader() {} @@ -180,6 +187,12 @@ int BrotliCompressBuffer(BrotliParams params, // of reading from and writing to pre-allocated memory buffers. int BrotliCompress(BrotliParams params, BrotliIn* in, BrotliOut* out); +// Before compressing the data, sets a custom LZ77 dictionary with +// BrotliCompressor::BrotliSetCustomDictionary. +int BrotliCompressWithCustomDictionary(size_t dictsize, const uint8_t* dict, + BrotliParams params, + BrotliIn* in, BrotliOut* out); + } // namespace brotli #endif // BROTLI_ENC_ENCODE_H_ diff --git a/enc/hash.h b/enc/hash.h index 6df709a..724aa9c 100644 --- a/enc/hash.h +++ b/enc/hash.h @@ -612,6 +612,31 @@ struct Hashers { if (hash_h10.get() != NULL) hash_h10->SetStaticDictionary(dict); } + template + void WarmupHash(const size_t size, const uint8_t* dict, Hasher* hasher) { + for (size_t i = 0; i < size; i++) { + hasher->Store(dict, i); + } + } + + // Custom LZ77 window. + void PrependCustomDictionary( + int type, const size_t size, const uint8_t* dict) { + switch (type) { + case 1: WarmupHash(size, dict, hash_h1.get()); break; + case 2: WarmupHash(size, dict, hash_h2.get()); break; + case 3: WarmupHash(size, dict, hash_h3.get()); break; + case 4: WarmupHash(size, dict, hash_h4.get()); break; + case 5: WarmupHash(size, dict, hash_h5.get()); break; + case 6: WarmupHash(size, dict, hash_h6.get()); break; + case 7: WarmupHash(size, dict, hash_h7.get()); break; + case 8: WarmupHash(size, dict, hash_h8.get()); break; + case 9: WarmupHash(size, dict, hash_h9.get()); break; + case 10: WarmupHash(size, dict, hash_h10.get()); break; + default: break; + } + } + std::unique_ptr

hash_h1; std::unique_ptr

hash_h2; std::unique_ptr

hash_h3;