From 19d86fb9a60aa7034d4981b69a5b656f5b90017e Mon Sep 17 00:00:00 2001 From: Eugene Kliuchnikov Date: Wed, 4 Aug 2021 14:42:02 +0200 Subject: [PATCH] Merge-in SharedDictionary feature (#916) Co-authored-by: Eugene Kliuchnikov --- c/common/shared_dictionary.c | 515 +++++++++++++++ c/common/shared_dictionary_internal.h | 74 +++ c/dec/decode.c | 194 +++++- c/dec/state.c | 10 +- c/dec/state.h | 19 +- c/enc/backward_references.c | 61 +- c/enc/backward_references_hq.c | 98 ++- c/enc/backward_references_inc.h | 36 +- c/enc/compound_dictionary.c | 200 ++++++ c/enc/compound_dictionary.h | 60 ++ c/enc/encode.c | 163 ++++- c/enc/encoder_dict.c | 597 +++++++++++++++++- c/enc/encoder_dict.h | 113 +++- c/enc/hash.h | 200 ++++++ c/enc/params.h | 3 +- c/enc/static_dict.c | 56 +- c/include/brotli/decode.h | 26 +- c/include/brotli/encode.h | 46 ++ c/include/brotli/shared_dictionary.h | 97 +++ c/tools/brotli.c | 97 ++- c/tools/brotli.md | 3 + java/org/brotli/wrapper/dec/BUILD | 14 + .../wrapper/dec/BrotliDecoderChannel.java | 5 + .../brotli/wrapper/dec/BrotliInputStream.java | 5 + .../brotli/wrapper/dec/CornerCasesTest.java | 33 + java/org/brotli/wrapper/dec/Decoder.java | 6 + java/org/brotli/wrapper/dec/DecoderJNI.java | 14 + java/org/brotli/wrapper/dec/decoder_jni.cc | 50 ++ java/org/brotli/wrapper/enc/BUILD | 7 + .../wrapper/enc/BrotliEncoderChannel.java | 6 + .../wrapper/enc/BrotliOutputStream.java | 5 + java/org/brotli/wrapper/enc/Encoder.java | 23 + java/org/brotli/wrapper/enc/EncoderJNI.java | 58 ++ .../enc/UseCompoundDictionaryTest.java | 114 ++++ java/org/brotli/wrapper/enc/encoder_jni.cc | 87 +++ scripts/sources.lst | 5 + setup.py | 4 + 37 files changed, 3066 insertions(+), 38 deletions(-) create mode 100644 c/common/shared_dictionary.c create mode 100644 c/common/shared_dictionary_internal.h create mode 100644 c/enc/compound_dictionary.c create mode 100644 c/enc/compound_dictionary.h create mode 100644 c/include/brotli/shared_dictionary.h create mode 100644 java/org/brotli/wrapper/dec/CornerCasesTest.java create mode 100644 java/org/brotli/wrapper/enc/UseCompoundDictionaryTest.java diff --git a/c/common/shared_dictionary.c b/c/common/shared_dictionary.c new file mode 100644 index 0000000..c7d05c5 --- /dev/null +++ b/c/common/shared_dictionary.c @@ -0,0 +1,515 @@ +/* Copyright 2017 Google Inc. All Rights Reserved. + + Distributed under MIT license. + See file LICENSE for detail or copy at https://opensource.org/licenses/MIT +*/ + +/* Shared Dictionary definition and utilities. */ + +#include + +#include +#include /* malloc, free */ +#include + +#include "./dictionary.h" +#include "./platform.h" +#include "./shared_dictionary_internal.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +#define BROTLI_NUM_ENCODED_LENGTHS (SHARED_BROTLI_MAX_DICTIONARY_WORD_LENGTH \ + - SHARED_BROTLI_MIN_DICTIONARY_WORD_LENGTH + 1) + +/* Max allowed by spec */ +#define BROTLI_MAX_SIZE_BITS 15u + +/* Returns BROTLI_TRUE on success, BROTLI_FALSE on failure. */ +static BROTLI_BOOL ReadBool(const uint8_t* encoded, size_t size, size_t* pos, + BROTLI_BOOL* result) { + uint8_t value; + size_t position = *pos; + if (position >= size) return BROTLI_FALSE; /* past file end */ + value = encoded[position++]; + if (value > 1) return BROTLI_FALSE; /* invalid bool */ + *result = TO_BROTLI_BOOL(value); + *pos = position; + return BROTLI_TRUE; /* success */ +} + +/* Returns BROTLI_TRUE on success, BROTLI_FALSE on failure. */ +static BROTLI_BOOL ReadUint8(const uint8_t* encoded, size_t size, size_t* pos, + uint8_t* result) { + size_t position = *pos; + if (position + sizeof(uint8_t) > size) return BROTLI_FALSE; + *result = encoded[position++]; + *pos = position; + return BROTLI_TRUE; +} + +/* Returns BROTLI_TRUE on success, BROTLI_FALSE on failure. */ +static BROTLI_BOOL ReadUint16(const uint8_t* encoded, size_t size, size_t* pos, + uint16_t* result) { + size_t position = *pos; + if (position + sizeof(uint16_t) > size) return BROTLI_FALSE; + *result = BROTLI_UNALIGNED_LOAD16LE(&encoded[position]); + position += 2; + *pos = position; + return BROTLI_TRUE; +} + +/* Reads a varint into a uint32_t, and returns error if it's too large */ +/* Returns BROTLI_TRUE on success, BROTLI_FALSE on failure. */ +static BROTLI_BOOL ReadVarint32(const uint8_t* encoded, size_t size, + size_t* pos, uint32_t* result) { + int num = 0; + uint8_t byte; + *result = 0; + for (;;) { + if (*pos >= size) return BROTLI_FALSE; + byte = encoded[(*pos)++]; + if (num == 4 && byte > 15) return BROTLI_FALSE; + *result |= (uint32_t)(byte & 127) << (num * 7); + if (byte < 128) return BROTLI_TRUE; + num++; + } +} + +/* Returns the total length of word list. */ +static size_t BrotliSizeBitsToOffsets(const uint8_t* size_bits_by_length, + uint32_t* offsets_by_length) { + uint32_t pos = 0; + uint32_t i; + for (i = 0; i <= SHARED_BROTLI_MAX_DICTIONARY_WORD_LENGTH; i++) { + offsets_by_length[i] = pos; + if (size_bits_by_length[i] != 0) { + pos += i << size_bits_by_length[i]; + } + } + return pos; +} + +static BROTLI_BOOL ParseWordList(size_t size, const uint8_t* encoded, + size_t* pos, BrotliDictionary* out) { + size_t offset; + size_t i; + size_t position = *pos; + if (position + BROTLI_NUM_ENCODED_LENGTHS > size) { + return BROTLI_FALSE; + } + + memset(out->size_bits_by_length, 0, SHARED_BROTLI_MIN_DICTIONARY_WORD_LENGTH); + memcpy(out->size_bits_by_length + SHARED_BROTLI_MIN_DICTIONARY_WORD_LENGTH, + &encoded[position], BROTLI_NUM_ENCODED_LENGTHS); + for (i = SHARED_BROTLI_MIN_DICTIONARY_WORD_LENGTH; + i <= SHARED_BROTLI_MAX_DICTIONARY_WORD_LENGTH; i++) { + if (out->size_bits_by_length[i] > BROTLI_MAX_SIZE_BITS) { + return BROTLI_FALSE; + } + } + position += BROTLI_NUM_ENCODED_LENGTHS; + offset = BrotliSizeBitsToOffsets( + out->size_bits_by_length, out->offsets_by_length); + + out->data = &encoded[position]; + out->data_size = offset; + position += offset; + if (position > size) return BROTLI_FALSE; + *pos = position; + return BROTLI_TRUE; +} + +/* Computes the cutOffTransforms of a BrotliTransforms which already has the + transforms data correctly filled in. */ +static void ComputeCutoffTransforms(BrotliTransforms* transforms) { + uint32_t i; + for (i = 0; i < BROTLI_TRANSFORMS_MAX_CUT_OFF + 1; i++) { + transforms->cutOffTransforms[i] = -1; + } + for (i = 0; i < transforms->num_transforms; i++) { + const uint8_t* prefix = BROTLI_TRANSFORM_PREFIX(transforms, i); + uint8_t type = BROTLI_TRANSFORM_TYPE(transforms, i); + const uint8_t* suffix = BROTLI_TRANSFORM_SUFFIX(transforms, i); + if (type <= BROTLI_TRANSFORM_OMIT_LAST_9 && *prefix == 0 && *suffix == 0 && + transforms->cutOffTransforms[type] == -1) { + transforms->cutOffTransforms[type] = (int16_t)i; + } + } +} + +static BROTLI_BOOL ParsePrefixSuffixTable(size_t size, const uint8_t* encoded, + size_t* pos, BrotliTransforms* out, uint16_t* out_table, + size_t* out_table_size) { + size_t position = *pos; + size_t offset = 0; + size_t stringlet_count = 0; /* NUM_PREFIX_SUFFIX */ + size_t data_length = 0; + + /* PREFIX_SUFFIX_LENGTH */ + if (!ReadUint16(encoded, size, &position, &out->prefix_suffix_size)) { + return BROTLI_FALSE; + } + data_length = out->prefix_suffix_size; + + /* Must at least have space for null terminator. */ + if (data_length < 1) return BROTLI_FALSE; + out->prefix_suffix = &encoded[position]; + if (position + data_length >= size) return BROTLI_FALSE; + while (BROTLI_TRUE) { + /* STRING_LENGTH */ + size_t stringlet_len = encoded[position + offset]; + out_table[stringlet_count] = (uint16_t)offset; + stringlet_count++; + offset++; + if (stringlet_len == 0) { + if (offset == data_length) { + break; + } else { + return BROTLI_FALSE; + } + } + if (stringlet_count > 255) return BROTLI_FALSE; + offset += stringlet_len; + if (offset >= data_length) return BROTLI_FALSE; + } + + position += data_length; + *pos = position; + *out_table_size = (uint16_t)stringlet_count; + return BROTLI_TRUE; +} + +static BROTLI_BOOL ParseTransformsList(size_t size, const uint8_t* encoded, + size_t* pos, BrotliTransforms* out, uint16_t* prefix_suffix_table, + size_t* prefix_suffix_count) { + uint32_t i; + BROTLI_BOOL has_params = BROTLI_FALSE; + BROTLI_BOOL prefix_suffix_ok = BROTLI_FALSE; + size_t position = *pos; + size_t stringlet_cnt = 0; + if (position >= size) return BROTLI_FALSE; + + prefix_suffix_ok = ParsePrefixSuffixTable( + size, encoded, &position, out, prefix_suffix_table, &stringlet_cnt); + if (!prefix_suffix_ok) return BROTLI_FALSE; + out->prefix_suffix_map = prefix_suffix_table; + *prefix_suffix_count = stringlet_cnt; + + out->num_transforms = encoded[position++]; + out->transforms = &encoded[position]; + position += (size_t)out->num_transforms * 3; + if (position > size) return BROTLI_FALSE; + /* Check for errors and read extra parameters. */ + for (i = 0; i < out->num_transforms; i++) { + uint8_t prefix_id = BROTLI_TRANSFORM_PREFIX_ID(out, i); + uint8_t type = BROTLI_TRANSFORM_TYPE(out, i); + uint8_t suffix_id = BROTLI_TRANSFORM_SUFFIX_ID(out, i); + if (prefix_id >= stringlet_cnt) return BROTLI_FALSE; + if (type >= BROTLI_NUM_TRANSFORM_TYPES) return BROTLI_FALSE; + if (suffix_id >= stringlet_cnt) return BROTLI_FALSE; + if (type == BROTLI_TRANSFORM_SHIFT_FIRST || + type == BROTLI_TRANSFORM_SHIFT_ALL) { + has_params = BROTLI_TRUE; + } + } + if (has_params) { + out->params = &encoded[position]; + position += (size_t)out->num_transforms * 2; + if (position > size) return BROTLI_FALSE; + for (i = 0; i < out->num_transforms; i++) { + uint8_t type = BROTLI_TRANSFORM_TYPE(out, i); + if (type != BROTLI_TRANSFORM_SHIFT_FIRST && + type != BROTLI_TRANSFORM_SHIFT_ALL) { + if (out->params[i * 2] != 0 || out->params[i * 2 + 1] != 0) { + return BROTLI_FALSE; + } + } + } + } else { + out->params = NULL; + } + ComputeCutoffTransforms(out); + *pos = position; + return BROTLI_TRUE; +} + +static BROTLI_BOOL DryParseDictionary(const uint8_t* encoded, + size_t size, uint32_t* num_prefix, BROTLI_BOOL* is_custom_static_dict) { + size_t pos = 0; + uint32_t chunk_size = 0; + uint8_t num_word_lists; + uint8_t num_transform_lists; + *is_custom_static_dict = BROTLI_FALSE; + *num_prefix = 0; + + /* Skip magic header bytes. */ + pos += 2; + + /* LZ77_DICTIONARY_LENGTH */ + if (!ReadVarint32(encoded, size, &pos, &chunk_size)) return BROTLI_FALSE; + if (chunk_size != 0) { + /* This limitation is not specified but the 32-bit Brotli decoder for now */ + if (chunk_size > 1073741823) return BROTLI_FALSE; + *num_prefix = 1; + if (pos + chunk_size > size) return BROTLI_FALSE; + pos += chunk_size; + } + + if (!ReadUint8(encoded, size, &pos, &num_word_lists)) { + return BROTLI_FALSE; + } + if (!ReadUint8(encoded, size, &pos, &num_transform_lists)) { + return BROTLI_FALSE; + } + + if (num_word_lists > 0 || num_transform_lists > 0) { + *is_custom_static_dict = BROTLI_TRUE; + } + + return BROTLI_TRUE; +} + +static BROTLI_BOOL ParseDictionary(const uint8_t* encoded, size_t size, + BrotliSharedDictionary* dict) { + uint32_t i; + size_t pos = 0; + uint32_t chunk_size = 0; + size_t total_prefix_suffix_count = 0; + size_t trasform_list_start[SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS]; + uint16_t temporary_prefix_suffix_table[256]; + + /* Skip magic header bytes. */ + pos += 2; + + /* LZ77_DICTIONARY_LENGTH */ + if (!ReadVarint32(encoded, size, &pos, &chunk_size)) return BROTLI_FALSE; + if (chunk_size != 0) { + if (pos + chunk_size > size) return BROTLI_FALSE; + dict->prefix_size[dict->num_prefix] = chunk_size; + dict->prefix[dict->num_prefix] = &encoded[pos]; + dict->num_prefix++; + /* LZ77_DICTIONARY_LENGTH bytes. */ + pos += chunk_size; + } + + /* NUM_WORD_LISTS */ + if (!ReadUint8(encoded, size, &pos, &dict->num_word_lists)) { + return BROTLI_FALSE; + } + if (dict->num_word_lists > SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS) { + return BROTLI_FALSE; + } + + if (dict->num_word_lists != 0) { + dict->words_instances = (BrotliDictionary*)dict->alloc_func( + dict->memory_manager_opaque, + dict->num_word_lists * sizeof(*dict->words_instances)); + if (!dict->words_instances) return BROTLI_FALSE; /* OOM */ + } + for (i = 0; i < dict->num_word_lists; i++) { + if (!ParseWordList(size, encoded, &pos, &dict->words_instances[i])) { + return BROTLI_FALSE; + } + } + + /* NUM_TRANSFORM_LISTS */ + if (!ReadUint8(encoded, size, &pos, &dict->num_transform_lists)) { + return BROTLI_FALSE; + } + if (dict->num_transform_lists > SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS) { + return BROTLI_FALSE; + } + + if (dict->num_transform_lists != 0) { + dict->transforms_instances = (BrotliTransforms*)dict->alloc_func( + dict->memory_manager_opaque, + dict->num_transform_lists * sizeof(*dict->transforms_instances)); + if (!dict->transforms_instances) return BROTLI_FALSE; /* OOM */ + } + for (i = 0; i < dict->num_transform_lists; i++) { + BROTLI_BOOL ok = BROTLI_FALSE; + size_t prefix_suffix_count = 0; + trasform_list_start[i] = pos; + dict->transforms_instances[i].prefix_suffix_map = + temporary_prefix_suffix_table; + ok = ParseTransformsList( + size, encoded, &pos, &dict->transforms_instances[i], + temporary_prefix_suffix_table, &prefix_suffix_count); + if (!ok) return BROTLI_FALSE; + total_prefix_suffix_count += prefix_suffix_count; + } + if (total_prefix_suffix_count != 0) { + dict->prefix_suffix_maps = (uint16_t*)dict->alloc_func( + dict->memory_manager_opaque, + total_prefix_suffix_count * sizeof(*dict->prefix_suffix_maps)); + if (!dict->prefix_suffix_maps) return BROTLI_FALSE; /* OOM */ + } + total_prefix_suffix_count = 0; + for (i = 0; i < dict->num_transform_lists; i++) { + size_t prefix_suffix_count = 0; + size_t position = trasform_list_start[i]; + uint16_t* prefix_suffix_map = + &dict->prefix_suffix_maps[total_prefix_suffix_count]; + BROTLI_BOOL ok = ParsePrefixSuffixTable( + size, encoded, &position, &dict->transforms_instances[i], + prefix_suffix_map, &prefix_suffix_count); + if (!ok) return BROTLI_FALSE; + dict->transforms_instances[i].prefix_suffix_map = prefix_suffix_map; + total_prefix_suffix_count += prefix_suffix_count; + } + + if (dict->num_word_lists != 0 || dict->num_transform_lists != 0) { + if (!ReadUint8(encoded, size, &pos, &dict->num_dictionaries)) { + return BROTLI_FALSE; + } + if (dict->num_dictionaries == 0 || + dict->num_dictionaries > SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS) { + return BROTLI_FALSE; + } + for (i = 0; i < dict->num_dictionaries; i++) { + uint8_t words_index; + uint8_t transforms_index; + if (!ReadUint8(encoded, size, &pos, &words_index)) { + return BROTLI_FALSE; + } + if (words_index > dict->num_word_lists) return BROTLI_FALSE; + if (!ReadUint8(encoded, size, &pos, &transforms_index)) { + return BROTLI_FALSE; + } + if (transforms_index > dict->num_transform_lists) return BROTLI_FALSE; + dict->words[i] = words_index == dict->num_word_lists ? + BrotliGetDictionary() : &dict->words_instances[words_index]; + dict->transforms[i] = transforms_index == dict->num_transform_lists ? + BrotliGetTransforms(): &dict->transforms_instances[transforms_index]; + } + /* CONTEXT_ENABLED */ + if (!ReadBool(encoded, size, &pos, &dict->context_based)) { + return BROTLI_FALSE; + } + + /* CONTEXT_MAP */ + if (dict->context_based) { + for (i = 0; i < SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS; i++) { + if (!ReadUint8(encoded, size, &pos, &dict->context_map[i])) { + return BROTLI_FALSE; + } + if (dict->context_map[i] >= dict->num_dictionaries) { + return BROTLI_FALSE; + } + } + } + } else { + dict->context_based = BROTLI_FALSE; + dict->num_dictionaries = 1; + dict->words[0] = BrotliGetDictionary(); + dict->transforms[0] = BrotliGetTransforms(); + } + + return BROTLI_TRUE; +} + +/* Decodes shared dictionary and verifies correctness. + Returns BROTLI_TRUE if dictionary is valid, BROTLI_FALSE otherwise. + The BrotliSharedDictionary must already have been initialized. If the + BrotliSharedDictionary already contains data, compound dictionaries + will be appended, but an error will be returned if it already has + custom words or transforms. + TODO: link to RFC for shared brotli once published. */ +static BROTLI_BOOL DecodeSharedDictionary( + const uint8_t* encoded, size_t size, BrotliSharedDictionary* dict) { + uint32_t num_prefix = 0; + BROTLI_BOOL is_custom_static_dict = BROTLI_FALSE; + BROTLI_BOOL has_custom_static_dict = + dict->num_word_lists > 0 || dict->num_transform_lists > 0; + + /* Check magic header bytes. */ + if (size < 2) return BROTLI_FALSE; + if (encoded[0] != 0x91 || encoded[1] != 0) return BROTLI_FALSE; + + if (!DryParseDictionary(encoded, size, &num_prefix, &is_custom_static_dict)) { + return BROTLI_FALSE; + } + + if (num_prefix + dict->num_prefix > SHARED_BROTLI_MAX_COMPOUND_DICTS) { + return BROTLI_FALSE; + } + + /* Cannot combine different static dictionaries, only prefix dictionaries */ + if (has_custom_static_dict && is_custom_static_dict) return BROTLI_FALSE; + + return ParseDictionary(encoded, size, dict); +} + +void BrotliSharedDictionaryDestroyInstance( + BrotliSharedDictionary* dict) { + if (!dict) { + return; + } else { + brotli_free_func free_func = dict->free_func; + void* opaque = dict->memory_manager_opaque; + /* Cleanup. */ + free_func(opaque, dict->words_instances); + free_func(opaque, dict->transforms_instances); + free_func(opaque, dict->prefix_suffix_maps); + /* Self-destruction. */ + free_func(opaque, dict); + } +} + +BROTLI_BOOL BrotliSharedDictionaryAttach( + BrotliSharedDictionary* dict, BrotliSharedDictionaryType type, + size_t data_size, const uint8_t* data) { + if (!dict) { + return BROTLI_FALSE; + } + if (type == BROTLI_SHARED_DICTIONARY_SERIALIZED) { + return DecodeSharedDictionary(data, data_size, dict); + } else if (type == BROTLI_SHARED_DICTIONARY_RAW) { + if (dict->num_prefix >= SHARED_BROTLI_MAX_COMPOUND_DICTS) { + return BROTLI_FALSE; + } + dict->prefix_size[dict->num_prefix] = data_size; + dict->prefix[dict->num_prefix] = data; + dict->num_prefix++; + return BROTLI_TRUE; + } else { + return BROTLI_FALSE; + } +} + +BrotliSharedDictionary* BrotliSharedDictionaryCreateInstance( + brotli_alloc_func alloc_func, brotli_free_func free_func, void* opaque) { + BrotliSharedDictionary* dict = 0; + if (!alloc_func && !free_func) { + dict = (BrotliSharedDictionary*)malloc(sizeof(BrotliSharedDictionary)); + } else if (alloc_func && free_func) { + dict = (BrotliSharedDictionary*)alloc_func( + opaque, sizeof(BrotliSharedDictionary)); + } + if (dict == 0) { + return 0; + } + + /* TODO: explicitly initialize all the fields? */ + memset(dict, 0, sizeof(BrotliSharedDictionary)); + + dict->context_based = BROTLI_FALSE; + dict->num_dictionaries = 1; + dict->num_word_lists = 0; + dict->num_transform_lists = 0; + + dict->words[0] = BrotliGetDictionary(); + dict->transforms[0] = BrotliGetTransforms(); + + dict->alloc_func = alloc_func ? alloc_func : BrotliDefaultAllocFunc; + dict->free_func = free_func ? free_func : BrotliDefaultFreeFunc; + dict->memory_manager_opaque = opaque; + + return dict; +} + +#if defined(__cplusplus) || defined(c_plusplus) +} /* extern "C" */ +#endif diff --git a/c/common/shared_dictionary_internal.h b/c/common/shared_dictionary_internal.h new file mode 100644 index 0000000..a2b76ee --- /dev/null +++ b/c/common/shared_dictionary_internal.h @@ -0,0 +1,74 @@ +/* Copyright 2017 Google Inc. All Rights Reserved. + + Distributed under MIT license. + See file LICENSE for detail or copy at https://opensource.org/licenses/MIT +*/ + +/* (Transparent) Shared Dictionary definition. */ + +#ifndef BROTLI_COMMON_SHARED_DICTIONARY_INTERNAL_H_ +#define BROTLI_COMMON_SHARED_DICTIONARY_INTERNAL_H_ + +#include "./dictionary.h" +#include +#include "./transform.h" +#include + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +struct BrotliSharedDictionaryStruct { + /* LZ77 prefixes (compound dictionary). */ + uint32_t num_prefix; /* max SHARED_BROTLI_MAX_COMPOUND_DICTS */ + size_t prefix_size[SHARED_BROTLI_MAX_COMPOUND_DICTS]; + const uint8_t* prefix[SHARED_BROTLI_MAX_COMPOUND_DICTS]; + + /* If set, the context map is used to select word and transform list from 64 + contexts, if not set, the context map is not used and only words[0] and + transforms[0] are to be used. */ + BROTLI_BOOL context_based; + + uint8_t context_map[SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS]; + + /* Amount of word_list+transform_list combinations. */ + uint8_t num_dictionaries; + + /* Must use num_dictionaries values. */ + const BrotliDictionary* words[SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS]; + + /* Must use num_dictionaries values. */ + const BrotliTransforms* transforms[SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS]; + + /* Amount of custom word lists. May be 0 if only Brotli's built-in is used */ + uint8_t num_word_lists; + + /* Contents of the custom words lists. Must be NULL if num_word_lists is 0. */ + BrotliDictionary* words_instances; + + /* Amount of custom transform lists. May be 0 if only Brotli's built-in is + used */ + uint8_t num_transform_lists; + + /* Contents of the custom transform lists. Must be NULL if num_transform_lists + is 0. */ + BrotliTransforms* transforms_instances; + + /* Concatenated prefix_suffix_maps of the custom transform lists. Must be NULL + if num_transform_lists is 0. */ + uint16_t* prefix_suffix_maps; + + /* Memory management */ + brotli_alloc_func alloc_func; + brotli_free_func free_func; + void* memory_manager_opaque; +}; + +typedef struct BrotliSharedDictionaryStruct BrotliSharedDictionaryInternal; +#define BrotliSharedDictionary BrotliSharedDictionaryInternal + +#if defined(__cplusplus) || defined(c_plusplus) +} /* extern "C" */ +#endif + +#endif /* BROTLI_COMMON_SHARED_DICTIONARY_INTERNAL_H_ */ diff --git a/c/dec/decode.c b/c/dec/decode.c index 7eee968..12cd502 100644 --- a/c/dec/decode.c +++ b/c/dec/decode.c @@ -13,6 +13,7 @@ #include "../common/context.h" #include "../common/dictionary.h" #include "../common/platform.h" +#include "../common/shared_dictionary_internal.h" #include "../common/transform.h" #include "../common/version.h" #include "./bit_reader.h" @@ -42,8 +43,8 @@ extern "C" { /* We need the slack region for the following reasons: - doing up to two 16-byte copies for fast backward copying - inserting transformed dictionary word: - 5 prefix + 24 base + 8 suffix */ -static const uint32_t kRingBufferWriteAheadSlack = 42; + 255 prefix + 32 base + 255 suffix */ +static const uint32_t kRingBufferWriteAheadSlack = 542; static const uint8_t kCodeLengthCodeOrder[BROTLI_CODE_LENGTH_CODES] = { 1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -1403,6 +1404,114 @@ static BrotliDecoderErrorCode BROTLI_NOINLINE CopyUncompressedBlockToOutput( BROTLI_DCHECK(0); /* Unreachable */ } +static BROTLI_BOOL AttachCompoundDictionary( + BrotliDecoderState* state, const uint8_t* data, size_t size) { + BrotliDecoderCompoundDictionary* addon = state->compound_dictionary; + if (state->state != BROTLI_STATE_UNINITED) return BROTLI_FALSE; + if (!addon) { + addon = (BrotliDecoderCompoundDictionary*)BROTLI_DECODER_ALLOC( + state, sizeof(BrotliDecoderCompoundDictionary)); + if (!addon) return BROTLI_FALSE; + addon->num_chunks = 0; + addon->total_size = 0; + addon->br_length = 0; + addon->br_copied = 0; + addon->block_bits = -1; + addon->chunk_offsets[0] = 0; + state->compound_dictionary = addon; + } + if (addon->num_chunks == 15) return BROTLI_FALSE; + addon->chunks[addon->num_chunks] = data; + addon->num_chunks++; + addon->total_size += (int)size; + addon->chunk_offsets[addon->num_chunks] = addon->total_size; + return BROTLI_TRUE; +} + +static void EnsureCoumpoundDictionaryInitialized(BrotliDecoderState* state) { + BrotliDecoderCompoundDictionary* addon = state->compound_dictionary; + /* 256 = (1 << 8) slots in block map. */ + int block_bits = 8; + int cursor = 0; + int index = 0; + if (addon->block_bits != -1) return; + while (((addon->total_size - 1) >> block_bits) != 0) block_bits++; + block_bits -= 8; + addon->block_bits = block_bits; + while (cursor < addon->total_size) { + while (addon->chunk_offsets[index + 1] < cursor) index++; + addon->block_map[cursor >> block_bits] = (uint8_t)index; + cursor += 1 << block_bits; + } +} + +static BROTLI_BOOL InitializeCompoundDictionaryCopy(BrotliDecoderState* s, + int address, int length) { + BrotliDecoderCompoundDictionary* addon = s->compound_dictionary; + int index; + EnsureCoumpoundDictionaryInitialized(s); + index = addon->block_map[address >> addon->block_bits]; + while (address >= addon->chunk_offsets[index + 1]) index++; + if (addon->total_size < address + length) return BROTLI_FALSE; + /* Update the recent distances cache. */ + s->dist_rb[s->dist_rb_idx & 3] = s->distance_code; + ++s->dist_rb_idx; + s->meta_block_remaining_len -= length; + addon->br_index = index; + addon->br_offset = address - addon->chunk_offsets[index]; + addon->br_length = length; + addon->br_copied = 0; + return BROTLI_TRUE; +} + +static int GetCompoundDictionarySize(BrotliDecoderState* s) { + return s->compound_dictionary ? s->compound_dictionary->total_size : 0; +} + +static int CopyFromCompoundDictionary(BrotliDecoderState* s, int pos) { + BrotliDecoderCompoundDictionary* addon = s->compound_dictionary; + int orig_pos = pos; + while (addon->br_length != addon->br_copied) { + uint8_t* copy_dst = &s->ringbuffer[pos]; + const uint8_t* copy_src = + addon->chunks[addon->br_index] + addon->br_offset; + int space = s->ringbuffer_size - pos; + int rem_chunk_length = (addon->chunk_offsets[addon->br_index + 1] - + addon->chunk_offsets[addon->br_index]) - addon->br_offset; + int length = addon->br_length - addon->br_copied; + if (length > rem_chunk_length) length = rem_chunk_length; + if (length > space) length = space; + memcpy(copy_dst, copy_src, (size_t)length); + pos += length; + addon->br_offset += length; + addon->br_copied += length; + if (length == rem_chunk_length) { + addon->br_index++; + addon->br_offset = 0; + } + if (pos == s->ringbuffer_size) break; + } + return pos - orig_pos; +} + +BROTLI_BOOL BrotliDecoderAttachDictionary(BrotliDecoderState* state, + BrotliSharedDictionaryType type, size_t data_size, const uint8_t* data) { + uint32_t i; + uint32_t num_prefix_before = state->dictionary->num_prefix; + if (state->state != BROTLI_STATE_UNINITED) return BROTLI_FALSE; + if (!BrotliSharedDictionaryAttach(state->dictionary, type, data_size, data)) { + return BROTLI_FALSE; + } + for (i = num_prefix_before; i < state->dictionary->num_prefix; i++) { + if (!AttachCompoundDictionary( + state, state->dictionary->prefix[i], + state->dictionary->prefix_size[i])) { + return BROTLI_FALSE; + } + } + return BROTLI_TRUE; +} + /* Calculates the smallest feasible ring buffer. If we know the data size is small, do not allocate more ring buffer @@ -1737,6 +1846,7 @@ static BROTLI_INLINE BrotliDecoderErrorCode ProcessCommandsInternal( int i = s->loop_counter; BrotliDecoderErrorCode result = BROTLI_DECODER_SUCCESS; BrotliBitReader* br = &s->br; + int compound_dictionary_size = GetCompoundDictionarySize(s); if (!CheckInputAmount(safe, br, 28)) { result = BROTLI_DECODER_NEEDS_MORE_INPUT; @@ -1903,20 +2013,75 @@ CommandPostDecodeLiterals: pos, s->distance_code, i, s->meta_block_remaining_len)); return BROTLI_FAILURE(BROTLI_DECODER_ERROR_FORMAT_DISTANCE); } - if (i >= BROTLI_MIN_DICTIONARY_WORD_LENGTH && - i <= BROTLI_MAX_DICTIONARY_WORD_LENGTH) { - int address = s->distance_code - s->max_distance - 1; - const BrotliDictionary* words = s->dictionary; - const BrotliTransforms* transforms = s->transforms; - int offset = (int)s->dictionary->offsets_by_length[i]; - uint32_t shift = s->dictionary->size_bits_by_length[i]; - + if (s->distance_code - s->max_distance - 1 < compound_dictionary_size) { + int address = compound_dictionary_size - + (s->distance_code - s->max_distance); + if (!InitializeCompoundDictionaryCopy(s, address, i)) { + return BROTLI_FAILURE(BROTLI_DECODER_ERROR_COMPOUND_DICTIONARY); + } + pos += CopyFromCompoundDictionary(s, pos); + if (pos >= s->ringbuffer_size) { + s->state = BROTLI_STATE_COMMAND_POST_WRITE_1; + goto saveStateAndReturn; + } + } else if (i >= SHARED_BROTLI_MIN_DICTIONARY_WORD_LENGTH && + i <= SHARED_BROTLI_MAX_DICTIONARY_WORD_LENGTH) { + uint8_t p1 = s->ringbuffer[(pos - 1) & s->ringbuffer_mask]; + uint8_t p2 = s->ringbuffer[(pos - 2) & s->ringbuffer_mask]; + uint8_t dict_id = s->dictionary->context_based ? + s->dictionary->context_map[BROTLI_CONTEXT(p1, p2, s->context_lookup)] + : 0; + const BrotliDictionary* words = s->dictionary->words[dict_id]; + const BrotliTransforms* transforms = s->dictionary->transforms[dict_id]; + int offset = (int)words->offsets_by_length[i]; + uint32_t shift = words->size_bits_by_length[i]; + int address = + s->distance_code - s->max_distance - 1 - compound_dictionary_size; int mask = (int)BitMask(shift); int word_idx = address & mask; int transform_idx = address >> shift; /* Compensate double distance-ring-buffer roll. */ s->dist_rb_idx += s->distance_context; offset += word_idx * i; + /* If the distance is out of bound, select a next static dictionary if + there exist multiple. */ + if ((transform_idx >= (int)transforms->num_transforms || + words->size_bits_by_length[i] == 0) && + s->dictionary->num_dictionaries > 1) { + uint8_t dict_id2; + int dist_remaining = address - + (int)(((1u << shift) & ~1u)) * (int)transforms->num_transforms; + for (dict_id2 = 0; dict_id2 < s->dictionary->num_dictionaries; + dict_id2++) { + const BrotliDictionary* words2 = s->dictionary->words[dict_id2]; + if (dict_id2 != dict_id && words2->size_bits_by_length[i] != 0) { + const BrotliTransforms* transforms2 = + s->dictionary->transforms[dict_id2]; + uint32_t shift2 = words2->size_bits_by_length[i]; + int num = (int)((1u << shift2) & ~1u) * + (int)transforms2->num_transforms; + if (dist_remaining < num) { + dict_id = dict_id2; + words = words2; + transforms = transforms2; + address = dist_remaining; + shift = shift2; + mask = (int)BitMask(shift); + word_idx = address & mask; + transform_idx = address >> shift; + offset = (int)words->offsets_by_length[i] + word_idx * i; + break; + } + dist_remaining -= num; + } + } + } + if (BROTLI_PREDICT_FALSE(words->size_bits_by_length[i] == 0)) { + BROTLI_LOG(("Invalid backward reference. pos: %d distance: %d " + "len: %d bytes left: %d\n", + pos, s->distance_code, i, s->meta_block_remaining_len)); + return BROTLI_FAILURE(BROTLI_DECODER_ERROR_FORMAT_DICTIONARY); + } if (BROTLI_PREDICT_FALSE(!words->data)) { return BROTLI_FAILURE(BROTLI_DECODER_ERROR_DICTIONARY_NOT_SET); } @@ -1933,6 +2098,10 @@ CommandPostDecodeLiterals: BROTLI_LOG(("[ProcessCommandsInternal] dictionary word: [%.*s]," " transform_idx = %d, transformed: [%.*s]\n", i, word, transform_idx, len, &s->ringbuffer[pos])); + if (len == 0 && s->distance_code <= 120) { + BROTLI_LOG(("Invalid length-0 dictionary word after transform\n")); + return BROTLI_FAILURE(BROTLI_DECODER_ERROR_FORMAT_TRANSFORM); + } } pos += len; s->meta_block_remaining_len -= len; @@ -2483,6 +2652,11 @@ BrotliDecoderResult BrotliDecoderDecompressStream( s->max_distance = s->max_backward_distance; } if (s->state == BROTLI_STATE_COMMAND_POST_WRITE_1) { + BrotliDecoderCompoundDictionary* addon = s->compound_dictionary; + if (addon && (addon->br_length != addon->br_copied)) { + s->pos += CopyFromCompoundDictionary(s, s->pos); + if (s->pos >= s->ringbuffer_size) continue; + } if (s->meta_block_remaining_len == 0) { /* Next metablock, if any. */ s->state = BROTLI_STATE_METABLOCK_DONE; diff --git a/c/dec/state.c b/c/dec/state.c index f847836..66d6820 100644 --- a/c/dec/state.c +++ b/c/dec/state.c @@ -8,6 +8,7 @@ #include /* free, malloc */ +#include "../common/dictionary.h" #include #include "./huffman.h" @@ -81,8 +82,10 @@ BROTLI_BOOL BrotliDecoderStateInit(BrotliDecoderState* s, s->mtf_upper_bound = 63; - s->dictionary = BrotliGetDictionary(); - s->transforms = BrotliGetTransforms(); + s->compound_dictionary = NULL; + s->dictionary = + BrotliSharedDictionaryCreateInstance(alloc_func, free_func, opaque); + if (!s->dictionary) return BROTLI_FALSE; return BROTLI_TRUE; } @@ -129,6 +132,9 @@ void BrotliDecoderStateCleanupAfterMetablock(BrotliDecoderState* s) { void BrotliDecoderStateCleanup(BrotliDecoderState* s) { BrotliDecoderStateCleanupAfterMetablock(s); + BROTLI_DECODER_FREE(s, s->compound_dictionary); + BrotliSharedDictionaryDestroyInstance(s->dictionary); + s->dictionary = NULL; BROTLI_DECODER_FREE(s, s->ringbuffer); BROTLI_DECODER_FREE(s, s->block_type_trees); } diff --git a/c/dec/state.h b/c/dec/state.h index 54dab69..7127581 100644 --- a/c/dec/state.h +++ b/c/dec/state.h @@ -12,6 +12,7 @@ #include "../common/constants.h" #include "../common/dictionary.h" #include "../common/platform.h" +#include #include "../common/transform.h" #include #include "./bit_reader.h" @@ -189,6 +190,20 @@ typedef enum { BROTLI_STATE_READ_BLOCK_LENGTH_SUFFIX } BrotliRunningReadBlockLengthState; +/* BrotliDecoderState addon, used for Compound Dictionary functionality. */ +typedef struct BrotliDecoderCompoundDictionary { + int num_chunks; + int total_size; + int br_index; + int br_offset; + int br_length; + int br_copied; + const uint8_t* chunks[16]; + int chunk_offsets[16]; + int block_bits; + uint8_t block_map[256]; +} BrotliDecoderCompoundDictionary; + typedef struct BrotliMetablockHeaderArena { BrotliRunningTreeGroupState substate_tree_group; BrotliRunningContextMapState substate_context_map; @@ -327,8 +342,8 @@ struct BrotliDecoderStateStruct { uint8_t* context_map; uint8_t* context_modes; - const BrotliDictionary* dictionary; - const BrotliTransforms* transforms; + BrotliSharedDictionary* dictionary; + BrotliDecoderCompoundDictionary* compound_dictionary; uint32_t trivial_literal_contexts[8]; /* 256 bits */ diff --git a/c/enc/backward_references.c b/c/enc/backward_references.c index a07a617..1ae1146 100644 --- a/c/enc/backward_references.c +++ b/c/enc/backward_references.c @@ -9,12 +9,13 @@ #include "./backward_references.h" #include "../common/constants.h" -#include "../common/context.h" #include "../common/dictionary.h" #include "../common/platform.h" #include #include "./command.h" +#include "./compound_dictionary.h" #include "./dictionary_hash.h" +#include "./encoder_dict.h" #include "./memory.h" #include "./quality.h" @@ -52,6 +53,7 @@ static BROTLI_INLINE size_t ComputeDistanceCode(size_t distance, #define EXPORT_FN(X) EXPAND_CAT(X, EXPAND_CAT(PREFIX(), HASHER())) #define PREFIX() N +#define ENABLE_COMPOUND_DICTIONARY 0 #define HASHER() H2 /* NOLINTNEXTLINE(build/include) */ @@ -113,6 +115,41 @@ static BROTLI_INLINE size_t ComputeDistanceCode(size_t distance, #include "./backward_references_inc.h" #undef HASHER +#undef ENABLE_COMPOUND_DICTIONARY +#undef PREFIX +#define PREFIX() D +#define ENABLE_COMPOUND_DICTIONARY 1 + +#define HASHER() H5 +/* NOLINTNEXTLINE(build/include) */ +#include "./backward_references_inc.h" +#undef HASHER +#define HASHER() H6 +/* NOLINTNEXTLINE(build/include) */ +#include "./backward_references_inc.h" +#undef HASHER +#define HASHER() H40 +/* NOLINTNEXTLINE(build/include) */ +#include "./backward_references_inc.h" +#undef HASHER +#define HASHER() H41 +/* NOLINTNEXTLINE(build/include) */ +#include "./backward_references_inc.h" +#undef HASHER +#define HASHER() H42 +/* NOLINTNEXTLINE(build/include) */ +#include "./backward_references_inc.h" +#undef HASHER +#define HASHER() H55 +/* NOLINTNEXTLINE(build/include) */ +#include "./backward_references_inc.h" +#undef HASHER +#define HASHER() H65 +/* NOLINTNEXTLINE(build/include) */ +#include "./backward_references_inc.h" +#undef HASHER + +#undef ENABLE_COMPOUND_DICTIONARY #undef PREFIX #undef EXPORT_FN @@ -125,6 +162,28 @@ void BrotliCreateBackwardReferences(size_t num_bytes, ContextLut literal_context_lut, const BrotliEncoderParams* params, Hasher* hasher, int* dist_cache, size_t* last_insert_len, Command* commands, size_t* num_commands, size_t* num_literals) { + if (params->dictionary.compound.num_chunks != 0) { + switch (params->hasher.type) { +#define CASE_(N) \ + case N: \ + CreateBackwardReferencesDH ## N(num_bytes, \ + position, ringbuffer, ringbuffer_mask, \ + literal_context_lut, params, hasher, dist_cache, \ + last_insert_len, commands, num_commands, num_literals); \ + return; + CASE_(5) + CASE_(6) + CASE_(40) + CASE_(41) + CASE_(42) + CASE_(55) + CASE_(65) +#undef CASE_ + default: + break; + } + } + switch (params->hasher.type) { #define CASE_(N) \ case N: \ diff --git a/c/enc/backward_references_hq.c b/c/enc/backward_references_hq.c index 61b7ff1..5e17c84 100644 --- a/c/enc/backward_references_hq.c +++ b/c/enc/backward_references_hq.c @@ -11,10 +11,11 @@ #include /* memcpy, memset */ #include "../common/constants.h" -#include "../common/context.h" #include "../common/platform.h" #include #include "./command.h" +#include "./compound_dictionary.h" +#include "./encoder_dict.h" #include "./fast_log.h" #include "./find_match_length.h" #include "./literal_cost.h" @@ -430,7 +431,8 @@ static size_t UpdateNodes( size_t min_len; size_t result = 0; size_t k; - size_t gap = 0; + const CompoundDictionary* addon = ¶ms->dictionary.compound; + size_t gap = addon->total_size; EvaluateNode(block_start + stream_offset, pos, max_backward_limit, gap, starting_dist_cache, model, queue, nodes); @@ -484,6 +486,24 @@ static size_t UpdateNodes( len = FindMatchLengthWithLimit(&ringbuffer[prev_ix], &ringbuffer[cur_ix_masked], max_len); + } else if (backward > dictionary_start) { + size_t d = 0; + size_t offset; + size_t limit; + const uint8_t* source; + offset = dictionary_start + 1 + addon->total_size - 1; + while (offset >= backward + addon->chunk_offsets[d + 1]) d++; + source = addon->chunk_source[d]; + offset = offset - addon->chunk_offsets[d] - backward; + limit = addon->chunk_offsets[d + 1] - addon->chunk_offsets[d] - offset; + limit = limit > max_len ? max_len : limit; + if (best_len >= limit || + continuation != source[offset + best_len]) { + continue; + } + len = FindMatchLengthWithLimit(&source[offset], + &ringbuffer[cur_ix_masked], + limit); } else { /* "Gray" area. It is addressable by decoder, but this encoder instance does not have that data -> should not touch it. */ @@ -589,7 +609,7 @@ void BrotliZopfliCreateCommands(const size_t num_bytes, size_t pos = 0; uint32_t offset = nodes[0].u.next; size_t i; - size_t gap = 0; + size_t gap = params->dictionary.compound.total_size; for (i = 0; offset != BROTLI_UINT32_MAX; i++) { const ZopfliNode* next = &nodes[pos + offset]; size_t copy_length = ZopfliNodeCopyLength(next); @@ -665,6 +685,23 @@ static size_t ZopfliIterate(size_t num_bytes, size_t position, return ComputeShortestPathFromNodes(num_bytes, nodes); } +static void MergeMatches(BackwardMatch* dst, + BackwardMatch* src1, size_t len1, BackwardMatch* src2, size_t len2) { + while (len1 > 0 && len2 > 0) { + size_t l1 = BackwardMatchLength(src1); + size_t l2 = BackwardMatchLength(src2); + if (l1 < l2 || ((l1 == l2) && (src1->distance < src2->distance))) { + *dst++ = *src1++; + len1--; + } else { + *dst++ = *src2++; + len2--; + } + } + while (len1-- > 0) *dst++ = *src1++; + while (len2-- > 0) *dst++ = *src2++; +} + /* REQUIRES: nodes != NULL and len(nodes) >= num_bytes + 1 */ size_t BrotliZopfliComputeShortestPath(MemoryManager* m, size_t num_bytes, size_t position, const uint8_t* ringbuffer, size_t ringbuffer_mask, @@ -679,10 +716,11 @@ size_t BrotliZopfliComputeShortestPath(MemoryManager* m, size_t num_bytes, const size_t store_end = num_bytes >= StoreLookaheadH10() ? position + num_bytes - StoreLookaheadH10() + 1 : position; size_t i; - size_t gap = 0; - size_t lz_matches_offset = 0; + const CompoundDictionary* addon = ¶ms->dictionary.compound; + size_t gap = addon->total_size; + size_t lz_matches_offset = + (addon->num_chunks != 0) ? (MAX_NUM_MATCHES_H10 + 128) : 0; ZopfliCostModel* model = BROTLI_ALLOC(m, ZopfliCostModel, 1); - BROTLI_UNUSED(literal_context_lut); if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(model) || BROTLI_IS_NULL(matches)) { return 0; } @@ -700,10 +738,28 @@ size_t BrotliZopfliComputeShortestPath(MemoryManager* m, size_t num_bytes, pos + stream_offset, max_backward_limit); size_t skip; size_t num_matches; + int dict_id = 0; + if (params->dictionary.contextual.context_based) { + uint8_t p1 = pos >= 1 ? + ringbuffer[(size_t)(pos - 1) & ringbuffer_mask] : 0; + uint8_t p2 = pos >= 2 ? + ringbuffer[(size_t)(pos - 2) & ringbuffer_mask] : 0; + dict_id = params->dictionary.contextual.context_map[ + BROTLI_CONTEXT(p1, p2, literal_context_lut)]; + } num_matches = FindAllMatchesH10(&hasher->privat._H10, - ¶ms->dictionary, + params->dictionary.contextual.dict[dict_id], ringbuffer, ringbuffer_mask, pos, num_bytes - i, max_distance, dictionary_start + gap, params, &matches[lz_matches_offset]); + if (addon->num_chunks != 0) { + size_t cd_matches = LookupAllCompoundDictionaryMatches(addon, + ringbuffer, ringbuffer_mask, pos, 3, num_bytes - i, + dictionary_start, params->dist.max_distance, + &matches[lz_matches_offset - 64], 64); + MergeMatches(matches, &matches[lz_matches_offset - 64], cd_matches, + &matches[lz_matches_offset], num_matches); + num_matches += cd_matches; + } if (num_matches > 0 && BackwardMatchLength(&matches[num_matches - 1]) > max_zopfli_len) { matches[0] = matches[num_matches - 1]; @@ -774,9 +830,10 @@ void BrotliCreateHqZopfliBackwardReferences(MemoryManager* m, size_t num_bytes, ZopfliCostModel* model = BROTLI_ALLOC(m, ZopfliCostModel, 1); ZopfliNode* nodes; BackwardMatch* matches = BROTLI_ALLOC(m, BackwardMatch, matches_size); - size_t gap = 0; - size_t shadow_matches = 0; - BROTLI_UNUSED(literal_context_lut); + const CompoundDictionary* addon = ¶ms->dictionary.compound; + size_t gap = addon->total_size; + size_t shadow_matches = + (addon->num_chunks != 0) ? (MAX_NUM_MATCHES_H10 + 128) : 0; if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(model) || BROTLI_IS_NULL(num_matches) || BROTLI_IS_NULL(matches)) { return; @@ -790,15 +847,34 @@ void BrotliCreateHqZopfliBackwardReferences(MemoryManager* m, size_t num_bytes, size_t num_found_matches; size_t cur_match_end; size_t j; + int dict_id = 0; + if (params->dictionary.contextual.context_based) { + uint8_t p1 = pos >= 1 ? + ringbuffer[(size_t)(pos - 1) & ringbuffer_mask] : 0; + uint8_t p2 = pos >= 2 ? + ringbuffer[(size_t)(pos - 2) & ringbuffer_mask] : 0; + dict_id = params->dictionary.contextual.context_map[ + BROTLI_CONTEXT(p1, p2, literal_context_lut)]; + } /* Ensure that we have enough free slots. */ BROTLI_ENSURE_CAPACITY(m, BackwardMatch, matches, matches_size, cur_match_pos + MAX_NUM_MATCHES_H10 + shadow_matches); if (BROTLI_IS_OOM(m)) return; num_found_matches = FindAllMatchesH10(&hasher->privat._H10, - ¶ms->dictionary, + params->dictionary.contextual.dict[dict_id], ringbuffer, ringbuffer_mask, pos, max_length, max_distance, dictionary_start + gap, params, &matches[cur_match_pos + shadow_matches]); + if (addon->num_chunks != 0) { + size_t cd_matches = LookupAllCompoundDictionaryMatches(addon, + ringbuffer, ringbuffer_mask, pos, 3, max_length, + dictionary_start, params->dist.max_distance, + &matches[cur_match_pos + shadow_matches - 64], 64); + MergeMatches(&matches[cur_match_pos], + &matches[cur_match_pos + shadow_matches - 64], cd_matches, + &matches[cur_match_pos + shadow_matches], num_found_matches); + num_found_matches += cd_matches; + } cur_match_end = cur_match_pos + num_found_matches; for (j = cur_match_pos; j + 1 < cur_match_end; ++j) { BROTLI_DCHECK(BackwardMatchLength(&matches[j]) <= diff --git a/c/enc/backward_references_inc.h b/c/enc/backward_references_inc.h index 766bf91..752c12e 100644 --- a/c/enc/backward_references_inc.h +++ b/c/enc/backward_references_inc.h @@ -28,13 +28,11 @@ static BROTLI_NOINLINE void EXPORT_FN(CreateBackwardReferences)( const size_t random_heuristics_window_size = LiteralSpreeLengthForSparseSearch(params); size_t apply_random_heuristics = position + random_heuristics_window_size; - const size_t gap = 0; + const size_t gap = params->dictionary.compound.total_size; /* Minimum score to accept a backward reference. */ const score_t kMinScore = BROTLI_SCORE_BASE + 100; - BROTLI_UNUSED(literal_context_lut); - FN(PrepareDistanceCache)(privat, dist_cache); while (position + FN(HashTypeLength)() < pos_end) { @@ -43,13 +41,29 @@ static BROTLI_NOINLINE void EXPORT_FN(CreateBackwardReferences)( size_t dictionary_start = BROTLI_MIN(size_t, position + position_offset, max_backward_limit); HasherSearchResult sr; + int dict_id = 0; + uint8_t p1 = 0; + uint8_t p2 = 0; + if (params->dictionary.contextual.context_based) { + p1 = position >= 1 ? + ringbuffer[(size_t)(position - 1) & ringbuffer_mask] : 0; + p2 = position >= 2 ? + ringbuffer[(size_t)(position - 2) & ringbuffer_mask] : 0; + dict_id = params->dictionary.contextual.context_map[ + BROTLI_CONTEXT(p1, p2, literal_context_lut)]; + } sr.len = 0; sr.len_code_delta = 0; sr.distance = 0; sr.score = kMinScore; - FN(FindLongestMatch)(privat, ¶ms->dictionary, + FN(FindLongestMatch)(privat, params->dictionary.contextual.dict[dict_id], ringbuffer, ringbuffer_mask, dist_cache, position, max_length, max_distance, dictionary_start + gap, params->dist.max_distance, &sr); + if (ENABLE_COMPOUND_DICTIONARY) { + LookupCompoundDictionaryMatch(¶ms->dictionary.compound, ringbuffer, + ringbuffer_mask, dist_cache, position, max_length, + dictionary_start, params->dist.max_distance, &sr); + } if (sr.score > kMinScore) { /* Found a match. Let's look for something even better ahead. */ int delayed_backward_references_in_row = 0; @@ -65,11 +79,23 @@ static BROTLI_NOINLINE void EXPORT_FN(CreateBackwardReferences)( max_distance = BROTLI_MIN(size_t, position + 1, max_backward_limit); dictionary_start = BROTLI_MIN(size_t, position + 1 + position_offset, max_backward_limit); + if (params->dictionary.contextual.context_based) { + p2 = p1; + p1 = ringbuffer[position & ringbuffer_mask]; + dict_id = params->dictionary.contextual.context_map[ + BROTLI_CONTEXT(p1, p2, literal_context_lut)]; + } FN(FindLongestMatch)(privat, - ¶ms->dictionary, + params->dictionary.contextual.dict[dict_id], ringbuffer, ringbuffer_mask, dist_cache, position + 1, max_length, max_distance, dictionary_start + gap, params->dist.max_distance, &sr2); + if (ENABLE_COMPOUND_DICTIONARY) { + LookupCompoundDictionaryMatch( + ¶ms->dictionary.compound, ringbuffer, + ringbuffer_mask, dist_cache, position + 1, max_length, + dictionary_start, params->dist.max_distance, &sr2); + } if (sr2.score >= sr.score + cost_diff_lazy) { /* Ok, let's just write one byte for now and start a match from the next byte. */ diff --git a/c/enc/compound_dictionary.c b/c/enc/compound_dictionary.c new file mode 100644 index 0000000..f2a3f37 --- /dev/null +++ b/c/enc/compound_dictionary.c @@ -0,0 +1,200 @@ +/* Copyright 2017 Google Inc. All Rights Reserved. + + Distributed under MIT license. + See file LICENSE for detail or copy at https://opensource.org/licenses/MIT +*/ + +#include "./compound_dictionary.h" + +#include "../common/platform.h" +#include +#include "./memory.h" +#include "./quality.h" + +static PreparedDictionary* CreatePreparedDictionaryWithParams(MemoryManager* m, + const uint8_t* source, size_t source_size, uint32_t bucket_bits, + uint32_t slot_bits, uint32_t hash_bits, uint16_t bucket_limit) { + /* Step 1: create "bloated" hasher. */ + uint32_t num_slots = 1u << slot_bits; + uint32_t num_buckets = 1u << bucket_bits; + uint32_t hash_shift = 64u - bucket_bits; + uint64_t hash_mask = (~((uint64_t)0U)) >> (64 - hash_bits); + uint32_t slot_mask = num_slots - 1; + size_t alloc_size = (sizeof(uint32_t) << slot_bits) + + (sizeof(uint32_t) << slot_bits) + + (sizeof(uint16_t) << bucket_bits) + + (sizeof(uint32_t) << bucket_bits) + + (sizeof(uint32_t) * source_size); + uint8_t* flat = NULL; + PreparedDictionary* result = NULL; + uint16_t* num = NULL; + uint32_t* bucket_heads = NULL; + uint32_t* next_bucket = NULL; + uint32_t* slot_offsets = NULL; + uint16_t* heads = NULL; + uint32_t* items = NULL; + uint8_t* source_copy = NULL; + uint32_t i; + uint32_t* slot_size = NULL; + uint32_t* slot_limit = NULL; + uint32_t total_items = 0; + if (slot_bits > 16) return NULL; + if (slot_bits > bucket_bits) return NULL; + if (bucket_bits - slot_bits >= 16) return NULL; + + flat = BROTLI_ALLOC(m, uint8_t, alloc_size); + if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(flat)) return NULL; + + slot_size = (uint32_t*)flat; + slot_limit = (uint32_t*)(&slot_size[num_slots]); + num = (uint16_t*)(&slot_limit[num_slots]); + bucket_heads = (uint32_t*)(&num[num_buckets]); + next_bucket = (uint32_t*)(&bucket_heads[num_buckets]); + memset(num, 0, num_buckets * sizeof(num[0])); + + /* TODO: apply custom "store" order. */ + for (i = 0; i + 7 < source_size; ++i) { + const uint64_t h = (BROTLI_UNALIGNED_LOAD64LE(&source[i]) & hash_mask) * + kPreparedDictionaryHashMul64Long; + const uint32_t key = (uint32_t)(h >> hash_shift); + uint16_t count = num[key]; + next_bucket[i] = (count == 0) ? ((uint32_t)(-1)) : bucket_heads[key]; + bucket_heads[key] = i; + count++; + if (count > bucket_limit) count = bucket_limit; + num[key] = count; + } + + /* Step 2: find slot limits. */ + for (i = 0; i < num_slots; ++i) { + BROTLI_BOOL overflow = BROTLI_FALSE; + slot_limit[i] = bucket_limit; + while (BROTLI_TRUE) { + uint32_t limit = slot_limit[i]; + size_t j; + uint32_t count = 0; + overflow = BROTLI_FALSE; + for (j = i; j < num_buckets; j += num_slots) { + uint32_t size = num[j]; + /* Last chain may span behind 64K limit; overflow happens only if + we are about to use 0xFFFF+ as item offset. */ + if (count >= 0xFFFF) { + overflow = BROTLI_TRUE; + break; + } + if (size > limit) size = limit; + count += size; + } + if (!overflow) { + slot_size[i] = count; + total_items += count; + break; + } + slot_limit[i]--; + } + } + + /* Step 3: transfer data to "slim" hasher. */ + alloc_size = sizeof(PreparedDictionary) + (sizeof(uint32_t) << slot_bits) + + (sizeof(uint16_t) << bucket_bits) + (sizeof(uint32_t) * total_items) + + source_size; + + result = (PreparedDictionary*)BROTLI_ALLOC(m, uint8_t, alloc_size); + if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(result)) { + BROTLI_FREE(m, flat); + return NULL; + } + slot_offsets = (uint32_t*)(&result[1]); + heads = (uint16_t*)(&slot_offsets[num_slots]); + items = (uint32_t*)(&heads[num_buckets]); + source_copy = (uint8_t*)(&items[total_items]); + + result->magic = kPreparedDictionaryMagic; + result->source_offset = total_items; + result->source_size = (uint32_t)source_size; + result->hash_bits = hash_bits; + result->bucket_bits = bucket_bits; + result->slot_bits = slot_bits; + + total_items = 0; + for (i = 0; i < num_slots; ++i) { + slot_offsets[i] = total_items; + total_items += slot_size[i]; + slot_size[i] = 0; + } + for (i = 0; i < num_buckets; ++i) { + uint32_t slot = i & slot_mask; + uint32_t count = num[i]; + uint32_t pos; + size_t j; + size_t cursor = slot_size[slot]; + if (count > slot_limit[slot]) count = slot_limit[slot]; + if (count == 0) { + heads[i] = 0xFFFF; + continue; + } + heads[i] = (uint16_t)cursor; + cursor += slot_offsets[slot]; + slot_size[slot] += count; + pos = bucket_heads[i]; + for (j = 0; j < count; j++) { + items[cursor++] = pos; + pos = next_bucket[pos]; + } + items[cursor - 1] |= 0x80000000; + } + + BROTLI_FREE(m, flat); + memcpy(source_copy, source, source_size); + return result; +} + +PreparedDictionary* CreatePreparedDictionary(MemoryManager* m, + const uint8_t* source, size_t source_size) { + uint32_t bucket_bits = 17; + uint32_t slot_bits = 7; + uint32_t hash_bits = 40; + uint16_t bucket_limit = 32; + size_t volume = 16u << bucket_bits; + /* Tune parameters to fit dictionary size. */ + while (volume < source_size && bucket_bits < 22) { + bucket_bits++; + slot_bits++; + volume <<= 1; + } + return CreatePreparedDictionaryWithParams(m, + source, source_size, bucket_bits, slot_bits, hash_bits, bucket_limit); +} + +void DestroyPreparedDictionary(MemoryManager* m, + PreparedDictionary* dictionary) { + if (!dictionary) return; + BROTLI_FREE(m, dictionary); +} + +BROTLI_BOOL AttachPreparedDictionary( + CompoundDictionary* compound, const PreparedDictionary* dictionary) { + size_t length = 0; + size_t index = 0; + + if (compound->num_chunks == SHARED_BROTLI_MAX_COMPOUND_DICTS) { + return BROTLI_FALSE; + } + + if (!dictionary) return BROTLI_FALSE; + + length = dictionary->source_size; + index = compound->num_chunks; + compound->total_size += length; + compound->chunks[index] = dictionary; + compound->chunk_offsets[index + 1] = compound->total_size; + { + uint32_t* slot_offsets = (uint32_t*)(&dictionary[1]); + uint16_t* heads = (uint16_t*)(&slot_offsets[1u << dictionary->slot_bits]); + uint32_t* items = (uint32_t*)(&heads[1u << dictionary->bucket_bits]); + compound->chunk_source[index] = + (const uint8_t*)(&items[dictionary->source_offset]); + } + compound->num_chunks++; + return BROTLI_TRUE; +} diff --git a/c/enc/compound_dictionary.h b/c/enc/compound_dictionary.h new file mode 100644 index 0000000..5e1dfe2 --- /dev/null +++ b/c/enc/compound_dictionary.h @@ -0,0 +1,60 @@ +/* Copyright 2017 Google Inc. All Rights Reserved. + + Distributed under MIT license. + See file LICENSE for detail or copy at https://opensource.org/licenses/MIT +*/ + +#ifndef BROTLI_ENC_PREPARED_DICTIONARY_H_ +#define BROTLI_ENC_PREPARED_DICTIONARY_H_ + +#include "../common/platform.h" +#include "../common/constants.h" +#include +#include +#include "./memory.h" + +static const uint32_t kPreparedDictionaryMagic = 0xDEBCEDE0; +static const uint64_t kPreparedDictionaryHashMul64Long = + BROTLI_MAKE_UINT64_T(0x1FE35A7Bu, 0xD3579BD3u); + +typedef struct PreparedDictionary { + uint32_t magic; + uint32_t source_offset; + uint32_t source_size; + uint32_t hash_bits; + uint32_t bucket_bits; + uint32_t slot_bits; + + /* --- Dynamic size members --- */ + + /* uint32_t slot_offsets[1 << slot_bits]; */ + /* uint16_t heads[1 << bucket_bits]; */ + /* uint32_t items[variable]; */ + + /* uint8_t source[source_size] */ +} PreparedDictionary; + +BROTLI_INTERNAL PreparedDictionary* CreatePreparedDictionary(MemoryManager* m, + const uint8_t* source, size_t source_size); + +BROTLI_INTERNAL void DestroyPreparedDictionary(MemoryManager* m, + PreparedDictionary* dictionary); + +typedef struct CompoundDictionary { + /* LZ77 prefix, compound dictionary */ + size_t num_chunks; + size_t total_size; + /* Client instances. */ + const PreparedDictionary* chunks[SHARED_BROTLI_MAX_COMPOUND_DICTS + 1]; + const uint8_t* chunk_source[SHARED_BROTLI_MAX_COMPOUND_DICTS + 1]; + size_t chunk_offsets[SHARED_BROTLI_MAX_COMPOUND_DICTS + 1]; + + size_t num_prepared_instances_; + /* Owned instances. */ + PreparedDictionary* prepared_instances_[SHARED_BROTLI_MAX_COMPOUND_DICTS + 1]; +} CompoundDictionary; + +BROTLI_INTERNAL BROTLI_BOOL AttachPreparedDictionary( + CompoundDictionary* compound, const PreparedDictionary* dictionary); + +#endif /* BROTLI_ENC_PREPARED_DICTIONARY */ diff --git a/c/enc/encode.c b/c/enc/encode.c index 23b1d3b..4a68c3c 100644 --- a/c/enc/encode.c +++ b/c/enc/encode.c @@ -21,6 +21,7 @@ #include "./brotli_bit_stream.h" #include "./compress_fragment.h" #include "./compress_fragment_two_pass.h" +#include "./dictionary_hash.h" #include "./encoder_dict.h" #include "./entropy_encode.h" #include "./fast_log.h" @@ -746,7 +747,7 @@ static void BrotliEncoderInitParams(BrotliEncoderParams* params) { params->stream_offset = 0; params->size_hint = 0; params->disable_literal_context_modeling = BROTLI_FALSE; - BrotliInitEncoderDictionary(¶ms->dictionary); + BrotliInitSharedEncoderDictionary(¶ms->dictionary); params->dist.distance_postfix_bits = 0; params->dist.num_direct_distance_codes = 0; params->dist.alphabet_size_max = @@ -755,6 +756,11 @@ static void BrotliEncoderInitParams(BrotliEncoderParams* params) { params->dist.max_distance = BROTLI_MAX_DISTANCE; } +static void BrotliEncoderCleanupParams(MemoryManager* m, + BrotliEncoderParams* params) { + BrotliCleanupSharedEncoderDictionary(m, ¶ms->dictionary); +} + static void BrotliEncoderInitState(BrotliEncoderState* s) { BrotliEncoderInitParams(&s->params); s->input_pos_ = 0; @@ -825,6 +831,7 @@ static void BrotliEncoderCleanupState(BrotliEncoderState* s) { BROTLI_FREE(m, s->two_pass_arena_); BROTLI_FREE(m, s->command_buf_); BROTLI_FREE(m, s->literal_buf_); + BrotliEncoderCleanupParams(m, &s->params); } /* Deinitializes and frees BrotliEncoderState instance. */ @@ -922,6 +929,8 @@ static void ExtendLastCommand(BrotliEncoderState* s, uint32_t* bytes, uint64_t cmd_dist = (uint64_t)s->dist_cache_[0]; uint32_t distance_code = CommandRestoreDistanceCode(last_command, &s->params.dist); + const CompoundDictionary* dict = &s->params.dictionary.compound; + size_t compound_dictionary_size = dict->total_size; if (distance_code < BROTLI_NUM_DISTANCE_SHORT_CODES || distance_code - (BROTLI_NUM_DISTANCE_SHORT_CODES - 1) == cmd_dist) { if (cmd_dist <= max_distance) { @@ -932,6 +941,38 @@ static void ExtendLastCommand(BrotliEncoderState* s, uint32_t* bytes, (*wrapped_last_processed_pos)++; } } else { + if ((cmd_dist - max_distance - 1) < compound_dictionary_size && + last_copy_len < cmd_dist - max_distance) { + size_t address = + compound_dictionary_size - (size_t)(cmd_dist - max_distance) + + (size_t)last_copy_len; + size_t br_index = 0; + size_t br_offset; + const uint8_t* chunk; + size_t chunk_length; + while (address >= dict->chunk_offsets[br_index + 1]) br_index++; + br_offset = address - dict->chunk_offsets[br_index]; + chunk = dict->chunk_source[br_index]; + chunk_length = + dict->chunk_offsets[br_index + 1] - dict->chunk_offsets[br_index]; + while (*bytes != 0 && data[*wrapped_last_processed_pos & mask] == + chunk[br_offset]) { + last_command->copy_len_++; + (*bytes)--; + (*wrapped_last_processed_pos)++; + if (++br_offset == chunk_length) { + br_index++; + br_offset = 0; + if (br_index != dict->num_chunks) { + chunk = dict->chunk_source[br_index]; + chunk_length = dict->chunk_offsets[br_index + 1] - + dict->chunk_offsets[br_index]; + } else { + break; + } + } + } + } } /* The copy length is at most the metablock size, and thus expressible. */ GetLengthCode(last_command->insert_len_, @@ -969,6 +1010,7 @@ static BROTLI_BOOL EncodeData( data = s->ringbuffer_.buffer_; mask = s->ringbuffer_.mask_; + if (s->params.quality > s->params.dictionary.max_quality) return BROTLI_FALSE; /* Adding more blocks after "last" block is forbidden. */ if (s->is_last_block_emitted_) return BROTLI_FALSE; if (is_last) s->is_last_block_emitted_ = BROTLI_TRUE; @@ -1421,6 +1463,7 @@ static BROTLI_BOOL BrotliCompressBufferQuality10( *encoded_size = total_out_size; DestroyHasher(m, hasher); BROTLI_FREE(m, hasher); + BrotliEncoderCleanupParams(m, params); BROTLI_FREE(m, params); BrotliBootstrapFree(m, m); return ok; @@ -1930,6 +1973,124 @@ uint32_t BrotliEncoderVersion(void) { return BROTLI_VERSION; } +BrotliEncoderPreparedDictionary* BrotliEncoderPrepareDictionary( + BrotliSharedDictionaryType type, size_t size, const uint8_t* data, + int quality, + brotli_alloc_func alloc_func, brotli_free_func free_func, void* opaque) { + ManagedDictionary* managed_dictionary = NULL; + if (type != BROTLI_SHARED_DICTIONARY_RAW && + type != BROTLI_SHARED_DICTIONARY_SERIALIZED) { + return NULL; + } + managed_dictionary = + BrotliCreateManagedDictionary(alloc_func, free_func, opaque); + if (managed_dictionary == NULL) { + return NULL; + } + if (type == BROTLI_SHARED_DICTIONARY_RAW) { + managed_dictionary->dictionary = (uint32_t*)CreatePreparedDictionary( + &managed_dictionary->memory_manager_, data, size); + } else { + SharedEncoderDictionary* dict = (SharedEncoderDictionary*)BrotliAllocate( + &managed_dictionary->memory_manager_, sizeof(SharedEncoderDictionary)); + managed_dictionary->dictionary = (uint32_t*)dict; + if (dict != NULL) { + BROTLI_BOOL ok = BrotliInitCustomSharedEncoderDictionary( + &managed_dictionary->memory_manager_, data, size, quality, dict); + if (!ok) { + BrotliFree(&managed_dictionary->memory_manager_, dict); + managed_dictionary->dictionary = NULL; + } + } + } + if (managed_dictionary->dictionary == NULL) { + BrotliDestroyManagedDictionary(managed_dictionary); + return NULL; + } + return (BrotliEncoderPreparedDictionary*)managed_dictionary; +} + +void BrotliEncoderDestroyPreparedDictionary( + BrotliEncoderPreparedDictionary* dictionary) { + ManagedDictionary* dict = (ManagedDictionary*)dictionary; + if (!dictionary) return; + /* First field of dictionary structs. */ + /* Only managed dictionaries are eligible for destruction by this method. */ + if (dict->magic != kManagedDictionaryMagic) { + return; + } + if (dict->dictionary == NULL) { + /* This should never ever happen. */ + } else if (*dict->dictionary == kPreparedDictionaryMagic) { + DestroyPreparedDictionary( + &dict->memory_manager_, (PreparedDictionary*)dict->dictionary); + } else if (*dict->dictionary == kSharedDictionaryMagic) { + BrotliCleanupSharedEncoderDictionary(&dict->memory_manager_, + (SharedEncoderDictionary*)dict->dictionary); + BrotliFree(&dict->memory_manager_, dict->dictionary); + } else { + /* This should never ever happen. */ + } + dict->dictionary = NULL; + BrotliDestroyManagedDictionary(dict); +} + +BROTLI_BOOL BrotliEncoderAttachPreparedDictionary(BrotliEncoderState* state, + const BrotliEncoderPreparedDictionary* dictionary) { + /* First field of dictionary structs */ + const BrotliEncoderPreparedDictionary* dict = dictionary; + uint32_t magic = *((const uint32_t*)dict); + SharedEncoderDictionary* current = NULL; + if (magic == kManagedDictionaryMagic) { + /* Unwrap managed dictionary. */ + ManagedDictionary* managed_dictionary = (ManagedDictionary*)dict; + magic = *managed_dictionary->dictionary; + dict = (BrotliEncoderPreparedDictionary*)managed_dictionary->dictionary; + } + current = &state->params.dictionary; + if (magic == kPreparedDictionaryMagic) { + const PreparedDictionary* prepared = (const PreparedDictionary*)dict; + if (!AttachPreparedDictionary(¤t->compound, prepared)) { + return BROTLI_FALSE; + } + } else if (magic == kSharedDictionaryMagic) { + const SharedEncoderDictionary* attached = + (const SharedEncoderDictionary*)dict; + BROTLI_BOOL was_default = !current->contextual.context_based && + current->contextual.num_dictionaries == 1 && + current->contextual.dict[0]->hash_table_words == + kStaticDictionaryHashWords && + current->contextual.dict[0]->hash_table_lengths == + kStaticDictionaryHashLengths; + BROTLI_BOOL new_default = !attached->contextual.context_based && + attached->contextual.num_dictionaries == 1 && + attached->contextual.dict[0]->hash_table_words == + kStaticDictionaryHashWords && + attached->contextual.dict[0]->hash_table_lengths == + kStaticDictionaryHashLengths; + size_t i; + if (state->is_initialized_) return BROTLI_FALSE; + current->max_quality = + BROTLI_MIN(int, current->max_quality, attached->max_quality); + for (i = 0; i < attached->compound.num_chunks; i++) { + if (!AttachPreparedDictionary(¤t->compound, + attached->compound.chunks[i])) { + return BROTLI_FALSE; + } + } + if (!new_default) { + if (!was_default) return BROTLI_FALSE; + /* Copy by value, but then set num_instances_ to 0 because their memory + is managed by attached, not by current */ + current->contextual = attached->contextual; + current->contextual.num_instances_ = 0; + } + } else { + return BROTLI_FALSE; + } + return BROTLI_TRUE; +} + #if defined(__cplusplus) || defined(c_plusplus) } /* extern "C" */ #endif diff --git a/c/enc/encoder_dict.c b/c/enc/encoder_dict.c index c9e963b..098727c 100644 --- a/c/enc/encoder_dict.c +++ b/c/enc/encoder_dict.c @@ -6,16 +6,43 @@ #include "./encoder_dict.h" +#include /* malloc, free */ + #include "../common/dictionary.h" +#include "../common/platform.h" +#include "../common/shared_dictionary_internal.h" #include "../common/transform.h" +#include "./compound_dictionary.h" #include "./dictionary_hash.h" +#include "./memory.h" +#include "./quality.h" #include "./hash.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif -void BrotliInitEncoderDictionary(BrotliEncoderDictionary* dict) { +#define NUM_HASH_BITS 15u +#define NUM_HASH_BUCKETS (1u << NUM_HASH_BITS) + +static void BrotliTrieInit(BrotliTrie* trie) { + trie->pool_capacity = 0; + trie->pool_size = 0; + trie->pool = 0; + + /* Set up the root node */ + trie->root.single = 0; + trie->root.len_ = 0; + trie->root.idx_ = 0; + trie->root.sub = 0; +} + +static void BrotliTrieFree(MemoryManager* m, BrotliTrie* trie) { + BrotliFree(m, trie->pool); +} + +/* Initializes to RFC 7932 static dictionary / transforms. */ +static void InitEncoderDictionary(BrotliEncoderDictionary* dict) { dict->words = BrotliGetDictionary(); dict->num_transforms = (uint32_t)BrotliGetTransforms()->num_transforms; @@ -26,6 +53,574 @@ void BrotliInitEncoderDictionary(BrotliEncoderDictionary* dict) { dict->cutoffTransformsCount = kCutoffTransformsCount; dict->cutoffTransforms = kCutoffTransforms; + + dict->parent = 0; + + dict->hash_table_data_words_ = 0; + dict->hash_table_data_lengths_ = 0; + dict->buckets_alloc_size_ = 0; + dict->buckets_data_ = 0; + dict->dict_words_alloc_size_ = 0; + dict->dict_words_data_ = 0; + dict->words_instance_ = 0; + dict->has_words_heavy = BROTLI_FALSE; + BrotliTrieInit(&dict->trie); +} + +static void BrotliDestroyEncoderDictionary(MemoryManager* m, + BrotliEncoderDictionary* dict) { + BrotliFree(m, dict->hash_table_data_words_); + BrotliFree(m, dict->hash_table_data_lengths_); + BrotliFree(m, dict->buckets_data_); + BrotliFree(m, dict->dict_words_data_); + BrotliFree(m, dict->words_instance_); + BrotliTrieFree(m, &dict->trie); +} + +/* Word length must be at least 4 bytes */ +static uint32_t Hash(const uint8_t* data, int bits) { + uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kHashMul32; + /* The higher bits contain more mixture from the multiplication, + so we take our results from there. */ + return h >> (32 - bits); +} + +/* Theoretical max possible word size after transform */ +#define kTransformedBufferSize \ + (256 + 256 + SHARED_BROTLI_MAX_DICTIONARY_WORD_LENGTH) + +/* To be safe buffer must have at least kTransformedBufferSize */ +static void TransformedDictionaryWord(uint32_t word_idx, int len, int transform, + const BrotliTransforms* transforms, + const BrotliEncoderDictionary* dict, + uint8_t* buffer, size_t* size) { + const uint8_t* dict_word = &dict->words->data[ + dict->words->offsets_by_length[len] + (uint32_t)len * word_idx]; + *size = (size_t)BrotliTransformDictionaryWord(buffer, dict_word, len, + transforms, transform); +} + +static DictWord MakeDictWord(uint8_t len, uint8_t transform, uint16_t idx) { + DictWord result; + result.len = len; + result.transform = transform; + result.idx = idx; + return result; +} + +static uint32_t BrotliTrieAlloc(MemoryManager* m, size_t num, BrotliTrie* trie, + BrotliTrieNode** keep) { + uint32_t result; + uint32_t keep_index = 0; + if (keep && *keep != &trie->root) { + /* Optional node to keep, since address may change after re-allocating */ + keep_index = (uint32_t)(*keep - trie->pool); + } + if (trie->pool_size == 0) { + /* Have a dummy node in the front. We do not want the result to be 0, it + must be at least 1, 0 represents "null pointer" */ + trie->pool_size = 1; + } + BROTLI_ENSURE_CAPACITY(m, BrotliTrieNode, trie->pool, trie->pool_capacity, + trie->pool_size + num); + if (BROTLI_IS_OOM(m)) return 0; + /* Init the new nodes to empty */ + memset(trie->pool + trie->pool_size, 0, sizeof(*trie->pool) * num); + result = (uint32_t)trie->pool_size; + trie->pool_size += num; + if (keep && *keep != &trie->root) { + *keep = trie->pool + keep_index; + } + return result; +} + +/** + * len and idx: payload for last node + * word, size: the string + * index: position in the string + */ +static BROTLI_BOOL BrotliTrieNodeAdd(MemoryManager* m, uint8_t len, + uint32_t idx, const uint8_t* word, size_t size, int index, + BrotliTrieNode* node, BrotliTrie* trie) { + BrotliTrieNode* child = 0; + uint8_t c; + if ((size_t)index == size) { + if (!node->len_ || idx < node->idx_) { + node->len_ = len; + node->idx_ = idx; + } + return BROTLI_TRUE; + } + c = word[index]; + if (node->single && c != node->c) { + BrotliTrieNode old = trie->pool[node->sub]; + uint32_t new_nodes = BrotliTrieAlloc(m, 32, trie, &node); + if (BROTLI_IS_OOM(m)) return BROTLI_FALSE; + node->single = 0; + node->sub = new_nodes; + trie->pool[node->sub + (node->c >> 4)].sub = new_nodes + 16; + trie->pool[trie->pool[node->sub + (node->c >> 4)].sub + (node->c & 15)] = + old; + } + if (!node->sub) { + uint32_t new_node = BrotliTrieAlloc(m, 1, trie, &node); + if (BROTLI_IS_OOM(m)) return BROTLI_FALSE; + node->single = 1; + node->c = c; + node->sub = new_node; + } + if (node->single) { + child = &trie->pool[node->sub]; + } else { + if (!trie->pool[node->sub + (c >> 4)].sub) { + uint32_t new_nodes = BrotliTrieAlloc(m, 16, trie, &node); + if (BROTLI_IS_OOM(m)) return BROTLI_FALSE; + trie->pool[node->sub + (c >> 4)].sub = new_nodes; + } + child = &trie->pool[trie->pool[node->sub + (c >> 4)].sub + (c & 15)]; + } + return BrotliTrieNodeAdd(m, len, idx, word, size, index + 1, child, trie); +} + +static BROTLI_BOOL BrotliTrieAdd(MemoryManager* m, uint8_t len, uint32_t idx, + const uint8_t* word, size_t size, BrotliTrie* trie) { + return BrotliTrieNodeAdd(m, len, idx, word, size, 0, &trie->root, trie); +} + +const BrotliTrieNode* BrotliTrieSub(const BrotliTrie* trie, + const BrotliTrieNode* node, uint8_t c) { + BrotliTrieNode* temp_node; + if (node->single) { + if (node->c == c) return &trie->pool[node->sub]; + return 0; + } + if (!node->sub) return 0; + temp_node = &trie->pool[node->sub + (c >> 4)]; + if (!temp_node->sub) return 0; + return &trie->pool[temp_node->sub + (c & 15)]; +} + +static const BrotliTrieNode* BrotliTrieFind(const BrotliTrie* trie, + const uint8_t* word, size_t size) { + const BrotliTrieNode* node = &trie->root; + size_t i; + for (i = 0; i < size; i++) { + node = BrotliTrieSub(trie, node, word[i]); + if (!node) return 0; + } + return node; +} + +static BROTLI_BOOL BuildDictionaryLut(MemoryManager* m, + const BrotliTransforms* transforms, + BrotliEncoderDictionary* dict) { + uint32_t i; + DictWord* dict_words; + uint16_t* buckets; + DictWord** words_by_hash; + size_t* words_by_hash_size; + size_t* words_by_hash_capacity; + BrotliTrie dedup; + uint8_t word[kTransformedBufferSize]; + size_t word_size; + size_t total = 0; + uint8_t l; + uint16_t idx; + + BrotliTrieInit(&dedup); + + words_by_hash = (DictWord**)BrotliAllocate(m, + sizeof(*words_by_hash) * NUM_HASH_BUCKETS); + words_by_hash_size = (size_t*)BrotliAllocate(m, + sizeof(*words_by_hash_size) * NUM_HASH_BUCKETS); + words_by_hash_capacity = (size_t*)BrotliAllocate(m, + sizeof(*words_by_hash_capacity) * NUM_HASH_BUCKETS); + if (BROTLI_IS_OOM(m)) return BROTLI_FALSE; + memset(words_by_hash, 0, sizeof(*words_by_hash) * NUM_HASH_BUCKETS); + memset(words_by_hash_size, 0, sizeof(*words_by_hash_size) * NUM_HASH_BUCKETS); + memset(words_by_hash_capacity, 0, + sizeof(*words_by_hash_capacity) * NUM_HASH_BUCKETS); + + if (transforms->num_transforms > 0) { + for (l = SHARED_BROTLI_MIN_DICTIONARY_WORD_LENGTH; + l <= SHARED_BROTLI_MAX_DICTIONARY_WORD_LENGTH; ++l) { + uint16_t n = dict->words->size_bits_by_length[l] ? + (uint16_t)(1 << dict->words->size_bits_by_length[l]) : 0u; + for (idx = 0; idx < n; ++idx) { + uint32_t key; + /* First transform (usually identity) */ + TransformedDictionaryWord(idx, l, 0, transforms, dict, word, + &word_size); + /* Cannot hash words smaller than 4 bytes */ + if (word_size < 4) { + /* Break instead of continue, all next words of this length will have + same length after transform */ + break; + } + if (!BrotliTrieAdd(m, 0, idx, word, word_size, &dedup)) { + return BROTLI_FALSE; + } + key = Hash(word, NUM_HASH_BITS); + BROTLI_ENSURE_CAPACITY_APPEND(m, DictWord, words_by_hash[key], + words_by_hash_capacity[key], words_by_hash_size[key], + MakeDictWord(l, 0, idx)); + ++total; + } + } + } + + /* These LUT transforms only supported if no custom transforms. This is + ok, we will use the heavy trie instead. */ + if (transforms == BrotliGetTransforms()) { + for (l = SHARED_BROTLI_MIN_DICTIONARY_WORD_LENGTH; + l <= SHARED_BROTLI_MAX_DICTIONARY_WORD_LENGTH; ++l) { + uint16_t n = dict->words->size_bits_by_length[l] ? + (uint16_t)(1 << dict->words->size_bits_by_length[l]) : 0u; + for (idx = 0; idx < n; ++idx) { + int k; + BROTLI_BOOL is_ascii = BROTLI_TRUE; + size_t offset = dict->words->offsets_by_length[l] + (size_t)l * idx; + const uint8_t* data = &dict->words->data[offset]; + for (k = 0; k < l; ++k) { + if (data[k] >= 128) is_ascii = BROTLI_FALSE; + } + if (data[0] < 128) { + int transform = 9; /* {empty, uppercase first, empty} */ + uint32_t ix = idx + (uint32_t)transform * n; + const BrotliTrieNode* it; + TransformedDictionaryWord(idx, l, transform, transforms, + dict, word, &word_size); + it = BrotliTrieFind(&dedup, word, word_size); + if (!it || it->idx_ > ix) { + uint32_t key = Hash(word, NUM_HASH_BITS); + if (!BrotliTrieAdd(m, 0, ix, word, word_size, &dedup)) { + return BROTLI_FALSE; + } + BROTLI_ENSURE_CAPACITY_APPEND(m, DictWord, words_by_hash[key], + words_by_hash_capacity[key], words_by_hash_size[key], + MakeDictWord(l, BROTLI_TRANSFORM_UPPERCASE_FIRST, idx)); + ++total; + } + } + if (is_ascii) { + int transform = 44; /* {empty, uppercase all, empty} */ + uint32_t ix = idx + (uint32_t)transform * n; + const BrotliTrieNode* it; + TransformedDictionaryWord(idx, l, transform, transforms, + dict, word, &word_size); + it = BrotliTrieFind(&dedup, word, word_size); + if (!it || it->idx_ > ix) { + uint32_t key = Hash(word, NUM_HASH_BITS); + if (!BrotliTrieAdd(m, 0, ix, word, word_size, &dedup)) { + return BROTLI_FALSE; + } + BROTLI_ENSURE_CAPACITY_APPEND(m, DictWord, words_by_hash[key], + words_by_hash_capacity[key], words_by_hash_size[key], + MakeDictWord(l, BROTLI_TRANSFORM_UPPERCASE_ALL, idx)); + ++total; + } + } + } + } + } + + dict_words = (DictWord*)BrotliAllocate(m, + sizeof(*dict->dict_words) * (total + 1)); + buckets = (uint16_t*)BrotliAllocate(m, + sizeof(*dict->buckets) * NUM_HASH_BUCKETS); + if (BROTLI_IS_OOM(m)) return BROTLI_FALSE; + dict->dict_words_alloc_size_ = total + 1; + dict->dict_words = dict->dict_words_data_ = dict_words; + dict->buckets_alloc_size_ = NUM_HASH_BUCKETS; + dict->buckets = dict->buckets_data_ = buckets; + + /* Unused; makes offsets start from 1. */ + dict_words[0] = MakeDictWord(0, 0, 0); + total = 1; + for (i = 0; i < NUM_HASH_BUCKETS; ++i) { + size_t num_words = words_by_hash_size[i]; + if (num_words > 0) { + buckets[i] = (uint16_t)(total); + memcpy(&dict_words[total], &words_by_hash[i][0], + sizeof(dict_words[0]) * num_words); + total += num_words; + dict_words[total - 1].len |= 0x80; + } else { + buckets[i] = 0; + } + } + + for (i = 0; i < NUM_HASH_BUCKETS; ++i) { + BrotliFree(m, words_by_hash[i]); + } + BrotliFree(m, words_by_hash); + BrotliFree(m, words_by_hash_size); + BrotliFree(m, words_by_hash_capacity); + BrotliTrieFree(m, &dedup); + + return BROTLI_TRUE; +} + +static void BuildDictionaryHashTable(uint16_t* hash_table_words, + uint8_t* hash_table_lengths, const BrotliDictionary* dict) { + int j, len; + /* The order of the loops is such that in case of collision, words with + shorter length are preferred, and in case of same length, words with + smaller index. There is only a single word per bucket. */ + /* TODO: consider adding optional user-supplied frequency_map to use + for preferred words instead, this can make the encoder better for + quality 9 and below without affecting the decoder */ + memset(hash_table_words, 0, sizeof(kStaticDictionaryHashWords)); + memset(hash_table_lengths, 0, sizeof(kStaticDictionaryHashLengths)); + for (len = SHARED_BROTLI_MAX_DICTIONARY_WORD_LENGTH; + len >= SHARED_BROTLI_MIN_DICTIONARY_WORD_LENGTH; --len) { + const size_t num_words = dict->size_bits_by_length[len] ? + (1u << dict->size_bits_by_length[len]) : 0; + for (j = (int)num_words - 1; j >= 0; --j) { + size_t offset = dict->offsets_by_length[len] + + (size_t)len * (size_t)j; + const uint8_t* word = &dict->data[offset]; + const uint32_t key = Hash(word, 14); + int idx = (int)(key << 1) + (len < 8 ? 1 : 0); + BROTLI_DCHECK(idx < (int)NUM_HASH_BUCKETS); + hash_table_words[idx] = (uint16_t)j; + hash_table_lengths[idx] = (uint8_t)len; + } + } +} + +static BROTLI_BOOL GenerateWordsHeavy(MemoryManager* m, + const BrotliTransforms* transforms, + BrotliEncoderDictionary* dict) { + int i, j, l; + for (j = (int)transforms->num_transforms - 1; j >= 0 ; --j) { + for (l = 0; l < 32; l++) { + int num = (int)((1u << dict->words->size_bits_by_length[l]) & ~1u); + for (i = 0; i < num; i++) { + uint8_t transformed[kTransformedBufferSize]; + size_t size; + TransformedDictionaryWord( + (uint32_t)i, l, j, transforms, dict, transformed, &size); + if (size < 4) continue; + if (!BrotliTrieAdd(m, (uint8_t)l, (uint32_t)(i + num * j), + transformed, size, &dict->trie)) { + return BROTLI_FALSE; + } + } + } + } + return BROTLI_TRUE; +} + +/* Computes cutoffTransformsCount (in count) and cutoffTransforms (in data) for + the custom transforms, where possible within the limits of the + cutoffTransforms encoding. The fast encoder uses this to do fast lookup for + transforms that remove the N last characters (OmitLast). */ +static void ComputeCutoffTransforms( + const BrotliTransforms* transforms, + uint32_t* count, uint64_t* data) { + int i; + /* The encoding in a 64-bit integer of transform N in the data is: (N << 2) + + ((cutoffTransforms >> (N * 6)) & 0x3F), so for example the identity + transform code must be 0-63, for N=1 the transform code must be 4-67, ..., + for N=9 it must be 36-99. + TODO: consider a simple flexible uint8_t[10] instead of the uint64_t + for the cutoff transforms, so that shared dictionaries can have the + OmitLast transforms anywhere without loss. */ + *count = 0; + *data = 0; + for (i = 0; i < BROTLI_TRANSFORMS_MAX_CUT_OFF + 1; i++) { + int idx = transforms->cutOffTransforms[i]; + if (idx == -1) break; /* Not found */ + if (idx < (i << 2)) break; /* Too small for the encoding */ + if (idx >= (i << 2) + 64) break; /* Too large for the encoding */ + (*count)++; + *data |= (uint64_t)(((uint64_t)idx - + ((uint64_t)i << 2u)) << ((uint64_t)i * 6u)); + } +} + +static BROTLI_BOOL ComputeDictionary(MemoryManager* m, int quality, + const BrotliTransforms* transforms, + BrotliEncoderDictionary* current) { + int default_words = current->words == BrotliGetDictionary(); + int default_transforms = transforms == BrotliGetTransforms(); + + if (default_words && default_transforms) { + /* hashes are already set to Brotli defaults */ + return BROTLI_TRUE; + } + + current->hash_table_data_words_ = (uint16_t*)BrotliAllocate( + m, sizeof(kStaticDictionaryHashWords)); + current->hash_table_data_lengths_ = (uint8_t*)BrotliAllocate( + m, sizeof(kStaticDictionaryHashLengths)); + if (BROTLI_IS_OOM(m)) return BROTLI_FALSE; + current->hash_table_words = current->hash_table_data_words_; + current->hash_table_lengths = current->hash_table_data_lengths_; + + BuildDictionaryHashTable(current->hash_table_data_words_, + current->hash_table_data_lengths_, current->words); + + ComputeCutoffTransforms(transforms, + ¤t->cutoffTransformsCount, ¤t->cutoffTransforms); + + /* Only compute the data for slow encoder if the requested quality is high + enough to need it */ + if (quality >= ZOPFLIFICATION_QUALITY) { + if (!BuildDictionaryLut(m, transforms, current)) return BROTLI_FALSE; + + /* For the built-in Brotli transforms, there is a hard-coded function to + handle all transforms, but for custom transforms, we use the following + large hammer instead */ + current->has_words_heavy = !default_transforms; + if (current->has_words_heavy) { + if (!GenerateWordsHeavy(m, transforms, current)) return BROTLI_FALSE; + } + } + + return BROTLI_TRUE; +} + +void BrotliInitSharedEncoderDictionary(SharedEncoderDictionary* dict) { + dict->magic = kSharedDictionaryMagic; + + dict->compound.num_chunks = 0; + dict->compound.total_size = 0; + dict->compound.chunk_offsets[0] = 0; + dict->compound.num_prepared_instances_ = 0; + + dict->contextual.context_based = 0; + dict->contextual.num_dictionaries = 1; + dict->contextual.instances_ = 0; + dict->contextual.num_instances_ = 1; /* The instance_ field */ + dict->contextual.dict[0] = &dict->contextual.instance_; + InitEncoderDictionary(&dict->contextual.instance_); + dict->contextual.instance_.parent = &dict->contextual; + + dict->max_quality = BROTLI_MAX_QUALITY; +} + +/* TODO: make sure that tooling will warn user if not all the cutoff + transforms are available (for low-quality encoder). */ +static BROTLI_BOOL InitCustomSharedEncoderDictionary( + MemoryManager* m, const BrotliSharedDictionary* decoded_dict, + int quality, SharedEncoderDictionary* dict) { + ContextualEncoderDictionary* contextual; + CompoundDictionary* compound; + BrotliEncoderDictionary* instances; + int i; + BrotliInitSharedEncoderDictionary(dict); + + contextual = &dict->contextual; + compound = &dict->compound; + + for (i = 0; i < (int)decoded_dict->num_prefix; i++) { + PreparedDictionary* prepared = CreatePreparedDictionary(m, + decoded_dict->prefix[i], decoded_dict->prefix_size[i]); + AttachPreparedDictionary(compound, prepared); + /* remember for cleanup */ + compound->prepared_instances_[ + compound->num_prepared_instances_++] = prepared; + } + + dict->max_quality = quality; + contextual->context_based = decoded_dict->context_based; + if (decoded_dict->context_based) { + memcpy(contextual->context_map, decoded_dict->context_map, + SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS); + } + + contextual->num_dictionaries = decoded_dict->num_dictionaries; + contextual->num_instances_ = decoded_dict->num_dictionaries; + if (contextual->num_instances_ == 1) { + instances = &contextual->instance_; + } else { + contextual->instances_ = (BrotliEncoderDictionary*) + BrotliAllocate(m, sizeof(*contextual->instances_) * + contextual->num_instances_); + if (BROTLI_IS_OOM(m)) return BROTLI_FALSE; + instances = contextual->instances_; + } + for (i = 0; i < (int)contextual->num_instances_; i++) { + BrotliEncoderDictionary* current = &instances[i]; + InitEncoderDictionary(current); + current->parent = &dict->contextual; + if (decoded_dict->words[i] == BrotliGetDictionary()) { + current->words = BrotliGetDictionary(); + } else { + current->words_instance_ = (BrotliDictionary*)BrotliAllocate( + m, sizeof(BrotliDictionary)); + if (BROTLI_IS_OOM(m)) return BROTLI_FALSE; + *current->words_instance_ = *decoded_dict->words[i]; + current->words = current->words_instance_; + } + current->num_transforms = + (uint32_t)decoded_dict->transforms[i]->num_transforms; + if (!ComputeDictionary( + m, quality, decoded_dict->transforms[i], current)) { + return BROTLI_FALSE; + } + + contextual->dict[i] = current; + } + + return BROTLI_TRUE; /* success */ +} + +BROTLI_BOOL BrotliInitCustomSharedEncoderDictionary( + MemoryManager* m, const uint8_t* encoded_dict, size_t size, + int quality, SharedEncoderDictionary* dict) { + BROTLI_BOOL success = BROTLI_FALSE; + BrotliSharedDictionary* decoded_dict = BrotliSharedDictionaryCreateInstance( + m->alloc_func, m->free_func, m->opaque); + if (!decoded_dict) { /* OOM */ + return BROTLI_FALSE; + } + success = BrotliSharedDictionaryAttach( + decoded_dict, BROTLI_SHARED_DICTIONARY_SERIALIZED, size, encoded_dict); + if (success) { + success = InitCustomSharedEncoderDictionary(m, + decoded_dict, quality, dict); + } + BrotliSharedDictionaryDestroyInstance(decoded_dict); + return success; +} + +void BrotliCleanupSharedEncoderDictionary(MemoryManager* m, + SharedEncoderDictionary* dict) { + size_t i; + for (i = 0; i < dict->compound.num_prepared_instances_; i++) { + DestroyPreparedDictionary(m, + (PreparedDictionary*)dict->compound.prepared_instances_[i]); + } + if (dict->contextual.num_instances_ == 1) { + BrotliDestroyEncoderDictionary(m, &dict->contextual.instance_); + } else if (dict->contextual.num_instances_ > 1) { + for (i = 0; i < dict->contextual.num_instances_; i++) { + BrotliDestroyEncoderDictionary(m, &dict->contextual.instances_[i]); + } + BrotliFree(m, dict->contextual.instances_); + } +} + +ManagedDictionary* BrotliCreateManagedDictionary( + brotli_alloc_func alloc_func, brotli_free_func free_func, void* opaque) { + ManagedDictionary* result = (ManagedDictionary*)BrotliBootstrapAlloc( + sizeof(ManagedDictionary), alloc_func, free_func, opaque); + if (result == NULL) return NULL; + + result->magic = kManagedDictionaryMagic; + BrotliInitMemoryManager( + &result->memory_manager_, alloc_func, free_func, opaque); + result->dictionary = NULL; + + return result; +} + +void BrotliDestroyManagedDictionary(ManagedDictionary* dictionary) { + if (!dictionary) return; + BrotliBootstrapFree(dictionary, &dictionary->memory_manager_); } #if defined(__cplusplus) || defined(c_plusplus) diff --git a/c/enc/encoder_dict.h b/c/enc/encoder_dict.h index a1c329f..dfeb796 100644 --- a/c/enc/encoder_dict.h +++ b/c/enc/encoder_dict.h @@ -9,13 +9,50 @@ #include "../common/dictionary.h" #include "../common/platform.h" +#include #include +#include "./compound_dictionary.h" +#include "./memory.h" #include "./static_dict_lut.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif +/* +Dictionary hierarchy for Encoder: +-SharedEncoderDictionary +--CompoundDictionary +---PreparedDictionary [up to 15x] + = prefix dictionary with precomputed hashes +--ContextualEncoderDictionary +---BrotliEncoderDictionary [up to 64x] + = for each context, precomputed static dictionary with words + transforms + +Dictionary hiearchy from common: similar, but without precomputed hashes +-BrotliSharedDictionary +--BrotliDictionary [up to 64x] +--BrotliTransforms [up to 64x] +--const uint8_t* prefix [up to 15x]: compound dictionaries +*/ + +typedef struct BrotliTrieNode { + uint8_t single; /* if 1, sub is a single node for c instead of 256 */ + uint8_t c; + uint8_t len_; /* untransformed length */ + uint32_t idx_; /* word index + num words * transform index */ + uint32_t sub; /* index of sub node(s) in the pool */ +} BrotliTrieNode; + +typedef struct BrotliTrie { + BrotliTrieNode* pool; + size_t pool_capacity; + size_t pool_size; + BrotliTrieNode root; +} BrotliTrie; + +BROTLI_INTERNAL const BrotliTrieNode* BrotliTrieSub(const BrotliTrie* trie, + const BrotliTrieNode* node, uint8_t c); /* Dictionary data (words and transforms) for 1 possible context */ typedef struct BrotliEncoderDictionary { const BrotliDictionary* words; @@ -32,9 +69,83 @@ typedef struct BrotliEncoderDictionary { /* from static_dict_lut.h, for slow encoder */ const uint16_t* buckets; const DictWord* dict_words; + /* Heavy version, for use by slow encoder when there are custom transforms. + Contains every possible transformed dictionary word in a trie. It encodes + about as fast as the non-heavy encoder but consumes a lot of memory and + takes time to build. */ + BrotliTrie trie; + BROTLI_BOOL has_words_heavy; + + /* Reference to other dictionaries. */ + const struct ContextualEncoderDictionary* parent; + + /* Allocated memory, used only when not using the Brotli defaults */ + uint16_t* hash_table_data_words_; + uint8_t* hash_table_data_lengths_; + size_t buckets_alloc_size_; + uint16_t* buckets_data_; + size_t dict_words_alloc_size_; + DictWord* dict_words_data_; + BrotliDictionary* words_instance_; } BrotliEncoderDictionary; -BROTLI_INTERNAL void BrotliInitEncoderDictionary(BrotliEncoderDictionary* dict); +/* Dictionary data for all 64 contexts */ +typedef struct ContextualEncoderDictionary { + BROTLI_BOOL context_based; + uint8_t num_dictionaries; + uint8_t context_map[SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS]; + const BrotliEncoderDictionary* dict[SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS]; + + /* If num_instances_ is 1, instance_ is used, else dynamic allocation with + instances_ is used. */ + size_t num_instances_; + BrotliEncoderDictionary instance_; + BrotliEncoderDictionary* instances_; +} ContextualEncoderDictionary; + +static const uint32_t kSharedDictionaryMagic = 0xDEBCEDE1; +static const uint32_t kManagedDictionaryMagic = 0xDEBCEDE2; + +typedef struct SharedEncoderDictionary { + /* Magic value to distinguish this struct from PreparedDictionary for + certain external usages. */ + uint32_t magic; + + /* LZ77 prefix, compound dictionary */ + CompoundDictionary compound; + + /* Custom static dictionary (optionally context-based) */ + ContextualEncoderDictionary contextual; + + /* The maximum quality the dictionary was computed for */ + int max_quality; +} SharedEncoderDictionary; + +typedef struct ManagedDictionary { + uint32_t magic; + MemoryManager memory_manager_; + uint32_t* dictionary; +} ManagedDictionary; + +/* Initializes to the brotli built-in dictionary */ +BROTLI_INTERNAL void BrotliInitSharedEncoderDictionary( + SharedEncoderDictionary* dict); + +/* Initializes to shared dictionary that will be parsed from + encoded_dict. Requires that you keep the encoded_dict buffer + around, parts of data will point to it. */ +BROTLI_INTERNAL BROTLI_BOOL BrotliInitCustomSharedEncoderDictionary( + MemoryManager* m, const uint8_t* encoded_dict, size_t size, + int quality, SharedEncoderDictionary* dict); + +BROTLI_INTERNAL void BrotliCleanupSharedEncoderDictionary( + MemoryManager* m, SharedEncoderDictionary* dict); + +BROTLI_INTERNAL ManagedDictionary* BrotliCreateManagedDictionary( + brotli_alloc_func alloc_func, brotli_free_func free_func, void* opaque); + +BROTLI_INTERNAL void BrotliDestroyManagedDictionary( + ManagedDictionary* dictionary); #if defined(__cplusplus) || defined(c_plusplus) } /* extern "C" */ diff --git a/c/enc/hash.h b/c/enc/hash.h index 114b621..7a54be2 100644 --- a/c/enc/hash.h +++ b/c/enc/hash.h @@ -504,6 +504,206 @@ static BROTLI_INLINE void InitOrStitchToPreviousBlock( } } +/* NB: when seamless dictionary-ring-buffer copies are implemented, don't forget + to add proper guards for non-zero-BROTLI_PARAM_STREAM_OFFSET. */ +static BROTLI_INLINE void FindCompoundDictionaryMatch( + const PreparedDictionary* self, const uint8_t* BROTLI_RESTRICT data, + const size_t ring_buffer_mask, const int* BROTLI_RESTRICT distance_cache, + const size_t cur_ix, const size_t max_length, const size_t distance_offset, + const size_t max_distance, HasherSearchResult* BROTLI_RESTRICT out) { + const uint32_t source_offset = self->source_offset; + const uint32_t source_size = self->source_size; + const size_t boundary = distance_offset - source_size; + const uint32_t hash_bits = self->hash_bits; + const uint32_t bucket_bits = self->bucket_bits; + const uint32_t slot_bits = self->slot_bits; + + const uint32_t hash_shift = 64u - bucket_bits; + const uint32_t slot_mask = (~((uint32_t)0U)) >> (32 - slot_bits); + const uint64_t hash_mask = (~((uint64_t)0U)) >> (64 - hash_bits); + + const uint32_t* slot_offsets = (uint32_t*)(&self[1]); + const uint16_t* heads = (uint16_t*)(&slot_offsets[1u << slot_bits]); + const uint32_t* items = (uint32_t*)(&heads[1u << bucket_bits]); + const uint8_t* source = (uint8_t*)(&items[source_offset]); + + const size_t cur_ix_masked = cur_ix & ring_buffer_mask; + score_t best_score = out->score; + size_t best_len = out->len; + size_t i; + const uint64_t h = + (BROTLI_UNALIGNED_LOAD64LE(&data[cur_ix_masked]) & hash_mask) * + kPreparedDictionaryHashMul64Long; + const uint32_t key = (uint32_t)(h >> hash_shift); + const uint32_t slot = key & slot_mask; + const uint32_t head = heads[key]; + const uint32_t* BROTLI_RESTRICT chain = &items[slot_offsets[slot] + head]; + uint32_t item = (head == 0xFFFF) ? 1 : 0; + for (i = 0; i < 4; ++i) { + const size_t distance = (size_t)distance_cache[i]; + size_t offset; + size_t limit; + size_t len; + if (distance <= boundary || distance > distance_offset) continue; + offset = distance_offset - distance; + limit = source_size - offset; + limit = limit > max_length ? max_length : limit; + len = FindMatchLengthWithLimit(&source[offset], &data[cur_ix_masked], + limit); + if (len >= 2) { + score_t score = BackwardReferenceScoreUsingLastDistance(len); + if (best_score < score) { + if (i != 0) score -= BackwardReferencePenaltyUsingLastDistance(i); + if (best_score < score) { + best_score = score; + if (len > best_len) best_len = len; + out->len = len; + out->len_code_delta = 0; + out->distance = distance; + out->score = best_score; + } + } + } + } + while (item == 0) { + size_t offset; + size_t distance; + size_t limit; + item = *chain; + chain++; + offset = item & 0x7FFFFFFF; + item &= 0x80000000; + distance = distance_offset - offset; + limit = source_size - offset; + limit = (limit > max_length) ? max_length : limit; + if (distance > max_distance) continue; + if (cur_ix_masked + best_len > ring_buffer_mask || + best_len >= limit || + data[cur_ix_masked + best_len] != source[offset + best_len]) { + continue; + } + { + const size_t len = FindMatchLengthWithLimit(&source[offset], + &data[cur_ix_masked], + limit); + if (len >= 4) { + score_t score = BackwardReferenceScore(len, distance); + if (best_score < score) { + best_score = score; + best_len = len; + out->len = best_len; + out->len_code_delta = 0; + out->distance = distance; + out->score = best_score; + } + } + } + } +} + +/* NB: when seamless dictionary-ring-buffer copies are implemented, don't forget + to add proper guards for non-zero-BROTLI_PARAM_STREAM_OFFSET. */ +static BROTLI_INLINE size_t FindAllCompoundDictionaryMatches( + const PreparedDictionary* self, const uint8_t* BROTLI_RESTRICT data, + const size_t ring_buffer_mask, const size_t cur_ix, const size_t min_length, + const size_t max_length, const size_t distance_offset, + const size_t max_distance, BackwardMatch* matches, size_t match_limit) { + const uint32_t source_offset = self->source_offset; + const uint32_t source_size = self->source_size; + const uint32_t hash_bits = self->hash_bits; + const uint32_t bucket_bits = self->bucket_bits; + const uint32_t slot_bits = self->slot_bits; + + const uint32_t hash_shift = 64u - bucket_bits; + const uint32_t slot_mask = (~((uint32_t)0U)) >> (32 - slot_bits); + const uint64_t hash_mask = (~((uint64_t)0U)) >> (64 - hash_bits); + + const uint32_t* slot_offsets = (uint32_t*)(&self[1]); + const uint16_t* heads = (uint16_t*)(&slot_offsets[1u << slot_bits]); + const uint32_t* items = (uint32_t*)(&heads[1u << bucket_bits]); + const uint8_t* source = (uint8_t*)(&items[source_offset]); + + const size_t cur_ix_masked = cur_ix & ring_buffer_mask; + size_t best_len = min_length; + const uint64_t h = + (BROTLI_UNALIGNED_LOAD64LE(&data[cur_ix_masked]) & hash_mask) * + kPreparedDictionaryHashMul64Long; + const uint32_t key = (uint32_t)(h >> hash_shift); + const uint32_t slot = key & slot_mask; + const uint32_t head = heads[key]; + const uint32_t* BROTLI_RESTRICT chain = &items[slot_offsets[slot] + head]; + uint32_t item = (head == 0xFFFF) ? 1 : 0; + size_t found = 0; + while (item == 0) { + size_t offset; + size_t distance; + size_t limit; + size_t len; + item = *chain; + chain++; + offset = item & 0x7FFFFFFF; + item &= 0x80000000; + distance = distance_offset - offset; + limit = source_size - offset; + limit = (limit > max_length) ? max_length : limit; + if (distance > max_distance) continue; + if (cur_ix_masked + best_len > ring_buffer_mask || + best_len >= limit || + data[cur_ix_masked + best_len] != source[offset + best_len]) { + continue; + } + len = FindMatchLengthWithLimit( + &source[offset], &data[cur_ix_masked], limit); + if (len > best_len) { + best_len = len; + InitBackwardMatch(matches++, distance, len); + found++; + if (found == match_limit) break; + } + } + return found; +} + +static BROTLI_INLINE void LookupCompoundDictionaryMatch( + const CompoundDictionary* addon, const uint8_t* BROTLI_RESTRICT data, + const size_t ring_buffer_mask, const int* BROTLI_RESTRICT distance_cache, + const size_t cur_ix, const size_t max_length, + const size_t max_ring_buffer_distance, const size_t max_distance, + HasherSearchResult* sr) { + size_t base_offset = max_ring_buffer_distance + 1 + addon->total_size - 1; + size_t d; + for (d = 0; d < addon->num_chunks; ++d) { + /* Only one prepared dictionary type is currently supported. */ + FindCompoundDictionaryMatch( + (const PreparedDictionary*)addon->chunks[d], data, ring_buffer_mask, + distance_cache, cur_ix, max_length, + base_offset - addon->chunk_offsets[d], max_distance, sr); + } +} + +static BROTLI_INLINE size_t LookupAllCompoundDictionaryMatches( + const CompoundDictionary* addon, const uint8_t* BROTLI_RESTRICT data, + const size_t ring_buffer_mask, const size_t cur_ix, size_t min_length, + const size_t max_length, const size_t max_ring_buffer_distance, + const size_t max_distance, BackwardMatch* matches, + size_t match_limit) { + size_t base_offset = max_ring_buffer_distance + 1 + addon->total_size - 1; + size_t d; + size_t total_found = 0; + for (d = 0; d < addon->num_chunks; ++d) { + /* Only one prepared dictionary type is currently supported. */ + total_found += FindAllCompoundDictionaryMatches( + (const PreparedDictionary*)addon->chunks[d], data, ring_buffer_mask, + cur_ix, min_length, max_length, base_offset - addon->chunk_offsets[d], + max_distance, matches + total_found, match_limit - total_found); + if (total_found == match_limit) break; + if (total_found > 0) { + min_length = BackwardMatchLength(&matches[total_found - 1]); + } + } + return total_found; +} + #if defined(__cplusplus) || defined(c_plusplus) } /* extern "C" */ #endif diff --git a/c/enc/params.h b/c/enc/params.h index 54a7f00..c4c0642 100644 --- a/c/enc/params.h +++ b/c/enc/params.h @@ -40,7 +40,8 @@ typedef struct BrotliEncoderParams { BROTLI_BOOL large_window; BrotliHasherParams hasher; BrotliDistanceParams dist; - BrotliEncoderDictionary dictionary; + /* TODO(eustas): rename to BrotliShared... */ + SharedEncoderDictionary dictionary; } BrotliEncoderParams; #endif /* BROTLI_ENC_PARAMS_H_ */ diff --git a/c/enc/static_dict.c b/c/enc/static_dict.c index 7299ab7..c0e0ec4 100644 --- a/c/enc/static_dict.c +++ b/c/enc/static_dict.c @@ -74,10 +74,25 @@ static BROTLI_INLINE BROTLI_BOOL IsMatch(const BrotliDictionary* dictionary, } } -BROTLI_BOOL BrotliFindAllStaticDictionaryMatches( +/* Finds matches for a single static dictionary */ +static BROTLI_BOOL BrotliFindAllStaticDictionaryMatchesFor( const BrotliEncoderDictionary* dictionary, const uint8_t* data, size_t min_length, size_t max_length, uint32_t* matches) { BROTLI_BOOL has_found_match = BROTLI_FALSE; + if (dictionary->has_words_heavy) { + const BrotliTrieNode* node = &dictionary->trie.root; + size_t l = 0; + while (node && l < max_length) { + uint8_t c; + if (l >= min_length && node->len_) { + AddMatch(node->idx_, l, node->len_, matches); + has_found_match = BROTLI_TRUE; + } + c = data[l++]; + node = BrotliTrieSub(&dictionary->trie, node, c); + } + return has_found_match; + } { size_t offset = dictionary->buckets[Hash(data)]; BROTLI_BOOL end = !offset; @@ -481,6 +496,45 @@ BROTLI_BOOL BrotliFindAllStaticDictionaryMatches( return has_found_match; } +/* Finds matches for one or more dictionaries, if multiple are present + in the contextual dictionary */ +BROTLI_BOOL BrotliFindAllStaticDictionaryMatches( + const BrotliEncoderDictionary* dictionary, const uint8_t* data, + size_t min_length, size_t max_length, uint32_t* matches) { + BROTLI_BOOL has_found_match = + BrotliFindAllStaticDictionaryMatchesFor( + dictionary, data, min_length, max_length, matches); + + if (!!dictionary->parent && dictionary->parent->num_dictionaries > 1) { + uint32_t matches2[BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN + 1]; + int l; + const BrotliEncoderDictionary* dictionary2 = dictionary->parent->dict[0]; + if (dictionary2 == dictionary) { + dictionary2 = dictionary->parent->dict[1]; + } + + for (l = 0; l < BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN + 1; l++) { + matches2[l] = kInvalidMatch; + } + + has_found_match |= BrotliFindAllStaticDictionaryMatchesFor( + dictionary2, data, min_length, max_length, matches2); + + for (l = 0; l < BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN + 1; l++) { + if (matches2[l] != kInvalidMatch) { + uint32_t dist = (uint32_t)(matches2[l] >> 5); + uint32_t len_code = matches2[l] & 31; + uint32_t skipdist = (uint32_t)((uint32_t)(1 << dictionary->words-> + size_bits_by_length[len_code]) & ~1u) * + (uint32_t)dictionary->num_transforms; + /* TODO: check for dist overflow */ + dist += skipdist; + AddMatch(dist, (size_t)l, len_code, matches); + } + } + } + return has_found_match; +} #if defined(__cplusplus) || defined(c_plusplus) } /* extern "C" */ #endif diff --git a/c/include/brotli/decode.h b/c/include/brotli/decode.h index 0f5c8f9..98b3c7b 100644 --- a/c/include/brotli/decode.h +++ b/c/include/brotli/decode.h @@ -13,6 +13,7 @@ #define BROTLI_DEC_DECODE_H_ #include +#include #include #if defined(__cplusplus) || defined(c_plusplus) @@ -85,8 +86,9 @@ typedef enum { BROTLI_ERROR_CODE(_ERROR_FORMAT_, PADDING_2, -15) SEPARATOR \ BROTLI_ERROR_CODE(_ERROR_FORMAT_, DISTANCE, -16) SEPARATOR \ \ - /* -17..-18 codes are reserved */ \ + /* -17 code is reserved */ \ \ + BROTLI_ERROR_CODE(_ERROR_, COMPOUND_DICTIONARY, -18) SEPARATOR \ BROTLI_ERROR_CODE(_ERROR_, DICTIONARY_NOT_SET, -19) SEPARATOR \ BROTLI_ERROR_CODE(_ERROR_, INVALID_ARGUMENTS, -20) SEPARATOR \ \ @@ -154,6 +156,28 @@ typedef enum BrotliDecoderParameter { BROTLI_DEC_API BROTLI_BOOL BrotliDecoderSetParameter( BrotliDecoderState* state, BrotliDecoderParameter param, uint32_t value); +/** + * Adds LZ77 prefix dictionary, adds or replaces built-in static dictionary and + * transforms. + * + * Attached dictionary ownership is not transferred. + * Data provided to this method should be kept accessible until + * decoding is finished and decoder instance is destroyed. + * + * @note Dictionaries could NOT be attached after actual decoding is started. + * + * @param state decoder instance + * @param type dictionary data format + * @param data_size length of memory region pointed by @p data + * @param data dictionary data in format corresponding to @p type + * @returns ::BROTLI_FALSE if dictionary is corrupted, + * or dictionary count limit is reached + * @returns ::BROTLI_TRUE if dictionary is accepted / attached + */ +BROTLI_DEC_API BROTLI_BOOL BrotliDecoderAttachDictionary( + BrotliDecoderState* state, BrotliSharedDictionaryType type, + size_t data_size, const uint8_t data[BROTLI_ARRAY_PARAM(data_size)]); + /** * Creates an instance of ::BrotliDecoderState and initializes it. * diff --git a/c/include/brotli/encode.h b/c/include/brotli/encode.h index b2774cb..72668e1 100644 --- a/c/include/brotli/encode.h +++ b/c/include/brotli/encode.h @@ -13,6 +13,7 @@ #define BROTLI_ENC_ENCODE_H_ #include +#include #include #if defined(__cplusplus) || defined(c_plusplus) @@ -269,6 +270,51 @@ BROTLI_ENC_API BrotliEncoderState* BrotliEncoderCreateInstance( */ BROTLI_ENC_API void BrotliEncoderDestroyInstance(BrotliEncoderState* state); +/* Opaque type for pointer to different possible internal structures containing + dictionary prepared for the encoder */ +typedef struct BrotliEncoderPreparedDictionaryStruct + BrotliEncoderPreparedDictionary; + +/** + * Prepares a shared dictionary from the given file format for the encoder. + * + * @p alloc_func and @p free_func @b MUST be both zero or both non-zero. In the + * case they are both zero, default memory allocators are used. @p opaque is + * passed to @p alloc_func and @p free_func when they are called. @p free_func + * has to return without doing anything when asked to free a NULL pointer. + * + * @param type type of dictionary stored in data + * @param data_size size of @p data buffer + * @param data pointer to the dictionary data + * @param quality the maximum Brotli quality to prepare the dictionary for, + * use BROTLI_MAX_QUALITY by default + * @param alloc_func custom memory allocation function + * @param free_func custom memory free function + * @param opaque custom memory manager handle + */ +BROTLI_ENC_API BrotliEncoderPreparedDictionary* +BrotliEncoderPrepareDictionary(BrotliSharedDictionaryType type, + size_t data_size, const uint8_t data[BROTLI_ARRAY_PARAM(data_size)], + int quality, + brotli_alloc_func alloc_func, brotli_free_func free_func, void* opaque); + +BROTLI_ENC_API void BrotliEncoderDestroyPreparedDictionary( + BrotliEncoderPreparedDictionary* dictionary); + +/** + * Attaches a prepared dictionary of any type to the encoder. Can be used + * multiple times to attach multiple dictionaries. The dictionary type was + * determined by BrotliEncoderPrepareDictionary. Multiple raw prefix + * dictionaries and/or max 1 serialized dictionary with custom words can be + * attached. + * + * @returns ::BROTLI_FALSE in case of error + * @returns ::BROTLI_TRUE otherwise + */ +BROTLI_ENC_API BROTLI_BOOL BrotliEncoderAttachPreparedDictionary( + BrotliEncoderState* state, + const BrotliEncoderPreparedDictionary* dictionary); + /** * Calculates the output size bound for the given @p input_size. * diff --git a/c/include/brotli/shared_dictionary.h b/c/include/brotli/shared_dictionary.h new file mode 100644 index 0000000..ceb6cf1 --- /dev/null +++ b/c/include/brotli/shared_dictionary.h @@ -0,0 +1,97 @@ +/* Copyright 2017 Google Inc. All Rights Reserved. + + Distributed under MIT license. + See file LICENSE for detail or copy at https://opensource.org/licenses/MIT +*/ + +/* (Opaque) Shared Dictionary definition and utilities. */ + +#ifndef BROTLI_COMMON_SHARED_DICTIONARY_H_ +#define BROTLI_COMMON_SHARED_DICTIONARY_H_ + +#include +#include + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +#define SHARED_BROTLI_MIN_DICTIONARY_WORD_LENGTH 4 +#define SHARED_BROTLI_MAX_DICTIONARY_WORD_LENGTH 31 +#define SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS 64 +#define SHARED_BROTLI_MAX_COMPOUND_DICTS 15 + +/** + * Opaque structure that holds shared dictionary data. + * + * Allocated and initialized with ::BrotliSharedDictionaryCreateInstance. + * Cleaned up and deallocated with ::BrotliSharedDictionaryDestroyInstance. + */ +typedef struct BrotliSharedDictionaryStruct BrotliSharedDictionary; + +/** + * Input data type for ::BrotliSharedDictionaryAttach. + */ +typedef enum BrotliSharedDictionaryType { + /** Raw LZ77 prefix dictionary. */ + BROTLI_SHARED_DICTIONARY_RAW = 0, + /** Serialized shared dictionary. */ + BROTLI_SHARED_DICTIONARY_SERIALIZED = 1 +} BrotliSharedDictionaryType; + +/** + * Creates an instance of ::BrotliSharedDictionary. + * + * Fresh instance has default word dictionary and transforms + * and no LZ77 prefix dictionary. + * + * @p alloc_func and @p free_func @b MUST be both zero or both non-zero. In the + * case they are both zero, default memory allocators are used. @p opaque is + * passed to @p alloc_func and @p free_func when they are called. @p free_func + * has to return without doing anything when asked to free a NULL pointer. + * + * @param alloc_func custom memory allocation function + * @param free_func custom memory free function + * @param opaque custom memory manager handle + * @returns @c 0 if instance can not be allocated or initialized + * @returns pointer to initialized ::BrotliSharedDictionary otherwise + */ +BROTLI_COMMON_API BrotliSharedDictionary* BrotliSharedDictionaryCreateInstance( + brotli_alloc_func alloc_func, brotli_free_func free_func, void* opaque); + +/** + * Deinitializes and frees ::BrotliSharedDictionary instance. + * + * @param dict shared dictionary instance to be cleaned up and deallocated + */ +BROTLI_COMMON_API void BrotliSharedDictionaryDestroyInstance( + BrotliSharedDictionary* dict); + +/** + * Attaches dictionary to a given instance of ::BrotliSharedDictionary. + * + * Dictionary to be attached is represented in a serialized format as a region + * of memory. + * + * Provided data it partially referenced by a resulting (compound) dictionary, + * and should be kept untouched, while at least one compound dictionary uses it. + * This way memory overhead is kept minimal by the cost of additional resource + * management. + * + * @param dict dictionary to extend + * @param type type of dictionary to attach + * @param data_size size of @p data + * @param data serialized dictionary of type @p type, with at least @p data_size + * addressable bytes + * @returns ::BROTLI_TRUE if provided dictionary is successfully attached + * @returns ::BROTLI_FALSE otherwise + */ +BROTLI_COMMON_API BROTLI_BOOL BrotliSharedDictionaryAttach( + BrotliSharedDictionary* dict, BrotliSharedDictionaryType type, + size_t data_size, const uint8_t data[BROTLI_ARRAY_PARAM(data_size)]); + +#if defined(__cplusplus) || defined(c_plusplus) +} /* extern "C" */ +#endif + +#endif /* BROTLI_COMMON_SHARED_DICTIONARY_H_ */ diff --git a/c/tools/brotli.c b/c/tools/brotli.c index 77752b2..049371c 100644 --- a/c/tools/brotli.c +++ b/c/tools/brotli.c @@ -100,6 +100,7 @@ typedef struct { BROTLI_BOOL decompress; BROTLI_BOOL large_window; const char* output_path; + const char* dictionary_path; const char* suffix; int not_input_indices[MAX_OPTIONS]; size_t longest_path_len; @@ -108,6 +109,9 @@ typedef struct { /* Inner state */ int argc; char** argv; + uint8_t* dictionary; + size_t dictionary_size; + BrotliEncoderPreparedDictionary* prepared_dictionary; char* modified_path; /* Storage for path with appended / cut suffix */ int iterator; int ignore; @@ -359,6 +363,12 @@ static Command ParseParams(Context* params) { params->lgwin, BROTLI_MIN_WINDOW_BITS); return COMMAND_INVALID; } + } else if (c == 'D') { + if (params->dictionary_path) { + fprintf(stderr, "dictionary path already set\n"); + return COMMAND_INVALID; + } + params->dictionary_path = argv[i]; } else if (c == 'S') { if (suffix_set) { fprintf(stderr, "suffix already set\n"); @@ -446,7 +456,13 @@ static Command ParseParams(Context* params) { } key_len = (size_t)(value - arg); value++; - if (strncmp("lgwin", arg, key_len) == 0) { + if (strncmp("dictionary", arg, key_len) == 0) { + if (params->dictionary_path) { + fprintf(stderr, "dictionary path already set\n"); + return COMMAND_INVALID; + } + params->dictionary_path = value; + } else if (strncmp("lgwin", arg, key_len) == 0) { if (lgwin_set) { fprintf(stderr, "lgwin parameter already set\n"); return COMMAND_INVALID; @@ -575,6 +591,8 @@ static void PrintHelp(const char* name, BROTLI_BOOL error) { " decodable with regular brotli decoders\n", BROTLI_MIN_WINDOW_BITS, BROTLI_LARGE_MAX_WINDOW_BITS); fprintf(media, +" -D FILE, --dictionary=FILE use FILE as raw (LZ77) dictionary\n"); + fprintf(media, " -S SUF, --suffix=SUF output file suffix (default:'%s')\n", DEFAULT_SUFFIX); fprintf(media, @@ -678,6 +696,69 @@ static void CopyStat(const char* input_path, const char* output_path) { } } +/* Result ownership is passed to caller. + |*dictionary_size| is set to resulting buffer size. */ +static BROTLI_BOOL ReadDictionary(Context* context, Command command) { + static const int kMaxDictionarySize = + BROTLI_MAX_DISTANCE - BROTLI_MAX_BACKWARD_LIMIT(24); + FILE* f; + int64_t file_size_64; + uint8_t* buffer; + size_t bytes_read; + + if (context->dictionary_path == NULL) return BROTLI_TRUE; + f = fopen(context->dictionary_path, "rb"); + if (f == NULL) { + fprintf(stderr, "failed to open dictionary file [%s]: %s\n", + PrintablePath(context->dictionary_path), strerror(errno)); + return BROTLI_FALSE; + } + + file_size_64 = FileSize(context->dictionary_path); + if (file_size_64 == -1) { + fprintf(stderr, "could not get size of dictionary file [%s]", + PrintablePath(context->dictionary_path)); + fclose(f); + return BROTLI_FALSE; + } + + if (file_size_64 > kMaxDictionarySize) { + fprintf(stderr, "dictionary [%s] is larger than maximum allowed: %d\n", + PrintablePath(context->dictionary_path), kMaxDictionarySize); + fclose(f); + return BROTLI_FALSE; + } + context->dictionary_size = (size_t)file_size_64; + + buffer = (uint8_t*)malloc(context->dictionary_size); + if (!buffer) { + fprintf(stderr, "could not read dictionary: out of memory\n"); + fclose(f); + return BROTLI_FALSE; + } + bytes_read = fread(buffer, sizeof(uint8_t), context->dictionary_size, f); + if (bytes_read != context->dictionary_size) { + free(buffer); + fprintf(stderr, "failed to read dictionary [%s]: %s\n", + PrintablePath(context->dictionary_path), strerror(errno)); + fclose(f); + return BROTLI_FALSE; + } + fclose(f); + context->dictionary = buffer; + if (command == COMMAND_COMPRESS) { + context->prepared_dictionary = BrotliEncoderPrepareDictionary( + BROTLI_SHARED_DICTIONARY_RAW, context->dictionary_size, + context->dictionary, BROTLI_MAX_QUALITY, NULL, NULL, NULL); + if (context->prepared_dictionary == NULL) { + fprintf(stderr, "failed to prepare dictionary [%s]\n", + PrintablePath(context->dictionary_path)); + return BROTLI_FALSE; + } + } + return BROTLI_TRUE; +} + static BROTLI_BOOL NextFile(Context* context) { const char* arg; size_t arg_len; @@ -933,6 +1014,10 @@ static BROTLI_BOOL DecompressFiles(Context* context) { fragmentation (new builds decode streams that old builds don't), it is better from used experience perspective. */ BrotliDecoderSetParameter(s, BROTLI_DECODER_PARAM_LARGE_WINDOW, 1u); + if (context->dictionary) { + BrotliDecoderAttachDictionary(s, BROTLI_SHARED_DICTIONARY_RAW, + context->dictionary_size, context->dictionary); + } is_ok = OpenFiles(context); if (is_ok && !context->current_input_path && !context->force_overwrite && isatty(STDIN_FILENO)) { @@ -1020,6 +1105,9 @@ static BROTLI_BOOL CompressFiles(Context* context) { (uint32_t)context->input_file_length : (1u << 30); BrotliEncoderSetParameter(s, BROTLI_PARAM_SIZE_HINT, size_hint); } + if (context->dictionary) { + BrotliEncoderAttachPreparedDictionary(s, context->prepared_dictionary); + } is_ok = OpenFiles(context); if (is_ok && !context->current_output_path && !context->force_overwrite && isatty(STDOUT_FILENO)) { @@ -1051,6 +1139,7 @@ int main(int argc, char** argv) { context.decompress = BROTLI_FALSE; context.large_window = BROTLI_FALSE; context.output_path = NULL; + context.dictionary_path = NULL; context.suffix = DEFAULT_SUFFIX; for (i = 0; i < MAX_OPTIONS; ++i) context.not_input_indices[i] = 0; context.longest_path_len = 1; @@ -1058,6 +1147,9 @@ int main(int argc, char** argv) { context.argc = argc; context.argv = argv; + context.dictionary = NULL; + context.dictionary_size = 0; + context.prepared_dictionary = NULL; context.modified_path = NULL; context.iterator = 0; context.ignore = 0; @@ -1072,6 +1164,7 @@ int main(int argc, char** argv) { if (command == COMMAND_COMPRESS || command == COMMAND_DECOMPRESS || command == COMMAND_TEST_INTEGRITY) { + if (!ReadDictionary(&context, command)) is_ok = BROTLI_FALSE; if (is_ok) { size_t modified_path_len = context.longest_path_len + strlen(context.suffix) + 1; @@ -1116,6 +1209,8 @@ int main(int argc, char** argv) { if (context.iterator_error) is_ok = BROTLI_FALSE; + BrotliEncoderDestroyPreparedDictionary(context.prepared_dictionary); + free(context.dictionary); free(context.modified_path); free(context.buffer); diff --git a/c/tools/brotli.md b/c/tools/brotli.md index c029869..86253b8 100644 --- a/c/tools/brotli.md +++ b/c/tools/brotli.md @@ -84,6 +84,9 @@ OPTIONS `(2**NUM - 16)`; 0 lets compressor decide over the optimal value; bigger windows size improve density; decoder might require up to window size memory to operate +* `-D FILE`, `--dictionary=FILE`: + use FILE as raw (LZ77) dictionary; same dictionary MUST be used both for + compression and decompression * `-S SUF`, `--suffix=SUF`: output file suffix (default: `.br`) * `-V`, `--version`: diff --git a/java/org/brotli/wrapper/dec/BUILD b/java/org/brotli/wrapper/dec/BUILD index 4fd51d0..6190504 100644 --- a/java/org/brotli/wrapper/dec/BUILD +++ b/java/org/brotli/wrapper/dec/BUILD @@ -24,6 +24,7 @@ java_library( ":dec", "//org/brotli/integration:brotli_jni_test_base", "//org/brotli/integration:bundle_helper", + "//org/brotli/wrapper/enc", "@maven//:junit_junit", ], ) @@ -82,3 +83,16 @@ java_test( test_class = "org.brotli.wrapper.dec.DecoderTest", runtime_deps = [":test_lib"], ) + +java_test( + name = "CornerCasesTest", + size = "large", + data = [ + ":brotli_jni", # Bazel JNI workaround + ], + jvm_flags = [ + "-DBROTLI_JNI_LIBRARY=$(location :brotli_jni)", + ], + test_class = "org.brotli.wrapper.dec.CornerCasesTest", + runtime_deps = [":test_lib"], +) diff --git a/java/org/brotli/wrapper/dec/BrotliDecoderChannel.java b/java/org/brotli/wrapper/dec/BrotliDecoderChannel.java index 6dfe8f2..30f6f1d 100644 --- a/java/org/brotli/wrapper/dec/BrotliDecoderChannel.java +++ b/java/org/brotli/wrapper/dec/BrotliDecoderChannel.java @@ -35,6 +35,11 @@ public class BrotliDecoderChannel extends Decoder implements ReadableByteChannel this(source, DEFAULT_BUFFER_SIZE); } + @Override + public void attachDictionary(ByteBuffer dictionary) throws IOException { + super.attachDictionary(dictionary); + } + @Override public boolean isOpen() { synchronized (mutex) { diff --git a/java/org/brotli/wrapper/dec/BrotliInputStream.java b/java/org/brotli/wrapper/dec/BrotliInputStream.java index 6e2e6e5..13a8880 100644 --- a/java/org/brotli/wrapper/dec/BrotliInputStream.java +++ b/java/org/brotli/wrapper/dec/BrotliInputStream.java @@ -8,6 +8,7 @@ package org.brotli.wrapper.dec; import java.io.IOException; import java.io.InputStream; +import java.nio.ByteBuffer; import java.nio.channels.Channels; /** @@ -34,6 +35,10 @@ public class BrotliInputStream extends InputStream { this(source, DEFAULT_BUFFER_SIZE); } + public void attachDictionary(ByteBuffer dictionary) throws IOException { + decoder.attachDictionary(dictionary); + } + public void enableEagerOutput() { decoder.enableEagerOutput(); } diff --git a/java/org/brotli/wrapper/dec/CornerCasesTest.java b/java/org/brotli/wrapper/dec/CornerCasesTest.java new file mode 100644 index 0000000..ab02f23 --- /dev/null +++ b/java/org/brotli/wrapper/dec/CornerCasesTest.java @@ -0,0 +1,33 @@ +/* Copyright 2021 Google Inc. All Rights Reserved. + + Distributed under MIT license. + See file LICENSE for detail or copy at https://opensource.org/licenses/MIT +*/ + +package org.brotli.wrapper.dec; + +import static org.junit.Assert.assertEquals; + +import org.brotli.integration.BrotliJniTestBase; +import org.brotli.wrapper.enc.Encoder; +import java.io.IOException; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link org.brotli.wrapper.enc.Encoder}. */ +@RunWith(JUnit4.class) +public class CornerCasesTest extends BrotliJniTestBase { + @Test + public void testPowerOfTwoInput() throws IOException { + // 24 == max window bits to ensure ring-buffer size is not greater than input. + int len = 1 << 24; + byte[] data = new byte[len]; + for (int i = 0; i < len; ++i) { + data[i] = (byte) Integer.bitCount(i); + } + byte[] encoded = Encoder.compress(data); + byte[] decoded = Decoder.decompress(encoded); + assertEquals(len, decoded.length); + } +} diff --git a/java/org/brotli/wrapper/dec/Decoder.java b/java/org/brotli/wrapper/dec/Decoder.java index 018317d..de69d3a 100644 --- a/java/org/brotli/wrapper/dec/Decoder.java +++ b/java/org/brotli/wrapper/dec/Decoder.java @@ -50,6 +50,12 @@ public class Decoder { throw new IOException(message); } + void attachDictionary(ByteBuffer dictionary) throws IOException { + if (!decoder.attachDictionary(dictionary)) { + fail("failed to attach dictionary"); + } + } + public void enableEagerOutput() { this.eager = true; } diff --git a/java/org/brotli/wrapper/dec/DecoderJNI.java b/java/org/brotli/wrapper/dec/DecoderJNI.java index 2319b1e..fc5225d 100644 --- a/java/org/brotli/wrapper/dec/DecoderJNI.java +++ b/java/org/brotli/wrapper/dec/DecoderJNI.java @@ -17,6 +17,7 @@ public class DecoderJNI { private static native void nativePush(long[] context, int length); private static native ByteBuffer nativePull(long[] context); private static native void nativeDestroy(long[] context); + private static native boolean nativeAttachDictionary(long[] context, ByteBuffer dictionary); public enum Status { ERROR, @@ -40,6 +41,19 @@ public class DecoderJNI { } } + public boolean attachDictionary(ByteBuffer dictionary) { + if (!dictionary.isDirect()) { + throw new IllegalArgumentException("only direct buffers allowed"); + } + if (context[0] == 0) { + throw new IllegalStateException("brotli decoder is already destroyed"); + } + if (!fresh) { + throw new IllegalStateException("decoding is already started"); + } + return nativeAttachDictionary(context, dictionary); + } + public void push(int length) { if (length < 0) { throw new IllegalArgumentException("negative block length"); diff --git a/java/org/brotli/wrapper/dec/decoder_jni.cc b/java/org/brotli/wrapper/dec/decoder_jni.cc index 268a10b..e10ffe3 100644 --- a/java/org/brotli/wrapper/dec/decoder_jni.cc +++ b/java/org/brotli/wrapper/dec/decoder_jni.cc @@ -15,6 +15,9 @@ namespace { typedef struct DecoderHandle { BrotliDecoderState* state; + jobject dictionary_refs[15]; + size_t dictionary_count; + uint8_t* input_start; size_t input_offset; size_t input_length; @@ -54,6 +57,10 @@ Java_org_brotli_wrapper_dec_DecoderJNI_nativeCreate( ok = !!handle; if (ok) { + for (int i = 0; i < 15; ++i) { + handle->dictionary_refs[i] = nullptr; + } + handle->dictionary_count = 0; handle->input_offset = 0; handle->input_length = 0; handle->input_start = nullptr; @@ -193,10 +200,53 @@ Java_org_brotli_wrapper_dec_DecoderJNI_nativeDestroy( env->GetLongArrayRegion(ctx, 0, 3, context); DecoderHandle* handle = getHandle(reinterpret_cast(context[0])); BrotliDecoderDestroyInstance(handle->state); + for (size_t i = 0; i < handle->dictionary_count; ++i) { + env->DeleteGlobalRef(handle->dictionary_refs[i]); + } delete[] handle->input_start; delete handle; } +JNIEXPORT jboolean JNICALL +Java_org_brotli_wrapper_dec_DecoderJNI_nativeAttachDictionary( + JNIEnv* env, jobject /*jobj*/, jlongArray ctx, jobject dictionary) { + jlong context[3]; + env->GetLongArrayRegion(ctx, 0, 3, context); + DecoderHandle* handle = getHandle(reinterpret_cast(context[0])); + jobject ref = nullptr; + uint8_t* address = nullptr; + jlong capacity = 0; + + bool ok = true; + if (ok && !dictionary) { + ok = false; + } + if (ok && handle->dictionary_count >= 15) { + ok = false; + } + if (ok) { + ref = env->NewGlobalRef(dictionary); + ok = !!ref; + } + if (ok) { + handle->dictionary_refs[handle->dictionary_count] = ref; + handle->dictionary_count++; + address = static_cast(env->GetDirectBufferAddress(ref)); + ok = !!address; + } + if (ok) { + capacity = env->GetDirectBufferCapacity(ref); + ok = (capacity > 0) && (capacity < (1 << 30)); + } + if (ok) { + size_t size = static_cast(capacity); + ok = !!BrotliDecoderAttachDictionary(handle->state, + BROTLI_SHARED_DICTIONARY_RAW, size, address); + } + + return static_cast(ok); +} + #ifdef __cplusplus } #endif diff --git a/java/org/brotli/wrapper/enc/BUILD b/java/org/brotli/wrapper/enc/BUILD index 22154d2..0f8d2e1 100644 --- a/java/org/brotli/wrapper/enc/BUILD +++ b/java/org/brotli/wrapper/enc/BUILD @@ -18,6 +18,10 @@ java_library( ["*.java"], exclude = ["*Test*.java"], ), + deps = [ + "//org/brotli/common:shared_dictionary", + "//org/brotli/enc:prepared_dictionary", + ], resource_jars = ["//:license"], ) @@ -27,8 +31,11 @@ java_library( srcs = glob(["*Test*.java"]), deps = [ ":enc", + "//org/brotli/common:shared_dictionary", + "//org/brotli/enc:prepared_dictionary", "//org/brotli/integration:brotli_jni_test_base", "//org/brotli/integration:bundle_helper", + "//org/brotli/wrapper/common", "//org/brotli/wrapper/dec", "@maven//:junit_junit", ], diff --git a/java/org/brotli/wrapper/enc/BrotliEncoderChannel.java b/java/org/brotli/wrapper/enc/BrotliEncoderChannel.java index 047c356..2ee2ba6 100644 --- a/java/org/brotli/wrapper/enc/BrotliEncoderChannel.java +++ b/java/org/brotli/wrapper/enc/BrotliEncoderChannel.java @@ -6,6 +6,7 @@ package org.brotli.wrapper.enc; +import org.brotli.enc.PreparedDictionary; import java.io.IOException; import java.nio.Buffer; import java.nio.ByteBuffer; @@ -42,6 +43,11 @@ public class BrotliEncoderChannel extends Encoder implements WritableByteChannel this(destination, new Encoder.Parameters()); } + @Override + public void attachDictionary(PreparedDictionary dictionary) throws IOException { + super.attachDictionary(dictionary); + } + @Override public boolean isOpen() { synchronized (mutex) { diff --git a/java/org/brotli/wrapper/enc/BrotliOutputStream.java b/java/org/brotli/wrapper/enc/BrotliOutputStream.java index 5bd3957..09cbd5a 100644 --- a/java/org/brotli/wrapper/enc/BrotliOutputStream.java +++ b/java/org/brotli/wrapper/enc/BrotliOutputStream.java @@ -6,6 +6,7 @@ package org.brotli.wrapper.enc; +import org.brotli.enc.PreparedDictionary; import java.io.IOException; import java.io.OutputStream; import java.nio.channels.Channels; @@ -40,6 +41,10 @@ public class BrotliOutputStream extends OutputStream { this(destination, new Encoder.Parameters()); } + public void attachDictionary(PreparedDictionary dictionary) throws IOException { + encoder.attachDictionary(dictionary); + } + @Override public void close() throws IOException { encoder.close(); diff --git a/java/org/brotli/wrapper/enc/Encoder.java b/java/org/brotli/wrapper/enc/Encoder.java index af579a3..2aa9890 100644 --- a/java/org/brotli/wrapper/enc/Encoder.java +++ b/java/org/brotli/wrapper/enc/Encoder.java @@ -6,17 +6,20 @@ package org.brotli.wrapper.enc; +import org.brotli.enc.PreparedDictionary; import java.io.IOException; import java.nio.Buffer; import java.nio.ByteBuffer; import java.nio.channels.WritableByteChannel; import java.util.ArrayList; +import java.util.List; /** * Base class for OutputStream / Channel implementations. */ public class Encoder { private final WritableByteChannel destination; + private final List dictionaries; private final EncoderJNI.Wrapper encoder; private ByteBuffer buffer; final ByteBuffer inputBuffer; @@ -111,6 +114,7 @@ public class Encoder { if (destination == null) { throw new NullPointerException("destination can not be null"); } + this.dictionaries = new ArrayList(); this.destination = destination; this.encoder = new EncoderJNI.Wrapper(inputBufferSize, params.quality, params.lgwin, params.mode); this.inputBuffer = this.encoder.getInputBuffer(); @@ -125,6 +129,14 @@ public class Encoder { throw new IOException(message); } + public void attachDictionary(PreparedDictionary dictionary) throws IOException { + if (!encoder.attachDictionary(dictionary.getData())) { + fail("failed to attach dictionary"); + } + // Reference to native prepared dictionary wrapper should be held till the end of encoding. + dictionaries.add(dictionary); + } + /** * @param force repeat pushing until all output is consumed * @return true if all encoder output is consumed @@ -239,4 +251,15 @@ public class Encoder { public static byte[] compress(byte[] data) throws IOException { return compress(data, new Parameters()); } + + /** + * Prepares raw or serialized dictionary for being used by encoder. + * + * @param dictionary raw / serialized dictionary data; MUST be direct + * @param sharedDictionaryType dictionary data type + */ + public static PreparedDictionary prepareDictionary(ByteBuffer dictionary, + int sharedDictionaryType) { + return EncoderJNI.prepareDictionary(dictionary, sharedDictionaryType); + } } diff --git a/java/org/brotli/wrapper/enc/EncoderJNI.java b/java/org/brotli/wrapper/enc/EncoderJNI.java index d0c5fac..1cbd7c1 100644 --- a/java/org/brotli/wrapper/enc/EncoderJNI.java +++ b/java/org/brotli/wrapper/enc/EncoderJNI.java @@ -6,6 +6,7 @@ package org.brotli.wrapper.enc; +import org.brotli.enc.PreparedDictionary; import java.io.IOException; import java.nio.ByteBuffer; @@ -17,6 +18,9 @@ class EncoderJNI { private static native void nativePush(long[] context, int length); private static native ByteBuffer nativePull(long[] context); private static native void nativeDestroy(long[] context); + private static native boolean nativeAttachDictionary(long[] context, ByteBuffer dictionary); + private static native ByteBuffer nativePrepareDictionary(ByteBuffer dictionary, long type); + private static native void nativeDestroyDictionary(ByteBuffer dictionary); enum Operation { PROCESS, @@ -24,6 +28,47 @@ class EncoderJNI { FINISH } + private static class PreparedDictionaryImpl implements PreparedDictionary { + private ByteBuffer data; + + private PreparedDictionaryImpl(ByteBuffer data) { + this.data = data; + } + + @Override + public ByteBuffer getData() { + return data; + } + + @Override + protected void finalize() throws Throwable { + try { + ByteBuffer data = this.data; + this.data = null; + nativeDestroyDictionary(data); + } finally { + super.finalize(); + } + } + } + + /** + * Prepares raw or serialized dictionary for being used by encoder. + * + * @param dictionary raw / serialized dictionary data; MUST be direct + * @param sharedDictionaryType dictionary data type + */ + static PreparedDictionary prepareDictionary(ByteBuffer dictionary, int sharedDictionaryType) { + if (!dictionary.isDirect()) { + throw new IllegalArgumentException("only direct buffers allowed"); + } + ByteBuffer dictionaryData = nativePrepareDictionary(dictionary, sharedDictionaryType); + if (dictionaryData == null) { + throw new IllegalStateException("OOM"); + } + return new PreparedDictionaryImpl(dictionaryData); + } + static class Wrapper { protected final long[] context = new long[5]; private final ByteBuffer inputBuffer; @@ -48,6 +93,19 @@ class EncoderJNI { this.context[4] = 0; } + boolean attachDictionary(ByteBuffer dictionary) { + if (!dictionary.isDirect()) { + throw new IllegalArgumentException("only direct buffers allowed"); + } + if (context[0] == 0) { + throw new IllegalStateException("brotli decoder is already destroyed"); + } + if (!fresh) { + throw new IllegalStateException("decoding is already started"); + } + return nativeAttachDictionary(context, dictionary); + } + void push(Operation op, int length) { if (length < 0) { throw new IllegalArgumentException("negative block length"); diff --git a/java/org/brotli/wrapper/enc/UseCompoundDictionaryTest.java b/java/org/brotli/wrapper/enc/UseCompoundDictionaryTest.java new file mode 100644 index 0000000..cbfd0d9 --- /dev/null +++ b/java/org/brotli/wrapper/enc/UseCompoundDictionaryTest.java @@ -0,0 +1,114 @@ +package org.brotli.wrapper.enc; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.brotli.common.SharedDictionaryType; +import org.brotli.enc.PreparedDictionary; +import org.brotli.enc.PreparedDictionaryGenerator; +import org.brotli.integration.BrotliJniTestBase; +import org.brotli.integration.BundleHelper; +import org.brotli.wrapper.common.BrotliCommon; +import org.brotli.wrapper.dec.BrotliInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.util.List; +import junit.framework.TestCase; +import junit.framework.TestSuite; +import org.junit.runner.RunWith; +import org.junit.runners.AllTests; + +/** Tests for compression / decompression aided with LZ77 dictionary. */ +@RunWith(AllTests.class) +public class UseCompoundDictionaryTest extends BrotliJniTestBase { + + static InputStream getBundle() throws IOException { + return new FileInputStream(System.getProperty("TEST_BUNDLE")); + } + + /** Creates a test suite. */ + public static TestSuite suite() throws IOException { + TestSuite suite = new TestSuite(); + InputStream bundle = getBundle(); + try { + List entries = BundleHelper.listEntries(bundle); + for (String entry : entries) { + suite.addTest(new UseCompoundDictionaryTestCase(entry)); + } + } finally { + bundle.close(); + } + return suite; + } + + /** Test case with a unique name. */ + static class UseCompoundDictionaryTestCase extends TestCase { + final String entryName; + UseCompoundDictionaryTestCase(String entryName) { + super("UseCompoundDictionaryTest." + entryName); + this.entryName = entryName; + } + + @Override + protected void runTest() throws Throwable { + UseCompoundDictionaryTest.run(entryName); + } + } + + private static PreparedDictionary prepareRawDictionary(String entryName, ByteBuffer data) { + if (entryName.endsWith("E.coli.txt")) { + // Default prepared dictionary parameters doesn't work well for DNA data. + // Should work well with 8-byte hash. + return PreparedDictionaryGenerator.generate(data, 17, 3, 64, 5); + } else { + return Encoder.prepareDictionary(data, SharedDictionaryType.RAW); + } + } + + private static void run(String entryName) throws Throwable { + InputStream bundle = getBundle(); + byte[] original; + try { + original = BundleHelper.readEntry(bundle, entryName); + } finally { + bundle.close(); + } + + if (original == null) { + throw new RuntimeException("Can't read bundle entry: " + entryName); + } + + ByteBuffer dictionary = BrotliCommon.makeNative(original); + PreparedDictionary preparedDictionary = prepareRawDictionary(entryName, dictionary); + + ByteArrayOutputStream dst = new ByteArrayOutputStream(); + BrotliOutputStream encoder = + new BrotliOutputStream(dst, new Encoder.Parameters().setQuality(9).setWindow(23), 1 << 23); + encoder.attachDictionary(preparedDictionary); + try { + encoder.write(original); + } finally { + encoder.close(); + } + + byte[] compressed = dst.toByteArray(); + + // Just copy self from LZ77 dictionary -> ultimate compression ratio. + assertTrue(compressed.length < 80 + original.length / 65536); + + BrotliInputStream decoder = + new BrotliInputStream(new ByteArrayInputStream(compressed), 1 << 23); + decoder.attachDictionary(dictionary); + try { + long originalCrc = BundleHelper.fingerprintStream(new ByteArrayInputStream(original)); + long crc = BundleHelper.fingerprintStream(decoder); + assertEquals(originalCrc, crc); + } finally { + decoder.close(); + } + } +} \ No newline at end of file diff --git a/java/org/brotli/wrapper/enc/encoder_jni.cc b/java/org/brotli/wrapper/enc/encoder_jni.cc index e69f719..00fbfd8 100644 --- a/java/org/brotli/wrapper/enc/encoder_jni.cc +++ b/java/org/brotli/wrapper/enc/encoder_jni.cc @@ -15,6 +15,9 @@ namespace { typedef struct EncoderHandle { BrotliEncoderState* state; + jobject dictionary_refs[15]; + size_t dictionary_count; + uint8_t* input_start; size_t input_offset; size_t input_last; @@ -53,6 +56,10 @@ Java_org_brotli_wrapper_enc_EncoderJNI_nativeCreate( ok = !!handle; if (ok) { + for (int i = 0; i < 15; ++i) { + handle->dictionary_refs[i] = nullptr; + } + handle->dictionary_count = 0; handle->input_offset = 0; handle->input_last = 0; handle->input_start = nullptr; @@ -190,10 +197,90 @@ Java_org_brotli_wrapper_enc_EncoderJNI_nativeDestroy( env->GetLongArrayRegion(ctx, 0, 2, context); EncoderHandle* handle = getHandle(reinterpret_cast(context[0])); BrotliEncoderDestroyInstance(handle->state); + for (size_t i = 0; i < handle->dictionary_count; ++i) { + env->DeleteGlobalRef(handle->dictionary_refs[i]); + } delete[] handle->input_start; delete handle; } +JNIEXPORT jboolean JNICALL +Java_org_brotli_wrapper_enc_EncoderJNI_nativeAttachDictionary( + JNIEnv* env, jobject /*jobj*/, jlongArray ctx, jobject dictionary) { + jlong context[2]; + env->GetLongArrayRegion(ctx, 0, 2, context); + EncoderHandle* handle = getHandle(reinterpret_cast(context[0])); + jobject ref = nullptr; + uint8_t* address = nullptr; + + bool ok = true; + if (ok && !dictionary) { + ok = false; + } + if (ok && handle->dictionary_count >= 15) { + ok = false; + } + if (ok) { + ref = env->NewGlobalRef(dictionary); + ok = !!ref; + } + if (ok) { + handle->dictionary_refs[handle->dictionary_count] = ref; + handle->dictionary_count++; + address = static_cast(env->GetDirectBufferAddress(ref)); + ok = !!address; + } + if (ok) { + ok = !!BrotliEncoderAttachPreparedDictionary(handle->state, + reinterpret_cast(address)); + } + + return static_cast(ok); +} + +JNIEXPORT void JNICALL +Java_org_brotli_wrapper_enc_EncoderJNI_nativeDestroyDictionary( + JNIEnv* env, jobject /*jobj*/, jobject dictionary) { + if (!dictionary) { + return; + } + uint8_t* address = + static_cast(env->GetDirectBufferAddress(dictionary)); + if (!address) { + return; + } + BrotliEncoderDestroyPreparedDictionary( + reinterpret_cast(address)); +} + +JNIEXPORT jobject JNICALL +Java_org_brotli_wrapper_enc_EncoderJNI_nativePrepareDictionary( + JNIEnv* env, jobject /*jobj*/, jobject dictionary, jlong type) { + if (!dictionary) { + return nullptr; + } + uint8_t* address = + static_cast(env->GetDirectBufferAddress(dictionary)); + if (!address) { + return nullptr; + } + jlong capacity = env->GetDirectBufferCapacity(dictionary); + if ((capacity <= 0) || (capacity >= (1 << 30))) { + return nullptr; + } + BrotliSharedDictionaryType dictionary_type = + static_cast(type); + size_t size = static_cast(capacity); + BrotliEncoderPreparedDictionary* prepared_dictionary = + BrotliEncoderPrepareDictionary(dictionary_type, size, address, + BROTLI_MAX_QUALITY, nullptr, nullptr, nullptr); + if (!prepared_dictionary) { + return nullptr; + } + /* Size is 4 - just enough to check magic bytes. */ + return env->NewDirectByteBuffer(prepared_dictionary, 4); +} + #ifdef __cplusplus } #endif diff --git a/scripts/sources.lst b/scripts/sources.lst index 19a6d00..18d3867 100644 --- a/scripts/sources.lst +++ b/scripts/sources.lst @@ -9,6 +9,7 @@ BROTLI_COMMON_C = \ c/common/context.c \ c/common/dictionary.c \ c/common/platform.c \ + c/common/shared_dictionary.c \ c/common/transform.c BROTLI_COMMON_H = \ @@ -16,6 +17,7 @@ BROTLI_COMMON_H = \ c/common/context.h \ c/common/dictionary.h \ c/common/platform.h \ + c/common/shared_dictionary_internal.h \ c/common/transform.h \ c/common/version.h @@ -39,6 +41,7 @@ BROTLI_ENC_C = \ c/enc/brotli_bit_stream.c \ c/enc/cluster.c \ c/enc/command.c \ + c/enc/compound_dictionary.c \ c/enc/compress_fragment.c \ c/enc/compress_fragment_two_pass.c \ c/enc/dictionary_hash.c \ @@ -66,6 +69,7 @@ BROTLI_ENC_H = \ c/enc/cluster.h \ c/enc/cluster_inc.h \ c/enc/command.h \ + c/enc/compound_dictionary.h \ c/enc/compress_fragment.h \ c/enc/compress_fragment_two_pass.h \ c/enc/dictionary_hash.h \ @@ -101,4 +105,5 @@ BROTLI_INCLUDE = \ c/include/brotli/decode.h \ c/include/brotli/encode.h \ c/include/brotli/port.h \ + c/include/brotli/shared_dictionary.h \ c/include/brotli/types.h diff --git a/setup.py b/setup.py index 25626ec..3afb035 100644 --- a/setup.py +++ b/setup.py @@ -185,6 +185,7 @@ EXT_MODULES = [ 'c/common/context.c', 'c/common/dictionary.c', 'c/common/platform.c', + 'c/common/shared_dictionary.c', 'c/common/transform.c', 'c/dec/bit_reader.c', 'c/dec/decode.c', @@ -197,6 +198,7 @@ EXT_MODULES = [ 'c/enc/brotli_bit_stream.c', 'c/enc/cluster.c', 'c/enc/command.c', + 'c/enc/compound_dictionary.c', 'c/enc/compress_fragment.c', 'c/enc/compress_fragment_two_pass.c', 'c/enc/dictionary_hash.c', @@ -216,6 +218,7 @@ EXT_MODULES = [ 'c/common/context.h', 'c/common/dictionary.h', 'c/common/platform.h', + 'c/common/shared_dictionary_internal.h', 'c/common/transform.h', 'c/common/version.h', 'c/dec/bit_reader.h', @@ -234,6 +237,7 @@ EXT_MODULES = [ 'c/enc/cluster.h', 'c/enc/cluster_inc.h', 'c/enc/command.h', + 'c/enc/compound_dictionary.h', 'c/enc/compress_fragment.h', 'c/enc/compress_fragment_two_pass.h', 'c/enc/dictionary_hash.h',