mirror of
https://github.com/google/brotli.git
synced 2024-11-09 13:40:06 +00:00
Use a static hash table to look up dictionary words and transforms.
This is used for quality 11, for qualities <= 9 we already have a simpler hash table. The static data size is 252 kB, and this removes the need to initialize a huge hash map at startup, which was the reason why transforms had to be disabled by default. In comparison, the static dictionary itself is 120 kB. This supports every transform, except the kOmitFirstN.
This commit is contained in:
parent
0fd2df4f4d
commit
66098830a2
@ -2,7 +2,7 @@
|
||||
|
||||
include ../shared.mk
|
||||
|
||||
OBJS = backward_references.o block_splitter.o brotli_bit_stream.o encode.o encode_parallel.o entropy_encode.o histogram.o literal_cost.o metablock.o streams.o
|
||||
OBJS = backward_references.o block_splitter.o brotli_bit_stream.o encode.o encode_parallel.o entropy_encode.o histogram.o literal_cost.o metablock.o static_dict.o streams.o
|
||||
|
||||
all : $(OBJS)
|
||||
|
||||
|
@ -161,8 +161,6 @@ BrotliCompressor::BrotliCompressor(BrotliParams params)
|
||||
std::max(kMinInputBlockBits, params_.lgblock));
|
||||
}
|
||||
if (params_.quality <= 9) {
|
||||
params_.enable_dictionary = false;
|
||||
params_.enable_transforms = false;
|
||||
params_.greedy_block_split = true;
|
||||
params_.enable_context_modeling = false;
|
||||
}
|
||||
@ -210,26 +208,11 @@ BrotliCompressor::BrotliCompressor(BrotliParams params)
|
||||
// Initialize hashers.
|
||||
hash_type_ = std::min(9, params_.quality);
|
||||
hashers_->Init(hash_type_);
|
||||
if ((params_.mode == BrotliParams::MODE_GENERIC ||
|
||||
params_.mode == BrotliParams::MODE_TEXT) &&
|
||||
params_.enable_dictionary) {
|
||||
StoreDictionaryWordHashes(params_.enable_transforms);
|
||||
}
|
||||
}
|
||||
|
||||
BrotliCompressor::~BrotliCompressor() {
|
||||
}
|
||||
|
||||
StaticDictionary* BrotliCompressor::static_dictionary_ = NULL;
|
||||
|
||||
void BrotliCompressor::StoreDictionaryWordHashes(bool enable_transforms) {
|
||||
if (static_dictionary_ == NULL) {
|
||||
static_dictionary_ = new StaticDictionary;
|
||||
static_dictionary_->Fill(enable_transforms);
|
||||
}
|
||||
hashers_->SetStaticDictionary(static_dictionary_);
|
||||
}
|
||||
|
||||
void BrotliCompressor::CopyInputToRingBuffer(const size_t input_size,
|
||||
const uint8_t* input_buffer) {
|
||||
ringbuffer_->Write(input_buffer, input_size);
|
||||
|
@ -139,9 +139,6 @@ class BrotliCompressor {
|
||||
void WriteStreamHeader() {}
|
||||
|
||||
private:
|
||||
// Initializes the hasher with the hashes of dictionary words.
|
||||
void StoreDictionaryWordHashes(bool enable_transforms);
|
||||
|
||||
uint8_t* GetBrotliStorage(size_t size);
|
||||
|
||||
bool WriteMetaBlockInternal(const bool is_last,
|
||||
@ -172,7 +169,6 @@ class BrotliCompressor {
|
||||
uint8_t prev_byte2_;
|
||||
int storage_size_;
|
||||
std::unique_ptr<uint8_t[]> storage_;
|
||||
static StaticDictionary *static_dictionary_;
|
||||
};
|
||||
|
||||
// Compresses the data in input_buffer into encoded_buffer, and sets
|
||||
|
@ -124,7 +124,6 @@ bool WriteMetaBlockParallel(const BrotliParams& params,
|
||||
const uint8_t* input_buffer,
|
||||
const size_t prefix_size,
|
||||
const uint8_t* prefix_buffer,
|
||||
const StaticDictionary* static_dict,
|
||||
const bool is_first,
|
||||
const bool is_last,
|
||||
size_t* encoded_size,
|
||||
@ -169,7 +168,6 @@ bool WriteMetaBlockParallel(const BrotliParams& params,
|
||||
int hash_type = std::min(9, params.quality);
|
||||
std::unique_ptr<Hashers> hashers(new Hashers());
|
||||
hashers->Init(hash_type);
|
||||
hashers->SetStaticDictionary(static_dict);
|
||||
|
||||
// Compute backward references.
|
||||
int last_insert_len = 0;
|
||||
@ -318,8 +316,6 @@ int BrotliCompressBufferParallel(BrotliParams params,
|
||||
size_t max_input_block_size = 1 << params.lgblock;
|
||||
|
||||
std::vector<std::vector<uint8_t> > compressed_pieces;
|
||||
StaticDictionary dict;
|
||||
dict.Fill(params.enable_transforms);
|
||||
|
||||
// Compress block-by-block independently.
|
||||
for (size_t pos = 0; pos < input_size; ) {
|
||||
@ -331,7 +327,6 @@ int BrotliCompressBufferParallel(BrotliParams params,
|
||||
&input_buffer[pos],
|
||||
pos,
|
||||
input_buffer,
|
||||
&dict,
|
||||
pos == 0,
|
||||
pos + input_block_size == input_size,
|
||||
&out_size,
|
||||
|
53
enc/hash.h
53
enc/hash.h
@ -305,7 +305,7 @@ template <int kBucketBits,
|
||||
int kNumLastDistancesToCheck>
|
||||
class HashLongestMatch {
|
||||
public:
|
||||
HashLongestMatch() : static_dict_(NULL) {
|
||||
HashLongestMatch() {
|
||||
Reset();
|
||||
}
|
||||
|
||||
@ -315,28 +315,6 @@ class HashLongestMatch {
|
||||
num_dict_matches_ = 0;
|
||||
}
|
||||
|
||||
void CopyTo(HashLongestMatch* target) {
|
||||
bool has_data = false;
|
||||
for (int i = 0; i < kBucketSize; i++) {
|
||||
target->num_[i] = num_[i];
|
||||
has_data = (has_data || num_[i]);
|
||||
}
|
||||
if (has_data) {
|
||||
for (int i = 0; i < kBucketSize; i++) {
|
||||
for (int j = 0; j < kBlockSize; j++) {
|
||||
target->buckets_[i][j] = buckets_[i][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
target->num_dict_lookups_ = num_dict_lookups_;
|
||||
target->num_dict_matches_ = num_dict_matches_;
|
||||
target->static_dict_ = static_dict_;
|
||||
}
|
||||
|
||||
void SetStaticDictionary(const StaticDictionary *dict) {
|
||||
static_dict_ = dict;
|
||||
}
|
||||
|
||||
// Look at 3 bytes at data.
|
||||
// Compute a hash from these, and store the value of ix at that position.
|
||||
inline void Store(const uint8_t *data, const int ix) {
|
||||
@ -558,18 +536,16 @@ class HashLongestMatch {
|
||||
}
|
||||
}
|
||||
}
|
||||
if (static_dict_ != NULL) {
|
||||
// We decide based on first 4 bytes how many bytes to test for.
|
||||
uint32_t prefix = BROTLI_UNALIGNED_LOAD32(&data[cur_ix_masked]);
|
||||
int maxlen = static_dict_->GetLength(prefix);
|
||||
for (int len = std::min<size_t>(maxlen, max_length);
|
||||
len > best_len && len >= 4; --len) {
|
||||
std::string snippet((const char *)&data[cur_ix_masked], len);
|
||||
int copy_len_code;
|
||||
int word_id;
|
||||
if (static_dict_->Get(snippet, ©_len_code, &word_id)) {
|
||||
const size_t backward = max_backward + word_id + 1;
|
||||
*matches++ = BackwardMatch(backward, len, copy_len_code);
|
||||
std::vector<int> dict_matches(kMaxDictionaryMatchLen + 1, kInvalidMatch);
|
||||
int minlen = std::max<int>(4, best_len + 1);
|
||||
if (FindAllStaticDictionaryMatches(&data[cur_ix_masked], minlen,
|
||||
&dict_matches[0])) {
|
||||
int maxlen = std::min<int>(kMaxDictionaryMatchLen, max_length);
|
||||
for (int l = minlen; l <= maxlen; ++l) {
|
||||
int dict_id = dict_matches[l];
|
||||
if (dict_id < kInvalidMatch) {
|
||||
*matches++ = BackwardMatch(max_backward + (dict_id >> 5) + 1, l,
|
||||
dict_id & 31);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -595,8 +571,6 @@ class HashLongestMatch {
|
||||
|
||||
size_t num_dict_lookups_;
|
||||
size_t num_dict_matches_;
|
||||
|
||||
const StaticDictionary *static_dict_;
|
||||
};
|
||||
|
||||
struct Hashers {
|
||||
@ -628,11 +602,6 @@ struct Hashers {
|
||||
}
|
||||
}
|
||||
|
||||
// Brotli's built in static transformable dictionary.
|
||||
void SetStaticDictionary(const StaticDictionary *dict) {
|
||||
if (hash_h9.get() != NULL) hash_h9->SetStaticDictionary(dict);
|
||||
}
|
||||
|
||||
template<typename Hasher>
|
||||
void WarmupHash(const size_t size, const uint8_t* dict, Hasher* hasher) {
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
|
420
enc/static_dict.cc
Normal file
420
enc/static_dict.cc
Normal file
@ -0,0 +1,420 @@
|
||||
#include "./static_dict.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "./dictionary.h"
|
||||
#include "./find_match_length.h"
|
||||
#include "./static_dict_lut.h"
|
||||
#include "./transform.h"
|
||||
|
||||
namespace brotli {
|
||||
|
||||
inline uint32_t Hash(const uint8_t *data) {
|
||||
uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kDictHashMul32;
|
||||
// The higher bits contain more mixture from the multiplication,
|
||||
// so we take our results from there.
|
||||
return h >> (32 - kDictNumBits);
|
||||
}
|
||||
|
||||
inline void AddMatch(int distance, int len, int len_code, int* matches) {
|
||||
matches[len] = std::min(matches[len], (distance << 5) + len_code);
|
||||
}
|
||||
|
||||
inline int DictMatchLength(const uint8_t* data, int id, int len) {
|
||||
const int offset = kBrotliDictionaryOffsetsByLength[len] + len * id;
|
||||
return FindMatchLengthWithLimit(&kBrotliDictionary[offset], data, len);
|
||||
}
|
||||
|
||||
inline bool IsMatch(DictWord w, const uint8_t* data) {
|
||||
const int offset = kBrotliDictionaryOffsetsByLength[w.len] + w.len * w.idx;
|
||||
const uint8_t* dict = &kBrotliDictionary[offset];
|
||||
if (w.transform == 0) {
|
||||
// Match against base dictionary word.
|
||||
return FindMatchLengthWithLimit(dict, data, w.len) == w.len;
|
||||
} else if (w.transform == 10) {
|
||||
// Match against uppercase first transform.
|
||||
// Note that there are only ASCII uppercase words in the lookup table.
|
||||
return (dict[0] >= 'a' && dict[0] <= 'z' &&
|
||||
(dict[0] ^ 32) == data[0] &&
|
||||
FindMatchLengthWithLimit(&dict[1], &data[1], w.len - 1) ==
|
||||
w.len - 1);
|
||||
} else {
|
||||
// Match against uppercase all transform.
|
||||
// Note that there are only ASCII uppercase words in the lookup table.
|
||||
for (int i = 0; i < w.len; ++i) {
|
||||
if (dict[i] >= 'a' && dict[i] <= 'z') {
|
||||
if ((dict[i] ^ 32) != data[i]) return false;
|
||||
} else {
|
||||
if (dict[i] != data[i]) return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
bool FindAllStaticDictionaryMatches(const uint8_t* data,
|
||||
int min_length,
|
||||
int* matches) {
|
||||
bool found_match = false;
|
||||
uint32_t key = Hash(data);
|
||||
uint32_t bucket = kStaticDictionaryBuckets[key];
|
||||
if (bucket != 0) {
|
||||
int num = bucket & 0xff;
|
||||
int offset = bucket >> 8;
|
||||
for (int i = 0; i < num; ++i) {
|
||||
const DictWord w = kStaticDictionaryWords[offset + i];
|
||||
const int l = w.len;
|
||||
const int n = 1 << kBrotliDictionarySizeBitsByLength[l];
|
||||
const int id = w.idx;
|
||||
if (w.transform == 0) {
|
||||
const int matchlen = DictMatchLength(data, id, l);
|
||||
// Transform "" + kIdentity + ""
|
||||
if (matchlen == l) {
|
||||
AddMatch(id, l, l, matches);
|
||||
found_match = true;
|
||||
}
|
||||
// Transfroms "" + kOmitLast1 + "" and "" + kOmitLast1 + "ing "
|
||||
if (matchlen >= l - 1) {
|
||||
AddMatch(id + 12 * n, l - 1, l, matches);
|
||||
if (data[l - 1] == 'i' && data[l] == 'n' && data[l + 1] == 'g' &&
|
||||
data[l + 2] == ' ') {
|
||||
AddMatch(id + 49 * n, l + 3, l, matches);
|
||||
}
|
||||
found_match = true;
|
||||
}
|
||||
// Transform "" + kOmitLastN + "" (N = 2 .. 9)
|
||||
int minlen = std::max<int>(min_length, l - 9);
|
||||
int maxlen = std::min<int>(matchlen, l - 2);
|
||||
for (int len = minlen; len <= maxlen; ++len) {
|
||||
AddMatch(id + kOmitLastNTransforms[l - len] * n, len, l, matches);
|
||||
found_match = true;
|
||||
}
|
||||
if (matchlen < l) {
|
||||
continue;
|
||||
}
|
||||
const uint8_t* s = &data[l];
|
||||
// Transforms "" + kIdentity + <suffix>
|
||||
if (s[0] == ' ') {
|
||||
AddMatch(id + n, l + 1, l, matches);
|
||||
if (s[1] == 'a') {
|
||||
if (s[2] == ' ') {
|
||||
AddMatch(id + 28 * n, l + 3, l, matches);
|
||||
} else if (s[2] == 's') {
|
||||
if (s[3] == ' ') AddMatch(id + 46 * n, l + 4, l, matches);
|
||||
} else if (s[2] == 't') {
|
||||
if (s[3] == ' ') AddMatch(id + 60 * n, l + 4, l, matches);
|
||||
} else if (s[2] == 'n') {
|
||||
if (s[3] == 'd' && s[4] == ' ') {
|
||||
AddMatch(id + 10 * n, l + 5, l, matches);
|
||||
}
|
||||
}
|
||||
} else if (s[1] == 'b') {
|
||||
if (s[2] == 'y' && s[3] == ' ') {
|
||||
AddMatch(id + 38 * n, l + 4, l, matches);
|
||||
}
|
||||
} else if (s[1] == 'i') {
|
||||
if (s[2] == 'n') {
|
||||
if (s[3] == ' ') AddMatch(id + 16 * n, l + 4, l, matches);
|
||||
} else if (s[2] == 's') {
|
||||
if (s[3] == ' ') AddMatch(id + 47 * n, l + 4, l, matches);
|
||||
}
|
||||
} else if (s[1] == 'f') {
|
||||
if (s[2] == 'o') {
|
||||
if (s[3] == 'r' && s[4] == ' ') {
|
||||
AddMatch(id + 25 * n, l + 5, l, matches);
|
||||
}
|
||||
} else if (s[2] == 'r') {
|
||||
if (s[3] == 'o' && s[4] == 'm' && s[5] == ' ') {
|
||||
AddMatch(id + 37 * n, l + 6, l, matches);
|
||||
}
|
||||
}
|
||||
} else if (s[1] == 'o') {
|
||||
if (s[2] == 'f') {
|
||||
if (s[3] == ' ') AddMatch(id + 8 * n, l + 4, l, matches);
|
||||
} else if (s[2] == 'n') {
|
||||
if (s[3] == ' ') AddMatch(id + 45 * n, l + 4, l, matches);
|
||||
}
|
||||
} else if (s[1] == 'n') {
|
||||
if (s[2] == 'o' && s[3] == 't' && s[4] == ' ') {
|
||||
AddMatch(id + 80 * n, l + 5, l, matches);
|
||||
}
|
||||
} else if (s[1] == 't') {
|
||||
if (s[2] == 'h') {
|
||||
if (s[3] == 'e') {
|
||||
if (s[4] == ' ') AddMatch(id + 5 * n, l + 5, l, matches);
|
||||
} else if (s[3] == 'a') {
|
||||
if (s[4] == 't' && s[5] == ' ') {
|
||||
AddMatch(id + 29 * n, l + 6, l, matches);
|
||||
}
|
||||
}
|
||||
} else if (s[2] == 'o') {
|
||||
if (s[3] == ' ') AddMatch(id + 17 * n, l + 4, l, matches);
|
||||
}
|
||||
} else if (s[1] == 'w') {
|
||||
if (s[2] == 'i' && s[3] == 't' && s[4] == 'h' && s[5] == ' ') {
|
||||
AddMatch(id + 35 * n, l + 6, l, matches);
|
||||
}
|
||||
}
|
||||
} else if (s[0] == '"') {
|
||||
AddMatch(id + 19 * n, l + 1, l, matches);
|
||||
if (s[1] == '>') {
|
||||
AddMatch(id + 21 * n, l + 2, l, matches);
|
||||
}
|
||||
} else if (s[0] == '.') {
|
||||
AddMatch(id + 20 * n, l + 1, l, matches);
|
||||
if (s[1] == ' ') {
|
||||
AddMatch(id + 31 * n, l + 2, l, matches);
|
||||
if (s[2] == 'T' && s[3] == 'h') {
|
||||
if (s[4] == 'e') {
|
||||
if (s[5] == ' ') AddMatch(id + 43 * n, l + 6, l, matches);
|
||||
} else if (s[4] == 'i') {
|
||||
if (s[5] == 's' && s[6] == ' ') {
|
||||
AddMatch(id + 75 * n, l + 7, l, matches);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (s[0] == ',') {
|
||||
AddMatch(id + 76 * n, l + 1, l, matches);
|
||||
if (s[1] == ' ') {
|
||||
AddMatch(id + 14 * n, l + 2, l, matches);
|
||||
}
|
||||
} else if (s[0] == '\n') {
|
||||
AddMatch(id + 22 * n, l + 1, l, matches);
|
||||
if (s[1] == '\t') {
|
||||
AddMatch(id + 50 * n, l + 2, l, matches);
|
||||
}
|
||||
} else if (s[0] == ']') {
|
||||
AddMatch(id + 24 * n, l + 1, l, matches);
|
||||
} else if (s[0] == '\'') {
|
||||
AddMatch(id + 36 * n, l + 1, l, matches);
|
||||
} else if (s[0] == ':') {
|
||||
AddMatch(id + 51 * n, l + 1, l, matches);
|
||||
} else if (s[0] == '(') {
|
||||
AddMatch(id + 57 * n, l + 1, l, matches);
|
||||
} else if (s[0] == '=') {
|
||||
if (s[1] == '"') {
|
||||
AddMatch(id + 70 * n, l + 2, l, matches);
|
||||
} else if (s[1] == '\'') {
|
||||
AddMatch(id + 86 * n, l + 2, l, matches);
|
||||
}
|
||||
} else if (s[0] == 'a') {
|
||||
if (s[1] == 'l' && s[2] == ' ') {
|
||||
AddMatch(id + 84 * n, l + 3, l, matches);
|
||||
}
|
||||
} else if (s[0] == 'e') {
|
||||
if (s[1] == 'd') {
|
||||
if (s[2] == ' ') AddMatch(id + 53 * n, l + 3, l, matches);
|
||||
} else if (s[1] == 'r') {
|
||||
if (s[2] == ' ') AddMatch(id + 82 * n, l + 3, l, matches);
|
||||
} else if (s[1] == 's') {
|
||||
if (s[2] == 't' && s[3] == ' ') {
|
||||
AddMatch(id + 95 * n, l + 4, l, matches);
|
||||
}
|
||||
}
|
||||
} else if (s[0] == 'f') {
|
||||
if (s[1] == 'u' && s[2] == 'l' && s[3] == ' ') {
|
||||
AddMatch(id + 90 * n, l + 4, l, matches);
|
||||
}
|
||||
} else if (s[0] == 'i') {
|
||||
if (s[1] == 'v') {
|
||||
if (s[2] == 'e' && s[3] == ' ') {
|
||||
AddMatch(id + 92 * n, l + 4, l, matches);
|
||||
}
|
||||
} else if (s[1] == 'z') {
|
||||
if (s[2] == 'e' && s[3] == ' ') {
|
||||
AddMatch(id + 100 * n, l + 4, l, matches);
|
||||
}
|
||||
}
|
||||
} else if (s[0] == 'l') {
|
||||
if (s[1] == 'e') {
|
||||
if (s[2] == 's' && s[3] == 's' && s[4] == ' ') {
|
||||
AddMatch(id + 93 * n, l + 5, l, matches);
|
||||
}
|
||||
} else if (s[1] == 'y') {
|
||||
if (s[2] == ' ') AddMatch(id + 61 * n, l + 3, l, matches);
|
||||
}
|
||||
} else if (s[0] == 'o') {
|
||||
if (s[1] == 'u' && s[2] == 's' && s[3] == ' ') {
|
||||
AddMatch(id + 106 * n, l + 4, l, matches);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Set t=0 for kUppercaseFirst and t=1 for kUppercaseAll transform.
|
||||
const int t = w.transform - 10;
|
||||
if (!IsMatch(w, data)) {
|
||||
continue;
|
||||
}
|
||||
// Transform "" + kUppercase{First,All} + ""
|
||||
AddMatch(id + (t ? 44 : 9) * n, l, l, matches);
|
||||
found_match = true;
|
||||
// Transforms "" + kUppercase{First,All} + <suffix>
|
||||
const uint8_t* s = &data[l];
|
||||
if (s[0] == ' ') {
|
||||
AddMatch(id + (t ? 68 : 4) * n, l + 1, l, matches);
|
||||
} else if (s[0] == '"') {
|
||||
AddMatch(id + (t ? 87 : 66) * n, l + 1, l, matches);
|
||||
if (s[1] == '>') {
|
||||
AddMatch(id + (t ? 97 : 69) * n, l + 2, l, matches);
|
||||
}
|
||||
} else if (s[0] == '.') {
|
||||
AddMatch(id + (t ? 101 : 79) * n, l + 1, l, matches);
|
||||
if (s[1] == ' ') {
|
||||
AddMatch(id + (t ? 114 : 88) * n, l + 2, l, matches);
|
||||
}
|
||||
} else if (s[0] == ',') {
|
||||
AddMatch(id + (t ? 112 : 99) * n, l + 1, l, matches);
|
||||
if (s[1] == ' ') {
|
||||
AddMatch(id + (t ? 107 : 58) * n, l + 2, l, matches);
|
||||
}
|
||||
} else if (s[0] == '\'') {
|
||||
AddMatch(id + (t ? 94 : 74) * n, l + 1, l, matches);
|
||||
} else if (s[0] == '(') {
|
||||
AddMatch(id + (t ? 113 : 78) * n, l + 1, l, matches);
|
||||
} else if (s[0] == '=') {
|
||||
if (s[1] == '"') {
|
||||
AddMatch(id + (t ? 105 : 104) * n, l + 2, l, matches);
|
||||
} else if (s[1] == '\'') {
|
||||
AddMatch(id + (t ? 116 : 108) * n, l + 2, l, matches);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Transforms with prefixes " " and "."
|
||||
if (data[0] == ' ' || data[0] == '.') {
|
||||
bool is_space = (data[0] == ' ');
|
||||
key = Hash(&data[1]);
|
||||
bucket = kStaticDictionaryBuckets[key];
|
||||
int num = bucket & 0xff;
|
||||
int offset = bucket >> 8;
|
||||
for (int i = 0; i < num; ++i) {
|
||||
const DictWord w = kStaticDictionaryWords[offset + i];
|
||||
const int l = w.len;
|
||||
const int n = 1 << kBrotliDictionarySizeBitsByLength[l];
|
||||
const int id = w.idx;
|
||||
if (w.transform == 0) {
|
||||
if (!IsMatch(w, &data[1])) {
|
||||
continue;
|
||||
}
|
||||
// Transforms " " + kIdentity + "" and "." + kIdentity + ""
|
||||
AddMatch(id + (is_space ? 6 : 32) * n, l + 1, l, matches);
|
||||
found_match = true;
|
||||
// Transforms " " + kIdentity + <suffix> and "." + kIdentity + <suffix>
|
||||
const uint8_t* s = &data[l + 1];
|
||||
if (s[0] == ' ') {
|
||||
AddMatch(id + (is_space ? 2 : 77) * n, l + 2, l, matches);
|
||||
} else if (s[0] == '(') {
|
||||
AddMatch(id + (is_space ? 89 : 67) * n, l + 2, l, matches);
|
||||
} else if (is_space) {
|
||||
if (s[0] == ',') {
|
||||
AddMatch(id + 103 * n, l + 2, l, matches);
|
||||
if (s[1] == ' ') {
|
||||
AddMatch(id + 33 * n, l + 3, l, matches);
|
||||
}
|
||||
} else if (s[0] == '.') {
|
||||
AddMatch(id + 71 * n, l + 2, l, matches);
|
||||
if (s[1] == ' ') {
|
||||
AddMatch(id + 52 * n, l + 3, l, matches);
|
||||
}
|
||||
} else if (s[0] == '=') {
|
||||
if (s[1] == '"') {
|
||||
AddMatch(id + 81 * n, l + 3, l, matches);
|
||||
} else if (s[1] == '\'') {
|
||||
AddMatch(id + 98 * n, l + 3, l, matches);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (is_space) {
|
||||
// Set t=0 for kUppercaseFirst and t=1 for kUppercaseAll transform.
|
||||
const int t = w.transform - 10;
|
||||
if (!IsMatch(w, &data[1])) {
|
||||
continue;
|
||||
}
|
||||
// Transforms " " + kUppercase{First,All} + ""
|
||||
AddMatch(id + (t ? 85 : 30) * n, l + 1, l, matches);
|
||||
found_match = true;
|
||||
// Transforms " " + kUppercase{First,All} + <suffix>
|
||||
const uint8_t* s = &data[l + 1];
|
||||
if (s[0] == ' ') {
|
||||
AddMatch(id + (t ? 83 : 15) * n, l + 2, l, matches);
|
||||
} else if (s[0] == ',') {
|
||||
if (t == 0) {
|
||||
AddMatch(id + 109 * n, l + 2, l, matches);
|
||||
}
|
||||
if (s[1] == ' ') {
|
||||
AddMatch(id + (t ? 111 : 65) * n, l + 3, l, matches);
|
||||
}
|
||||
} else if (s[0] == '.') {
|
||||
AddMatch(id + (t ? 115 : 96) * n, l + 2, l, matches);
|
||||
if (s[1] == ' ') {
|
||||
AddMatch(id + (t ? 117 : 91) * n, l + 3, l, matches);
|
||||
}
|
||||
} else if (s[0] == '=') {
|
||||
if (s[1] == '"') {
|
||||
AddMatch(id + (t ? 110 : 118) * n, l + 3, l, matches);
|
||||
} else if (s[1] == '\'') {
|
||||
AddMatch(id + (t ? 119 : 120) * n, l + 3, l, matches);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Transforms with prefixes "e ", "s ", ", " and "\xc2\xa0"
|
||||
if ((data[1] == ' ' &&
|
||||
(data[0] == 'e' || data[0] == 's' || data[0] == ',')) ||
|
||||
(data[0] == '\xc2' && data[1] == '\xa0')) {
|
||||
key = Hash(&data[2]);
|
||||
bucket = kStaticDictionaryBuckets[key];
|
||||
int num = bucket & 0xff;
|
||||
int offset = bucket >> 8;
|
||||
for (int i = 0; i < num; ++i) {
|
||||
const DictWord w = kStaticDictionaryWords[offset + i];
|
||||
const int l = w.len;
|
||||
const int n = 1 << kBrotliDictionarySizeBitsByLength[l];
|
||||
const int id = w.idx;
|
||||
if (w.transform == 0 && IsMatch(w, &data[2])) {
|
||||
if (data[0] == '\xc2') {
|
||||
AddMatch(id + 102 * n, l + 2, l, matches);
|
||||
found_match = true;
|
||||
} else if (data[l + 2] == ' ') {
|
||||
int t = data[0] == 'e' ? 18 : (data[0] == 's' ? 7 : 13);
|
||||
AddMatch(id + t * n, l + 3, l, matches);
|
||||
found_match = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Transforms with prefixes " the " and ".com/"
|
||||
if ((data[0] == ' ' && data[1] == 't' && data[2] == 'h' &&
|
||||
data[3] == 'e' && data[4] == ' ') ||
|
||||
(data[0] == '.' && data[1] == 'c' && data[2] == 'o' &&
|
||||
data[3] == 'm' && data[4] == '/')) {
|
||||
key = Hash(&data[5]);
|
||||
bucket = kStaticDictionaryBuckets[key];
|
||||
int num = bucket & 0xff;
|
||||
int offset = bucket >> 8;
|
||||
for (int i = 0; i < num; ++i) {
|
||||
const DictWord w = kStaticDictionaryWords[offset + i];
|
||||
const int l = w.len;
|
||||
const int n = 1 << kBrotliDictionarySizeBitsByLength[l];
|
||||
const int id = w.idx;
|
||||
if (w.transform == 0 && IsMatch(w, &data[5])) {
|
||||
AddMatch(id + (data[0] == ' ' ? 41 : 72) * n, l + 5, l, matches);
|
||||
found_match = true;
|
||||
const uint8_t* s = &data[l + 5];
|
||||
if (data[0] == ' ') {
|
||||
if (s[0] == ' ' && s[1] == 'o' && s[2] == 'f' && s[3] == ' ') {
|
||||
AddMatch(id + 62 * n, l + 9, l, matches);
|
||||
if (s[4] == 't' && s[5] == 'h' && s[6] == 'e' && s[7] == ' ') {
|
||||
AddMatch(id + 73 * n, l + 13, l, matches);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return found_match;
|
||||
}
|
||||
|
||||
} // namespace brotli
|
@ -17,70 +17,22 @@
|
||||
#ifndef BROTLI_ENC_STATIC_DICT_H_
|
||||
#define BROTLI_ENC_STATIC_DICT_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <unordered_map>
|
||||
#include <string>
|
||||
|
||||
#include "./dictionary.h"
|
||||
#include "./transform.h"
|
||||
#include <stdint.h>
|
||||
|
||||
namespace brotli {
|
||||
|
||||
class StaticDictionary {
|
||||
public:
|
||||
StaticDictionary() {}
|
||||
void Fill(bool enable_transforms) {
|
||||
const int num_transforms = enable_transforms ? kNumTransforms : 1;
|
||||
for (int t = num_transforms - 1; t >= 0; --t) {
|
||||
for (int i = kMaxDictionaryWordLength;
|
||||
i >= kMinDictionaryWordLength; --i) {
|
||||
const int num_words = 1 << kBrotliDictionarySizeBitsByLength[i];
|
||||
for (int j = num_words - 1; j >= 0; --j) {
|
||||
int word_id = t * num_words + j;
|
||||
std::string word = GetTransformedDictionaryWord(i, word_id);
|
||||
if (word.size() >= 4) {
|
||||
Insert(word, i, word_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
void Insert(const std::string &str, int len, int dist) {
|
||||
int ix = (dist << 6) + len;
|
||||
std::unordered_map<std::string, int>::const_iterator it = map_.find(str);
|
||||
if (it != map_.end() && ix >= it->second) {
|
||||
return;
|
||||
}
|
||||
map_[str] = ix;
|
||||
uint32_t v = 0;
|
||||
for (int i = 0; i < 4 && i < str.size(); ++i) {
|
||||
v += static_cast<uint32_t>(str[i]) << (8 * i);
|
||||
}
|
||||
if (prefix_map_[v] < str.size()) {
|
||||
prefix_map_[v] = str.size();
|
||||
}
|
||||
}
|
||||
int GetLength(uint32_t v) const {
|
||||
std::unordered_map<uint32_t, int>::const_iterator it = prefix_map_.find(v);
|
||||
if (it == prefix_map_.end()) {
|
||||
return 0;
|
||||
}
|
||||
return it->second;
|
||||
}
|
||||
bool Get(const std::string &str, int *len, int *dist) const {
|
||||
std::unordered_map<std::string, int>::const_iterator it = map_.find(str);
|
||||
if (it == map_.end()) {
|
||||
return false;
|
||||
}
|
||||
int v = it->second;
|
||||
*len = v & 63;
|
||||
*dist = v >> 6;
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
std::unordered_map<std::string, int> map_;
|
||||
std::unordered_map<uint32_t, int> prefix_map_;
|
||||
};
|
||||
static const int kMaxDictionaryMatchLen = 37;
|
||||
static const int kInvalidMatch = 0xfffffff;
|
||||
|
||||
// Matches data against static dictionary words, and for each length l,
|
||||
// for which a match is found, updates matches[l] to be the minimum possible
|
||||
// (distance << 5) + len_code.
|
||||
// Prerequisites:
|
||||
// matches array is at least kMaxDictionaryMatchLen + 1 long
|
||||
// all elements are initialized to kInvalidMatch
|
||||
bool FindAllStaticDictionaryMatches(const uint8_t* data,
|
||||
int min_length,
|
||||
int* matches);
|
||||
|
||||
} // namespace brotli
|
||||
|
||||
|
12063
enc/static_dict_lut.h
Normal file
12063
enc/static_dict_lut.h
Normal file
File diff suppressed because it is too large
Load Diff
@ -179,6 +179,14 @@ static const Transform kTransforms[] = {
|
||||
|
||||
static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]);
|
||||
|
||||
static const int kOmitFirstNTransforms[10] = {
|
||||
0, 3, 11, 26, 34, 39, 40, 55, 0, 54
|
||||
};
|
||||
|
||||
static const int kOmitLastNTransforms[10] = {
|
||||
0, 12, 27, 23, 42, 63, 56, 48, 59, 64,
|
||||
};
|
||||
|
||||
static int ToUpperCase(uint8_t *p, int len) {
|
||||
if (len == 1 || p[0] < 0xc0) {
|
||||
if (p[0] >= 'a' && p[0] <= 'z') {
|
||||
|
2
setup.py
2
setup.py
@ -102,6 +102,7 @@ brotli = Extension("brotli",
|
||||
"enc/histogram.cc",
|
||||
"enc/literal_cost.cc",
|
||||
"enc/metablock.cc",
|
||||
"enc/static_dict.cc",
|
||||
"enc/streams.cc",
|
||||
"dec/bit_reader.c",
|
||||
"dec/decode.c",
|
||||
@ -132,6 +133,7 @@ brotli = Extension("brotli",
|
||||
"enc/prefix.h",
|
||||
"enc/ringbuffer.h",
|
||||
"enc/static_dict.h",
|
||||
"enc/static_dict_lut.h",
|
||||
"enc/streams.h",
|
||||
"enc/transform.h",
|
||||
"enc/write_bits.h",
|
||||
|
Loading…
Reference in New Issue
Block a user