2015-11-27 10:27:11 +00:00
|
|
|
/* Copyright 2010 Google Inc. All Rights Reserved.
|
|
|
|
|
2015-12-11 10:11:51 +00:00
|
|
|
Distributed under MIT license.
|
2015-11-27 10:27:11 +00:00
|
|
|
See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
|
|
|
|
*/
|
|
|
|
|
2013-10-23 11:06:13 +00:00
|
|
|
// A (forgetful) hash table to the data seen by the compressor, to
|
|
|
|
// help create backward references to previous data.
|
|
|
|
|
|
|
|
#ifndef BROTLI_ENC_HASH_H_
|
|
|
|
#define BROTLI_ENC_HASH_H_
|
|
|
|
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <algorithm>
|
2016-01-07 15:27:49 +00:00
|
|
|
#include <cstring>
|
|
|
|
#include <vector>
|
2013-10-23 11:06:13 +00:00
|
|
|
|
2015-04-23 13:52:32 +00:00
|
|
|
#include "./dictionary_hash.h"
|
2013-10-23 11:06:13 +00:00
|
|
|
#include "./fast_log.h"
|
|
|
|
#include "./find_match_length.h"
|
|
|
|
#include "./port.h"
|
2014-10-28 12:25:22 +00:00
|
|
|
#include "./prefix.h"
|
2014-03-20 13:32:35 +00:00
|
|
|
#include "./static_dict.h"
|
2015-04-23 13:52:32 +00:00
|
|
|
#include "./transform.h"
|
2015-10-01 10:08:14 +00:00
|
|
|
#include "./types.h"
|
2013-10-23 11:06:13 +00:00
|
|
|
|
|
|
|
namespace brotli {
|
|
|
|
|
2016-01-07 15:27:49 +00:00
|
|
|
static const uint32_t kDistanceCacheIndex[] = {
|
2014-10-28 12:25:22 +00:00
|
|
|
0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
|
|
|
|
};
|
|
|
|
static const int kDistanceCacheOffset[] = {
|
|
|
|
0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3
|
|
|
|
};
|
|
|
|
|
2016-01-07 15:27:49 +00:00
|
|
|
static const uint32_t kCutoffTransformsCount = 10;
|
|
|
|
static const uint8_t kCutoffTransforms[] = {
|
|
|
|
0, 12, 27, 23, 42, 63, 56, 48, 59, 64
|
|
|
|
};
|
2015-08-10 11:13:58 +00:00
|
|
|
|
2013-10-23 11:06:13 +00:00
|
|
|
// kHashMul32 multiplier has these properties:
|
|
|
|
// * The multiplier must be odd. Otherwise we may lose the highest bit.
|
|
|
|
// * No long streaks of 1s or 0s.
|
|
|
|
// * There is no effort to ensure that it is a prime, the oddity is enough
|
|
|
|
// for this use.
|
|
|
|
// * The number has been tuned heuristically against compression benchmarks.
|
|
|
|
static const uint32_t kHashMul32 = 0x1e35a7bd;
|
|
|
|
|
2015-05-11 12:11:07 +00:00
|
|
|
template<int kShiftBits>
|
2014-03-20 13:32:35 +00:00
|
|
|
inline uint32_t Hash(const uint8_t *data) {
|
2015-05-11 12:11:07 +00:00
|
|
|
uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
|
|
|
|
// The higher bits contain more mixture from the multiplication,
|
|
|
|
// so we take our results from there.
|
|
|
|
return h >> (32 - kShiftBits);
|
2013-10-23 11:06:13 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Usually, we always choose the longest backward reference. This function
|
|
|
|
// allows for the exception of that rule.
|
|
|
|
//
|
|
|
|
// If we choose a backward reference that is further away, it will
|
|
|
|
// usually be coded with more bits. We approximate this by assuming
|
|
|
|
// log2(distance). If the distance can be expressed in terms of the
|
|
|
|
// last four distances, we use some heuristic constants to estimate
|
|
|
|
// the bits cost. For the first up to four literals we use the bit
|
|
|
|
// cost of the literals from the literal cost model, after that we
|
|
|
|
// use the average bit cost of the cost model.
|
|
|
|
//
|
|
|
|
// This function is used to sometimes discard a longer backward reference
|
|
|
|
// when it is not much longer and the bit cost for encoding it is more
|
|
|
|
// than the saved literals.
|
2016-01-07 15:27:49 +00:00
|
|
|
//
|
|
|
|
// backward_reference_offset MUST be positive.
|
|
|
|
inline double BackwardReferenceScore(size_t copy_length,
|
|
|
|
size_t backward_reference_offset) {
|
|
|
|
return 5.4 * static_cast<double>(copy_length) -
|
|
|
|
1.20 * Log2FloorNonZero(backward_reference_offset);
|
2014-03-20 13:32:35 +00:00
|
|
|
}
|
|
|
|
|
2016-01-07 15:27:49 +00:00
|
|
|
inline double BackwardReferenceScoreUsingLastDistance(size_t copy_length,
|
|
|
|
size_t distance_short_code) {
|
2014-03-20 13:32:35 +00:00
|
|
|
static const double kDistanceShortCodeBitCost[16] = {
|
|
|
|
-0.6, 0.95, 1.17, 1.27,
|
|
|
|
0.93, 0.93, 0.96, 0.96, 0.99, 0.99,
|
|
|
|
1.05, 1.05, 1.15, 1.15, 1.25, 1.25
|
|
|
|
};
|
2016-01-07 15:27:49 +00:00
|
|
|
return 5.4 * static_cast<double>(copy_length) -
|
|
|
|
kDistanceShortCodeBitCost[distance_short_code];
|
2013-10-23 11:06:13 +00:00
|
|
|
}
|
|
|
|
|
2015-06-12 14:25:41 +00:00
|
|
|
struct BackwardMatch {
|
|
|
|
BackwardMatch() : distance(0), length_and_code(0) {}
|
|
|
|
|
2016-01-07 15:27:49 +00:00
|
|
|
BackwardMatch(size_t dist, size_t len)
|
|
|
|
: distance(static_cast<uint32_t>(dist))
|
|
|
|
, length_and_code(static_cast<uint32_t>(len << 5)) {}
|
2015-06-12 14:25:41 +00:00
|
|
|
|
2016-01-07 15:27:49 +00:00
|
|
|
BackwardMatch(size_t dist, size_t len, size_t len_code)
|
|
|
|
: distance(static_cast<uint32_t>(dist))
|
|
|
|
, length_and_code(static_cast<uint32_t>(
|
|
|
|
(len << 5) | (len == len_code ? 0 : len_code))) {}
|
2015-06-12 14:25:41 +00:00
|
|
|
|
2016-01-07 15:27:49 +00:00
|
|
|
size_t length() const {
|
2015-06-12 14:25:41 +00:00
|
|
|
return length_and_code >> 5;
|
|
|
|
}
|
2016-01-07 15:27:49 +00:00
|
|
|
size_t length_code() const {
|
|
|
|
size_t code = length_and_code & 31;
|
2015-06-12 14:25:41 +00:00
|
|
|
return code ? code : length();
|
|
|
|
}
|
|
|
|
|
2016-01-07 15:27:49 +00:00
|
|
|
uint32_t distance;
|
|
|
|
uint32_t length_and_code;
|
2015-06-12 14:25:41 +00:00
|
|
|
};
|
|
|
|
|
2014-10-28 12:25:22 +00:00
|
|
|
// A (forgetful) hash table to the data seen by the compressor, to
|
|
|
|
// help create backward references to previous data.
|
|
|
|
//
|
|
|
|
// This is a hash map of fixed size (kBucketSize). Starting from the
|
|
|
|
// given index, kBucketSweep buckets are used to store values of a key.
|
2015-05-07 15:30:10 +00:00
|
|
|
template <int kBucketBits, int kBucketSweep, bool kUseDictionary>
|
2014-10-28 12:25:22 +00:00
|
|
|
class HashLongestMatchQuickly {
|
|
|
|
public:
|
|
|
|
HashLongestMatchQuickly() {
|
|
|
|
Reset();
|
|
|
|
}
|
|
|
|
void Reset() {
|
|
|
|
// It is not strictly necessary to fill this buffer here, but
|
|
|
|
// not filling will make the results of the compression stochastic
|
|
|
|
// (but correct). This is because random data would cause the
|
|
|
|
// system to find accidentally good backward references here and there.
|
2015-08-28 14:09:23 +00:00
|
|
|
memset(&buckets_[0], 0, sizeof(buckets_));
|
2015-04-23 13:52:32 +00:00
|
|
|
num_dict_lookups_ = 0;
|
|
|
|
num_dict_matches_ = 0;
|
2014-10-28 12:25:22 +00:00
|
|
|
}
|
|
|
|
// Look at 4 bytes at data.
|
|
|
|
// Compute a hash from these, and store the value somewhere within
|
|
|
|
// [ix .. ix+3].
|
2015-10-26 16:08:57 +00:00
|
|
|
inline void Store(const uint8_t *data, const uint32_t ix) {
|
2015-08-28 14:09:23 +00:00
|
|
|
const uint32_t key = HashBytes(data);
|
2014-10-28 12:25:22 +00:00
|
|
|
// Wiggle the value with the bucket sweep range.
|
2015-10-26 16:08:57 +00:00
|
|
|
const uint32_t off = (ix >> 3) % kBucketSweep;
|
2014-10-28 12:25:22 +00:00
|
|
|
buckets_[key + off] = ix;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Find a longest backward match of &ring_buffer[cur_ix & ring_buffer_mask]
|
|
|
|
// up to the length of max_length.
|
|
|
|
//
|
|
|
|
// Does not look for matches longer than max_length.
|
|
|
|
// Does not look for matches further away than max_backward.
|
|
|
|
// Writes the best found match length into best_len_out.
|
|
|
|
// Writes the index (&data[index]) of the start of the best match into
|
|
|
|
// best_distance_out.
|
|
|
|
inline bool FindLongestMatch(const uint8_t * __restrict ring_buffer,
|
|
|
|
const size_t ring_buffer_mask,
|
|
|
|
const int* __restrict distance_cache,
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t cur_ix,
|
|
|
|
const size_t max_length,
|
|
|
|
const size_t max_backward,
|
|
|
|
size_t * __restrict best_len_out,
|
|
|
|
size_t * __restrict best_len_code_out,
|
|
|
|
size_t * __restrict best_distance_out,
|
2014-10-28 12:25:22 +00:00
|
|
|
double* __restrict best_score_out) {
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t best_len_in = *best_len_out;
|
2015-10-28 16:44:47 +00:00
|
|
|
const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
|
2014-10-28 12:25:22 +00:00
|
|
|
int compare_char = ring_buffer[cur_ix_masked + best_len_in];
|
|
|
|
double best_score = *best_score_out;
|
2016-01-07 15:27:49 +00:00
|
|
|
size_t best_len = best_len_in;
|
|
|
|
size_t cached_backward = static_cast<size_t>(distance_cache[0]);
|
|
|
|
size_t prev_ix = cur_ix - cached_backward;
|
2014-10-28 12:25:22 +00:00
|
|
|
bool match_found = false;
|
|
|
|
if (prev_ix < cur_ix) {
|
2015-10-28 16:44:47 +00:00
|
|
|
prev_ix &= static_cast<uint32_t>(ring_buffer_mask);
|
2014-10-28 12:25:22 +00:00
|
|
|
if (compare_char == ring_buffer[prev_ix + best_len]) {
|
2016-01-07 15:27:49 +00:00
|
|
|
size_t len = FindMatchLengthWithLimit(&ring_buffer[prev_ix],
|
|
|
|
&ring_buffer[cur_ix_masked],
|
|
|
|
max_length);
|
2014-10-28 12:25:22 +00:00
|
|
|
if (len >= 4) {
|
2015-06-12 14:25:41 +00:00
|
|
|
best_score = BackwardReferenceScoreUsingLastDistance(len, 0);
|
2014-10-28 12:25:22 +00:00
|
|
|
best_len = len;
|
|
|
|
*best_len_out = len;
|
|
|
|
*best_len_code_out = len;
|
2015-10-01 09:40:05 +00:00
|
|
|
*best_distance_out = cached_backward;
|
2014-10-28 12:25:22 +00:00
|
|
|
*best_score_out = best_score;
|
|
|
|
compare_char = ring_buffer[cur_ix_masked + best_len];
|
|
|
|
if (kBucketSweep == 1) {
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
match_found = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2015-08-28 14:09:23 +00:00
|
|
|
const uint32_t key = HashBytes(&ring_buffer[cur_ix_masked]);
|
2014-10-28 12:25:22 +00:00
|
|
|
if (kBucketSweep == 1) {
|
|
|
|
// Only one to look for, don't bother to prepare for a loop.
|
|
|
|
prev_ix = buckets_[key];
|
2016-01-07 15:27:49 +00:00
|
|
|
size_t backward = cur_ix - prev_ix;
|
2015-10-28 16:44:47 +00:00
|
|
|
prev_ix &= static_cast<uint32_t>(ring_buffer_mask);
|
2014-10-28 12:25:22 +00:00
|
|
|
if (compare_char != ring_buffer[prev_ix + best_len_in]) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (PREDICT_FALSE(backward == 0 || backward > max_backward)) {
|
|
|
|
return false;
|
|
|
|
}
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t len = FindMatchLengthWithLimit(&ring_buffer[prev_ix],
|
|
|
|
&ring_buffer[cur_ix_masked],
|
|
|
|
max_length);
|
2014-10-28 12:25:22 +00:00
|
|
|
if (len >= 4) {
|
|
|
|
*best_len_out = len;
|
|
|
|
*best_len_code_out = len;
|
|
|
|
*best_distance_out = backward;
|
2015-06-12 14:25:41 +00:00
|
|
|
*best_score_out = BackwardReferenceScore(len, backward);
|
2014-10-28 12:25:22 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
uint32_t *bucket = buckets_ + key;
|
|
|
|
prev_ix = *bucket++;
|
|
|
|
for (int i = 0; i < kBucketSweep; ++i, prev_ix = *bucket++) {
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t backward = cur_ix - prev_ix;
|
2015-10-28 16:44:47 +00:00
|
|
|
prev_ix &= static_cast<uint32_t>(ring_buffer_mask);
|
2014-10-28 12:25:22 +00:00
|
|
|
if (compare_char != ring_buffer[prev_ix + best_len]) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (PREDICT_FALSE(backward == 0 || backward > max_backward)) {
|
|
|
|
continue;
|
|
|
|
}
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t len = FindMatchLengthWithLimit(&ring_buffer[prev_ix],
|
|
|
|
&ring_buffer[cur_ix_masked],
|
|
|
|
max_length);
|
2014-10-28 12:25:22 +00:00
|
|
|
if (len >= 4) {
|
2015-06-12 14:25:41 +00:00
|
|
|
const double score = BackwardReferenceScore(len, backward);
|
2014-10-28 12:25:22 +00:00
|
|
|
if (best_score < score) {
|
|
|
|
best_score = score;
|
|
|
|
best_len = len;
|
|
|
|
*best_len_out = best_len;
|
|
|
|
*best_len_code_out = best_len;
|
|
|
|
*best_distance_out = backward;
|
|
|
|
*best_score_out = score;
|
|
|
|
compare_char = ring_buffer[cur_ix_masked + best_len];
|
|
|
|
match_found = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2015-05-07 15:30:10 +00:00
|
|
|
}
|
|
|
|
if (kUseDictionary && !match_found &&
|
|
|
|
num_dict_matches_ >= (num_dict_lookups_ >> 7)) {
|
|
|
|
++num_dict_lookups_;
|
2015-10-01 09:40:05 +00:00
|
|
|
const uint32_t dict_key = Hash<14>(&ring_buffer[cur_ix_masked]) << 1;
|
|
|
|
const uint16_t v = kStaticDictionaryHash[dict_key];
|
2015-05-07 15:30:10 +00:00
|
|
|
if (v > 0) {
|
2016-01-07 15:27:49 +00:00
|
|
|
const uint32_t len = v & 31;
|
|
|
|
const uint32_t dist = v >> 5;
|
|
|
|
const size_t offset =
|
|
|
|
kBrotliDictionaryOffsetsByLength[len] + len * dist;
|
2015-05-07 15:30:10 +00:00
|
|
|
if (len <= max_length) {
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t matchlen =
|
2015-05-07 15:30:10 +00:00
|
|
|
FindMatchLengthWithLimit(&ring_buffer[cur_ix_masked],
|
|
|
|
&kBrotliDictionary[offset], len);
|
2016-01-07 15:27:49 +00:00
|
|
|
if (matchlen + kCutoffTransformsCount > len && matchlen > 0) {
|
|
|
|
const size_t transform_id = kCutoffTransforms[len - matchlen];
|
|
|
|
const size_t word_id =
|
2015-08-10 11:13:58 +00:00
|
|
|
transform_id * (1 << kBrotliDictionarySizeBitsByLength[len]) +
|
|
|
|
dist;
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t backward = max_backward + word_id + 1;
|
2015-08-10 11:13:58 +00:00
|
|
|
const double score = BackwardReferenceScore(matchlen, backward);
|
2015-05-07 15:30:10 +00:00
|
|
|
if (best_score < score) {
|
|
|
|
++num_dict_matches_;
|
|
|
|
best_score = score;
|
2015-08-10 11:13:58 +00:00
|
|
|
best_len = matchlen;
|
2015-05-07 15:30:10 +00:00
|
|
|
*best_len_out = best_len;
|
2015-08-10 11:13:58 +00:00
|
|
|
*best_len_code_out = len;
|
2015-05-07 15:30:10 +00:00
|
|
|
*best_distance_out = backward;
|
|
|
|
*best_score_out = best_score;
|
|
|
|
return true;
|
2015-04-23 13:52:32 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2014-10-28 12:25:22 +00:00
|
|
|
}
|
2015-05-07 15:30:10 +00:00
|
|
|
return match_found;
|
2014-10-28 12:25:22 +00:00
|
|
|
}
|
|
|
|
|
2015-08-28 14:09:23 +00:00
|
|
|
enum { kHashLength = 5 };
|
|
|
|
enum { kHashTypeLength = 8 };
|
|
|
|
// HashBytes is the function that chooses the bucket to place
|
|
|
|
// the address in. The HashLongestMatch and HashLongestMatchQuickly
|
|
|
|
// classes have separate, different implementations of hashing.
|
|
|
|
static uint32_t HashBytes(const uint8_t *data) {
|
|
|
|
// Computing a hash based on 5 bytes works much better for
|
|
|
|
// qualities 1 and 3, where the next hash value is likely to replace
|
|
|
|
uint64_t h = (BROTLI_UNALIGNED_LOAD64(data) << 24) * kHashMul32;
|
|
|
|
// The higher bits contain more mixture from the multiplication,
|
|
|
|
// so we take our results from there.
|
2015-10-28 16:44:47 +00:00
|
|
|
return static_cast<uint32_t>(h >> (64 - kBucketBits));
|
2015-08-28 14:09:23 +00:00
|
|
|
}
|
|
|
|
|
2014-10-28 12:25:22 +00:00
|
|
|
private:
|
|
|
|
static const uint32_t kBucketSize = 1 << kBucketBits;
|
|
|
|
uint32_t buckets_[kBucketSize + kBucketSweep];
|
2015-04-23 13:52:32 +00:00
|
|
|
size_t num_dict_lookups_;
|
|
|
|
size_t num_dict_matches_;
|
2014-10-28 12:25:22 +00:00
|
|
|
};
|
|
|
|
|
2015-06-12 14:25:41 +00:00
|
|
|
// The maximum length for which the zopflification uses distinct distances.
|
2016-01-07 15:27:49 +00:00
|
|
|
static const uint16_t kMaxZopfliLen = 325;
|
2015-06-12 14:25:41 +00:00
|
|
|
|
2013-10-23 11:06:13 +00:00
|
|
|
// A (forgetful) hash table to the data seen by the compressor, to
|
|
|
|
// help create backward references to previous data.
|
|
|
|
//
|
|
|
|
// This is a hash map of fixed size (kBucketSize) to a ring buffer of
|
|
|
|
// fixed size (kBlockSize). The ring buffer contains the last kBlockSize
|
|
|
|
// index positions of the given hash key in the compressed data.
|
2014-10-28 12:25:22 +00:00
|
|
|
template <int kBucketBits,
|
|
|
|
int kBlockBits,
|
2015-06-12 14:25:41 +00:00
|
|
|
int kNumLastDistancesToCheck>
|
2013-10-23 11:06:13 +00:00
|
|
|
class HashLongestMatch {
|
|
|
|
public:
|
2015-06-12 14:45:17 +00:00
|
|
|
HashLongestMatch() {
|
2013-10-23 11:06:13 +00:00
|
|
|
Reset();
|
|
|
|
}
|
2015-06-12 14:25:41 +00:00
|
|
|
|
2013-10-23 11:06:13 +00:00
|
|
|
void Reset() {
|
2015-08-28 14:09:23 +00:00
|
|
|
memset(&num_[0], 0, sizeof(num_));
|
2015-04-23 13:52:32 +00:00
|
|
|
num_dict_lookups_ = 0;
|
|
|
|
num_dict_matches_ = 0;
|
2013-10-23 11:06:13 +00:00
|
|
|
}
|
2015-06-12 14:25:41 +00:00
|
|
|
|
2013-10-23 11:06:13 +00:00
|
|
|
// Look at 3 bytes at data.
|
|
|
|
// Compute a hash from these, and store the value of ix at that position.
|
2015-10-26 16:08:57 +00:00
|
|
|
inline void Store(const uint8_t *data, const uint32_t ix) {
|
2015-08-28 14:09:23 +00:00
|
|
|
const uint32_t key = HashBytes(data);
|
2013-10-23 11:06:13 +00:00
|
|
|
const int minor_ix = num_[key] & kBlockMask;
|
|
|
|
buckets_[key][minor_ix] = ix;
|
|
|
|
++num_[key];
|
|
|
|
}
|
|
|
|
|
|
|
|
// Find a longest backward match of &data[cur_ix] up to the length of
|
|
|
|
// max_length.
|
|
|
|
//
|
|
|
|
// Does not look for matches longer than max_length.
|
|
|
|
// Does not look for matches further away than max_backward.
|
|
|
|
// Writes the best found match length into best_len_out.
|
|
|
|
// Writes the index (&data[index]) offset from the start of the best match
|
|
|
|
// into best_distance_out.
|
|
|
|
// Write the score of the best match into best_score_out.
|
|
|
|
bool FindLongestMatch(const uint8_t * __restrict data,
|
2013-11-15 18:02:17 +00:00
|
|
|
const size_t ring_buffer_mask,
|
2014-10-28 12:25:22 +00:00
|
|
|
const int* __restrict distance_cache,
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t cur_ix,
|
|
|
|
const size_t max_length,
|
|
|
|
const size_t max_backward,
|
|
|
|
size_t * __restrict best_len_out,
|
|
|
|
size_t * __restrict best_len_code_out,
|
|
|
|
size_t * __restrict best_distance_out,
|
2014-10-28 12:25:22 +00:00
|
|
|
double * __restrict best_score_out) {
|
2014-02-14 14:04:23 +00:00
|
|
|
*best_len_code_out = 0;
|
2013-11-15 18:02:17 +00:00
|
|
|
const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
|
2013-10-23 11:06:13 +00:00
|
|
|
bool match_found = false;
|
|
|
|
// Don't accept a short copy from far away.
|
2014-10-28 12:25:22 +00:00
|
|
|
double best_score = *best_score_out;
|
2016-01-07 15:27:49 +00:00
|
|
|
size_t best_len = *best_len_out;
|
2013-10-23 11:06:13 +00:00
|
|
|
*best_len_out = 0;
|
|
|
|
// Try last distance first.
|
2016-01-07 15:27:49 +00:00
|
|
|
for (size_t i = 0; i < kNumLastDistancesToCheck; ++i) {
|
|
|
|
const size_t idx = kDistanceCacheIndex[i];
|
|
|
|
const size_t backward =
|
|
|
|
static_cast<size_t>(distance_cache[idx] + kDistanceCacheOffset[i]);
|
|
|
|
size_t prev_ix = static_cast<size_t>(cur_ix - backward);
|
2013-10-23 11:06:13 +00:00
|
|
|
if (prev_ix >= cur_ix) {
|
|
|
|
continue;
|
|
|
|
}
|
2016-01-07 15:27:49 +00:00
|
|
|
if (PREDICT_FALSE(backward > max_backward)) {
|
2013-10-23 11:06:13 +00:00
|
|
|
continue;
|
|
|
|
}
|
2016-01-07 15:27:49 +00:00
|
|
|
prev_ix &= ring_buffer_mask;
|
2014-10-28 12:25:22 +00:00
|
|
|
|
2014-01-06 15:01:57 +00:00
|
|
|
if (cur_ix_masked + best_len > ring_buffer_mask ||
|
|
|
|
prev_ix + best_len > ring_buffer_mask ||
|
|
|
|
data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
|
2013-10-23 11:06:13 +00:00
|
|
|
continue;
|
|
|
|
}
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t len = FindMatchLengthWithLimit(&data[prev_ix],
|
|
|
|
&data[cur_ix_masked],
|
|
|
|
max_length);
|
2015-05-11 12:11:07 +00:00
|
|
|
if (len >= 3 || (len == 2 && i < 2)) {
|
2013-10-23 11:06:13 +00:00
|
|
|
// Comparing for >= 2 does not change the semantics, but just saves for
|
|
|
|
// a few unnecessary binary logarithms in backward reference score,
|
|
|
|
// since we are not interested in such short matches.
|
2015-06-12 14:25:41 +00:00
|
|
|
double score = BackwardReferenceScoreUsingLastDistance(len, i);
|
2013-10-23 11:06:13 +00:00
|
|
|
if (best_score < score) {
|
|
|
|
best_score = score;
|
|
|
|
best_len = len;
|
|
|
|
*best_len_out = best_len;
|
2013-11-19 22:32:56 +00:00
|
|
|
*best_len_code_out = best_len;
|
2014-10-28 12:25:22 +00:00
|
|
|
*best_distance_out = backward;
|
2013-10-23 11:06:13 +00:00
|
|
|
*best_score_out = best_score;
|
|
|
|
match_found = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2015-08-28 14:09:23 +00:00
|
|
|
const uint32_t key = HashBytes(&data[cur_ix_masked]);
|
2015-10-26 16:08:57 +00:00
|
|
|
const uint32_t * __restrict const bucket = &buckets_[key][0];
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
|
|
|
|
for (size_t i = num_[key]; i > down;) {
|
|
|
|
--i;
|
|
|
|
size_t prev_ix = bucket[i & kBlockMask];
|
|
|
|
const size_t backward = cur_ix - prev_ix;
|
2015-10-26 16:08:57 +00:00
|
|
|
if (PREDICT_FALSE(backward == 0 || backward > max_backward)) {
|
|
|
|
break;
|
|
|
|
}
|
2016-01-07 15:27:49 +00:00
|
|
|
prev_ix &= ring_buffer_mask;
|
2015-10-26 16:08:57 +00:00
|
|
|
if (cur_ix_masked + best_len > ring_buffer_mask ||
|
|
|
|
prev_ix + best_len > ring_buffer_mask ||
|
|
|
|
data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
|
|
|
|
continue;
|
|
|
|
}
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t len = FindMatchLengthWithLimit(&data[prev_ix],
|
|
|
|
&data[cur_ix_masked],
|
|
|
|
max_length);
|
2015-10-26 16:08:57 +00:00
|
|
|
if (len >= 4) {
|
|
|
|
// Comparing for >= 3 does not change the semantics, but just saves
|
|
|
|
// for a few unnecessary binary logarithms in backward reference
|
|
|
|
// score, since we are not interested in such short matches.
|
|
|
|
double score = BackwardReferenceScore(len, backward);
|
|
|
|
if (best_score < score) {
|
|
|
|
best_score = score;
|
|
|
|
best_len = len;
|
|
|
|
*best_len_out = best_len;
|
|
|
|
*best_len_code_out = best_len;
|
|
|
|
*best_distance_out = backward;
|
|
|
|
*best_score_out = best_score;
|
|
|
|
match_found = true;
|
2013-10-23 11:06:13 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2015-04-23 13:52:32 +00:00
|
|
|
if (!match_found && num_dict_matches_ >= (num_dict_lookups_ >> 7)) {
|
2016-01-07 15:27:49 +00:00
|
|
|
size_t dict_key = Hash<14>(&data[cur_ix_masked]) << 1;
|
2015-10-01 09:40:05 +00:00
|
|
|
for (int k = 0; k < 2; ++k, ++dict_key) {
|
2015-04-23 13:52:32 +00:00
|
|
|
++num_dict_lookups_;
|
2015-10-01 09:40:05 +00:00
|
|
|
const uint16_t v = kStaticDictionaryHash[dict_key];
|
2015-04-23 13:52:32 +00:00
|
|
|
if (v > 0) {
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t len = v & 31;
|
|
|
|
const size_t dist = v >> 5;
|
|
|
|
const size_t offset =
|
|
|
|
kBrotliDictionaryOffsetsByLength[len] + len * dist;
|
2015-04-23 13:52:32 +00:00
|
|
|
if (len <= max_length) {
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t matchlen =
|
2015-04-23 13:52:32 +00:00
|
|
|
FindMatchLengthWithLimit(&data[cur_ix_masked],
|
|
|
|
&kBrotliDictionary[offset], len);
|
2016-01-07 15:27:49 +00:00
|
|
|
if (matchlen + kCutoffTransformsCount > len && matchlen > 0) {
|
|
|
|
const size_t transform_id = kCutoffTransforms[len - matchlen];
|
|
|
|
const size_t word_id =
|
2015-08-10 11:13:58 +00:00
|
|
|
transform_id * (1 << kBrotliDictionarySizeBitsByLength[len]) +
|
|
|
|
dist;
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t backward = max_backward + word_id + 1;
|
2015-08-10 11:13:58 +00:00
|
|
|
double score = BackwardReferenceScore(matchlen, backward);
|
2015-04-23 13:52:32 +00:00
|
|
|
if (best_score < score) {
|
|
|
|
++num_dict_matches_;
|
|
|
|
best_score = score;
|
2015-08-10 11:13:58 +00:00
|
|
|
best_len = matchlen;
|
2015-04-23 13:52:32 +00:00
|
|
|
*best_len_out = best_len;
|
2015-08-10 11:13:58 +00:00
|
|
|
*best_len_code_out = len;
|
2015-04-23 13:52:32 +00:00
|
|
|
*best_distance_out = backward;
|
|
|
|
*best_score_out = best_score;
|
|
|
|
match_found = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2015-06-12 14:25:41 +00:00
|
|
|
return match_found;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Similar to FindLongestMatch(), but finds all matches.
|
|
|
|
//
|
|
|
|
// Sets *num_matches to the number of matches found, and stores the found
|
|
|
|
// matches in matches[0] to matches[*num_matches - 1].
|
|
|
|
//
|
|
|
|
// If the longest match is longer than kMaxZopfliLen, returns only this
|
|
|
|
// longest match.
|
|
|
|
//
|
|
|
|
// Requires that at least kMaxZopfliLen space is available in matches.
|
2016-01-07 15:27:49 +00:00
|
|
|
size_t FindAllMatches(const uint8_t* data,
|
|
|
|
const size_t ring_buffer_mask,
|
|
|
|
const size_t cur_ix,
|
|
|
|
const size_t max_length,
|
|
|
|
const size_t max_backward,
|
|
|
|
BackwardMatch* matches) const {
|
2015-06-12 14:25:41 +00:00
|
|
|
BackwardMatch* const orig_matches = matches;
|
|
|
|
const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
|
2016-01-07 15:27:49 +00:00
|
|
|
size_t best_len = 1;
|
|
|
|
size_t stop = cur_ix - 64;
|
|
|
|
if (cur_ix < 64) { stop = 0; }
|
|
|
|
for (size_t i = cur_ix - 1; i > stop && best_len <= 2; --i) {
|
2015-06-12 14:25:41 +00:00
|
|
|
size_t prev_ix = i;
|
|
|
|
const size_t backward = cur_ix - prev_ix;
|
|
|
|
if (PREDICT_FALSE(backward > max_backward)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
prev_ix &= ring_buffer_mask;
|
|
|
|
if (data[cur_ix_masked] != data[prev_ix] ||
|
|
|
|
data[cur_ix_masked + 1] != data[prev_ix + 1]) {
|
|
|
|
continue;
|
|
|
|
}
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t len =
|
2015-06-12 14:25:41 +00:00
|
|
|
FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
|
|
|
|
max_length);
|
|
|
|
if (len > best_len) {
|
|
|
|
best_len = len;
|
|
|
|
if (len > kMaxZopfliLen) {
|
|
|
|
matches = orig_matches;
|
|
|
|
}
|
2016-01-07 15:27:49 +00:00
|
|
|
*matches++ = BackwardMatch(backward, len);
|
2015-06-12 14:25:41 +00:00
|
|
|
}
|
|
|
|
}
|
2015-08-28 14:09:23 +00:00
|
|
|
const uint32_t key = HashBytes(&data[cur_ix_masked]);
|
2015-10-26 16:08:57 +00:00
|
|
|
const uint32_t * __restrict const bucket = &buckets_[key][0];
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
|
|
|
|
for (size_t i = num_[key]; i > down;) {
|
|
|
|
--i;
|
|
|
|
size_t prev_ix = bucket[i & kBlockMask];
|
|
|
|
const size_t backward = cur_ix - prev_ix;
|
2015-10-26 16:08:57 +00:00
|
|
|
if (PREDICT_FALSE(backward == 0 || backward > max_backward)) {
|
|
|
|
break;
|
|
|
|
}
|
2016-01-07 15:27:49 +00:00
|
|
|
prev_ix &= ring_buffer_mask;
|
2015-10-26 16:08:57 +00:00
|
|
|
if (cur_ix_masked + best_len > ring_buffer_mask ||
|
|
|
|
prev_ix + best_len > ring_buffer_mask ||
|
|
|
|
data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
|
|
|
|
continue;
|
|
|
|
}
|
2016-01-07 15:27:49 +00:00
|
|
|
const size_t len =
|
2015-10-26 16:08:57 +00:00
|
|
|
FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
|
|
|
|
max_length);
|
|
|
|
if (len > best_len) {
|
|
|
|
best_len = len;
|
|
|
|
if (len > kMaxZopfliLen) {
|
|
|
|
matches = orig_matches;
|
2015-06-12 14:25:41 +00:00
|
|
|
}
|
2015-10-26 16:08:57 +00:00
|
|
|
*matches++ = BackwardMatch(backward, len);
|
2015-06-12 14:25:41 +00:00
|
|
|
}
|
|
|
|
}
|
2016-01-07 15:27:49 +00:00
|
|
|
std::vector<uint32_t> dict_matches(kMaxDictionaryMatchLen + 1,
|
|
|
|
kInvalidMatch);
|
|
|
|
size_t minlen = std::max<size_t>(4, best_len + 1);
|
2015-08-10 11:13:58 +00:00
|
|
|
if (FindAllStaticDictionaryMatches(&data[cur_ix_masked], minlen, max_length,
|
2015-06-12 14:45:17 +00:00
|
|
|
&dict_matches[0])) {
|
2016-01-07 15:27:49 +00:00
|
|
|
size_t maxlen = std::min<size_t>(kMaxDictionaryMatchLen, max_length);
|
|
|
|
for (size_t l = minlen; l <= maxlen; ++l) {
|
|
|
|
uint32_t dict_id = dict_matches[l];
|
2015-06-12 14:45:17 +00:00
|
|
|
if (dict_id < kInvalidMatch) {
|
|
|
|
*matches++ = BackwardMatch(max_backward + (dict_id >> 5) + 1, l,
|
|
|
|
dict_id & 31);
|
2014-03-20 13:32:35 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-01-07 15:27:49 +00:00
|
|
|
return static_cast<size_t>(matches - orig_matches);
|
2013-10-23 11:06:13 +00:00
|
|
|
}
|
|
|
|
|
2015-08-28 14:09:23 +00:00
|
|
|
enum { kHashLength = 4 };
|
|
|
|
enum { kHashTypeLength = 4 };
|
|
|
|
|
|
|
|
// HashBytes is the function that chooses the bucket to place
|
|
|
|
// the address in. The HashLongestMatch and HashLongestMatchQuickly
|
|
|
|
// classes have separate, different implementations of hashing.
|
|
|
|
static uint32_t HashBytes(const uint8_t *data) {
|
|
|
|
uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
|
|
|
|
// The higher bits contain more mixture from the multiplication,
|
|
|
|
// so we take our results from there.
|
|
|
|
return h >> (32 - kBucketBits);
|
|
|
|
}
|
|
|
|
|
2013-10-23 11:06:13 +00:00
|
|
|
private:
|
|
|
|
// Number of hash buckets.
|
|
|
|
static const uint32_t kBucketSize = 1 << kBucketBits;
|
|
|
|
|
|
|
|
// Only kBlockSize newest backward references are kept,
|
|
|
|
// and the older are forgotten.
|
|
|
|
static const uint32_t kBlockSize = 1 << kBlockBits;
|
|
|
|
|
|
|
|
// Mask for accessing entries in a block (in a ringbuffer manner).
|
|
|
|
static const uint32_t kBlockMask = (1 << kBlockBits) - 1;
|
|
|
|
|
|
|
|
// Number of entries in a particular bucket.
|
|
|
|
uint16_t num_[kBucketSize];
|
|
|
|
|
|
|
|
// Buckets containing kBlockSize of backward references.
|
2015-10-26 16:08:57 +00:00
|
|
|
uint32_t buckets_[kBucketSize][kBlockSize];
|
2013-10-23 11:06:13 +00:00
|
|
|
|
2015-04-23 13:52:32 +00:00
|
|
|
size_t num_dict_lookups_;
|
|
|
|
size_t num_dict_matches_;
|
2013-10-23 11:06:13 +00:00
|
|
|
};
|
|
|
|
|
2014-03-20 13:32:35 +00:00
|
|
|
struct Hashers {
|
2015-05-07 15:30:10 +00:00
|
|
|
// For kBucketSweep == 1, enabling the dictionary lookup makes compression
|
|
|
|
// a little faster (0.5% - 1%) and it compresses 0.15% better on small text
|
|
|
|
// and html inputs.
|
|
|
|
typedef HashLongestMatchQuickly<16, 1, true> H1;
|
|
|
|
typedef HashLongestMatchQuickly<16, 2, false> H2;
|
|
|
|
typedef HashLongestMatchQuickly<16, 4, false> H3;
|
|
|
|
typedef HashLongestMatchQuickly<17, 4, true> H4;
|
2015-06-12 14:25:41 +00:00
|
|
|
typedef HashLongestMatch<14, 4, 4> H5;
|
|
|
|
typedef HashLongestMatch<14, 5, 4> H6;
|
|
|
|
typedef HashLongestMatch<15, 6, 10> H7;
|
|
|
|
typedef HashLongestMatch<15, 7, 10> H8;
|
|
|
|
typedef HashLongestMatch<15, 8, 16> H9;
|
2014-10-28 12:25:22 +00:00
|
|
|
|
2015-10-01 10:08:14 +00:00
|
|
|
Hashers() : hash_h1(0), hash_h2(0), hash_h3(0), hash_h4(0), hash_h5(0),
|
|
|
|
hash_h6(0), hash_h7(0), hash_h8(0), hash_h9(0) {}
|
|
|
|
|
|
|
|
~Hashers() {
|
|
|
|
delete hash_h1;
|
|
|
|
delete hash_h2;
|
|
|
|
delete hash_h3;
|
|
|
|
delete hash_h4;
|
|
|
|
delete hash_h5;
|
|
|
|
delete hash_h6;
|
|
|
|
delete hash_h7;
|
|
|
|
delete hash_h8;
|
|
|
|
delete hash_h9;
|
|
|
|
}
|
|
|
|
|
2014-10-28 12:25:22 +00:00
|
|
|
void Init(int type) {
|
2014-03-20 13:32:35 +00:00
|
|
|
switch (type) {
|
2015-10-01 10:08:14 +00:00
|
|
|
case 1: hash_h1 = new H1; break;
|
|
|
|
case 2: hash_h2 = new H2; break;
|
|
|
|
case 3: hash_h3 = new H3; break;
|
|
|
|
case 4: hash_h4 = new H4; break;
|
|
|
|
case 5: hash_h5 = new H5; break;
|
|
|
|
case 6: hash_h6 = new H6; break;
|
|
|
|
case 7: hash_h7 = new H7; break;
|
|
|
|
case 8: hash_h8 = new H8; break;
|
|
|
|
case 9: hash_h9 = new H9; break;
|
2014-10-28 12:25:22 +00:00
|
|
|
default: break;
|
2014-03-20 13:32:35 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-06-12 13:43:54 +00:00
|
|
|
template<typename Hasher>
|
|
|
|
void WarmupHash(const size_t size, const uint8_t* dict, Hasher* hasher) {
|
2015-08-28 14:09:23 +00:00
|
|
|
for (size_t i = 0; i + Hasher::kHashTypeLength - 1 < size; i++) {
|
2015-10-26 16:08:57 +00:00
|
|
|
hasher->Store(&dict[i], static_cast<uint32_t>(i));
|
2015-06-12 13:43:54 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Custom LZ77 window.
|
|
|
|
void PrependCustomDictionary(
|
|
|
|
int type, const size_t size, const uint8_t* dict) {
|
|
|
|
switch (type) {
|
2015-10-01 10:08:14 +00:00
|
|
|
case 1: WarmupHash(size, dict, hash_h1); break;
|
|
|
|
case 2: WarmupHash(size, dict, hash_h2); break;
|
|
|
|
case 3: WarmupHash(size, dict, hash_h3); break;
|
|
|
|
case 4: WarmupHash(size, dict, hash_h4); break;
|
|
|
|
case 5: WarmupHash(size, dict, hash_h5); break;
|
|
|
|
case 6: WarmupHash(size, dict, hash_h6); break;
|
|
|
|
case 7: WarmupHash(size, dict, hash_h7); break;
|
|
|
|
case 8: WarmupHash(size, dict, hash_h8); break;
|
|
|
|
case 9: WarmupHash(size, dict, hash_h9); break;
|
2015-06-12 13:43:54 +00:00
|
|
|
default: break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-01 15:08:59 +00:00
|
|
|
|
2015-10-01 10:08:14 +00:00
|
|
|
H1* hash_h1;
|
|
|
|
H2* hash_h2;
|
|
|
|
H3* hash_h3;
|
|
|
|
H4* hash_h4;
|
|
|
|
H5* hash_h5;
|
|
|
|
H6* hash_h6;
|
|
|
|
H7* hash_h7;
|
|
|
|
H8* hash_h8;
|
|
|
|
H9* hash_h9;
|
2014-03-20 13:32:35 +00:00
|
|
|
};
|
2013-11-15 18:02:17 +00:00
|
|
|
|
2013-10-23 11:06:13 +00:00
|
|
|
} // namespace brotli
|
|
|
|
|
|
|
|
#endif // BROTLI_ENC_HASH_H_
|