* Update
 * decoder: better behavior after failure
 * encoder: replace "len_x_code" with delta
 * research: add experimental dictionary generator
 * python: test combing
This commit is contained in:
Eugene Kliuchnikov 2017-07-21 10:07:24 +02:00 committed by GitHub
parent 172a378deb
commit 52441069ef
13 changed files with 397 additions and 87 deletions

View File

@ -1942,7 +1942,13 @@ BrotliDecoderResult BrotliDecoderDecompressStream(
if (result != BROTLI_DECODER_SUCCESS) { /* Error, needs more input/output */
if (result == BROTLI_DECODER_NEEDS_MORE_INPUT) {
if (s->ringbuffer != 0) { /* Pro-actively push output. */
WriteRingBuffer(s, available_out, next_out, total_out, BROTLI_TRUE);
BrotliDecoderErrorCode intermediate_result = WriteRingBuffer(s,
available_out, next_out, total_out, BROTLI_TRUE);
/* WriteRingBuffer checks s->meta_block_remaining_len validity. */
if ((int)intermediate_result < 0) {
result = intermediate_result;
break;
}
}
if (s->buffer_length != 0) { /* Used with internal buffer. */
if (br->avail_in == 0) { /* Successfully finished read transaction. */
@ -2327,6 +2333,10 @@ void BrotliDecoderSetCustomDictionary(
}
BROTLI_BOOL BrotliDecoderHasMoreOutput(const BrotliDecoderState* s) {
/* After unrecoverable error remaining output is considered nonsensical. */
if ((int)s->error_code < 0) {
return BROTLI_FALSE;
}
return TO_BROTLI_BOOL(
s->ringbuffer != 0 && UnwrittenBytes(s, BROTLI_FALSE) != 0);
}
@ -2336,17 +2346,20 @@ const uint8_t* BrotliDecoderTakeOutput(BrotliDecoderState* s, size_t* size) {
size_t available_out = *size ? *size : 1u << 24;
size_t requested_out = available_out;
BrotliDecoderErrorCode status;
if (s->ringbuffer == 0) {
if ((s->ringbuffer == 0) || ((int)s->error_code < 0)) {
*size = 0;
return 0;
}
WrapRingBuffer(s);
status = WriteRingBuffer(s, &available_out, &result, 0, BROTLI_TRUE);
/* Either WriteRingBuffer returns those "success" codes... */
if (status == BROTLI_DECODER_SUCCESS ||
status == BROTLI_DECODER_NEEDS_MORE_OUTPUT) {
*size = requested_out - available_out;
} else {
/* This might happen if previous decoder error code was ignored. */
/* ... or stream is broken. Normally this should be caught by
BrotliDecoderDecompressStream, this is just a safeguard. */
if ((int)status < 0) SaveErrorCode(s, status);
*size = 0;
result = 0;
}

View File

@ -549,8 +549,8 @@ void BrotliZopfliCreateCommands(const size_t num_bytes,
BROTLI_BOOL is_dictionary = TO_BROTLI_BOOL(distance > max_distance);
size_t dist_code = ZopfliNodeDistanceCode(next);
InitCommand(
&commands[i], insert_length, copy_length, len_code, dist_code);
InitCommand(&commands[i], insert_length,
copy_length, (int)len_code - (int)copy_length, dist_code);
if (!is_dictionary && dist_code > 0) {
dist_cache[3] = dist_cache[2];

View File

@ -38,29 +38,29 @@ static BROTLI_NOINLINE void FN(CreateBackwardReferences)(
size_t max_distance = BROTLI_MIN(size_t, position, max_backward_limit);
HasherSearchResult sr;
sr.len = 0;
sr.len_x_code = 0;
sr.len_code_delta = 0;
sr.distance = 0;
sr.score = kMinScore;
if (FN(FindLongestMatch)(hasher, dictionary, dictionary_hash,
ringbuffer, ringbuffer_mask, dist_cache,
position, max_length, max_distance, &sr)) {
FN(FindLongestMatch)(hasher, dictionary, dictionary_hash, ringbuffer,
ringbuffer_mask, dist_cache, position,
max_length, max_distance, &sr);
if (sr.score > kMinScore) {
/* Found a match. Let's look for something even better ahead. */
int delayed_backward_references_in_row = 0;
--max_length;
for (;; --max_length) {
const score_t cost_diff_lazy = 175;
BROTLI_BOOL is_match_found;
HasherSearchResult sr2;
sr2.len = params->quality < MIN_QUALITY_FOR_EXTENSIVE_REFERENCE_SEARCH ?
BROTLI_MIN(size_t, sr.len - 1, max_length) : 0;
sr2.len_x_code = 0;
sr2.len_code_delta = 0;
sr2.distance = 0;
sr2.score = kMinScore;
max_distance = BROTLI_MIN(size_t, position + 1, max_backward_limit);
is_match_found = FN(FindLongestMatch)(hasher, dictionary,
dictionary_hash, ringbuffer, ringbuffer_mask, dist_cache,
position + 1, max_length, max_distance, &sr2);
if (is_match_found && sr2.score >= sr.score + cost_diff_lazy) {
FN(FindLongestMatch)(hasher, dictionary, dictionary_hash, ringbuffer,
ringbuffer_mask, dist_cache, position + 1,
max_length, max_distance, &sr2);
if (sr2.score >= sr.score + cost_diff_lazy) {
/* Ok, let's just write one byte for now and start a match from the
next byte. */
++position;
@ -88,7 +88,7 @@ static BROTLI_NOINLINE void FN(CreateBackwardReferences)(
dist_cache[0] = (int)sr.distance;
FN(PrepareDistanceCache)(hasher, dist_cache);
}
InitCommand(commands++, insert_length, sr.len, sr.len ^ sr.len_x_code,
InitCommand(commands++, insert_length, sr.len, sr.len_code_delta,
distance_code);
}
*num_literals += insert_length;

View File

@ -114,17 +114,19 @@ typedef struct Command {
/* distance_code is e.g. 0 for same-as-last short code, or 16 for offset 1. */
static BROTLI_INLINE void InitCommand(Command* self, size_t insertlen,
size_t copylen, size_t copylen_code, size_t distance_code) {
size_t copylen, int copylen_code_delta, size_t distance_code) {
/* Don't rely on signed int representation, use honest casts. */
uint32_t delta = (uint8_t)((int8_t)copylen_code_delta);
self->insert_len_ = (uint32_t)insertlen;
self->copy_len_ = (uint32_t)(copylen | ((copylen_code ^ copylen) << 24));
self->copy_len_ = (uint32_t)(copylen | (delta << 24));
/* The distance prefix and extra bits are stored in this Command as if
npostfix and ndirect were 0, they are only recomputed later after the
clustering if needed. */
PrefixEncodeCopyDistance(
distance_code, 0, 0, &self->dist_prefix_, &self->dist_extra_);
GetLengthCode(
insertlen, copylen_code, TO_BROTLI_BOOL(self->dist_prefix_ == 0),
&self->cmd_prefix_);
insertlen, (size_t)((int)copylen + copylen_code_delta),
TO_BROTLI_BOOL(self->dist_prefix_ == 0), &self->cmd_prefix_);
}
static BROTLI_INLINE void InitInsertCommand(Command* self, size_t insertlen) {
@ -167,7 +169,8 @@ static BROTLI_INLINE uint32_t CommandCopyLen(const Command* self) {
}
static BROTLI_INLINE uint32_t CommandCopyLenCode(const Command* self) {
return (self->copy_len_ & 0xFFFFFF) ^ (self->copy_len_ >> 24);
int32_t delta = (int8_t)((uint8_t)(self->copy_len_ >> 24));
return (uint32_t)((int32_t)(self->copy_len_ & 0xFFFFFF) + delta);
}
#if defined(__cplusplus) || defined(c_plusplus)

View File

@ -62,9 +62,9 @@ static const uint64_t kCutoffTransforms =
typedef struct HasherSearchResult {
size_t len;
size_t len_x_code; /* == len ^ len_code */
size_t distance;
score_t score;
int len_code_delta; /* == len_code - len */
} HasherSearchResult;
/* kHashMul32 multiplier has these properties:
@ -178,22 +178,21 @@ static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem(
return BROTLI_FALSE;
}
out->len = matchlen;
out->len_x_code = len ^ matchlen;
out->len_code_delta = (int)len - (int)matchlen;
out->distance = backward;
out->score = score;
return BROTLI_TRUE;
}
static BROTLI_INLINE BROTLI_BOOL SearchInStaticDictionary(
static BROTLI_INLINE void SearchInStaticDictionary(
const BrotliDictionary* dictionary, const uint16_t* dictionary_hash,
HasherHandle handle, const uint8_t* data, size_t max_length,
size_t max_backward, HasherSearchResult* out, BROTLI_BOOL shallow) {
size_t key;
size_t i;
BROTLI_BOOL is_match_found = BROTLI_FALSE;
HasherCommon* self = GetHasherCommon(handle);
if (self->dict_num_matches < (self->dict_num_lookups >> 7)) {
return BROTLI_FALSE;
return;
}
key = Hash14(data) << 1;
for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) {
@ -204,11 +203,9 @@ static BROTLI_INLINE BROTLI_BOOL SearchInStaticDictionary(
dictionary, item, data, max_length, max_backward, out);
if (item_matches) {
self->dict_num_matches++;
is_match_found = BROTLI_TRUE;
}
}
}
return is_match_found;
}
typedef struct BackwardMatch {

View File

@ -152,8 +152,8 @@ static BROTLI_INLINE void FN(PrepareDistanceCache)(
Does not look for matches longer than max_length.
Does not look for matches further away than max_backward.
Writes the best match into |out|.
Returns 1 when match is found, otherwise 0. */
static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
|out|->score is updated only if a better match is found. */
static BROTLI_INLINE void FN(FindLongestMatch)(HasherHandle handle,
const BrotliDictionary* dictionary, const uint16_t* dictionary_hash,
const uint8_t* BROTLI_RESTRICT data, const size_t ring_buffer_mask,
const int* BROTLI_RESTRICT distance_cache,
@ -161,15 +161,15 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
HasherSearchResult* BROTLI_RESTRICT out) {
HashForgetfulChain* self = FN(Self)(handle);
const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
BROTLI_BOOL is_match_found = BROTLI_FALSE;
/* Don't accept a short copy from far away. */
score_t min_score = out->score;
score_t best_score = out->score;
size_t best_len = out->len;
size_t i;
const size_t key = FN(HashBytes)(&data[cur_ix_masked]);
const uint8_t tiny_hash = (uint8_t)(key);
out->len = 0;
out->len_x_code = 0;
out->len_code_delta = 0;
/* Try last distance first. */
for (i = 0; i < NUM_LAST_DISTANCES_TO_CHECK; ++i) {
const size_t backward = (size_t)distance_cache[i];
@ -194,7 +194,6 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
out->len = best_len;
out->distance = backward;
out->score = best_score;
is_match_found = BROTLI_TRUE;
}
}
}
@ -234,19 +233,17 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
out->len = best_len;
out->distance = backward;
out->score = best_score;
is_match_found = BROTLI_TRUE;
}
}
}
}
FN(Store)(handle, data, ring_buffer_mask, cur_ix);
}
if (!is_match_found) {
is_match_found = SearchInStaticDictionary(dictionary, dictionary_hash,
if (out->score == min_score) {
SearchInStaticDictionary(dictionary, dictionary_hash,
handle, &data[cur_ix_masked], max_length, max_backward, out,
BROTLI_FALSE);
}
return is_match_found;
}
#undef BANK_SIZE

View File

@ -156,8 +156,8 @@ static BROTLI_INLINE void FN(PrepareDistanceCache)(
Does not look for matches longer than max_length.
Does not look for matches further away than max_backward.
Writes the best match into |out|.
Returns true when match is found, otherwise false. */
static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
|out|->score is updated only if a better match is found. */
static BROTLI_INLINE void FN(FindLongestMatch)(HasherHandle handle,
const BrotliDictionary* dictionary, const uint16_t* dictionary_hash,
const uint8_t* BROTLI_RESTRICT data, const size_t ring_buffer_mask,
const int* BROTLI_RESTRICT distance_cache, const size_t cur_ix,
@ -168,13 +168,13 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
uint16_t* num = FN(Num)(self);
uint32_t* buckets = FN(Buckets)(self);
const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
BROTLI_BOOL is_match_found = BROTLI_FALSE;
/* Don't accept a short copy from far away. */
score_t min_score = out->score;
score_t best_score = out->score;
size_t best_len = out->len;
size_t i;
out->len = 0;
out->len_x_code = 0;
out->len_code_delta = 0;
/* Try last distance first. */
for (i = 0; i < (size_t)common->params.num_last_distances_to_check; ++i) {
const size_t backward = (size_t)distance_cache[i];
@ -209,7 +209,6 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
out->len = best_len;
out->distance = backward;
out->score = best_score;
is_match_found = BROTLI_TRUE;
}
}
}
@ -250,7 +249,6 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
out->len = best_len;
out->distance = backward;
out->score = best_score;
is_match_found = BROTLI_TRUE;
}
}
}
@ -258,12 +256,11 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
bucket[num[key] & self->block_mask_] = (uint32_t)cur_ix;
++num[key];
}
if (!is_match_found) {
is_match_found = SearchInStaticDictionary(dictionary, dictionary_hash,
if (min_score == out->score) {
SearchInStaticDictionary(dictionary, dictionary_hash,
handle, &data[cur_ix_masked], max_length, max_backward, out,
BROTLI_FALSE);
}
return is_match_found;
}
#undef HashLongestMatch

View File

@ -149,8 +149,8 @@ static BROTLI_INLINE void FN(PrepareDistanceCache)(
Does not look for matches longer than max_length.
Does not look for matches further away than max_backward.
Writes the best match into |out|.
Returns true when match is found, otherwise false. */
static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
|out|->score is updated only if a better match is found. */
static BROTLI_INLINE void FN(FindLongestMatch)(HasherHandle handle,
const BrotliDictionary* dictionary, const uint16_t* dictionary_hash,
const uint8_t* BROTLI_RESTRICT data, const size_t ring_buffer_mask,
const int* BROTLI_RESTRICT distance_cache, const size_t cur_ix,
@ -161,13 +161,13 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
uint16_t* num = FN(Num)(self);
uint32_t* buckets = FN(Buckets)(self);
const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
BROTLI_BOOL is_match_found = BROTLI_FALSE;
/* Don't accept a short copy from far away. */
score_t min_score = out->score;
score_t best_score = out->score;
size_t best_len = out->len;
size_t i;
out->len = 0;
out->len_x_code = 0;
out->len_code_delta = 0;
/* Try last distance first. */
for (i = 0; i < (size_t)common->params.num_last_distances_to_check; ++i) {
const size_t backward = (size_t)distance_cache[i];
@ -202,7 +202,6 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
out->len = best_len;
out->distance = backward;
out->score = best_score;
is_match_found = BROTLI_TRUE;
}
}
}
@ -242,7 +241,6 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
out->len = best_len;
out->distance = backward;
out->score = best_score;
is_match_found = BROTLI_TRUE;
}
}
}
@ -250,12 +248,11 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle,
bucket[num[key] & self->block_mask_] = (uint32_t)cur_ix;
++num[key];
}
if (!is_match_found) {
is_match_found = SearchInStaticDictionary(dictionary, dictionary_hash,
if (min_score == out->score) {
SearchInStaticDictionary(dictionary, dictionary_hash,
handle, &data[cur_ix_masked], max_length, max_backward, out,
BROTLI_FALSE);
}
return is_match_found;
}
#undef HashLongestMatch

View File

@ -123,8 +123,8 @@ static BROTLI_INLINE void FN(PrepareDistanceCache)(
Does not look for matches longer than max_length.
Does not look for matches further away than max_backward.
Writes the best match into |out|.
Returns true if match is found, otherwise false. */
static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(
|out|->score is updated only if a better match is found. */
static BROTLI_INLINE void FN(FindLongestMatch)(
HasherHandle handle, const BrotliDictionary* dictionary,
const uint16_t* dictionary_hash, const uint8_t* BROTLI_RESTRICT data,
const size_t ring_buffer_mask, const int* BROTLI_RESTRICT distance_cache,
@ -135,12 +135,12 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(
const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
const uint32_t key = FN(HashBytes)(&data[cur_ix_masked]);
int compare_char = data[cur_ix_masked + best_len_in];
score_t min_score = out->score;
score_t best_score = out->score;
size_t best_len = best_len_in;
size_t cached_backward = (size_t)distance_cache[0];
size_t prev_ix = cur_ix - cached_backward;
BROTLI_BOOL is_match_found = BROTLI_FALSE;
out->len_x_code = 0;
out->len_code_delta = 0;
if (prev_ix < cur_ix) {
prev_ix &= (uint32_t)ring_buffer_mask;
if (compare_char == data[prev_ix + best_len]) {
@ -148,17 +148,18 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(
&data[cur_ix_masked],
max_length);
if (len >= 4) {
best_score = BackwardReferenceScoreUsingLastDistance(len);
best_len = len;
out->len = len;
out->distance = cached_backward;
out->score = best_score;
compare_char = data[cur_ix_masked + best_len];
if (BUCKET_SWEEP == 1) {
self->buckets_[key] = (uint32_t)cur_ix;
return BROTLI_TRUE;
} else {
is_match_found = BROTLI_TRUE;
const score_t score = BackwardReferenceScoreUsingLastDistance(len);
if (best_score < score) {
best_score = score;
best_len = len;
out->len = len;
out->distance = cached_backward;
out->score = best_score;
compare_char = data[cur_ix_masked + best_len];
if (BUCKET_SWEEP == 1) {
self->buckets_[key] = (uint32_t)cur_ix;
return;
}
}
}
}
@ -172,19 +173,22 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(
backward = cur_ix - prev_ix;
prev_ix &= (uint32_t)ring_buffer_mask;
if (compare_char != data[prev_ix + best_len_in]) {
return BROTLI_FALSE;
return;
}
if (BROTLI_PREDICT_FALSE(backward == 0 || backward > max_backward)) {
return BROTLI_FALSE;
return;
}
len = FindMatchLengthWithLimit(&data[prev_ix],
&data[cur_ix_masked],
max_length);
if (len >= 4) {
out->len = len;
out->distance = backward;
out->score = BackwardReferenceScore(len, backward);
return BROTLI_TRUE;
const score_t score = BackwardReferenceScore(len, backward);
if (best_score < score) {
out->len = len;
out->distance = backward;
out->score = score;
return;
}
}
} else {
uint32_t *bucket = self->buckets_ + key;
@ -212,18 +216,16 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(
out->distance = backward;
out->score = score;
compare_char = data[cur_ix_masked + best_len];
is_match_found = BROTLI_TRUE;
}
}
}
}
if (USE_DICTIONARY && !is_match_found) {
is_match_found = SearchInStaticDictionary(dictionary, dictionary_hash,
if (USE_DICTIONARY && min_score == out->score) {
SearchInStaticDictionary(dictionary, dictionary_hash,
handle, &data[cur_ix_masked], max_length, max_backward, out,
BROTLI_TRUE);
}
self->buckets_[key + ((cur_ix >> 3) % BUCKET_SWEEP)] = (uint32_t)cur_ix;
return is_match_found;
}
#undef HASH_MAP_SIZE

View File

@ -681,12 +681,14 @@ static BROTLI_BOOL CloseFiles(Context* context, BROTLI_BOOL success) {
}
}
if (fclose(context->fin) != 0) {
if (is_ok) {
fprintf(stderr, "fclose failed [%s]: %s\n",
PrintablePath(context->current_input_path), strerror(errno));
if (context->fin) {
if (fclose(context->fin) != 0) {
if (is_ok) {
fprintf(stderr, "fclose failed [%s]: %s\n",
PrintablePath(context->current_input_path), strerror(errno));
}
is_ok = BROTLI_FALSE;
}
is_ok = BROTLI_FALSE;
}
if (success && context->junk_source) {
unlink(context->current_input_path);

View File

@ -10,10 +10,12 @@ import unittest
project_dir = os.path.abspath(os.path.join(__file__, '..', '..', '..'))
src_dir = os.path.join(project_dir, 'python')
test_dir = os.path.join(project_dir, 'tests')
PYTHON = sys.executable or 'python'
BRO = os.path.join(project_dir, 'python', 'bro.py')
BRO = os.path.join(src_dir, 'bro.py')
# Get the platform/version-specific build folder.
# By default, the distutils build base is in the same location as setup.py.
@ -30,7 +32,7 @@ if 'PYTHONPATH' not in TEST_ENV:
else:
TEST_ENV['PYTHONPATH'] = build_dir + os.pathsep + TEST_ENV['PYTHONPATH']
TESTDATA_DIR = os.path.join(project_dir, 'tests', 'testdata')
TESTDATA_DIR = os.path.join(test_dir, 'testdata')
TESTDATA_FILES = [
'empty', # Empty file

273
research/deorummolae.cc Executable file
View File

@ -0,0 +1,273 @@
#include "./deorummolae.h"
#include <array>
#include <vector>
#include "./esaxx/sais.hxx"
/* Used for quick SA-entry to file mapping. Each file is padded to size that
is a multiple of chunk size. */
#define CHUNK_SIZE 64
/* Length of substring that is considered to be covered by dictionary string. */
#define CUT_MATCH 6
/* Minimal dictionary entry size. */
#define MIN_MATCH 24
/* Non tunable definitions. */
#define CHUNK_MASK (CHUNK_SIZE - 1)
#define COVERAGE_SIZE (1 << (LOG_MAX_FILES - 6))
/* File coverage: every bit set to 1 denotes a file covered by an isle. */
typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
static int popcount(uint64_t u) {
return __builtin_popcountll(u);
}
/* Condense terminators and pad file entries. */
static void rewriteText(std::vector<int>* text) {
int terminator = text->back();
int prev = terminator;
size_t to = 0;
for (size_t from = 0; from < text->size(); ++from) {
int next = text->at(from);
if (next < 256 || prev < 256) {
text->at(to++) = next;
if (next >= 256) terminator = next;
}
prev = next;
}
text->resize(to);
if (text->empty()) text->push_back(terminator);
while (text->size() & CHUNK_MASK) text->push_back(terminator);
}
/* Reenumerate terminators for smaller alphabet. */
static void remapTerminators(std::vector<int>* text, int* next_terminator) {
int prev = -1;
int x = 256;
for (size_t i = 0; i < text->size(); ++i) {
int next = text->at(i);
if (next < 256) { // Char.
// Do nothing.
} else if (prev < 256) { // Terminator after char.
next = x++;
} else { // Terminator after terminator.
next = prev;
}
text->at(i) = next;
prev = next;
}
*next_terminator = x;
}
/* Combine all file entries; create mapping position->file. */
static void buildFullText(std::vector<std::vector<int>>* data,
std::vector<int>* full_text, std::vector<size_t>* file_map,
std::vector<size_t>* file_offset, int* next_terminator) {
file_map->resize(0);
file_offset->resize(0);
full_text->resize(0);
for (size_t i = 0; i < data->size(); ++i) {
file_offset->push_back(full_text->size());
std::vector<int>& file = data->at(i);
rewriteText(&file);
full_text->insert(full_text->end(), file.begin(), file.end());
file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i);
}
if (false) remapTerminators(full_text, next_terminator);
}
/* Build longest-common-prefix based on suffix array and text.
TODO: borrowed -> unknown efficiency. */
static void buildLcp(std::vector<int>* text, std::vector<int>* sa,
std::vector<int>* lcp, std::vector<int>* invese_sa) {
int size = static_cast<int>(text->size());
lcp->resize(size);
int k = 0;
lcp->at(size - 1) = 0;
for (int i = 0; i < size; ++i) {
if (invese_sa->at(i) == size - 1) {
k = 0;
continue;
}
int j = sa->at(invese_sa->at(i) + 1); // Suffix which follow i-th suffix.
while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) {
++k;
}
lcp->at(invese_sa->at(i)) = k;
if (k > 0) --k;
}
}
/* Isle is a range in SA with LCP not less than some value.
When we raise the LCP requirement, the isle sunks and smaller isles appear
instead. */
typedef struct {
int lcp;
int l;
int r;
Coverage coverage;
} Isle;
/* Helper routine for `cutMatch`. */
static void poisonData(int pos, int length, std::vector<std::vector<int>>* data,
std::vector<size_t>* file_map, std::vector<size_t>* file_offset,
int* next_terminator) {
size_t f = file_map->at(pos / CHUNK_SIZE);
pos -= file_offset->at(f);
std::vector<int>& file = data->at(f);
int l = (length == CUT_MATCH) ? CUT_MATCH : 1;
for (int j = 0; j < l; j++, pos++) {
if (file[pos] >= 256) continue;
if (file[pos + 1] >= 256) {
file[pos] = file[pos + 1];
} else if (pos > 0 && file[pos - 1] >= 256) {
file[pos] = file[pos - 1];
} else {
file[pos] = (*next_terminator)++;
}
}
}
/* Remove substrings of a given match from files.
Substrings are replaced with unique terminators, so next iteration SA would
not allow to cross removed areas. */
static void cutMatch(std::vector<std::vector<int>>* data, int index, int length,
std::vector<int>* sa, std::vector<int>* lcp, std::vector<int>* invese_sa,
int* next_terminator, std::vector<size_t>* file_map,
std::vector<size_t>* file_offset) {
while (length >= CUT_MATCH) {
int i = index;
while (lcp->at(i) >= length) {
i++;
poisonData(
sa->at(i), length, data, file_map, file_offset, next_terminator);
}
while (true) {
poisonData(
sa->at(index), length, data, file_map, file_offset, next_terminator);
if (index == 0 || lcp->at(index - 1) < length) break;
index--;
}
length--;
index = invese_sa->at(sa->at(index) + 1);
}
}
size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
size_t num_samples, const size_t* sample_sizes,
const uint8_t* sample_data) {
{
uint64_t tmp = 0;
if (popcount(tmp - 1u) != 64) {
fprintf(stderr, "64-bit platform is required\n");
return 0;
}
}
/* Could use 256 + '0' for easier debugging. */
int next_terminator = 256;
std::vector<std::vector<int>> data;
size_t offset = 0;
if (num_samples > MAX_FILES) num_samples = MAX_FILES;
for (size_t n = 0; n < num_samples; ++n) {
size_t next_offset = offset + sample_sizes[n];
data.push_back(
std::vector<int>(sample_data + offset, sample_data + next_offset));
offset = next_offset;
data.back().push_back(next_terminator++);
}
/* Most arrays are allocated once, and then just resized to smaller and
smaller sizes. */
std::vector<int> full_text;
std::vector<size_t> file_map;
std::vector<size_t> file_offset;
std::vector<int> sa;
std::vector<int> invese_sa;
std::vector<int> lcp;
std::vector<Isle> isles;
std::vector<char> output_data;
size_t total = 0;
size_t total_cost = 0;
size_t best_cost;
Isle best_isle;
int min_count = num_samples;
while (true) {
size_t max_match = dictionary_size_limit - total;
buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator);
sa.resize(full_text.size());
saisxx(full_text.data(), sa.data(), static_cast<int>(full_text.size()),
next_terminator);
invese_sa.resize(full_text.size());
for (int i = 0; i < full_text.size(); ++i) invese_sa[sa[i]] = i;
buildLcp(&full_text, &sa, &lcp, &invese_sa);
/* Do not rebuild SA/LCP, just use different selection. */
retry:
best_cost = 0;
best_isle = {0, 0, 0, {{0}}};
isles.resize(0);
isles.push_back(best_isle);
for (int i = 0; i < static_cast<int>(lcp.size()); ++i) {
int l = i;
Coverage cov = {{0}};
int f = file_map[sa[i] / CHUNK_SIZE];
cov[f >> 6] = ((uint64_t)1) << (f & 63);
while (lcp[i] < isles.back().lcp) {
Isle& top = isles.back();
top.r = i;
l = top.l;
for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x];
int count = 0;
for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]);
int effective_lcp = top.lcp;
/* Restrict (last) dictionary entry length. */
if (effective_lcp > max_match) effective_lcp = max_match;
int cost = count * effective_lcp;
if (cost > best_cost && count >= min_count &&
effective_lcp >= MIN_MATCH) {
best_cost = cost;
best_isle = top;
best_isle.lcp = effective_lcp;
}
isles.pop_back();
for (size_t x = 0; x < cov.size(); ++x) {
isles.back().coverage[x] |= cov[x];
}
}
if (lcp[i] > isles.back().lcp) isles.push_back({lcp[i], l, 0, {{0}}});
for (size_t x = 0; x < cov.size(); ++x) {
isles.back().coverage[x] |= cov[x];
}
}
/* When saturated matches do not match length restrictions, lower the
saturation requirements. */
if (best_cost == 0 || best_isle.lcp < MIN_MATCH) {
if (min_count >= 8) {
min_count = (min_count * 7) / 8;
goto retry;
}
break;
}
/* Save the entry. */
fprintf(stderr,
"Savings: %zu+%zu, dictionary: %zu+%d\n",
total_cost, best_cost, total, best_isle.lcp);
memcpy(
dictionary + total, full_text.data() + sa[best_isle.l], best_isle.lcp);
total += best_isle.lcp;
total_cost += best_cost;
cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp,
&invese_sa, &next_terminator, &file_map, &file_offset);
if (total >= dictionary_size_limit) break;
}
return total;
}

27
research/deorummolae.h Executable file
View File

@ -0,0 +1,27 @@
#ifndef BROTLI_RESEARCH_DEORUMMOLAE_H_
#define BROTLI_RESEARCH_DEORUMMOLAE_H_
#include <stddef.h>
#include <stdint.h>
/* log2(maximal number of files). Value 6 provides some speedups. */
#define LOG_MAX_FILES 6
/* Non tunable definitions. */
#define MAX_FILES (1 << LOG_MAX_FILES)
/**
* Generate a dictionary for given samples.
*
* @param dictionary storage for generated dictionary
* @param dictionary_size_limit maximal dictionary size
* @param num_samples number of samples
* @param sample_sizes array with sample sizes
* @param sample_data concatenated samples
* @return generated dictionary size
*/
size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
size_t num_samples, const size_t* sample_sizes,
const uint8_t* sample_data);
#endif // BROTLI_RESEARCH_DEORUMMOLAE_H_