2017-07-21 08:07:24 +00:00
|
|
|
#include "./deorummolae.h"
|
|
|
|
|
|
|
|
#include <array>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "./esaxx/sais.hxx"
|
|
|
|
|
|
|
|
/* Used for quick SA-entry to file mapping. Each file is padded to size that
|
|
|
|
is a multiple of chunk size. */
|
|
|
|
#define CHUNK_SIZE 64
|
|
|
|
/* Length of substring that is considered to be covered by dictionary string. */
|
|
|
|
#define CUT_MATCH 6
|
|
|
|
/* Minimal dictionary entry size. */
|
|
|
|
#define MIN_MATCH 24
|
|
|
|
|
|
|
|
/* Non tunable definitions. */
|
|
|
|
#define CHUNK_MASK (CHUNK_SIZE - 1)
|
|
|
|
#define COVERAGE_SIZE (1 << (LOG_MAX_FILES - 6))
|
|
|
|
|
|
|
|
/* File coverage: every bit set to 1 denotes a file covered by an isle. */
|
|
|
|
typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
|
|
|
|
|
|
|
|
static int popcount(uint64_t u) {
|
|
|
|
return __builtin_popcountll(u);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Condense terminators and pad file entries. */
|
|
|
|
static void rewriteText(std::vector<int>* text) {
|
|
|
|
int terminator = text->back();
|
|
|
|
int prev = terminator;
|
|
|
|
size_t to = 0;
|
|
|
|
for (size_t from = 0; from < text->size(); ++from) {
|
|
|
|
int next = text->at(from);
|
|
|
|
if (next < 256 || prev < 256) {
|
|
|
|
text->at(to++) = next;
|
|
|
|
if (next >= 256) terminator = next;
|
|
|
|
}
|
|
|
|
prev = next;
|
|
|
|
}
|
|
|
|
text->resize(to);
|
|
|
|
if (text->empty()) text->push_back(terminator);
|
|
|
|
while (text->size() & CHUNK_MASK) text->push_back(terminator);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Reenumerate terminators for smaller alphabet. */
|
|
|
|
static void remapTerminators(std::vector<int>* text, int* next_terminator) {
|
|
|
|
int prev = -1;
|
|
|
|
int x = 256;
|
|
|
|
for (size_t i = 0; i < text->size(); ++i) {
|
|
|
|
int next = text->at(i);
|
|
|
|
if (next < 256) { // Char.
|
|
|
|
// Do nothing.
|
|
|
|
} else if (prev < 256) { // Terminator after char.
|
|
|
|
next = x++;
|
|
|
|
} else { // Terminator after terminator.
|
|
|
|
next = prev;
|
|
|
|
}
|
|
|
|
text->at(i) = next;
|
|
|
|
prev = next;
|
|
|
|
}
|
|
|
|
*next_terminator = x;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Combine all file entries; create mapping position->file. */
|
|
|
|
static void buildFullText(std::vector<std::vector<int>>* data,
|
|
|
|
std::vector<int>* full_text, std::vector<size_t>* file_map,
|
|
|
|
std::vector<size_t>* file_offset, int* next_terminator) {
|
|
|
|
file_map->resize(0);
|
|
|
|
file_offset->resize(0);
|
|
|
|
full_text->resize(0);
|
|
|
|
for (size_t i = 0; i < data->size(); ++i) {
|
|
|
|
file_offset->push_back(full_text->size());
|
|
|
|
std::vector<int>& file = data->at(i);
|
|
|
|
rewriteText(&file);
|
|
|
|
full_text->insert(full_text->end(), file.begin(), file.end());
|
|
|
|
file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i);
|
|
|
|
}
|
|
|
|
if (false) remapTerminators(full_text, next_terminator);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Build longest-common-prefix based on suffix array and text.
|
|
|
|
TODO: borrowed -> unknown efficiency. */
|
|
|
|
static void buildLcp(std::vector<int>* text, std::vector<int>* sa,
|
|
|
|
std::vector<int>* lcp, std::vector<int>* invese_sa) {
|
|
|
|
int size = static_cast<int>(text->size());
|
|
|
|
lcp->resize(size);
|
|
|
|
int k = 0;
|
|
|
|
lcp->at(size - 1) = 0;
|
|
|
|
for (int i = 0; i < size; ++i) {
|
|
|
|
if (invese_sa->at(i) == size - 1) {
|
|
|
|
k = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
int j = sa->at(invese_sa->at(i) + 1); // Suffix which follow i-th suffix.
|
|
|
|
while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) {
|
|
|
|
++k;
|
|
|
|
}
|
|
|
|
lcp->at(invese_sa->at(i)) = k;
|
|
|
|
if (k > 0) --k;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Isle is a range in SA with LCP not less than some value.
|
|
|
|
When we raise the LCP requirement, the isle sunks and smaller isles appear
|
|
|
|
instead. */
|
|
|
|
typedef struct {
|
|
|
|
int lcp;
|
|
|
|
int l;
|
|
|
|
int r;
|
|
|
|
Coverage coverage;
|
|
|
|
} Isle;
|
|
|
|
|
|
|
|
/* Helper routine for `cutMatch`. */
|
|
|
|
static void poisonData(int pos, int length, std::vector<std::vector<int>>* data,
|
|
|
|
std::vector<size_t>* file_map, std::vector<size_t>* file_offset,
|
|
|
|
int* next_terminator) {
|
|
|
|
size_t f = file_map->at(pos / CHUNK_SIZE);
|
|
|
|
pos -= file_offset->at(f);
|
|
|
|
std::vector<int>& file = data->at(f);
|
|
|
|
int l = (length == CUT_MATCH) ? CUT_MATCH : 1;
|
|
|
|
for (int j = 0; j < l; j++, pos++) {
|
|
|
|
if (file[pos] >= 256) continue;
|
|
|
|
if (file[pos + 1] >= 256) {
|
|
|
|
file[pos] = file[pos + 1];
|
|
|
|
} else if (pos > 0 && file[pos - 1] >= 256) {
|
|
|
|
file[pos] = file[pos - 1];
|
|
|
|
} else {
|
|
|
|
file[pos] = (*next_terminator)++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Remove substrings of a given match from files.
|
|
|
|
Substrings are replaced with unique terminators, so next iteration SA would
|
|
|
|
not allow to cross removed areas. */
|
|
|
|
static void cutMatch(std::vector<std::vector<int>>* data, int index, int length,
|
|
|
|
std::vector<int>* sa, std::vector<int>* lcp, std::vector<int>* invese_sa,
|
|
|
|
int* next_terminator, std::vector<size_t>* file_map,
|
|
|
|
std::vector<size_t>* file_offset) {
|
|
|
|
while (length >= CUT_MATCH) {
|
|
|
|
int i = index;
|
|
|
|
while (lcp->at(i) >= length) {
|
|
|
|
i++;
|
|
|
|
poisonData(
|
|
|
|
sa->at(i), length, data, file_map, file_offset, next_terminator);
|
|
|
|
}
|
|
|
|
while (true) {
|
|
|
|
poisonData(
|
|
|
|
sa->at(index), length, data, file_map, file_offset, next_terminator);
|
|
|
|
if (index == 0 || lcp->at(index - 1) < length) break;
|
|
|
|
index--;
|
|
|
|
}
|
|
|
|
length--;
|
|
|
|
index = invese_sa->at(sa->at(index) + 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
|
|
|
|
size_t num_samples, const size_t* sample_sizes,
|
|
|
|
const uint8_t* sample_data) {
|
|
|
|
{
|
|
|
|
uint64_t tmp = 0;
|
|
|
|
if (popcount(tmp - 1u) != 64) {
|
|
|
|
fprintf(stderr, "64-bit platform is required\n");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Could use 256 + '0' for easier debugging. */
|
|
|
|
int next_terminator = 256;
|
|
|
|
|
|
|
|
std::vector<std::vector<int>> data;
|
|
|
|
|
|
|
|
size_t offset = 0;
|
|
|
|
if (num_samples > MAX_FILES) num_samples = MAX_FILES;
|
|
|
|
for (size_t n = 0; n < num_samples; ++n) {
|
|
|
|
size_t next_offset = offset + sample_sizes[n];
|
|
|
|
data.push_back(
|
|
|
|
std::vector<int>(sample_data + offset, sample_data + next_offset));
|
|
|
|
offset = next_offset;
|
|
|
|
data.back().push_back(next_terminator++);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Most arrays are allocated once, and then just resized to smaller and
|
|
|
|
smaller sizes. */
|
|
|
|
std::vector<int> full_text;
|
|
|
|
std::vector<size_t> file_map;
|
|
|
|
std::vector<size_t> file_offset;
|
|
|
|
std::vector<int> sa;
|
|
|
|
std::vector<int> invese_sa;
|
|
|
|
std::vector<int> lcp;
|
|
|
|
std::vector<Isle> isles;
|
|
|
|
std::vector<char> output_data;
|
|
|
|
size_t total = 0;
|
|
|
|
size_t total_cost = 0;
|
|
|
|
size_t best_cost;
|
|
|
|
Isle best_isle;
|
|
|
|
int min_count = num_samples;
|
|
|
|
|
|
|
|
while (true) {
|
|
|
|
size_t max_match = dictionary_size_limit - total;
|
|
|
|
buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator);
|
|
|
|
sa.resize(full_text.size());
|
|
|
|
saisxx(full_text.data(), sa.data(), static_cast<int>(full_text.size()),
|
|
|
|
next_terminator);
|
|
|
|
invese_sa.resize(full_text.size());
|
|
|
|
for (int i = 0; i < full_text.size(); ++i) invese_sa[sa[i]] = i;
|
|
|
|
buildLcp(&full_text, &sa, &lcp, &invese_sa);
|
|
|
|
|
|
|
|
/* Do not rebuild SA/LCP, just use different selection. */
|
|
|
|
retry:
|
|
|
|
best_cost = 0;
|
|
|
|
best_isle = {0, 0, 0, {{0}}};
|
|
|
|
isles.resize(0);
|
|
|
|
isles.push_back(best_isle);
|
|
|
|
|
|
|
|
for (int i = 0; i < static_cast<int>(lcp.size()); ++i) {
|
|
|
|
int l = i;
|
|
|
|
Coverage cov = {{0}};
|
|
|
|
int f = file_map[sa[i] / CHUNK_SIZE];
|
|
|
|
cov[f >> 6] = ((uint64_t)1) << (f & 63);
|
|
|
|
while (lcp[i] < isles.back().lcp) {
|
|
|
|
Isle& top = isles.back();
|
|
|
|
top.r = i;
|
|
|
|
l = top.l;
|
|
|
|
for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x];
|
|
|
|
int count = 0;
|
|
|
|
for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]);
|
|
|
|
int effective_lcp = top.lcp;
|
|
|
|
/* Restrict (last) dictionary entry length. */
|
|
|
|
if (effective_lcp > max_match) effective_lcp = max_match;
|
|
|
|
int cost = count * effective_lcp;
|
|
|
|
if (cost > best_cost && count >= min_count &&
|
|
|
|
effective_lcp >= MIN_MATCH) {
|
|
|
|
best_cost = cost;
|
|
|
|
best_isle = top;
|
|
|
|
best_isle.lcp = effective_lcp;
|
|
|
|
}
|
|
|
|
isles.pop_back();
|
|
|
|
for (size_t x = 0; x < cov.size(); ++x) {
|
|
|
|
isles.back().coverage[x] |= cov[x];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (lcp[i] > isles.back().lcp) isles.push_back({lcp[i], l, 0, {{0}}});
|
|
|
|
for (size_t x = 0; x < cov.size(); ++x) {
|
|
|
|
isles.back().coverage[x] |= cov[x];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* When saturated matches do not match length restrictions, lower the
|
|
|
|
saturation requirements. */
|
|
|
|
if (best_cost == 0 || best_isle.lcp < MIN_MATCH) {
|
|
|
|
if (min_count >= 8) {
|
|
|
|
min_count = (min_count * 7) / 8;
|
2017-08-28 09:31:29 +00:00
|
|
|
fprintf(stderr, "Retry: min_count=%d\n", min_count);
|
2017-07-21 08:07:24 +00:00
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Save the entry. */
|
|
|
|
fprintf(stderr,
|
|
|
|
"Savings: %zu+%zu, dictionary: %zu+%d\n",
|
|
|
|
total_cost, best_cost, total, best_isle.lcp);
|
2017-08-28 09:31:29 +00:00
|
|
|
for (size_t i = 0; i < best_isle.lcp; ++i) {
|
|
|
|
dictionary[total + i] =
|
|
|
|
static_cast<uint8_t>(full_text[sa[best_isle.l] + i]);
|
|
|
|
}
|
2017-07-21 08:07:24 +00:00
|
|
|
total += best_isle.lcp;
|
|
|
|
total_cost += best_cost;
|
|
|
|
cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp,
|
|
|
|
&invese_sa, &next_terminator, &file_map, &file_offset);
|
|
|
|
if (total >= dictionary_size_limit) break;
|
|
|
|
}
|
|
|
|
return total;
|
|
|
|
}
|