2017-10-13 09:25:03 +00:00
|
|
|
#include "./sieve.h"
|
|
|
|
|
2018-02-26 14:04:36 +00:00
|
|
|
/* Pointer to position in (combined corpus) text. */
|
|
|
|
typedef uint32_t TextIdx;
|
|
|
|
|
|
|
|
/* Index of sample / generation. */
|
|
|
|
typedef uint16_t SampleIdx;
|
|
|
|
|
2017-10-13 09:25:03 +00:00
|
|
|
typedef struct Slot {
|
2018-02-26 14:04:36 +00:00
|
|
|
TextIdx next;
|
|
|
|
TextIdx offset;
|
|
|
|
SampleIdx presence;
|
|
|
|
SampleIdx mark;
|
2017-10-13 09:25:03 +00:00
|
|
|
} Slot;
|
|
|
|
|
2018-02-26 14:04:36 +00:00
|
|
|
static const TextIdx kNowhere = static_cast<TextIdx>(-1);
|
|
|
|
|
|
|
|
static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut,
|
|
|
|
TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) {
|
|
|
|
TextIdx from = kNowhere;
|
|
|
|
TextIdx to = kNowhere;
|
|
|
|
TextIdx result = 0;
|
|
|
|
SampleIdx targetPresence = minPresence;
|
|
|
|
for (TextIdx i = 0; i < end; ++i) {
|
2017-10-13 09:25:03 +00:00
|
|
|
if (i == middle) {
|
|
|
|
targetPresence++;
|
|
|
|
}
|
|
|
|
Slot& item = map[shortcut[i]];
|
|
|
|
if (item.mark != iteration) {
|
|
|
|
item.mark = iteration;
|
|
|
|
if (item.presence >= targetPresence) {
|
2018-02-26 14:04:36 +00:00
|
|
|
if ((to == kNowhere) || (to < i)) {
|
|
|
|
if (from != kNowhere) {
|
2017-10-13 09:25:03 +00:00
|
|
|
result += to - from;
|
|
|
|
}
|
|
|
|
from = i;
|
|
|
|
}
|
|
|
|
to = i + sliceLen;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2018-02-26 14:04:36 +00:00
|
|
|
if (from != kNowhere) {
|
2017-10-13 09:25:03 +00:00
|
|
|
result += to - from;
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2018-02-26 14:04:36 +00:00
|
|
|
static std::string createDictionary(const uint8_t* data, TextIdx sliceLen,
|
|
|
|
Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle,
|
|
|
|
SampleIdx minPresence, SampleIdx iteration) {
|
2017-10-13 09:25:03 +00:00
|
|
|
std::string output;
|
2018-02-26 14:04:36 +00:00
|
|
|
TextIdx from = kNowhere;
|
|
|
|
TextIdx to = kNowhere;
|
|
|
|
SampleIdx targetPresence = minPresence;
|
|
|
|
for (TextIdx i = 0; i < end; ++i) {
|
2017-10-13 09:25:03 +00:00
|
|
|
if (i == middle) {
|
|
|
|
targetPresence++;
|
|
|
|
}
|
|
|
|
Slot& item = map[shortcut[i]];
|
|
|
|
if (item.mark != iteration) {
|
|
|
|
item.mark = iteration;
|
|
|
|
if (item.presence >= targetPresence) {
|
2018-02-26 14:04:36 +00:00
|
|
|
if ((to == kNowhere) || (to < i)) {
|
|
|
|
if (from != kNowhere) {
|
2017-10-13 09:25:03 +00:00
|
|
|
output.insert(output.end(), &data[from], &data[to]);
|
|
|
|
}
|
|
|
|
from = i;
|
|
|
|
}
|
|
|
|
to = i + sliceLen;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2018-02-26 14:04:36 +00:00
|
|
|
if (from != kNowhere) {
|
2017-10-13 09:25:03 +00:00
|
|
|
output.insert(output.end(), &data[from], &data[to]);
|
|
|
|
}
|
|
|
|
return output;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
|
|
|
|
const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
|
|
|
|
/* Parameters aliasing */
|
2018-02-26 14:04:36 +00:00
|
|
|
TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
|
|
|
|
if (targetSize != dictionary_size_limit) {
|
|
|
|
fprintf(stderr, "dictionary_size_limit is too large\n");
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
TextIdx sliceLen = static_cast<TextIdx>(slice_len);
|
|
|
|
if (sliceLen != slice_len) {
|
|
|
|
fprintf(stderr, "slice_len is too large\n");
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
if (sliceLen < 1) {
|
|
|
|
fprintf(stderr, "slice_len is too small\n");
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
SampleIdx numSamples = static_cast<SampleIdx>(sample_sizes.size());
|
|
|
|
if ((numSamples != sample_sizes.size()) || (numSamples * 2 < numSamples)) {
|
|
|
|
fprintf(stderr, "too many samples\n");
|
|
|
|
return "";
|
|
|
|
}
|
2017-10-13 09:25:03 +00:00
|
|
|
const uint8_t* data = sample_data;
|
|
|
|
|
2018-02-26 14:04:36 +00:00
|
|
|
TextIdx total = 0;
|
|
|
|
std::vector<TextIdx> offsets;
|
|
|
|
for (SampleIdx i = 0; i < numSamples; ++i) {
|
|
|
|
TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
|
|
|
|
if (delta != sample_sizes[i]) {
|
|
|
|
fprintf(stderr, "sample is too large\n");
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
if (delta == 0) {
|
|
|
|
fprintf(stderr, "empty samples are prohibited\n");
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
if (total + delta <= total) {
|
|
|
|
fprintf(stderr, "corpus is too large\n");
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
total += delta;
|
2017-10-13 09:25:03 +00:00
|
|
|
offsets.push_back(total);
|
|
|
|
}
|
|
|
|
|
2018-02-26 14:04:36 +00:00
|
|
|
if (total * 2 < total) {
|
|
|
|
fprintf(stderr, "corpus is too large\n");
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
|
|
|
if (total < sliceLen) {
|
|
|
|
fprintf(stderr, "slice_len is larger than corpus size\n");
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
2017-10-13 09:25:03 +00:00
|
|
|
/*****************************************************************************
|
|
|
|
* Build coverage map.
|
|
|
|
****************************************************************************/
|
|
|
|
std::vector<Slot> map;
|
2018-02-26 14:04:36 +00:00
|
|
|
std::vector<TextIdx> shortcut;
|
2017-10-13 09:25:03 +00:00
|
|
|
map.push_back({0, 0, 0, 0});
|
2018-02-26 14:04:36 +00:00
|
|
|
TextIdx end = total - sliceLen;
|
|
|
|
TextIdx hashLen = 11;
|
|
|
|
while (hashLen < 29 && ((1u << hashLen) < end)) {
|
2017-10-13 09:25:03 +00:00
|
|
|
hashLen += 3;
|
|
|
|
}
|
|
|
|
hashLen -= 3;
|
2018-02-26 14:04:36 +00:00
|
|
|
TextIdx hashMask = (1u << hashLen) - 1u;
|
|
|
|
std::vector<TextIdx> hashHead(1 << hashLen);
|
|
|
|
TextIdx hashSlot = 1;
|
|
|
|
SampleIdx piece = 0;
|
|
|
|
TextIdx hash = 0;
|
|
|
|
TextIdx lShift = 3;
|
|
|
|
TextIdx rShift = hashLen - lShift;
|
|
|
|
for (TextIdx i = 0; i < sliceLen - 1; ++i) {
|
|
|
|
TextIdx v = data[i];
|
2017-10-13 09:25:03 +00:00
|
|
|
hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
|
|
|
|
}
|
2018-02-26 14:04:36 +00:00
|
|
|
TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
|
|
|
|
TextIdx rShiftX = hashLen - lShiftX;
|
|
|
|
for (TextIdx i = 0; i < end; ++i) {
|
|
|
|
TextIdx v = data[i + sliceLen - 1];
|
2017-10-13 09:25:03 +00:00
|
|
|
hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
|
|
|
|
|
|
|
|
if (offsets[piece] == i) {
|
|
|
|
piece++;
|
|
|
|
}
|
2018-02-26 14:04:36 +00:00
|
|
|
TextIdx slot = hashHead[hash];
|
2017-10-13 09:25:03 +00:00
|
|
|
while (slot != 0) {
|
|
|
|
Slot& item = map[slot];
|
2018-02-26 14:04:36 +00:00
|
|
|
TextIdx start = item.offset;
|
2017-10-13 09:25:03 +00:00
|
|
|
bool miss = false;
|
2018-02-26 14:04:36 +00:00
|
|
|
for (TextIdx j = 0; j < sliceLen; ++j) {
|
2017-10-13 09:25:03 +00:00
|
|
|
if (data[i + j] != data[start + j]) {
|
|
|
|
miss = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!miss) {
|
|
|
|
if (item.mark != piece) {
|
|
|
|
item.mark = piece;
|
|
|
|
item.presence++;
|
|
|
|
}
|
|
|
|
shortcut.push_back(slot);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
slot = item.next;
|
|
|
|
}
|
|
|
|
if (slot == 0) {
|
|
|
|
map.push_back({hashHead[hash], i, 1, piece});
|
|
|
|
hashHead[hash] = hashSlot;
|
|
|
|
shortcut.push_back(hashSlot);
|
|
|
|
hashSlot++;
|
|
|
|
}
|
|
|
|
v = data[i];
|
|
|
|
hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*****************************************************************************
|
|
|
|
* Build dictionary of specified size.
|
|
|
|
****************************************************************************/
|
2018-02-26 14:04:36 +00:00
|
|
|
SampleIdx a = 1;
|
|
|
|
TextIdx size = dryRun(
|
2017-10-13 09:25:03 +00:00
|
|
|
sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
|
|
|
|
/* Maximal output is smaller than target. */
|
|
|
|
if (size <= targetSize) {
|
|
|
|
return createDictionary(
|
|
|
|
data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
|
|
|
|
}
|
|
|
|
|
2018-02-26 14:04:36 +00:00
|
|
|
SampleIdx b = numSamples;
|
2017-10-13 09:25:03 +00:00
|
|
|
size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
|
|
|
|
if (size == targetSize) {
|
|
|
|
return createDictionary(
|
|
|
|
data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
|
|
|
|
}
|
|
|
|
/* Run binary search. */
|
|
|
|
if (size < targetSize) {
|
|
|
|
/* size(a) > targetSize > size(b) && a < m < b */
|
|
|
|
while (a + 1 < b) {
|
2018-02-26 14:04:36 +00:00
|
|
|
SampleIdx m = static_cast<SampleIdx>((a + b) / 2);
|
2017-10-13 09:25:03 +00:00
|
|
|
size = dryRun(
|
|
|
|
sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
|
|
|
|
if (size < targetSize) {
|
|
|
|
b = m;
|
|
|
|
} else if (size > targetSize) {
|
|
|
|
a = m;
|
|
|
|
} else {
|
|
|
|
return createDictionary(
|
|
|
|
data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
a = b;
|
|
|
|
}
|
|
|
|
/* size(minPresence) > targetSize > size(minPresence + 1) */
|
2018-02-26 14:04:36 +00:00
|
|
|
SampleIdx minPresence = a;
|
|
|
|
TextIdx c = 0;
|
|
|
|
TextIdx d = end;
|
2017-10-13 09:25:03 +00:00
|
|
|
/* size(a) < targetSize < size(b) && a < m < b */
|
2018-02-26 14:04:36 +00:00
|
|
|
while (c + 1 < d) {
|
|
|
|
TextIdx m = (c + d) / 2;
|
2017-10-13 09:25:03 +00:00
|
|
|
size = dryRun(
|
|
|
|
sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
|
|
|
|
if (size < targetSize) {
|
2018-02-26 14:04:36 +00:00
|
|
|
c = m;
|
2017-10-13 09:25:03 +00:00
|
|
|
} else if (size > targetSize) {
|
2018-02-26 14:04:36 +00:00
|
|
|
d = m;
|
2017-10-13 09:25:03 +00:00
|
|
|
} else {
|
|
|
|
return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
|
|
|
|
m, minPresence, ++piece);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool unrestricted = false;
|
|
|
|
if (minPresence <= 2 && !unrestricted) {
|
|
|
|
minPresence = 2;
|
2018-02-26 14:04:36 +00:00
|
|
|
c = end;
|
2017-10-13 09:25:03 +00:00
|
|
|
}
|
2018-02-26 14:04:36 +00:00
|
|
|
return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c,
|
2017-10-13 09:25:03 +00:00
|
|
|
minPresence, ++piece);
|
|
|
|
}
|