Add new (fast) dictionary generator engine. (#616)

Add CLI for dictionary generation.
Add BUILD file for research folder
This commit is contained in:
Eugene Kliuchnikov 2017-10-13 11:25:03 +02:00 committed by GitHub
parent 9c75a2a26a
commit 39ef4bbdcf
6 changed files with 436 additions and 25 deletions

29
research/BUILD Executable file
View File

@ -0,0 +1,29 @@
# Description: brotli research tools.
package(default_visibility = ["//visibility:public"])
licenses(["notice"]) # MIT
cc_library(
name = "dm",
srcs = ["deorummolae.cc"],
hdrs = [
"deorummolae.h",
"esaxx/sais.hxx",
],
)
cc_library(
name = "sieve",
srcs = ["sieve.cc"],
hdrs = ["sieve.h"],
)
cc_binary(
name = "dictionary_generator",
srcs = ["dictionary_generator.cc"],
deps = [
":dm",
":sieve",
],
)

View File

@ -1,7 +1,7 @@
#include "./deorummolae.h"
#include <array>
#include <vector>
#include <cstdio>
#include "./esaxx/sais.hxx"
@ -20,9 +20,7 @@
/* File coverage: every bit set to 1 denotes a file covered by an isle. */
typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
static int popcount(uint64_t u) {
return __builtin_popcountll(u);
}
static int popcount(uint64_t u) { return __builtin_popcountll(u); }
/* Condense terminators and pad file entries. */
static void rewriteText(std::vector<int>* text) {
@ -155,9 +153,8 @@ static void cutMatch(std::vector<std::vector<int>>* data, int index, int length,
}
}
size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
size_t num_samples, const size_t* sample_sizes,
const uint8_t* sample_data) {
std::string DM_generate(size_t dictionary_size_limit,
const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
{
uint64_t tmp = 0;
if (popcount(tmp - 1u) != 64) {
@ -169,9 +166,11 @@ size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
/* Could use 256 + '0' for easier debugging. */
int next_terminator = 256;
std::string output;
std::vector<std::vector<int>> data;
size_t offset = 0;
size_t num_samples = sample_sizes.size();
if (num_samples > MAX_FILES) num_samples = MAX_FILES;
for (size_t n = 0; n < num_samples; ++n) {
size_t next_offset = offset + sample_sizes[n];
@ -259,18 +258,16 @@ retry:
}
/* Save the entry. */
fprintf(stderr,
"Savings: %zu+%zu, dictionary: %zu+%d\n",
fprintf(stderr, "Savings: %zu+%zu, dictionary: %zu+%d\n",
total_cost, best_cost, total, best_isle.lcp);
for (size_t i = 0; i < best_isle.lcp; ++i) {
dictionary[total + i] =
static_cast<uint8_t>(full_text[sa[best_isle.l] + i]);
}
int* piece = &full_text[sa[best_isle.l]];
output.insert(output.end(), piece, piece + best_isle.lcp);
total += best_isle.lcp;
total_cost += best_cost;
cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp,
&invese_sa, &next_terminator, &file_map, &file_offset);
cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, &invese_sa,
&next_terminator, &file_map, &file_offset);
if (total >= dictionary_size_limit) break;
}
return total;
return output;
}

View File

@ -4,6 +4,9 @@
#include <stddef.h>
#include <stdint.h>
#include <string>
#include <vector>
/* log2(maximal number of files). Value 6 provides some speedups. */
#define LOG_MAX_FILES 6
@ -13,15 +16,12 @@
/**
* Generate a dictionary for given samples.
*
* @param dictionary storage for generated dictionary
* @param dictionary_size_limit maximal dictionary size
* @param num_samples number of samples
* @param sample_sizes array with sample sizes
* @param sample_sizes vector with sample sizes
* @param sample_data concatenated samples
* @return generated dictionary size
* @return generated dictionary
*/
size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
size_t num_samples, const size_t* sample_sizes,
const uint8_t* sample_data);
std::string DM_generate(size_t dictionary_size_limit,
const std::vector<size_t>& sample_sizes, const uint8_t* sample_data);
#endif // BROTLI_RESEARCH_DEORUMMOLAE_H_

153
research/dictionary_generator.cc Executable file
View File

@ -0,0 +1,153 @@
#include <cstdio>
#include <cstring>
#include <fstream>
#include <vector>
#include "./deorummolae.h"
#include "./sieve.h"
#define METHOD_DM 0
#define METHOD_SIEVE 1
size_t readInt(const char* str) {
size_t result = 0;
if (str[0] == 0 || str[0] == '0') {
return 0;
}
for (size_t i = 0; i < 13; ++i) {
if (str[i] == 0) {
return result;
}
if (str[i] == 'k' || str[i] == 'K') {
if ((str[i + 1] == 0) && ((result << 10) > result)) {
return result << 10;
}
return 0;
}
if (str[i] == 'm' || str[i] == 'M') {
if ((str[i + 1] == 0) && ((result << 20) > result)) {
return result << 20;
}
return 0;
}
if (str[i] < '0' || str[i] > '9') {
return 0;
}
size_t next = (10 * result) + (str[i] - '0');
if (next <= result) {
return 0;
}
result = next;
}
return 0;
}
static std::string readFile(const std::string& path) {
std::ifstream file(path);
std::string content(
(std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
return content;
}
static void writeFile(const char* file, const std::string& content) {
std::ofstream outfile(file, std::ofstream::binary);
outfile.write(content.c_str(), content.size());
outfile.close();
}
/* Returns "base file name" or its tail, if it contains '/' or '\'. */
static const char* fileName(const char* path) {
const char* separator_position = strrchr(path, '/');
if (separator_position) path = separator_position + 1;
separator_position = strrchr(path, '\\');
if (separator_position) path = separator_position + 1;
return path;
}
static void printHelp(const char* name) {
fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name);
fprintf(stderr,
"Options:\n"
" --dm use 'deorummolae' engine\n"
" --sieve use 'sieve' engine (default)\n"
" -t# set target dictionary size (limit); default: 16K\n"
" -s# set slize length for 'sieve'; default: 33\n"
"# is a decimal number with optional k/K/m/M suffix.\n\n");
}
int main(int argc, char const* argv[]) {
int dictionaryArg = -1;
int method = METHOD_SIEVE;
int sieveSliceLen = 33;
int targetSize = 16 << 10;
std::vector<uint8_t> data;
std::vector<size_t> sizes;
size_t total = 0;
for (int i = 1; i < argc; ++i) {
if (argv[i] == nullptr) {
continue;
}
if (argv[i][0] == '-') {
if (argv[i][1] == '-') {
if (std::strcmp("--sieve", argv[i]) == 0) {
method = METHOD_SIEVE;
continue;
}
if (std::strcmp("--dm", argv[i]) == 0) {
method = METHOD_DM;
continue;
}
printHelp(fileName(argv[0]));
fprintf(stderr, "Invalid option '%s'\n", argv[i]);
exit(1);
}
if (argv[i][1] == 's') {
sieveSliceLen = readInt(&argv[i][2]);
if (sieveSliceLen < 4 || sieveSliceLen > 256) {
printHelp(fileName(argv[0]));
fprintf(stderr, "Invalid option '%s'\n", argv[i]);
exit(1);
}
} else if (argv[i][1] == 't') {
targetSize = readInt(&argv[i][2]);
if (targetSize < 256 || targetSize > (1 << 25)) {
printHelp(fileName(argv[0]));
fprintf(stderr, "Invalid option '%s'\n", argv[i]);
exit(1);
}
} else {
printHelp(fileName(argv[0]));
fprintf(stderr, "Unrecognized option '%s'\n", argv[i]);
exit(1);
}
continue;
}
if (dictionaryArg == -1) {
dictionaryArg = i;
continue;
}
std::string content = readFile(argv[i]);
data.insert(data.end(), content.begin(), content.end());
total += content.size();
sizes.push_back(content.size());
}
if (dictionaryArg == -1 || total == 0) {
printHelp(fileName(argv[0]));
fprintf(stderr, "Not enough arguments\n");
exit(1);
}
if (method == METHOD_SIEVE) {
writeFile(argv[dictionaryArg],
sieve_generate(targetSize, sieveSliceLen, sizes, data.data()));
} else if (method == METHOD_DM) {
writeFile(argv[dictionaryArg],
DM_generate(targetSize, sizes, data.data()));
} else {
printHelp(fileName(argv[0]));
fprintf(stderr, "Unknown generator\n");
exit(1);
}
return 0;
}

211
research/sieve.cc Executable file
View File

@ -0,0 +1,211 @@
#include "./sieve.h"
typedef struct Slot {
uint32_t next;
uint32_t offset;
uint16_t presence;
uint16_t mark;
} Slot;
static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end,
size_t middle, uint16_t minPresence, uint16_t iteration) {
int from = -2;
int to = -1;
size_t result = 0;
uint16_t targetPresence = minPresence;
for (uint32_t i = 0; i < end; ++i) {
if (i == middle) {
targetPresence++;
}
Slot& item = map[shortcut[i]];
if (item.mark != iteration) {
item.mark = iteration;
if (item.presence >= targetPresence) {
if (to < i) {
if (from > 0) {
result += to - from;
}
from = i;
}
to = i + sliceLen;
}
}
}
if (from > 0) {
result += to - from;
}
return result;
}
static std::string createDictionary(const uint8_t* data, size_t sliceLen,
Slot* map, uint32_t* shortcut, size_t end, size_t middle,
uint16_t minPresence, uint16_t iteration) {
std::string output;
int from = -2;
int to = -1;
uint16_t targetPresence = minPresence;
for (uint32_t i = 0; i < end; ++i) {
if (i == middle) {
targetPresence++;
}
Slot& item = map[shortcut[i]];
if (item.mark != iteration) {
item.mark = iteration;
if (item.presence >= targetPresence) {
if (to < i) {
if (from > 0) {
output.insert(output.end(), &data[from], &data[to]);
}
from = i;
}
to = i + sliceLen;
}
}
}
if (from > 0) {
output.insert(output.end(), &data[from], &data[to]);
}
return output;
}
std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
/* Parameters aliasing */
size_t targetSize = dictionary_size_limit;
size_t sliceLen = slice_len;
const uint8_t* data = sample_data;
size_t total = 0;
std::vector<size_t> offsets;
for (size_t i = 0; i < sample_sizes.size(); ++i) {
total += sample_sizes[i];
offsets.push_back(total);
}
/*****************************************************************************
* Build coverage map.
****************************************************************************/
std::vector<Slot> map;
std::vector<uint32_t> shortcut;
map.push_back({0, 0, 0, 0});
size_t end = total - sliceLen;
int hashLen = 8;
while ((1 << hashLen) < end) {
hashLen += 3;
}
hashLen -= 3;
uint32_t hashMask = (1u << hashLen) - 1u;
std::vector<uint32_t> hashHead(1 << hashLen);
uint32_t hashSlot = 1;
uint16_t piece = 0;
uint32_t hash = 0;
int lShift = 3;
int rShift = hashLen - lShift;
for (int i = 0; i < sliceLen - 1; ++i) {
uint32_t v = data[i];
hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
}
int lShiftX = (lShift * (sliceLen - 1)) % hashLen;
int rShiftX = hashLen - lShiftX;
for (uint32_t i = 0; i < end; ++i) {
uint32_t v = data[i + sliceLen - 1];
hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
if (offsets[piece] == i) {
piece++;
}
uint32_t slot = hashHead[hash];
while (slot != 0) {
Slot& item = map[slot];
int start = item.offset;
bool miss = false;
for (size_t j = 0; j < sliceLen; ++j) {
if (data[i + j] != data[start + j]) {
miss = true;
break;
}
}
if (!miss) {
if (item.mark != piece) {
item.mark = piece;
item.presence++;
}
shortcut.push_back(slot);
break;
}
slot = item.next;
}
if (slot == 0) {
map.push_back({hashHead[hash], i, 1, piece});
hashHead[hash] = hashSlot;
shortcut.push_back(hashSlot);
hashSlot++;
}
v = data[i];
hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask;
}
/*****************************************************************************
* Build dictionary of specified size.
****************************************************************************/
size_t a = 1;
size_t size = dryRun(
sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
/* Maximal output is smaller than target. */
if (size <= targetSize) {
return createDictionary(
data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
}
size_t b = offsets.size();
size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
if (size == targetSize) {
return createDictionary(
data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
}
/* Run binary search. */
if (size < targetSize) {
/* size(a) > targetSize > size(b) && a < m < b */
while (a + 1 < b) {
size_t m = (a + b) / 2;
size = dryRun(
sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
if (size < targetSize) {
b = m;
} else if (size > targetSize) {
a = m;
} else {
return createDictionary(
data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
}
}
} else {
a = b;
}
/* size(minPresence) > targetSize > size(minPresence + 1) */
size_t minPresence = a;
a = 0;
b = end;
/* size(a) < targetSize < size(b) && a < m < b */
while (a + 1 < b) {
size_t m = (a + b) / 2;
size = dryRun(
sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
if (size < targetSize) {
a = m;
} else if (size > targetSize) {
b = m;
} else {
return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
m, minPresence, ++piece);
}
}
bool unrestricted = false;
if (minPresence <= 2 && !unrestricted) {
minPresence = 2;
a = end;
}
return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, a,
minPresence, ++piece);
}

21
research/sieve.h Executable file
View File

@ -0,0 +1,21 @@
#ifndef BROTLI_RESEARCH_SIEVE_H_
#define BROTLI_RESEARCH_SIEVE_H_
#include <stddef.h>
#include <stdint.h>
#include <string>
#include <vector>
/**
* Generate a dictionary for given samples.
*
* @param dictionary_size_limit maximal dictionary size
* @param sample_sizes vector with sample sizes
* @param sample_data concatenated samples
* @return generated dictionary
*/
std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
const std::vector<size_t>& sample_sizes, const uint8_t* sample_data);
#endif // BROTLI_RESEARCH_SIEVE_H_