mirror of
https://github.com/google/brotli.git
synced 2025-01-11 17:10:06 +00:00
Add new (fast) dictionary generator engine. (#616)
Add CLI for dictionary generation. Add BUILD file for research folder
This commit is contained in:
parent
9c75a2a26a
commit
39ef4bbdcf
29
research/BUILD
Executable file
29
research/BUILD
Executable file
@ -0,0 +1,29 @@
|
||||
# Description: brotli research tools.
|
||||
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
licenses(["notice"]) # MIT
|
||||
|
||||
cc_library(
|
||||
name = "dm",
|
||||
srcs = ["deorummolae.cc"],
|
||||
hdrs = [
|
||||
"deorummolae.h",
|
||||
"esaxx/sais.hxx",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "sieve",
|
||||
srcs = ["sieve.cc"],
|
||||
hdrs = ["sieve.h"],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "dictionary_generator",
|
||||
srcs = ["dictionary_generator.cc"],
|
||||
deps = [
|
||||
":dm",
|
||||
":sieve",
|
||||
],
|
||||
)
|
@ -1,7 +1,7 @@
|
||||
#include "./deorummolae.h"
|
||||
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <cstdio>
|
||||
|
||||
#include "./esaxx/sais.hxx"
|
||||
|
||||
@ -20,9 +20,7 @@
|
||||
/* File coverage: every bit set to 1 denotes a file covered by an isle. */
|
||||
typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
|
||||
|
||||
static int popcount(uint64_t u) {
|
||||
return __builtin_popcountll(u);
|
||||
}
|
||||
static int popcount(uint64_t u) { return __builtin_popcountll(u); }
|
||||
|
||||
/* Condense terminators and pad file entries. */
|
||||
static void rewriteText(std::vector<int>* text) {
|
||||
@ -155,9 +153,8 @@ static void cutMatch(std::vector<std::vector<int>>* data, int index, int length,
|
||||
}
|
||||
}
|
||||
|
||||
size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
|
||||
size_t num_samples, const size_t* sample_sizes,
|
||||
const uint8_t* sample_data) {
|
||||
std::string DM_generate(size_t dictionary_size_limit,
|
||||
const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
|
||||
{
|
||||
uint64_t tmp = 0;
|
||||
if (popcount(tmp - 1u) != 64) {
|
||||
@ -169,9 +166,11 @@ size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
|
||||
/* Could use 256 + '0' for easier debugging. */
|
||||
int next_terminator = 256;
|
||||
|
||||
std::string output;
|
||||
std::vector<std::vector<int>> data;
|
||||
|
||||
size_t offset = 0;
|
||||
size_t num_samples = sample_sizes.size();
|
||||
if (num_samples > MAX_FILES) num_samples = MAX_FILES;
|
||||
for (size_t n = 0; n < num_samples; ++n) {
|
||||
size_t next_offset = offset + sample_sizes[n];
|
||||
@ -208,7 +207,7 @@ size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
|
||||
buildLcp(&full_text, &sa, &lcp, &invese_sa);
|
||||
|
||||
/* Do not rebuild SA/LCP, just use different selection. */
|
||||
retry:
|
||||
retry:
|
||||
best_cost = 0;
|
||||
best_isle = {0, 0, 0, {{0}}};
|
||||
isles.resize(0);
|
||||
@ -259,18 +258,16 @@ retry:
|
||||
}
|
||||
|
||||
/* Save the entry. */
|
||||
fprintf(stderr,
|
||||
"Savings: %zu+%zu, dictionary: %zu+%d\n",
|
||||
total_cost, best_cost, total, best_isle.lcp);
|
||||
for (size_t i = 0; i < best_isle.lcp; ++i) {
|
||||
dictionary[total + i] =
|
||||
static_cast<uint8_t>(full_text[sa[best_isle.l] + i]);
|
||||
}
|
||||
fprintf(stderr, "Savings: %zu+%zu, dictionary: %zu+%d\n",
|
||||
total_cost, best_cost, total, best_isle.lcp);
|
||||
int* piece = &full_text[sa[best_isle.l]];
|
||||
output.insert(output.end(), piece, piece + best_isle.lcp);
|
||||
total += best_isle.lcp;
|
||||
total_cost += best_cost;
|
||||
cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp,
|
||||
&invese_sa, &next_terminator, &file_map, &file_offset);
|
||||
cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, &invese_sa,
|
||||
&next_terminator, &file_map, &file_offset);
|
||||
if (total >= dictionary_size_limit) break;
|
||||
}
|
||||
return total;
|
||||
|
||||
return output;
|
||||
}
|
||||
|
@ -4,6 +4,9 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
/* log2(maximal number of files). Value 6 provides some speedups. */
|
||||
#define LOG_MAX_FILES 6
|
||||
|
||||
@ -13,15 +16,12 @@
|
||||
/**
|
||||
* Generate a dictionary for given samples.
|
||||
*
|
||||
* @param dictionary storage for generated dictionary
|
||||
* @param dictionary_size_limit maximal dictionary size
|
||||
* @param num_samples number of samples
|
||||
* @param sample_sizes array with sample sizes
|
||||
* @param sample_sizes vector with sample sizes
|
||||
* @param sample_data concatenated samples
|
||||
* @return generated dictionary size
|
||||
* @return generated dictionary
|
||||
*/
|
||||
size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
|
||||
size_t num_samples, const size_t* sample_sizes,
|
||||
const uint8_t* sample_data);
|
||||
std::string DM_generate(size_t dictionary_size_limit,
|
||||
const std::vector<size_t>& sample_sizes, const uint8_t* sample_data);
|
||||
|
||||
#endif // BROTLI_RESEARCH_DEORUMMOLAE_H_
|
||||
|
153
research/dictionary_generator.cc
Executable file
153
research/dictionary_generator.cc
Executable file
@ -0,0 +1,153 @@
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
|
||||
#include "./deorummolae.h"
|
||||
#include "./sieve.h"
|
||||
|
||||
#define METHOD_DM 0
|
||||
#define METHOD_SIEVE 1
|
||||
|
||||
size_t readInt(const char* str) {
|
||||
size_t result = 0;
|
||||
if (str[0] == 0 || str[0] == '0') {
|
||||
return 0;
|
||||
}
|
||||
for (size_t i = 0; i < 13; ++i) {
|
||||
if (str[i] == 0) {
|
||||
return result;
|
||||
}
|
||||
if (str[i] == 'k' || str[i] == 'K') {
|
||||
if ((str[i + 1] == 0) && ((result << 10) > result)) {
|
||||
return result << 10;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
if (str[i] == 'm' || str[i] == 'M') {
|
||||
if ((str[i + 1] == 0) && ((result << 20) > result)) {
|
||||
return result << 20;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
if (str[i] < '0' || str[i] > '9') {
|
||||
return 0;
|
||||
}
|
||||
size_t next = (10 * result) + (str[i] - '0');
|
||||
if (next <= result) {
|
||||
return 0;
|
||||
}
|
||||
result = next;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static std::string readFile(const std::string& path) {
|
||||
std::ifstream file(path);
|
||||
std::string content(
|
||||
(std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
||||
return content;
|
||||
}
|
||||
|
||||
static void writeFile(const char* file, const std::string& content) {
|
||||
std::ofstream outfile(file, std::ofstream::binary);
|
||||
outfile.write(content.c_str(), content.size());
|
||||
outfile.close();
|
||||
}
|
||||
|
||||
/* Returns "base file name" or its tail, if it contains '/' or '\'. */
|
||||
static const char* fileName(const char* path) {
|
||||
const char* separator_position = strrchr(path, '/');
|
||||
if (separator_position) path = separator_position + 1;
|
||||
separator_position = strrchr(path, '\\');
|
||||
if (separator_position) path = separator_position + 1;
|
||||
return path;
|
||||
}
|
||||
|
||||
static void printHelp(const char* name) {
|
||||
fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name);
|
||||
fprintf(stderr,
|
||||
"Options:\n"
|
||||
" --dm use 'deorummolae' engine\n"
|
||||
" --sieve use 'sieve' engine (default)\n"
|
||||
" -t# set target dictionary size (limit); default: 16K\n"
|
||||
" -s# set slize length for 'sieve'; default: 33\n"
|
||||
"# is a decimal number with optional k/K/m/M suffix.\n\n");
|
||||
}
|
||||
|
||||
int main(int argc, char const* argv[]) {
|
||||
int dictionaryArg = -1;
|
||||
int method = METHOD_SIEVE;
|
||||
int sieveSliceLen = 33;
|
||||
int targetSize = 16 << 10;
|
||||
|
||||
std::vector<uint8_t> data;
|
||||
std::vector<size_t> sizes;
|
||||
size_t total = 0;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (argv[i] == nullptr) {
|
||||
continue;
|
||||
}
|
||||
if (argv[i][0] == '-') {
|
||||
if (argv[i][1] == '-') {
|
||||
if (std::strcmp("--sieve", argv[i]) == 0) {
|
||||
method = METHOD_SIEVE;
|
||||
continue;
|
||||
}
|
||||
if (std::strcmp("--dm", argv[i]) == 0) {
|
||||
method = METHOD_DM;
|
||||
continue;
|
||||
}
|
||||
printHelp(fileName(argv[0]));
|
||||
fprintf(stderr, "Invalid option '%s'\n", argv[i]);
|
||||
exit(1);
|
||||
}
|
||||
if (argv[i][1] == 's') {
|
||||
sieveSliceLen = readInt(&argv[i][2]);
|
||||
if (sieveSliceLen < 4 || sieveSliceLen > 256) {
|
||||
printHelp(fileName(argv[0]));
|
||||
fprintf(stderr, "Invalid option '%s'\n", argv[i]);
|
||||
exit(1);
|
||||
}
|
||||
} else if (argv[i][1] == 't') {
|
||||
targetSize = readInt(&argv[i][2]);
|
||||
if (targetSize < 256 || targetSize > (1 << 25)) {
|
||||
printHelp(fileName(argv[0]));
|
||||
fprintf(stderr, "Invalid option '%s'\n", argv[i]);
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
printHelp(fileName(argv[0]));
|
||||
fprintf(stderr, "Unrecognized option '%s'\n", argv[i]);
|
||||
exit(1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (dictionaryArg == -1) {
|
||||
dictionaryArg = i;
|
||||
continue;
|
||||
}
|
||||
std::string content = readFile(argv[i]);
|
||||
data.insert(data.end(), content.begin(), content.end());
|
||||
total += content.size();
|
||||
sizes.push_back(content.size());
|
||||
}
|
||||
if (dictionaryArg == -1 || total == 0) {
|
||||
printHelp(fileName(argv[0]));
|
||||
fprintf(stderr, "Not enough arguments\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (method == METHOD_SIEVE) {
|
||||
writeFile(argv[dictionaryArg],
|
||||
sieve_generate(targetSize, sieveSliceLen, sizes, data.data()));
|
||||
} else if (method == METHOD_DM) {
|
||||
writeFile(argv[dictionaryArg],
|
||||
DM_generate(targetSize, sizes, data.data()));
|
||||
} else {
|
||||
printHelp(fileName(argv[0]));
|
||||
fprintf(stderr, "Unknown generator\n");
|
||||
exit(1);
|
||||
}
|
||||
return 0;
|
||||
}
|
211
research/sieve.cc
Executable file
211
research/sieve.cc
Executable file
@ -0,0 +1,211 @@
|
||||
#include "./sieve.h"
|
||||
|
||||
typedef struct Slot {
|
||||
uint32_t next;
|
||||
uint32_t offset;
|
||||
uint16_t presence;
|
||||
uint16_t mark;
|
||||
} Slot;
|
||||
|
||||
static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end,
|
||||
size_t middle, uint16_t minPresence, uint16_t iteration) {
|
||||
int from = -2;
|
||||
int to = -1;
|
||||
size_t result = 0;
|
||||
uint16_t targetPresence = minPresence;
|
||||
for (uint32_t i = 0; i < end; ++i) {
|
||||
if (i == middle) {
|
||||
targetPresence++;
|
||||
}
|
||||
Slot& item = map[shortcut[i]];
|
||||
if (item.mark != iteration) {
|
||||
item.mark = iteration;
|
||||
if (item.presence >= targetPresence) {
|
||||
if (to < i) {
|
||||
if (from > 0) {
|
||||
result += to - from;
|
||||
}
|
||||
from = i;
|
||||
}
|
||||
to = i + sliceLen;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (from > 0) {
|
||||
result += to - from;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string createDictionary(const uint8_t* data, size_t sliceLen,
|
||||
Slot* map, uint32_t* shortcut, size_t end, size_t middle,
|
||||
uint16_t minPresence, uint16_t iteration) {
|
||||
std::string output;
|
||||
int from = -2;
|
||||
int to = -1;
|
||||
uint16_t targetPresence = minPresence;
|
||||
for (uint32_t i = 0; i < end; ++i) {
|
||||
if (i == middle) {
|
||||
targetPresence++;
|
||||
}
|
||||
Slot& item = map[shortcut[i]];
|
||||
if (item.mark != iteration) {
|
||||
item.mark = iteration;
|
||||
if (item.presence >= targetPresence) {
|
||||
if (to < i) {
|
||||
if (from > 0) {
|
||||
output.insert(output.end(), &data[from], &data[to]);
|
||||
}
|
||||
from = i;
|
||||
}
|
||||
to = i + sliceLen;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (from > 0) {
|
||||
output.insert(output.end(), &data[from], &data[to]);
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
|
||||
const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
|
||||
/* Parameters aliasing */
|
||||
size_t targetSize = dictionary_size_limit;
|
||||
size_t sliceLen = slice_len;
|
||||
const uint8_t* data = sample_data;
|
||||
|
||||
size_t total = 0;
|
||||
std::vector<size_t> offsets;
|
||||
for (size_t i = 0; i < sample_sizes.size(); ++i) {
|
||||
total += sample_sizes[i];
|
||||
offsets.push_back(total);
|
||||
}
|
||||
|
||||
/*****************************************************************************
|
||||
* Build coverage map.
|
||||
****************************************************************************/
|
||||
std::vector<Slot> map;
|
||||
std::vector<uint32_t> shortcut;
|
||||
map.push_back({0, 0, 0, 0});
|
||||
size_t end = total - sliceLen;
|
||||
int hashLen = 8;
|
||||
while ((1 << hashLen) < end) {
|
||||
hashLen += 3;
|
||||
}
|
||||
hashLen -= 3;
|
||||
uint32_t hashMask = (1u << hashLen) - 1u;
|
||||
std::vector<uint32_t> hashHead(1 << hashLen);
|
||||
uint32_t hashSlot = 1;
|
||||
uint16_t piece = 0;
|
||||
uint32_t hash = 0;
|
||||
int lShift = 3;
|
||||
int rShift = hashLen - lShift;
|
||||
for (int i = 0; i < sliceLen - 1; ++i) {
|
||||
uint32_t v = data[i];
|
||||
hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
|
||||
}
|
||||
int lShiftX = (lShift * (sliceLen - 1)) % hashLen;
|
||||
int rShiftX = hashLen - lShiftX;
|
||||
for (uint32_t i = 0; i < end; ++i) {
|
||||
uint32_t v = data[i + sliceLen - 1];
|
||||
hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
|
||||
|
||||
if (offsets[piece] == i) {
|
||||
piece++;
|
||||
}
|
||||
uint32_t slot = hashHead[hash];
|
||||
while (slot != 0) {
|
||||
Slot& item = map[slot];
|
||||
int start = item.offset;
|
||||
bool miss = false;
|
||||
for (size_t j = 0; j < sliceLen; ++j) {
|
||||
if (data[i + j] != data[start + j]) {
|
||||
miss = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!miss) {
|
||||
if (item.mark != piece) {
|
||||
item.mark = piece;
|
||||
item.presence++;
|
||||
}
|
||||
shortcut.push_back(slot);
|
||||
break;
|
||||
}
|
||||
slot = item.next;
|
||||
}
|
||||
if (slot == 0) {
|
||||
map.push_back({hashHead[hash], i, 1, piece});
|
||||
hashHead[hash] = hashSlot;
|
||||
shortcut.push_back(hashSlot);
|
||||
hashSlot++;
|
||||
}
|
||||
v = data[i];
|
||||
hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask;
|
||||
}
|
||||
|
||||
/*****************************************************************************
|
||||
* Build dictionary of specified size.
|
||||
****************************************************************************/
|
||||
size_t a = 1;
|
||||
size_t size = dryRun(
|
||||
sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
|
||||
/* Maximal output is smaller than target. */
|
||||
if (size <= targetSize) {
|
||||
return createDictionary(
|
||||
data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
|
||||
}
|
||||
|
||||
size_t b = offsets.size();
|
||||
size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
|
||||
if (size == targetSize) {
|
||||
return createDictionary(
|
||||
data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
|
||||
}
|
||||
/* Run binary search. */
|
||||
if (size < targetSize) {
|
||||
/* size(a) > targetSize > size(b) && a < m < b */
|
||||
while (a + 1 < b) {
|
||||
size_t m = (a + b) / 2;
|
||||
size = dryRun(
|
||||
sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
|
||||
if (size < targetSize) {
|
||||
b = m;
|
||||
} else if (size > targetSize) {
|
||||
a = m;
|
||||
} else {
|
||||
return createDictionary(
|
||||
data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
a = b;
|
||||
}
|
||||
/* size(minPresence) > targetSize > size(minPresence + 1) */
|
||||
size_t minPresence = a;
|
||||
a = 0;
|
||||
b = end;
|
||||
/* size(a) < targetSize < size(b) && a < m < b */
|
||||
while (a + 1 < b) {
|
||||
size_t m = (a + b) / 2;
|
||||
size = dryRun(
|
||||
sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
|
||||
if (size < targetSize) {
|
||||
a = m;
|
||||
} else if (size > targetSize) {
|
||||
b = m;
|
||||
} else {
|
||||
return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
|
||||
m, minPresence, ++piece);
|
||||
}
|
||||
}
|
||||
|
||||
bool unrestricted = false;
|
||||
if (minPresence <= 2 && !unrestricted) {
|
||||
minPresence = 2;
|
||||
a = end;
|
||||
}
|
||||
return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, a,
|
||||
minPresence, ++piece);
|
||||
}
|
21
research/sieve.h
Executable file
21
research/sieve.h
Executable file
@ -0,0 +1,21 @@
|
||||
#ifndef BROTLI_RESEARCH_SIEVE_H_
|
||||
#define BROTLI_RESEARCH_SIEVE_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
/**
|
||||
* Generate a dictionary for given samples.
|
||||
*
|
||||
* @param dictionary_size_limit maximal dictionary size
|
||||
* @param sample_sizes vector with sample sizes
|
||||
* @param sample_data concatenated samples
|
||||
* @return generated dictionary
|
||||
*/
|
||||
std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
|
||||
const std::vector<size_t>& sample_sizes, const uint8_t* sample_data);
|
||||
|
||||
#endif // BROTLI_RESEARCH_SIEVE_H_
|
Loading…
Reference in New Issue
Block a user