From 39ef4bbdcffd6316b5d3e95904cf9b0aa5e19504 Mon Sep 17 00:00:00 2001 From: Eugene Kliuchnikov Date: Fri, 13 Oct 2017 11:25:03 +0200 Subject: [PATCH] Add new (fast) dictionary generator engine. (#616) Add CLI for dictionary generation. Add BUILD file for research folder --- research/BUILD | 29 +++++ research/deorummolae.cc | 33 +++-- research/deorummolae.h | 14 +- research/dictionary_generator.cc | 153 ++++++++++++++++++++++ research/sieve.cc | 211 +++++++++++++++++++++++++++++++ research/sieve.h | 21 +++ 6 files changed, 436 insertions(+), 25 deletions(-) create mode 100755 research/BUILD create mode 100755 research/dictionary_generator.cc create mode 100755 research/sieve.cc create mode 100755 research/sieve.h diff --git a/research/BUILD b/research/BUILD new file mode 100755 index 0000000..6ff5ac2 --- /dev/null +++ b/research/BUILD @@ -0,0 +1,29 @@ +# Description: brotli research tools. + +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) # MIT + +cc_library( + name = "dm", + srcs = ["deorummolae.cc"], + hdrs = [ + "deorummolae.h", + "esaxx/sais.hxx", + ], +) + +cc_library( + name = "sieve", + srcs = ["sieve.cc"], + hdrs = ["sieve.h"], +) + +cc_binary( + name = "dictionary_generator", + srcs = ["dictionary_generator.cc"], + deps = [ + ":dm", + ":sieve", + ], +) diff --git a/research/deorummolae.cc b/research/deorummolae.cc index f4cc53e..c53a53c 100644 --- a/research/deorummolae.cc +++ b/research/deorummolae.cc @@ -1,7 +1,7 @@ #include "./deorummolae.h" #include -#include +#include #include "./esaxx/sais.hxx" @@ -20,9 +20,7 @@ /* File coverage: every bit set to 1 denotes a file covered by an isle. */ typedef std::array Coverage; -static int popcount(uint64_t u) { - return __builtin_popcountll(u); -} +static int popcount(uint64_t u) { return __builtin_popcountll(u); } /* Condense terminators and pad file entries. */ static void rewriteText(std::vector* text) { @@ -155,9 +153,8 @@ static void cutMatch(std::vector>* data, int index, int length, } } -size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit, - size_t num_samples, const size_t* sample_sizes, - const uint8_t* sample_data) { +std::string DM_generate(size_t dictionary_size_limit, + const std::vector& sample_sizes, const uint8_t* sample_data) { { uint64_t tmp = 0; if (popcount(tmp - 1u) != 64) { @@ -169,9 +166,11 @@ size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit, /* Could use 256 + '0' for easier debugging. */ int next_terminator = 256; + std::string output; std::vector> data; size_t offset = 0; + size_t num_samples = sample_sizes.size(); if (num_samples > MAX_FILES) num_samples = MAX_FILES; for (size_t n = 0; n < num_samples; ++n) { size_t next_offset = offset + sample_sizes[n]; @@ -208,7 +207,7 @@ size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit, buildLcp(&full_text, &sa, &lcp, &invese_sa); /* Do not rebuild SA/LCP, just use different selection. */ -retry: + retry: best_cost = 0; best_isle = {0, 0, 0, {{0}}}; isles.resize(0); @@ -259,18 +258,16 @@ retry: } /* Save the entry. */ - fprintf(stderr, - "Savings: %zu+%zu, dictionary: %zu+%d\n", - total_cost, best_cost, total, best_isle.lcp); - for (size_t i = 0; i < best_isle.lcp; ++i) { - dictionary[total + i] = - static_cast(full_text[sa[best_isle.l] + i]); - } + fprintf(stderr, "Savings: %zu+%zu, dictionary: %zu+%d\n", + total_cost, best_cost, total, best_isle.lcp); + int* piece = &full_text[sa[best_isle.l]]; + output.insert(output.end(), piece, piece + best_isle.lcp); total += best_isle.lcp; total_cost += best_cost; - cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, - &invese_sa, &next_terminator, &file_map, &file_offset); + cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, &invese_sa, + &next_terminator, &file_map, &file_offset); if (total >= dictionary_size_limit) break; } - return total; + + return output; } diff --git a/research/deorummolae.h b/research/deorummolae.h index f37015c..7f24add 100644 --- a/research/deorummolae.h +++ b/research/deorummolae.h @@ -4,6 +4,9 @@ #include #include +#include +#include + /* log2(maximal number of files). Value 6 provides some speedups. */ #define LOG_MAX_FILES 6 @@ -13,15 +16,12 @@ /** * Generate a dictionary for given samples. * - * @param dictionary storage for generated dictionary * @param dictionary_size_limit maximal dictionary size - * @param num_samples number of samples - * @param sample_sizes array with sample sizes + * @param sample_sizes vector with sample sizes * @param sample_data concatenated samples - * @return generated dictionary size + * @return generated dictionary */ -size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit, - size_t num_samples, const size_t* sample_sizes, - const uint8_t* sample_data); +std::string DM_generate(size_t dictionary_size_limit, + const std::vector& sample_sizes, const uint8_t* sample_data); #endif // BROTLI_RESEARCH_DEORUMMOLAE_H_ diff --git a/research/dictionary_generator.cc b/research/dictionary_generator.cc new file mode 100755 index 0000000..b3ee89c --- /dev/null +++ b/research/dictionary_generator.cc @@ -0,0 +1,153 @@ +#include +#include +#include +#include + +#include "./deorummolae.h" +#include "./sieve.h" + +#define METHOD_DM 0 +#define METHOD_SIEVE 1 + +size_t readInt(const char* str) { + size_t result = 0; + if (str[0] == 0 || str[0] == '0') { + return 0; + } + for (size_t i = 0; i < 13; ++i) { + if (str[i] == 0) { + return result; + } + if (str[i] == 'k' || str[i] == 'K') { + if ((str[i + 1] == 0) && ((result << 10) > result)) { + return result << 10; + } + return 0; + } + if (str[i] == 'm' || str[i] == 'M') { + if ((str[i + 1] == 0) && ((result << 20) > result)) { + return result << 20; + } + return 0; + } + if (str[i] < '0' || str[i] > '9') { + return 0; + } + size_t next = (10 * result) + (str[i] - '0'); + if (next <= result) { + return 0; + } + result = next; + } + return 0; +} + +static std::string readFile(const std::string& path) { + std::ifstream file(path); + std::string content( + (std::istreambuf_iterator(file)), std::istreambuf_iterator()); + return content; +} + +static void writeFile(const char* file, const std::string& content) { + std::ofstream outfile(file, std::ofstream::binary); + outfile.write(content.c_str(), content.size()); + outfile.close(); +} + +/* Returns "base file name" or its tail, if it contains '/' or '\'. */ +static const char* fileName(const char* path) { + const char* separator_position = strrchr(path, '/'); + if (separator_position) path = separator_position + 1; + separator_position = strrchr(path, '\\'); + if (separator_position) path = separator_position + 1; + return path; +} + +static void printHelp(const char* name) { + fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name); + fprintf(stderr, + "Options:\n" + " --dm use 'deorummolae' engine\n" + " --sieve use 'sieve' engine (default)\n" + " -t# set target dictionary size (limit); default: 16K\n" + " -s# set slize length for 'sieve'; default: 33\n" + "# is a decimal number with optional k/K/m/M suffix.\n\n"); +} + +int main(int argc, char const* argv[]) { + int dictionaryArg = -1; + int method = METHOD_SIEVE; + int sieveSliceLen = 33; + int targetSize = 16 << 10; + + std::vector data; + std::vector sizes; + size_t total = 0; + for (int i = 1; i < argc; ++i) { + if (argv[i] == nullptr) { + continue; + } + if (argv[i][0] == '-') { + if (argv[i][1] == '-') { + if (std::strcmp("--sieve", argv[i]) == 0) { + method = METHOD_SIEVE; + continue; + } + if (std::strcmp("--dm", argv[i]) == 0) { + method = METHOD_DM; + continue; + } + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } + if (argv[i][1] == 's') { + sieveSliceLen = readInt(&argv[i][2]); + if (sieveSliceLen < 4 || sieveSliceLen > 256) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } + } else if (argv[i][1] == 't') { + targetSize = readInt(&argv[i][2]); + if (targetSize < 256 || targetSize > (1 << 25)) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } + } else { + printHelp(fileName(argv[0])); + fprintf(stderr, "Unrecognized option '%s'\n", argv[i]); + exit(1); + } + continue; + } + if (dictionaryArg == -1) { + dictionaryArg = i; + continue; + } + std::string content = readFile(argv[i]); + data.insert(data.end(), content.begin(), content.end()); + total += content.size(); + sizes.push_back(content.size()); + } + if (dictionaryArg == -1 || total == 0) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Not enough arguments\n"); + exit(1); + } + + if (method == METHOD_SIEVE) { + writeFile(argv[dictionaryArg], + sieve_generate(targetSize, sieveSliceLen, sizes, data.data())); + } else if (method == METHOD_DM) { + writeFile(argv[dictionaryArg], + DM_generate(targetSize, sizes, data.data())); + } else { + printHelp(fileName(argv[0])); + fprintf(stderr, "Unknown generator\n"); + exit(1); + } + return 0; +} diff --git a/research/sieve.cc b/research/sieve.cc new file mode 100755 index 0000000..fbc1dbf --- /dev/null +++ b/research/sieve.cc @@ -0,0 +1,211 @@ +#include "./sieve.h" + +typedef struct Slot { + uint32_t next; + uint32_t offset; + uint16_t presence; + uint16_t mark; +} Slot; + +static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end, + size_t middle, uint16_t minPresence, uint16_t iteration) { + int from = -2; + int to = -1; + size_t result = 0; + uint16_t targetPresence = minPresence; + for (uint32_t i = 0; i < end; ++i) { + if (i == middle) { + targetPresence++; + } + Slot& item = map[shortcut[i]]; + if (item.mark != iteration) { + item.mark = iteration; + if (item.presence >= targetPresence) { + if (to < i) { + if (from > 0) { + result += to - from; + } + from = i; + } + to = i + sliceLen; + } + } + } + if (from > 0) { + result += to - from; + } + return result; +} + +static std::string createDictionary(const uint8_t* data, size_t sliceLen, + Slot* map, uint32_t* shortcut, size_t end, size_t middle, + uint16_t minPresence, uint16_t iteration) { + std::string output; + int from = -2; + int to = -1; + uint16_t targetPresence = minPresence; + for (uint32_t i = 0; i < end; ++i) { + if (i == middle) { + targetPresence++; + } + Slot& item = map[shortcut[i]]; + if (item.mark != iteration) { + item.mark = iteration; + if (item.presence >= targetPresence) { + if (to < i) { + if (from > 0) { + output.insert(output.end(), &data[from], &data[to]); + } + from = i; + } + to = i + sliceLen; + } + } + } + if (from > 0) { + output.insert(output.end(), &data[from], &data[to]); + } + return output; +} + +std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, + const std::vector& sample_sizes, const uint8_t* sample_data) { + /* Parameters aliasing */ + size_t targetSize = dictionary_size_limit; + size_t sliceLen = slice_len; + const uint8_t* data = sample_data; + + size_t total = 0; + std::vector offsets; + for (size_t i = 0; i < sample_sizes.size(); ++i) { + total += sample_sizes[i]; + offsets.push_back(total); + } + + /***************************************************************************** + * Build coverage map. + ****************************************************************************/ + std::vector map; + std::vector shortcut; + map.push_back({0, 0, 0, 0}); + size_t end = total - sliceLen; + int hashLen = 8; + while ((1 << hashLen) < end) { + hashLen += 3; + } + hashLen -= 3; + uint32_t hashMask = (1u << hashLen) - 1u; + std::vector hashHead(1 << hashLen); + uint32_t hashSlot = 1; + uint16_t piece = 0; + uint32_t hash = 0; + int lShift = 3; + int rShift = hashLen - lShift; + for (int i = 0; i < sliceLen - 1; ++i) { + uint32_t v = data[i]; + hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; + } + int lShiftX = (lShift * (sliceLen - 1)) % hashLen; + int rShiftX = hashLen - lShiftX; + for (uint32_t i = 0; i < end; ++i) { + uint32_t v = data[i + sliceLen - 1]; + hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; + + if (offsets[piece] == i) { + piece++; + } + uint32_t slot = hashHead[hash]; + while (slot != 0) { + Slot& item = map[slot]; + int start = item.offset; + bool miss = false; + for (size_t j = 0; j < sliceLen; ++j) { + if (data[i + j] != data[start + j]) { + miss = true; + break; + } + } + if (!miss) { + if (item.mark != piece) { + item.mark = piece; + item.presence++; + } + shortcut.push_back(slot); + break; + } + slot = item.next; + } + if (slot == 0) { + map.push_back({hashHead[hash], i, 1, piece}); + hashHead[hash] = hashSlot; + shortcut.push_back(hashSlot); + hashSlot++; + } + v = data[i]; + hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask; + } + + /***************************************************************************** + * Build dictionary of specified size. + ****************************************************************************/ + size_t a = 1; + size_t size = dryRun( + sliceLen, map.data(), shortcut.data(), end, end, a, ++piece); + /* Maximal output is smaller than target. */ + if (size <= targetSize) { + return createDictionary( + data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece); + } + + size_t b = offsets.size(); + size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece); + if (size == targetSize) { + return createDictionary( + data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece); + } + /* Run binary search. */ + if (size < targetSize) { + /* size(a) > targetSize > size(b) && a < m < b */ + while (a + 1 < b) { + size_t m = (a + b) / 2; + size = dryRun( + sliceLen, map.data(), shortcut.data(), end, end, m, ++piece); + if (size < targetSize) { + b = m; + } else if (size > targetSize) { + a = m; + } else { + return createDictionary( + data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece); + } + } + } else { + a = b; + } + /* size(minPresence) > targetSize > size(minPresence + 1) */ + size_t minPresence = a; + a = 0; + b = end; + /* size(a) < targetSize < size(b) && a < m < b */ + while (a + 1 < b) { + size_t m = (a + b) / 2; + size = dryRun( + sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece); + if (size < targetSize) { + a = m; + } else if (size > targetSize) { + b = m; + } else { + return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, + m, minPresence, ++piece); + } + } + + bool unrestricted = false; + if (minPresence <= 2 && !unrestricted) { + minPresence = 2; + a = end; + } + return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, a, + minPresence, ++piece); +} diff --git a/research/sieve.h b/research/sieve.h new file mode 100755 index 0000000..f529835 --- /dev/null +++ b/research/sieve.h @@ -0,0 +1,21 @@ +#ifndef BROTLI_RESEARCH_SIEVE_H_ +#define BROTLI_RESEARCH_SIEVE_H_ + +#include +#include + +#include +#include + +/** + * Generate a dictionary for given samples. + * + * @param dictionary_size_limit maximal dictionary size + * @param sample_sizes vector with sample sizes + * @param sample_data concatenated samples + * @return generated dictionary + */ +std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, + const std::vector& sample_sizes, const uint8_t* sample_data); + +#endif // BROTLI_RESEARCH_SIEVE_H_