mirror of
https://github.com/google/brotli.git
synced 2024-11-21 19:20:09 +00:00
148 lines
4.2 KiB
C++
148 lines
4.2 KiB
C++
/* Copyright 2016 Google Inc. All Rights Reserved.
|
|
Author: vanickulin@google.com (Ivan Nikulin)
|
|
|
|
Distributed under MIT license.
|
|
See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
|
|
*/
|
|
|
|
/* Tool for generating optimal backward references for the input file. Uses
|
|
sais-lite library for building suffix array. */
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
|
|
#include "./sais.hxx"
|
|
|
|
const int FLAGS_min_length = 1; // Minimal length of found backward references.
|
|
const int FLAGS_skip = 1; // Number of bytes to skip.
|
|
|
|
const size_t kFileBufferSize = (1 << 16); // 64KB
|
|
|
|
typedef int sarray_type; // Can't make it unsigned because of templates :(
|
|
typedef uint8_t input_type;
|
|
typedef uint32_t lcp_type;
|
|
|
|
void ReadInput(FILE* fin, input_type* storage, size_t input_size) {
|
|
size_t last_pos = 0;
|
|
size_t available_in;
|
|
fseek(fin, 0, SEEK_SET);
|
|
do {
|
|
available_in = fread(storage + last_pos, 1, kFileBufferSize, fin);
|
|
last_pos += available_in;
|
|
} while (available_in != 0);
|
|
assert(last_pos == input_size);
|
|
}
|
|
|
|
void BuildLCP(input_type* storage, sarray_type* sarray, lcp_type* lcp,
|
|
size_t size, uint32_t* pos) {
|
|
for (int i = 0; i < size; ++i) {
|
|
pos[sarray[i]] = i;
|
|
}
|
|
uint32_t k = 0;
|
|
lcp[size - 1] = 0;
|
|
for (int i = 0; i < size; ++i) {
|
|
if (pos[i] == size - 1) {
|
|
k = 0;
|
|
continue;
|
|
}
|
|
uint32_t j = sarray[pos[i] + 1]; // Suffix which follow i-th suffix in SA.
|
|
while (i + k < size && j + k < size && storage[i + k] == storage[j + k]) {
|
|
++k;
|
|
}
|
|
lcp[pos[i]] = k;
|
|
if (k > 0) --k;
|
|
}
|
|
}
|
|
|
|
void ProcessReferences(input_type* storage, sarray_type* sarray, lcp_type* lcp,
|
|
size_t size, uint32_t* pos, FILE* fout) {
|
|
int min_length = FLAGS_min_length;
|
|
for (int idx = FLAGS_skip; idx < size; ++idx) {
|
|
int max_lcp = -1;
|
|
int max_lcp_ix;
|
|
int left_lcp = -1;
|
|
int left_ix;
|
|
for (left_ix = pos[idx] - 1; left_ix >= 0; --left_ix) {
|
|
if (left_lcp == -1 || left_lcp > lcp[left_ix]) {
|
|
left_lcp = lcp[left_ix];
|
|
}
|
|
if (left_lcp == 0) break;
|
|
if (sarray[left_ix] < idx) break;
|
|
}
|
|
if (left_ix >= 0) {
|
|
max_lcp = left_lcp;
|
|
max_lcp_ix = left_ix;
|
|
}
|
|
|
|
int right_lcp = -1;
|
|
int right_ix;
|
|
for (right_ix = pos[idx]; right_ix < size - 1; ++right_ix) {
|
|
if (right_lcp == -1 || right_lcp > lcp[right_ix]) {
|
|
right_lcp = lcp[right_ix];
|
|
}
|
|
// Stop if we have better result from the left side already.
|
|
if (right_lcp < max_lcp) break;
|
|
if (right_lcp == 0) break;
|
|
if (sarray[right_ix] < idx) break;
|
|
}
|
|
if (right_lcp > max_lcp && right_ix < size - 1) {
|
|
max_lcp = right_lcp;
|
|
max_lcp_ix = right_ix;
|
|
}
|
|
|
|
if (max_lcp >= min_length) {
|
|
int dist = idx - sarray[max_lcp_ix];
|
|
if (dist <= 0) {
|
|
printf("idx = %d, pos[idx] = %u\n", idx, pos[idx]);
|
|
printf("left_ix = %d, right_ix = %d\n",
|
|
left_ix, right_ix);
|
|
printf("left_lcp = %d, right_lcp = %d\n",
|
|
left_lcp, right_lcp);
|
|
printf("sarray[left_ix] = %d, sarray[right_ix] = %d\n",
|
|
sarray[left_ix], sarray[right_ix]);
|
|
assert(dist > 0);
|
|
}
|
|
fputc(1, fout);
|
|
fwrite(&idx, sizeof(int), 1, fout); // Position in input.
|
|
fwrite(&dist, sizeof(int), 1, fout); // Backward distance.
|
|
}
|
|
}
|
|
}
|
|
|
|
int main(int argc, char* argv[]) {
|
|
if (argc != 3) {
|
|
printf("usage: %s input_file output_file\n", argv[0]);
|
|
return 1;
|
|
}
|
|
|
|
FILE* fin = fopen(argv[1], "rb");
|
|
FILE* fout = fopen(argv[2], "w");
|
|
|
|
fseek(fin, 0, SEEK_END);
|
|
int input_size = ftell(fin);
|
|
fseek(fin, 0, SEEK_SET);
|
|
printf("The file size is %u bytes\n", input_size);
|
|
|
|
input_type* storage = new input_type[input_size];
|
|
|
|
ReadInput(fin, storage, input_size);
|
|
fclose(fin);
|
|
|
|
sarray_type* sarray = new sarray_type[input_size];
|
|
saisxx(storage, sarray, input_size);
|
|
printf("Suffix array calculated.\n");
|
|
|
|
// Inverse suffix array.
|
|
uint32_t* pos = new uint32_t[input_size];
|
|
|
|
lcp_type* lcp = new lcp_type[input_size];
|
|
BuildLCP(storage, sarray, lcp, input_size, pos);
|
|
printf("LCP array constructed.\n");
|
|
|
|
ProcessReferences(storage, sarray, lcp, input_size, pos, fout);
|
|
fclose(fout);
|
|
return 0;
|
|
}
|