Add the initial version of the static dictionary and transforms to Brotli.

This commit is contained in:
Zoltan Szabadka 2014-02-17 14:25:36 +01:00
parent 0454ab4ec0
commit 2f268ad158
9 changed files with 12645 additions and 30 deletions

File diff suppressed because it is too large Load Diff

View File

@ -19,6 +19,8 @@
#include "./bit_reader.h"
#include "./context.h"
#include "./decode.h"
#include "./dictionary.h"
#include "./transform.h"
#include "./huffman.h"
#include "./prefix.h"
#include "./safe_malloc.h"
@ -977,11 +979,44 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) {
copy_dst = &ringbuffer[pos & ringbuffer_mask];
if (distance > max_distance) {
printf("Invalid backward reference. pos: %d distance: %d "
"len: %d bytes left: %d\n", pos, distance, copy_length,
meta_block_remaining_len);
ok = 0;
goto End;
if (copy_length >= 3 && copy_length <= kMaxDictionaryWordLength) {
int offset = kBrotliDictionaryOffsetsByLength[copy_length];
int word_id = distance - max_distance - 1;
int shift = kBrotliDictionarySizeBitsByLength[copy_length];
int mask = (1 << shift) - 1;
int word_idx = word_id & mask;
int transform_idx = word_id >> shift;
offset += word_idx * copy_length;
if (transform_idx < kNumTransforms) {
const uint8_t* word = &kBrotliDictionary[offset];
int len = TransformDictionaryWord(
copy_dst, word, copy_length, transform_idx);
copy_dst += len;
pos += len;
meta_block_remaining_len -= len;
if (copy_dst >= ringbuffer_end) {
if (BrotliWrite(output, ringbuffer,
(size_t)ringbuffer_size) < 0) {
ok = 0;
goto End;
}
memcpy(ringbuffer, ringbuffer_end,
(size_t)(copy_dst - ringbuffer_end));
}
} else {
printf("Invalid backward reference. pos: %d distance: %d "
"len: %d bytes left: %d\n", pos, distance, copy_length,
meta_block_remaining_len);
ok = 0;
goto End;
}
} else {
printf("Invalid backward reference. pos: %d distance: %d "
"len: %d bytes left: %d\n", pos, distance, copy_length,
meta_block_remaining_len);
ok = 0;
goto End;
}
} else {
if (copy_length > meta_block_remaining_len) {
printf("Invalid backward reference. pos: %d distance: %d "

4984
dec/dictionary.h Normal file

File diff suppressed because it is too large Load Diff

202
dec/transform.h Normal file
View File

@ -0,0 +1,202 @@
/* Copyright 2013 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Transformations on dictionary words.
*/
#ifndef BROTLI_DEC_TRANSFORM_H_
#define BROTLI_DEC_TRANSFORM_H_
#include <stdio.h>
#include <ctype.h>
#include "./types.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
enum WordTransformType {
kIdentity = 0,
kOmit1 = 1,
kOmit2 = 2,
kOmit3 = 3,
kOmit4 = 4,
kOmit5 = 5,
kOmit6 = 6,
kOmit7 = 7,
kOmit8 = 8,
kOmit9 = 9,
kUppercaseFirst = 10,
kUppercaseAll = 11
};
typedef struct {
const char* prefix;
enum WordTransformType transform;
const char* suffix;
} Transform;
static const Transform kTransforms[] = {
{ "", kIdentity, "" },
{ "", kIdentity, " ", },
{ "", kIdentity, "\">" },
{ "", kUppercaseFirst, "" },
{ "", kIdentity, "\"" },
{ "", kIdentity, ".", },
{ "", kIdentity, "=\"" },
{ "", kUppercaseFirst, " ", },
{ " ", kIdentity, "=\"" },
{ " ", kIdentity, " ", },
{ "", kIdentity, ":", },
{ " ", kIdentity, "" },
{ "", kIdentity, "\n" },
{ "", kIdentity, "(", },
{ "", kUppercaseAll, "" },
{ ".", kIdentity, "(", },
{ "", kIdentity, "'" },
{ "", kUppercaseFirst, "\"" },
{ " ", kUppercaseFirst, " ", },
{ "", kOmit3, "" },
{ "", kOmit4, "" },
{ ".", kIdentity, "" },
{ "", kOmit1, "" },
{ "", kOmit2, "" },
{ "", kUppercaseFirst, "\">" },
{ "", kOmit5, "" },
{ "", kUppercaseAll, " ", },
{ " ", kUppercaseFirst, "" },
{ "", kIdentity, ", ", },
{ "", kUppercaseFirst, "(", },
{ "", kIdentity, "\n\t" },
{ "", kUppercaseFirst, "'" },
{ ".", kIdentity, " ", },
{ " ", kUppercaseAll, " ", },
{ "", kIdentity, "='" },
{ "", kUppercaseFirst, ".", },
{ " ", kIdentity, ".", },
{ " ", kIdentity, ", ", },
{ " ", kUppercaseAll, "" },
{ "", kOmit6, "" },
{ "", kOmit9, "" },
{ "", kUppercaseAll, "\"" },
{ "", kIdentity, " the " },
{ "", kIdentity, " in " },
{ "", kIdentity, " of " },
{ "", kIdentity, " to " },
{ "", kIdentity, " and " },
{ "", kIdentity, " is " },
{ "", kIdentity, " on " },
{ "", kIdentity, " by " },
{ "", kIdentity, " for " },
{ "", kIdentity, " with " },
{ "", kIdentity, " from " },
{ "", kIdentity, " as " },
{ "", kIdentity, " at " },
{ "", kIdentity, "er " },
{ " ", kIdentity, "='" },
{ "", kIdentity, " a " },
{ "", kOmit7, "" },
{ "", kOmit8, "" },
{ " ", kIdentity, "(", },
{ " ", kIdentity, ". ", },
{ "", kIdentity, ". ", },
{ "", kIdentity, ",", },
{ "", kOmit1, "ing " },
{ "", kIdentity, "ed " },
{ "", kUppercaseFirst, ", ", },
{ "", kUppercaseAll, ".", },
{ "", kUppercaseAll, "=\"" },
{ "", kUppercaseAll, ", ", },
{ "", kUppercaseAll, "\">" },
{ " ", kUppercaseFirst, ".", },
{ " ", kUppercaseAll, "=\"" },
{ " ", kUppercaseFirst, ", ", },
{ "", kUppercaseAll, "'" },
{ "", kUppercaseFirst, "=\"" },
{ " ", kIdentity, ",", },
{ "", kIdentity, " that " },
{ "", kUppercaseFirst, "='" },
{ "", kUppercaseFirst, ". ", },
{ "", kUppercaseFirst, ",", },
{ "", kIdentity, ". The " },
{ "\xc2\xa0", kIdentity, "" },
{ " ", kUppercaseFirst, ". ", },
{ "", kUppercaseAll, ",", },
{ "", kUppercaseAll, "(", },
{ " ", kUppercaseAll, "='" },
{ "", kIdentity, "]" },
{ "", kUppercaseAll, "='" },
{ " ", kUppercaseAll, ".", },
{ "", kUppercaseAll, ". ", },
{ " ", kUppercaseFirst, "=\"" },
{ " ", kUppercaseAll, ". ", },
{ " ", kUppercaseFirst, ",", },
{ " ", kUppercaseAll, ", ", },
{ "", kIdentity, "ize " },
{ " ", kUppercaseFirst, "='" },
{ "", kIdentity, "est " },
{ "", kIdentity, ". This " },
};
static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]);
static int ToUpperCase(uint8_t *p, int len) {
if (p[0] < 0xc0) {
if (p[0] >= 'a' && p[0] <= 'z') {
p[0] ^= 32;
}
return 1;
}
/* An overly simplified uppercasing model for utf-8. */
if (p[0] < 0xe0) {
p[1] ^= 32;
return 2;
}
/* An arbitrary transform for three byte characters. */
p[2] ^= 5;
return 3;
}
static BROTLI_INLINE int TransformDictionaryWord(
uint8_t* dst, const uint8_t* word, int len, int transform) {
const char* prefix = kTransforms[transform].prefix;
const char* suffix = kTransforms[transform].suffix;
const int t = kTransforms[transform].transform;
int idx = 0;
int i = 0;
uint8_t* uppercase;
while (*prefix) { dst[idx++] = (uint8_t)*prefix++; }
if (t <= kOmit9) {
len -= t;
}
while (i < len) { dst[idx++] = word[i++]; }
uppercase = &dst[idx - len];
if (t == kUppercaseFirst) {
ToUpperCase(uppercase, len);
} else if (t == kUppercaseAll) {
while (len > 0) {
int step = ToUpperCase(uppercase, len);
uppercase += step;
len -= step;
}
}
while (*suffix) { dst[idx++] = (uint8_t)*suffix++; }
return idx;
}
#if defined(__cplusplus) || defined(c_plusplus)
} /* extern "C" */
#endif
#endif /* BROTLI_DEC_TRANSFORM_H_ */

4973
enc/dictionary.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -24,6 +24,7 @@
#include "./block_splitter.h"
#include "./cluster.h"
#include "./context.h"
#include "./transform.h"
#include "./entropy_encode.h"
#include "./fast_log.h"
#include "./hash.h"
@ -858,6 +859,7 @@ BrotliCompressor::BrotliCompressor()
dist_ringbuffer_[2] = 11;
dist_ringbuffer_[3] = 4;
storage_[0] = 0;
StoreDictionaryWordHashes();
}
BrotliCompressor::~BrotliCompressor() {
@ -865,6 +867,22 @@ BrotliCompressor::~BrotliCompressor() {
delete[] storage_;
}
void BrotliCompressor::StoreDictionaryWordHashes() {
for (int t = kNumTransforms - 1; t >= 0; --t) {
for (int i = kMaxDictionaryWordLength; i >= 3; --i) {
const int num_words = 1 << kBrotliDictionarySizeBitsByLength[i];
for (int j = num_words - 1; j >= 0; --j) {
int word_id = t * num_words + j;
std::string word = GetTransformedDictionaryWord(i, word_id);
if (word.size() >= 3) {
hasher_->Store(reinterpret_cast<const uint8_t*>(&word[0]),
(-1) * ((i << 20) + word_id + 1));
}
}
}
}
}
void BrotliCompressor::WriteStreamHeader() {
// Encode window size.
if (window_bits_ == 16) {

View File

@ -50,6 +50,9 @@ class BrotliCompressor {
private:
// Initializes the hasher with the hashes of dictionary words.
void StoreDictionaryWordHashes();
int window_bits_;
Hasher* hasher_;
int dist_ringbuffer_[4];

View File

@ -24,7 +24,9 @@
#include <sys/types.h>
#include <algorithm>
#include <cstdlib>
#include <string>
#include "./transform.h"
#include "./fast_log.h"
#include "./find_match_length.h"
#include "./port.h"
@ -276,7 +278,41 @@ class HashLongestMatch {
for (int i = num_[key] - 1; i >= down; --i) {
int prev_ix = bucket[i & kBlockMask];
if (prev_ix < 0) {
continue;
prev_ix *= -1;
prev_ix -= 1;
int copy_len_code = prev_ix >> 20;
int word_id = prev_ix & 0xfffff;
std::string word = GetTransformedDictionaryWord(copy_len_code, word_id);
int len = word.size();
const size_t backward = max_backward + word_id + 1;
bool word_matched = (len >= 3 && len <= max_length);
for (int k = 0; k < len && word_matched; ++k) {
if ((uint8_t)(word[k]) != data[cur_ix_masked + k]) {
word_matched = false;
}
}
if (word_matched) {
const double score = BackwardReferenceScore(average_cost_,
start_cost4,
start_cost3,
start_cost2,
len, backward,
last_distance1_,
last_distance2_,
last_distance3_,
last_distance4_);
if (best_score < score) {
best_score = score;
best_len = len;
best_ix = backward;
*best_len_out = best_len;
*best_len_code_out = copy_len_code;
*best_distance_out = best_ix;
*best_score_out = best_score;
match_found = true;
*in_dictionary = true;
}
}
} else {
const size_t backward = cur_ix - prev_ix;
if (PREDICT_FALSE(backward > max_backward)) {

204
enc/transform.h Normal file
View File

@ -0,0 +1,204 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Transformations on dictionary words.
#ifndef BROTLI_ENC_TRANSFORM_H_
#define BROTLI_ENC_TRANSFORM_H_
#include <string>
#include "./dictionary.h"
namespace brotli {
enum WordTransformType {
kIdentity = 0,
kOmit1 = 1,
kOmit2 = 2,
kOmit3 = 3,
kOmit4 = 4,
kOmit5 = 5,
kOmit6 = 6,
kOmit7 = 7,
kOmit8 = 8,
kOmit9 = 9,
kUppercaseFirst = 10,
kUppercaseAll = 11,
};
struct Transform {
const char* prefix;
WordTransformType word_transform;
const char* suffix;
};
static const Transform kTransforms[] = {
{ "", kIdentity, "" },
{ "", kIdentity, " ", },
{ "", kIdentity, "\">" },
{ "", kUppercaseFirst, "" },
{ "", kIdentity, "\"" },
{ "", kIdentity, ".", },
{ "", kIdentity, "=\"" },
{ "", kUppercaseFirst, " ", },
{ " ", kIdentity, "=\"" },
{ " ", kIdentity, " ", },
{ "", kIdentity, ":", },
{ " ", kIdentity, "" },
{ "", kIdentity, "\n" },
{ "", kIdentity, "(", },
{ "", kUppercaseAll, "" },
{ ".", kIdentity, "(", },
{ "", kIdentity, "'" },
{ "", kUppercaseFirst, "\"" },
{ " ", kUppercaseFirst, " ", },
{ "", kOmit3, "" },
{ "", kOmit4, "" },
{ ".", kIdentity, "" },
{ "", kOmit1, "" },
{ "", kOmit2, "" },
{ "", kUppercaseFirst, "\">" },
{ "", kOmit5, "" },
{ "", kUppercaseAll, " ", },
{ " ", kUppercaseFirst, "" },
{ "", kIdentity, ", ", },
{ "", kUppercaseFirst, "(", },
{ "", kIdentity, "\n\t" },
{ "", kUppercaseFirst, "'" },
{ ".", kIdentity, " ", },
{ " ", kUppercaseAll, " ", },
{ "", kIdentity, "='" },
{ "", kUppercaseFirst, ".", },
{ " ", kIdentity, ".", },
{ " ", kIdentity, ", ", },
{ " ", kUppercaseAll, "" },
{ "", kOmit6, "" },
{ "", kOmit9, "" },
{ "", kUppercaseAll, "\"" },
{ "", kIdentity, " the " },
{ "", kIdentity, " in " },
{ "", kIdentity, " of " },
{ "", kIdentity, " to " },
{ "", kIdentity, " and " },
{ "", kIdentity, " is " },
{ "", kIdentity, " on " },
{ "", kIdentity, " by " },
{ "", kIdentity, " for " },
{ "", kIdentity, " with " },
{ "", kIdentity, " from " },
{ "", kIdentity, " as " },
{ "", kIdentity, " at " },
{ "", kIdentity, "er " },
{ " ", kIdentity, "='" },
{ "", kIdentity, " a " },
{ "", kOmit7, "" },
{ "", kOmit8, "" },
{ " ", kIdentity, "(", },
{ " ", kIdentity, ". ", },
{ "", kIdentity, ". ", },
{ "", kIdentity, ",", },
{ "", kOmit1, "ing " },
{ "", kIdentity, "ed " },
{ "", kUppercaseFirst, ", ", },
{ "", kUppercaseAll, ".", },
{ "", kUppercaseAll, "=\"" },
{ "", kUppercaseAll, ", ", },
{ "", kUppercaseAll, "\">" },
{ " ", kUppercaseFirst, ".", },
{ " ", kUppercaseAll, "=\"" },
{ " ", kUppercaseFirst, ", ", },
{ "", kUppercaseAll, "'" },
{ "", kUppercaseFirst, "=\"" },
{ " ", kIdentity, ",", },
{ "", kIdentity, " that " },
{ "", kUppercaseFirst, "='" },
{ "", kUppercaseFirst, ". ", },
{ "", kUppercaseFirst, ",", },
{ "", kIdentity, ". The " },
{ "\xc2\xa0", kIdentity, "" },
{ " ", kUppercaseFirst, ". ", },
{ "", kUppercaseAll, ",", },
{ "", kUppercaseAll, "(", },
{ " ", kUppercaseAll, "='" },
{ "", kIdentity, "]" },
{ "", kUppercaseAll, "='" },
{ " ", kUppercaseAll, ".", },
{ "", kUppercaseAll, ". ", },
{ " ", kUppercaseFirst, "=\"" },
{ " ", kUppercaseAll, ". ", },
{ " ", kUppercaseFirst, ",", },
{ " ", kUppercaseAll, ", ", },
{ "", kIdentity, "ize " },
{ " ", kUppercaseFirst, "='" },
{ "", kIdentity, "est " },
{ "", kIdentity, ". This " },
};
static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]);
static int ToUpperCase(uint8_t *p, int len) {
if (len == 1 || p[0] < 0xc0) {
if (p[0] >= 'a' && p[0] <= 'z') {
p[0] ^= 32;
}
return 1;
}
if (p[0] < 0xe0) {
p[1] ^= 32;
return 2;
}
if (len == 2) {
return 2;
}
p[2] ^= 5;
return 3;
}
inline std::string ApplyTransform(
const Transform& t, const uint8_t* word, int len) {
std::string ret(t.prefix);
if (t.word_transform <= kOmit9) {
len -= t.word_transform;
}
if (len > 0) {
ret += std::string(word, word + len);
uint8_t *uppercase = reinterpret_cast<uint8_t*>(&ret[ret.size() - len]);
if (t.word_transform == kUppercaseFirst) {
ToUpperCase(uppercase, len);
} else if (t.word_transform == kUppercaseAll) {
while (len > 0) {
int step = ToUpperCase(uppercase, len);
uppercase += step;
len -= step;
}
}
}
ret += std::string(t.suffix);
return ret;
}
inline std::string GetTransformedDictionaryWord(int len_code, int word_id) {
int num_words = 1 << kBrotliDictionarySizeBitsByLength[len_code];
int offset = kBrotliDictionaryOffsetsByLength[len_code];
int t = word_id / num_words;
int word_idx = word_id % num_words;
offset += len_code * word_idx;
const uint8_t* word = &kBrotliDictionary[offset];
return ApplyTransform(kTransforms[t], word, len_code);
}
} // namespace brotli
#endif // BROTLI_ENC_TRANSFORM_H_