brotli/enc/transform.h
Zoltan Szabadka 66098830a2 Use a static hash table to look up dictionary words and transforms.
This is used for quality 11, for qualities <= 9 we already
have a simpler hash table.

The static data size is 252 kB, and this removes the
need to initialize a huge hash map at startup, which was
the reason why transforms had to be disabled by default.
In comparison, the static dictionary itself is 120 kB.
This supports every transform, except the kOmitFirstN.
2015-06-12 16:45:17 +02:00

251 lines
9.4 KiB
C++

// Copyright 2010 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Transformations on dictionary words.
#ifndef BROTLI_ENC_TRANSFORM_H_
#define BROTLI_ENC_TRANSFORM_H_
#include <string>
#include "./dictionary.h"
namespace brotli {
enum WordTransformType {
kIdentity = 0,
kOmitLast1 = 1,
kOmitLast2 = 2,
kOmitLast3 = 3,
kOmitLast4 = 4,
kOmitLast5 = 5,
kOmitLast6 = 6,
kOmitLast7 = 7,
kOmitLast8 = 8,
kOmitLast9 = 9,
kUppercaseFirst = 10,
kUppercaseAll = 11,
kOmitFirst1 = 12,
kOmitFirst2 = 13,
kOmitFirst3 = 14,
kOmitFirst4 = 15,
kOmitFirst5 = 16,
kOmitFirst6 = 17,
kOmitFirst7 = 18,
kOmitFirst8 = 19,
kOmitFirst9 = 20,
};
struct Transform {
const char* prefix;
WordTransformType word_transform;
const char* suffix;
};
static const Transform kTransforms[] = {
{ "", kIdentity, "" },
{ "", kIdentity, " " },
{ " ", kIdentity, " " },
{ "", kOmitFirst1, "" },
{ "", kUppercaseFirst, " " },
{ "", kIdentity, " the " },
{ " ", kIdentity, "" },
{ "s ", kIdentity, " " },
{ "", kIdentity, " of " },
{ "", kUppercaseFirst, "" },
{ "", kIdentity, " and " },
{ "", kOmitFirst2, "" },
{ "", kOmitLast1, "" },
{ ", ", kIdentity, " " },
{ "", kIdentity, ", " },
{ " ", kUppercaseFirst, " " },
{ "", kIdentity, " in " },
{ "", kIdentity, " to " },
{ "e ", kIdentity, " " },
{ "", kIdentity, "\"" },
{ "", kIdentity, "." },
{ "", kIdentity, "\">" },
{ "", kIdentity, "\n" },
{ "", kOmitLast3, "" },
{ "", kIdentity, "]" },
{ "", kIdentity, " for " },
{ "", kOmitFirst3, "" },
{ "", kOmitLast2, "" },
{ "", kIdentity, " a " },
{ "", kIdentity, " that " },
{ " ", kUppercaseFirst, "" },
{ "", kIdentity, ". " },
{ ".", kIdentity, "" },
{ " ", kIdentity, ", " },
{ "", kOmitFirst4, "" },
{ "", kIdentity, " with " },
{ "", kIdentity, "'" },
{ "", kIdentity, " from " },
{ "", kIdentity, " by " },
{ "", kOmitFirst5, "" },
{ "", kOmitFirst6, "" },
{ " the ", kIdentity, "" },
{ "", kOmitLast4, "" },
{ "", kIdentity, ". The " },
{ "", kUppercaseAll, "" },
{ "", kIdentity, " on " },
{ "", kIdentity, " as " },
{ "", kIdentity, " is " },
{ "", kOmitLast7, "" },
{ "", kOmitLast1, "ing " },
{ "", kIdentity, "\n\t" },
{ "", kIdentity, ":" },
{ " ", kIdentity, ". " },
{ "", kIdentity, "ed " },
{ "", kOmitFirst9, "" },
{ "", kOmitFirst7, "" },
{ "", kOmitLast6, "" },
{ "", kIdentity, "(" },
{ "", kUppercaseFirst, ", " },
{ "", kOmitLast8, "" },
{ "", kIdentity, " at " },
{ "", kIdentity, "ly " },
{ " the ", kIdentity, " of " },
{ "", kOmitLast5, "" },
{ "", kOmitLast9, "" },
{ " ", kUppercaseFirst, ", " },
{ "", kUppercaseFirst, "\"" },
{ ".", kIdentity, "(" },
{ "", kUppercaseAll, " " },
{ "", kUppercaseFirst, "\">" },
{ "", kIdentity, "=\"" },
{ " ", kIdentity, "." },
{ ".com/", kIdentity, "" },
{ " the ", kIdentity, " of the " },
{ "", kUppercaseFirst, "'" },
{ "", kIdentity, ". This " },
{ "", kIdentity, "," },
{ ".", kIdentity, " " },
{ "", kUppercaseFirst, "(" },
{ "", kUppercaseFirst, "." },
{ "", kIdentity, " not " },
{ " ", kIdentity, "=\"" },
{ "", kIdentity, "er " },
{ " ", kUppercaseAll, " " },
{ "", kIdentity, "al " },
{ " ", kUppercaseAll, "" },
{ "", kIdentity, "='" },
{ "", kUppercaseAll, "\"" },
{ "", kUppercaseFirst, ". " },
{ " ", kIdentity, "(" },
{ "", kIdentity, "ful " },
{ " ", kUppercaseFirst, ". " },
{ "", kIdentity, "ive " },
{ "", kIdentity, "less " },
{ "", kUppercaseAll, "'" },
{ "", kIdentity, "est " },
{ " ", kUppercaseFirst, "." },
{ "", kUppercaseAll, "\">" },
{ " ", kIdentity, "='" },
{ "", kUppercaseFirst, "," },
{ "", kIdentity, "ize " },
{ "", kUppercaseAll, "." },
{ "\xc2\xa0", kIdentity, "" },
{ " ", kIdentity, "," },
{ "", kUppercaseFirst, "=\"" },
{ "", kUppercaseAll, "=\"" },
{ "", kIdentity, "ous " },
{ "", kUppercaseAll, ", " },
{ "", kUppercaseFirst, "='" },
{ " ", kUppercaseFirst, "," },
{ " ", kUppercaseAll, "=\"" },
{ " ", kUppercaseAll, ", " },
{ "", kUppercaseAll, "," },
{ "", kUppercaseAll, "(" },
{ "", kUppercaseAll, ". " },
{ " ", kUppercaseAll, "." },
{ "", kUppercaseAll, "='" },
{ " ", kUppercaseAll, ". " },
{ " ", kUppercaseFirst, "=\"" },
{ " ", kUppercaseAll, "='" },
{ " ", kUppercaseFirst, "='" },
};
static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]);
static const int kOmitFirstNTransforms[10] = {
0, 3, 11, 26, 34, 39, 40, 55, 0, 54
};
static const int kOmitLastNTransforms[10] = {
0, 12, 27, 23, 42, 63, 56, 48, 59, 64,
};
static int ToUpperCase(uint8_t *p, int len) {
if (len == 1 || p[0] < 0xc0) {
if (p[0] >= 'a' && p[0] <= 'z') {
p[0] ^= 32;
}
return 1;
}
if (p[0] < 0xe0) {
p[1] ^= 32;
return 2;
}
if (len == 2) {
return 2;
}
p[2] ^= 5;
return 3;
}
inline std::string ApplyTransform(
const Transform& t, const uint8_t* word, int len) {
std::string ret(t.prefix);
if (t.word_transform <= kOmitLast9) {
len -= t.word_transform;
}
if (len > 0) {
if (t.word_transform >= kOmitFirst1) {
const int skip = t.word_transform - (kOmitFirst1 - 1);
if (len > skip) {
ret += std::string(word + skip, word + len);
}
} else {
ret += std::string(word, word + len);
uint8_t *uppercase = reinterpret_cast<uint8_t*>(&ret[ret.size() - len]);
if (t.word_transform == kUppercaseFirst) {
ToUpperCase(uppercase, len);
} else if (t.word_transform == kUppercaseAll) {
while (len > 0) {
int step = ToUpperCase(uppercase, len);
uppercase += step;
len -= step;
}
}
}
}
ret += std::string(t.suffix);
return ret;
}
inline std::string GetTransformedDictionaryWord(int len_code, int word_id) {
int num_words = 1 << kBrotliDictionarySizeBitsByLength[len_code];
int offset = kBrotliDictionaryOffsetsByLength[len_code];
int t = word_id / num_words;
int word_idx = word_id % num_words;
offset += len_code * word_idx;
const uint8_t* word = &kBrotliDictionary[offset];
return ApplyTransform(kTransforms[t], word, len_code);
}
} // namespace brotli
#endif // BROTLI_ENC_TRANSFORM_H_