Move literal cost computation to where it's used.

Move utf8 heuristics functions to their own file.
This commit is contained in:
Zoltan Szabadka 2015-10-01 15:10:42 +02:00
parent dc416abcb7
commit 4c37566f4b
11 changed files with 155 additions and 199 deletions

View File

@ -2,7 +2,7 @@
include ../shared.mk
OBJS_NODICT = backward_references.o block_splitter.o brotli_bit_stream.o encode.o encode_parallel.o entropy_encode.o histogram.o literal_cost.o metablock.o static_dict.o streams.o
OBJS_NODICT = backward_references.o block_splitter.o brotli_bit_stream.o encode.o encode_parallel.o entropy_encode.o histogram.o literal_cost.o metablock.o static_dict.o streams.o utf8_util.o
OBJS = $(OBJS_NODICT) dictionary.o
nodict : $(OBJS_NODICT)

View File

@ -22,6 +22,7 @@
#include "./command.h"
#include "./fast_log.h"
#include "./literal_cost.h"
namespace brotli {
@ -78,19 +79,15 @@ class ZopfliCostModel {
void SetFromLiteralCosts(size_t num_bytes,
size_t position,
const float* literal_cost,
size_t literal_cost_mask) {
const uint8_t* ringbuffer,
size_t ringbuffer_mask) {
std::vector<float> literal_cost(num_bytes);
EstimateBitCostsForLiterals(position, num_bytes, ringbuffer_mask,
ringbuffer, &literal_cost[0]);
literal_costs_.resize(num_bytes + 1);
literal_costs_[0] = 0.0;
if (literal_cost) {
for (int i = 0; i < num_bytes; ++i) {
literal_costs_[i + 1] = literal_costs_[i] +
literal_cost[(position + i) & literal_cost_mask];
}
} else {
for (int i = 1; i <= num_bytes; ++i) {
literal_costs_[i] = i * 5.4;
}
for (int i = 0; i < num_bytes; ++i) {
literal_costs_[i + 1] = literal_costs_[i] + literal_cost[i];
}
cost_cmd_.resize(kNumCommandPrefixes);
cost_dist_.resize(kNumDistancePrefixes);
@ -623,8 +620,6 @@ void CreateBackwardReferences(size_t num_bytes,
size_t position,
const uint8_t* ringbuffer,
size_t ringbuffer_mask,
const float* literal_cost,
size_t literal_cost_mask,
const size_t max_backward_limit,
const int quality,
Hashers* hashers,
@ -688,7 +683,7 @@ void CreateBackwardReferences(size_t num_bytes,
ZopfliCostModel model;
if (i == 0) {
model.SetFromLiteralCosts(num_bytes, position,
literal_cost, literal_cost_mask);
ringbuffer, ringbuffer_mask);
} else {
model.SetFromCommands(num_bytes, position,
ringbuffer, ringbuffer_mask,

View File

@ -33,8 +33,6 @@ void CreateBackwardReferences(size_t num_bytes,
size_t position,
const uint8_t* ringbuffer,
size_t ringbuffer_mask,
const float* literal_cost,
size_t literal_cost_mask,
const size_t max_backward_limit,
const int quality,
Hashers* hashers,

View File

@ -31,79 +31,16 @@
#include "./fast_log.h"
#include "./hash.h"
#include "./histogram.h"
#include "./literal_cost.h"
#include "./prefix.h"
#include "./utf8_util.h"
#include "./write_bits.h"
namespace brotli {
static const double kMinUTF8Ratio = 0.75;
static const int kMinQualityForBlockSplit = 4;
static const int kMinQualityForContextModeling = 5;
static const int kMinQualityForOptimizeHistograms = 4;
int ParseAsUTF8(int* symbol, const uint8_t* input, int size) {
// ASCII
if ((input[0] & 0x80) == 0) {
*symbol = input[0];
if (*symbol > 0) {
return 1;
}
}
// 2-byte UTF8
if (size > 1 &&
(input[0] & 0xe0) == 0xc0 &&
(input[1] & 0xc0) == 0x80) {
*symbol = (((input[0] & 0x1f) << 6) |
(input[1] & 0x3f));
if (*symbol > 0x7f) {
return 2;
}
}
// 3-byte UFT8
if (size > 2 &&
(input[0] & 0xf0) == 0xe0 &&
(input[1] & 0xc0) == 0x80 &&
(input[2] & 0xc0) == 0x80) {
*symbol = (((input[0] & 0x0f) << 12) |
((input[1] & 0x3f) << 6) |
(input[2] & 0x3f));
if (*symbol > 0x7ff) {
return 3;
}
}
// 4-byte UFT8
if (size > 3 &&
(input[0] & 0xf8) == 0xf0 &&
(input[1] & 0xc0) == 0x80 &&
(input[2] & 0xc0) == 0x80 &&
(input[3] & 0xc0) == 0x80) {
*symbol = (((input[0] & 0x07) << 18) |
((input[1] & 0x3f) << 12) |
((input[2] & 0x3f) << 6) |
(input[3] & 0x3f));
if (*symbol > 0xffff && *symbol <= 0x10ffff) {
return 4;
}
}
// Not UTF8, emit a special symbol above the UTF8-code space
*symbol = 0x110000 | input[0];
return 1;
}
// Returns true if at least min_fraction of the data is UTF8-encoded.
bool IsMostlyUTF8(const uint8_t* data, size_t length, double min_fraction) {
size_t size_utf8 = 0;
size_t pos = 0;
while (pos < length) {
int symbol;
int bytes_read = ParseAsUTF8(&symbol, data + pos, length - pos);
pos += bytes_read;
if (symbol < 0x110000) size_utf8 += bytes_read;
}
return size_utf8 > min_fraction * length;
}
void RecomputeDistancePrefixes(Command* cmds,
size_t num_commands,
int num_direct_distance_codes,
@ -136,7 +73,6 @@ BrotliCompressor::BrotliCompressor(BrotliParams params)
: params_(params),
hashers_(new Hashers()),
input_pos_(0),
literal_cost_(0),
num_commands_(0),
num_literals_(0),
last_insert_len_(0),
@ -173,10 +109,6 @@ BrotliCompressor::BrotliCompressor(BrotliParams params)
// smaller than ringbuffer size.
int ringbuffer_bits = std::max(params_.lgwin + 1, params_.lgblock + 1);
ringbuffer_ = new RingBuffer(ringbuffer_bits, params_.lgblock);
if (params_.quality > 9) {
literal_cost_mask_ = (1 << params_.lgblock) - 1;
literal_cost_ = new float[literal_cost_mask_ + 1];
}
// Allocate command buffer.
cmd_buffer_size_ = std::max(1 << 18, 1 << params_.lgblock);
@ -213,7 +145,6 @@ BrotliCompressor::BrotliCompressor(BrotliParams params)
BrotliCompressor::~BrotliCompressor() {
delete[] storage_;
delete[] literal_cost_;
delete[] commands_;
delete ringbuffer_;
delete hashers_;
@ -296,24 +227,7 @@ bool BrotliCompressor::WriteBrotliData(const bool is_last,
return false;
}
bool utf8_mode =
params_.quality >= 9 &&
IsMostlyUTF8(&data[last_processed_pos_ & mask], bytes, kMinUTF8Ratio);
if (literal_cost_) {
if (utf8_mode) {
EstimateBitCostsForLiteralsUTF8(last_processed_pos_, bytes, mask,
literal_cost_mask_, data,
literal_cost_);
} else {
EstimateBitCostsForLiterals(last_processed_pos_, bytes, mask,
literal_cost_mask_,
data, literal_cost_);
}
}
CreateBackwardReferences(bytes, last_processed_pos_, data, mask,
literal_cost_,
literal_cost_mask_,
max_backward_distance_,
params_.quality,
hashers_,
@ -347,7 +261,7 @@ bool BrotliCompressor::WriteBrotliData(const bool is_last,
last_insert_len_ = 0;
}
return WriteMetaBlockInternal(is_last, utf8_mode, out_size, output);
return WriteMetaBlockInternal(is_last, out_size, output);
}
// Decide about the context map based on the ability of the prediction
@ -449,7 +363,6 @@ void DecideOverLiteralContextModeling(const uint8_t* input,
}
bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
const bool utf8_mode,
size_t* out_size,
uint8_t** output) {
const size_t bytes = input_pos_ - last_flush_pos_;
@ -511,7 +424,7 @@ bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
}
} else {
MetaBlockSplit mb;
int literal_context_mode = utf8_mode ? CONTEXT_UTF8 : CONTEXT_SIGNED;
int literal_context_mode = CONTEXT_UTF8;
if (params_.quality <= 9) {
int num_literal_contexts = 1;
const int* literal_context_map = NULL;
@ -534,6 +447,9 @@ bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
&mb);
}
} else {
if (!IsMostlyUTF8(data, last_flush_pos_, mask, bytes, kMinUTF8Ratio)) {
literal_context_mode = CONTEXT_SIGNED;
}
BuildMetaBlock(data, last_flush_pos_, mask,
prev_byte_, prev_byte2_,
commands_, num_commands_,

View File

@ -142,7 +142,6 @@ class BrotliCompressor {
uint8_t* GetBrotliStorage(size_t size);
bool WriteMetaBlockInternal(const bool is_last,
const bool utf8_mode,
size_t* out_size,
uint8_t** output);
@ -152,8 +151,6 @@ class BrotliCompressor {
int hash_type_;
size_t input_pos_;
RingBuffer* ringbuffer_;
float* literal_cost_;
size_t literal_cost_mask_;
size_t cmd_buffer_size_;
Command* commands_;
int num_commands_;

View File

@ -31,75 +31,14 @@
#include "./fast_log.h"
#include "./hash.h"
#include "./histogram.h"
#include "./literal_cost.h"
#include "./prefix.h"
#include "./utf8_util.h"
#include "./write_bits.h"
namespace brotli {
namespace {
int ParseAsUTF8(int* symbol, const uint8_t* input, int size) {
// ASCII
if ((input[0] & 0x80) == 0) {
*symbol = input[0];
if (*symbol > 0) {
return 1;
}
}
// 2-byte UTF8
if (size > 1 &&
(input[0] & 0xe0) == 0xc0 &&
(input[1] & 0xc0) == 0x80) {
*symbol = (((input[0] & 0x1f) << 6) |
(input[1] & 0x3f));
if (*symbol > 0x7f) {
return 2;
}
}
// 3-byte UFT8
if (size > 2 &&
(input[0] & 0xf0) == 0xe0 &&
(input[1] & 0xc0) == 0x80 &&
(input[2] & 0xc0) == 0x80) {
*symbol = (((input[0] & 0x0f) << 12) |
((input[1] & 0x3f) << 6) |
(input[2] & 0x3f));
if (*symbol > 0x7ff) {
return 3;
}
}
// 4-byte UFT8
if (size > 3 &&
(input[0] & 0xf8) == 0xf0 &&
(input[1] & 0xc0) == 0x80 &&
(input[2] & 0xc0) == 0x80 &&
(input[3] & 0xc0) == 0x80) {
*symbol = (((input[0] & 0x07) << 18) |
((input[1] & 0x3f) << 12) |
((input[2] & 0x3f) << 6) |
(input[3] & 0x3f));
if (*symbol > 0xffff && *symbol <= 0x10ffff) {
return 4;
}
}
// Not UTF8, emit a special symbol above the UTF8-code space
*symbol = 0x110000 | input[0];
return 1;
}
// Returns true if at least min_fraction of the data is UTF8-encoded.
bool IsMostlyUTF8(const uint8_t* data, size_t length, double min_fraction) {
size_t size_utf8 = 0;
for (size_t pos = 0; pos < length; ) {
int symbol;
int bytes_read = ParseAsUTF8(&symbol, data + pos, length - pos);
pos += bytes_read;
if (symbol < 0x110000) size_utf8 += bytes_read;
}
return size_utf8 > min_fraction * length;
}
void RecomputeDistancePrefixes(std::vector<Command>* cmds,
int num_direct_distance_codes,
int distance_postfix_bits) {
@ -151,19 +90,8 @@ bool WriteMetaBlockParallel(const BrotliParams& params,
// Decide about UTF8 mode.
static const double kMinUTF8Ratio = 0.75;
bool utf8_mode = IsMostlyUTF8(&input[input_pos], input_size, kMinUTF8Ratio);
// Compute literal costs. The 4 bytes at the end are there to cover for an
// over-read past the end of input, but not past the mask, in
// CreateBackwardReferences.
std::vector<float> literal_cost(prefix_size + input_size + 4);
if (utf8_mode) {
EstimateBitCostsForLiteralsUTF8(input_pos, input_size, mask, mask,
&input[0], &literal_cost[0]);
} else {
EstimateBitCostsForLiterals(input_pos, input_size, mask, mask,
&input[0], &literal_cost[0]);
}
bool utf8_mode = IsMostlyUTF8(&input[0], input_pos, mask, input_size,
kMinUTF8Ratio);
// Initialize hashers.
int hash_type = std::min(9, params.quality);
@ -180,7 +108,6 @@ bool WriteMetaBlockParallel(const BrotliParams& params,
CreateBackwardReferences(
input_size, input_pos,
&input[0], mask,
&literal_cost[0], mask,
max_backward_distance,
params.quality,
hashers,

View File

@ -21,6 +21,7 @@
#include "./fast_log.h"
#include "./types.h"
#include "./utf8_util.h"
namespace brotli {
@ -61,8 +62,7 @@ static int DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask,
}
void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
size_t cost_mask, const uint8_t *data,
float *cost) {
const uint8_t *data, float *cost) {
// max_utf8 is 0 (normal ascii single byte modeling),
// 1 (for 2-byte utf-8 modeling), or 2 (for 3-byte utf-8 modeling).
@ -126,13 +126,16 @@ void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
if (i < 2000) {
lit_cost += 0.7 - ((2000 - i) / 2000.0 * 0.35);
}
cost[(pos + i) & cost_mask] = lit_cost;
cost[i] = lit_cost;
}
}
void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
size_t cost_mask, const uint8_t *data,
float *cost) {
const uint8_t *data, float *cost) {
if (IsMostlyUTF8(data, pos, mask, len, kMinUTF8Ratio)) {
EstimateBitCostsForLiteralsUTF8(pos, len, mask, data, cost);
return;
}
int histogram[256] = { 0 };
int window_half = 2000;
int in_window = std::min(static_cast<size_t>(window_half), len);
@ -164,7 +167,7 @@ void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
lit_cost *= 0.5;
lit_cost += 0.5;
}
cost[(pos + i) & cost_mask] = lit_cost;
cost[i] = lit_cost;
}
}

View File

@ -23,14 +23,9 @@ namespace brotli {
// Estimates how many bits the literals in the interval [pos, pos + len) in the
// ringbuffer (data, mask) will take entropy coded and writes these estimates
// to the ringbuffer (cost, mask).
// to the cost[0..len) array.
void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
size_t cost_mask, const uint8_t *data,
float *cost);
void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
size_t cost_mask, const uint8_t *data,
float *cost);
const uint8_t *data, float *cost);
} // namespace brotli

90
enc/utf8_util.cc Normal file
View File

@ -0,0 +1,90 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Heuristics for deciding about the UTF8-ness of strings.
#include "./utf8_util.h"
#include "./types.h"
namespace brotli {
namespace {
int ParseAsUTF8(int* symbol, const uint8_t* input, int size) {
// ASCII
if ((input[0] & 0x80) == 0) {
*symbol = input[0];
if (*symbol > 0) {
return 1;
}
}
// 2-byte UTF8
if (size > 1 &&
(input[0] & 0xe0) == 0xc0 &&
(input[1] & 0xc0) == 0x80) {
*symbol = (((input[0] & 0x1f) << 6) |
(input[1] & 0x3f));
if (*symbol > 0x7f) {
return 2;
}
}
// 3-byte UFT8
if (size > 2 &&
(input[0] & 0xf0) == 0xe0 &&
(input[1] & 0xc0) == 0x80 &&
(input[2] & 0xc0) == 0x80) {
*symbol = (((input[0] & 0x0f) << 12) |
((input[1] & 0x3f) << 6) |
(input[2] & 0x3f));
if (*symbol > 0x7ff) {
return 3;
}
}
// 4-byte UFT8
if (size > 3 &&
(input[0] & 0xf8) == 0xf0 &&
(input[1] & 0xc0) == 0x80 &&
(input[2] & 0xc0) == 0x80 &&
(input[3] & 0xc0) == 0x80) {
*symbol = (((input[0] & 0x07) << 18) |
((input[1] & 0x3f) << 12) |
((input[2] & 0x3f) << 6) |
(input[3] & 0x3f));
if (*symbol > 0xffff && *symbol <= 0x10ffff) {
return 4;
}
}
// Not UTF8, emit a special symbol above the UTF8-code space
*symbol = 0x110000 | input[0];
return 1;
}
} // namespace
// Returns true if at least min_fraction of the data is UTF8-encoded.
bool IsMostlyUTF8(const uint8_t* data, const size_t pos, const size_t mask,
const size_t length, const double min_fraction) {
size_t size_utf8 = 0;
size_t i = 0;
while (i < length) {
int symbol;
int bytes_read = ParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
i += bytes_read;
if (symbol < 0x110000) size_utf8 += bytes_read;
}
return size_utf8 > min_fraction * length;
}
} // namespace brotli

33
enc/utf8_util.h Normal file
View File

@ -0,0 +1,33 @@
#ifndef BROTLI_ENC_UTF8_UTIL_H_
#define BROTLI_ENC_UTF8_UTIL_H_
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Heuristics for deciding about the UTF8-ness of strings.
#include "./types.h"
namespace brotli {
static const double kMinUTF8Ratio = 0.75;
// Returns true if at least min_fraction of the bytes between pos and
// pos + length in the (data, mask) ringbuffer is UTF8-encoded.
bool IsMostlyUTF8(const uint8_t* data, const size_t pos, const size_t mask,
const size_t length, const double min_fraction);
} // namespace brotli
#endif // BROTLI_ENC_UTF8_UTIL_H_

View File

@ -144,6 +144,7 @@ brotli = Extension("brotli",
"enc/metablock.cc",
"enc/static_dict.cc",
"enc/streams.cc",
"enc/utf8_util.cc",
"dec/bit_reader.c",
"dec/decode.c",
"dec/dictionary.c",
@ -177,6 +178,7 @@ brotli = Extension("brotli",
"enc/streams.h",
"enc/transform.h",
"enc/types.h",
"enc/utf8_util.h",
"enc/write_bits.h",
"dec/bit_reader.h",
"dec/context.h",