mirror of
https://github.com/google/brotli.git
synced 2024-12-29 11:11:09 +00:00
Move literal cost computation to where it's used.
Move utf8 heuristics functions to their own file.
This commit is contained in:
parent
dc416abcb7
commit
4c37566f4b
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
include ../shared.mk
|
include ../shared.mk
|
||||||
|
|
||||||
OBJS_NODICT = backward_references.o block_splitter.o brotli_bit_stream.o encode.o encode_parallel.o entropy_encode.o histogram.o literal_cost.o metablock.o static_dict.o streams.o
|
OBJS_NODICT = backward_references.o block_splitter.o brotli_bit_stream.o encode.o encode_parallel.o entropy_encode.o histogram.o literal_cost.o metablock.o static_dict.o streams.o utf8_util.o
|
||||||
OBJS = $(OBJS_NODICT) dictionary.o
|
OBJS = $(OBJS_NODICT) dictionary.o
|
||||||
|
|
||||||
nodict : $(OBJS_NODICT)
|
nodict : $(OBJS_NODICT)
|
||||||
|
@ -22,6 +22,7 @@
|
|||||||
|
|
||||||
#include "./command.h"
|
#include "./command.h"
|
||||||
#include "./fast_log.h"
|
#include "./fast_log.h"
|
||||||
|
#include "./literal_cost.h"
|
||||||
|
|
||||||
namespace brotli {
|
namespace brotli {
|
||||||
|
|
||||||
@ -78,19 +79,15 @@ class ZopfliCostModel {
|
|||||||
|
|
||||||
void SetFromLiteralCosts(size_t num_bytes,
|
void SetFromLiteralCosts(size_t num_bytes,
|
||||||
size_t position,
|
size_t position,
|
||||||
const float* literal_cost,
|
const uint8_t* ringbuffer,
|
||||||
size_t literal_cost_mask) {
|
size_t ringbuffer_mask) {
|
||||||
|
std::vector<float> literal_cost(num_bytes);
|
||||||
|
EstimateBitCostsForLiterals(position, num_bytes, ringbuffer_mask,
|
||||||
|
ringbuffer, &literal_cost[0]);
|
||||||
literal_costs_.resize(num_bytes + 1);
|
literal_costs_.resize(num_bytes + 1);
|
||||||
literal_costs_[0] = 0.0;
|
literal_costs_[0] = 0.0;
|
||||||
if (literal_cost) {
|
for (int i = 0; i < num_bytes; ++i) {
|
||||||
for (int i = 0; i < num_bytes; ++i) {
|
literal_costs_[i + 1] = literal_costs_[i] + literal_cost[i];
|
||||||
literal_costs_[i + 1] = literal_costs_[i] +
|
|
||||||
literal_cost[(position + i) & literal_cost_mask];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int i = 1; i <= num_bytes; ++i) {
|
|
||||||
literal_costs_[i] = i * 5.4;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
cost_cmd_.resize(kNumCommandPrefixes);
|
cost_cmd_.resize(kNumCommandPrefixes);
|
||||||
cost_dist_.resize(kNumDistancePrefixes);
|
cost_dist_.resize(kNumDistancePrefixes);
|
||||||
@ -623,8 +620,6 @@ void CreateBackwardReferences(size_t num_bytes,
|
|||||||
size_t position,
|
size_t position,
|
||||||
const uint8_t* ringbuffer,
|
const uint8_t* ringbuffer,
|
||||||
size_t ringbuffer_mask,
|
size_t ringbuffer_mask,
|
||||||
const float* literal_cost,
|
|
||||||
size_t literal_cost_mask,
|
|
||||||
const size_t max_backward_limit,
|
const size_t max_backward_limit,
|
||||||
const int quality,
|
const int quality,
|
||||||
Hashers* hashers,
|
Hashers* hashers,
|
||||||
@ -688,7 +683,7 @@ void CreateBackwardReferences(size_t num_bytes,
|
|||||||
ZopfliCostModel model;
|
ZopfliCostModel model;
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
model.SetFromLiteralCosts(num_bytes, position,
|
model.SetFromLiteralCosts(num_bytes, position,
|
||||||
literal_cost, literal_cost_mask);
|
ringbuffer, ringbuffer_mask);
|
||||||
} else {
|
} else {
|
||||||
model.SetFromCommands(num_bytes, position,
|
model.SetFromCommands(num_bytes, position,
|
||||||
ringbuffer, ringbuffer_mask,
|
ringbuffer, ringbuffer_mask,
|
||||||
|
@ -33,8 +33,6 @@ void CreateBackwardReferences(size_t num_bytes,
|
|||||||
size_t position,
|
size_t position,
|
||||||
const uint8_t* ringbuffer,
|
const uint8_t* ringbuffer,
|
||||||
size_t ringbuffer_mask,
|
size_t ringbuffer_mask,
|
||||||
const float* literal_cost,
|
|
||||||
size_t literal_cost_mask,
|
|
||||||
const size_t max_backward_limit,
|
const size_t max_backward_limit,
|
||||||
const int quality,
|
const int quality,
|
||||||
Hashers* hashers,
|
Hashers* hashers,
|
||||||
|
@ -31,79 +31,16 @@
|
|||||||
#include "./fast_log.h"
|
#include "./fast_log.h"
|
||||||
#include "./hash.h"
|
#include "./hash.h"
|
||||||
#include "./histogram.h"
|
#include "./histogram.h"
|
||||||
#include "./literal_cost.h"
|
|
||||||
#include "./prefix.h"
|
#include "./prefix.h"
|
||||||
|
#include "./utf8_util.h"
|
||||||
#include "./write_bits.h"
|
#include "./write_bits.h"
|
||||||
|
|
||||||
namespace brotli {
|
namespace brotli {
|
||||||
|
|
||||||
static const double kMinUTF8Ratio = 0.75;
|
|
||||||
static const int kMinQualityForBlockSplit = 4;
|
static const int kMinQualityForBlockSplit = 4;
|
||||||
static const int kMinQualityForContextModeling = 5;
|
static const int kMinQualityForContextModeling = 5;
|
||||||
static const int kMinQualityForOptimizeHistograms = 4;
|
static const int kMinQualityForOptimizeHistograms = 4;
|
||||||
|
|
||||||
int ParseAsUTF8(int* symbol, const uint8_t* input, int size) {
|
|
||||||
// ASCII
|
|
||||||
if ((input[0] & 0x80) == 0) {
|
|
||||||
*symbol = input[0];
|
|
||||||
if (*symbol > 0) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// 2-byte UTF8
|
|
||||||
if (size > 1 &&
|
|
||||||
(input[0] & 0xe0) == 0xc0 &&
|
|
||||||
(input[1] & 0xc0) == 0x80) {
|
|
||||||
*symbol = (((input[0] & 0x1f) << 6) |
|
|
||||||
(input[1] & 0x3f));
|
|
||||||
if (*symbol > 0x7f) {
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// 3-byte UFT8
|
|
||||||
if (size > 2 &&
|
|
||||||
(input[0] & 0xf0) == 0xe0 &&
|
|
||||||
(input[1] & 0xc0) == 0x80 &&
|
|
||||||
(input[2] & 0xc0) == 0x80) {
|
|
||||||
*symbol = (((input[0] & 0x0f) << 12) |
|
|
||||||
((input[1] & 0x3f) << 6) |
|
|
||||||
(input[2] & 0x3f));
|
|
||||||
if (*symbol > 0x7ff) {
|
|
||||||
return 3;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// 4-byte UFT8
|
|
||||||
if (size > 3 &&
|
|
||||||
(input[0] & 0xf8) == 0xf0 &&
|
|
||||||
(input[1] & 0xc0) == 0x80 &&
|
|
||||||
(input[2] & 0xc0) == 0x80 &&
|
|
||||||
(input[3] & 0xc0) == 0x80) {
|
|
||||||
*symbol = (((input[0] & 0x07) << 18) |
|
|
||||||
((input[1] & 0x3f) << 12) |
|
|
||||||
((input[2] & 0x3f) << 6) |
|
|
||||||
(input[3] & 0x3f));
|
|
||||||
if (*symbol > 0xffff && *symbol <= 0x10ffff) {
|
|
||||||
return 4;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Not UTF8, emit a special symbol above the UTF8-code space
|
|
||||||
*symbol = 0x110000 | input[0];
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns true if at least min_fraction of the data is UTF8-encoded.
|
|
||||||
bool IsMostlyUTF8(const uint8_t* data, size_t length, double min_fraction) {
|
|
||||||
size_t size_utf8 = 0;
|
|
||||||
size_t pos = 0;
|
|
||||||
while (pos < length) {
|
|
||||||
int symbol;
|
|
||||||
int bytes_read = ParseAsUTF8(&symbol, data + pos, length - pos);
|
|
||||||
pos += bytes_read;
|
|
||||||
if (symbol < 0x110000) size_utf8 += bytes_read;
|
|
||||||
}
|
|
||||||
return size_utf8 > min_fraction * length;
|
|
||||||
}
|
|
||||||
|
|
||||||
void RecomputeDistancePrefixes(Command* cmds,
|
void RecomputeDistancePrefixes(Command* cmds,
|
||||||
size_t num_commands,
|
size_t num_commands,
|
||||||
int num_direct_distance_codes,
|
int num_direct_distance_codes,
|
||||||
@ -136,7 +73,6 @@ BrotliCompressor::BrotliCompressor(BrotliParams params)
|
|||||||
: params_(params),
|
: params_(params),
|
||||||
hashers_(new Hashers()),
|
hashers_(new Hashers()),
|
||||||
input_pos_(0),
|
input_pos_(0),
|
||||||
literal_cost_(0),
|
|
||||||
num_commands_(0),
|
num_commands_(0),
|
||||||
num_literals_(0),
|
num_literals_(0),
|
||||||
last_insert_len_(0),
|
last_insert_len_(0),
|
||||||
@ -173,10 +109,6 @@ BrotliCompressor::BrotliCompressor(BrotliParams params)
|
|||||||
// smaller than ringbuffer size.
|
// smaller than ringbuffer size.
|
||||||
int ringbuffer_bits = std::max(params_.lgwin + 1, params_.lgblock + 1);
|
int ringbuffer_bits = std::max(params_.lgwin + 1, params_.lgblock + 1);
|
||||||
ringbuffer_ = new RingBuffer(ringbuffer_bits, params_.lgblock);
|
ringbuffer_ = new RingBuffer(ringbuffer_bits, params_.lgblock);
|
||||||
if (params_.quality > 9) {
|
|
||||||
literal_cost_mask_ = (1 << params_.lgblock) - 1;
|
|
||||||
literal_cost_ = new float[literal_cost_mask_ + 1];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Allocate command buffer.
|
// Allocate command buffer.
|
||||||
cmd_buffer_size_ = std::max(1 << 18, 1 << params_.lgblock);
|
cmd_buffer_size_ = std::max(1 << 18, 1 << params_.lgblock);
|
||||||
@ -213,7 +145,6 @@ BrotliCompressor::BrotliCompressor(BrotliParams params)
|
|||||||
|
|
||||||
BrotliCompressor::~BrotliCompressor() {
|
BrotliCompressor::~BrotliCompressor() {
|
||||||
delete[] storage_;
|
delete[] storage_;
|
||||||
delete[] literal_cost_;
|
|
||||||
delete[] commands_;
|
delete[] commands_;
|
||||||
delete ringbuffer_;
|
delete ringbuffer_;
|
||||||
delete hashers_;
|
delete hashers_;
|
||||||
@ -296,24 +227,7 @@ bool BrotliCompressor::WriteBrotliData(const bool is_last,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool utf8_mode =
|
|
||||||
params_.quality >= 9 &&
|
|
||||||
IsMostlyUTF8(&data[last_processed_pos_ & mask], bytes, kMinUTF8Ratio);
|
|
||||||
|
|
||||||
if (literal_cost_) {
|
|
||||||
if (utf8_mode) {
|
|
||||||
EstimateBitCostsForLiteralsUTF8(last_processed_pos_, bytes, mask,
|
|
||||||
literal_cost_mask_, data,
|
|
||||||
literal_cost_);
|
|
||||||
} else {
|
|
||||||
EstimateBitCostsForLiterals(last_processed_pos_, bytes, mask,
|
|
||||||
literal_cost_mask_,
|
|
||||||
data, literal_cost_);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
CreateBackwardReferences(bytes, last_processed_pos_, data, mask,
|
CreateBackwardReferences(bytes, last_processed_pos_, data, mask,
|
||||||
literal_cost_,
|
|
||||||
literal_cost_mask_,
|
|
||||||
max_backward_distance_,
|
max_backward_distance_,
|
||||||
params_.quality,
|
params_.quality,
|
||||||
hashers_,
|
hashers_,
|
||||||
@ -347,7 +261,7 @@ bool BrotliCompressor::WriteBrotliData(const bool is_last,
|
|||||||
last_insert_len_ = 0;
|
last_insert_len_ = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
return WriteMetaBlockInternal(is_last, utf8_mode, out_size, output);
|
return WriteMetaBlockInternal(is_last, out_size, output);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Decide about the context map based on the ability of the prediction
|
// Decide about the context map based on the ability of the prediction
|
||||||
@ -449,7 +363,6 @@ void DecideOverLiteralContextModeling(const uint8_t* input,
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
|
bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
|
||||||
const bool utf8_mode,
|
|
||||||
size_t* out_size,
|
size_t* out_size,
|
||||||
uint8_t** output) {
|
uint8_t** output) {
|
||||||
const size_t bytes = input_pos_ - last_flush_pos_;
|
const size_t bytes = input_pos_ - last_flush_pos_;
|
||||||
@ -511,7 +424,7 @@ bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
MetaBlockSplit mb;
|
MetaBlockSplit mb;
|
||||||
int literal_context_mode = utf8_mode ? CONTEXT_UTF8 : CONTEXT_SIGNED;
|
int literal_context_mode = CONTEXT_UTF8;
|
||||||
if (params_.quality <= 9) {
|
if (params_.quality <= 9) {
|
||||||
int num_literal_contexts = 1;
|
int num_literal_contexts = 1;
|
||||||
const int* literal_context_map = NULL;
|
const int* literal_context_map = NULL;
|
||||||
@ -534,6 +447,9 @@ bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
|
|||||||
&mb);
|
&mb);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
if (!IsMostlyUTF8(data, last_flush_pos_, mask, bytes, kMinUTF8Ratio)) {
|
||||||
|
literal_context_mode = CONTEXT_SIGNED;
|
||||||
|
}
|
||||||
BuildMetaBlock(data, last_flush_pos_, mask,
|
BuildMetaBlock(data, last_flush_pos_, mask,
|
||||||
prev_byte_, prev_byte2_,
|
prev_byte_, prev_byte2_,
|
||||||
commands_, num_commands_,
|
commands_, num_commands_,
|
||||||
|
@ -142,7 +142,6 @@ class BrotliCompressor {
|
|||||||
uint8_t* GetBrotliStorage(size_t size);
|
uint8_t* GetBrotliStorage(size_t size);
|
||||||
|
|
||||||
bool WriteMetaBlockInternal(const bool is_last,
|
bool WriteMetaBlockInternal(const bool is_last,
|
||||||
const bool utf8_mode,
|
|
||||||
size_t* out_size,
|
size_t* out_size,
|
||||||
uint8_t** output);
|
uint8_t** output);
|
||||||
|
|
||||||
@ -152,8 +151,6 @@ class BrotliCompressor {
|
|||||||
int hash_type_;
|
int hash_type_;
|
||||||
size_t input_pos_;
|
size_t input_pos_;
|
||||||
RingBuffer* ringbuffer_;
|
RingBuffer* ringbuffer_;
|
||||||
float* literal_cost_;
|
|
||||||
size_t literal_cost_mask_;
|
|
||||||
size_t cmd_buffer_size_;
|
size_t cmd_buffer_size_;
|
||||||
Command* commands_;
|
Command* commands_;
|
||||||
int num_commands_;
|
int num_commands_;
|
||||||
|
@ -31,75 +31,14 @@
|
|||||||
#include "./fast_log.h"
|
#include "./fast_log.h"
|
||||||
#include "./hash.h"
|
#include "./hash.h"
|
||||||
#include "./histogram.h"
|
#include "./histogram.h"
|
||||||
#include "./literal_cost.h"
|
|
||||||
#include "./prefix.h"
|
#include "./prefix.h"
|
||||||
|
#include "./utf8_util.h"
|
||||||
#include "./write_bits.h"
|
#include "./write_bits.h"
|
||||||
|
|
||||||
namespace brotli {
|
namespace brotli {
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
int ParseAsUTF8(int* symbol, const uint8_t* input, int size) {
|
|
||||||
// ASCII
|
|
||||||
if ((input[0] & 0x80) == 0) {
|
|
||||||
*symbol = input[0];
|
|
||||||
if (*symbol > 0) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// 2-byte UTF8
|
|
||||||
if (size > 1 &&
|
|
||||||
(input[0] & 0xe0) == 0xc0 &&
|
|
||||||
(input[1] & 0xc0) == 0x80) {
|
|
||||||
*symbol = (((input[0] & 0x1f) << 6) |
|
|
||||||
(input[1] & 0x3f));
|
|
||||||
if (*symbol > 0x7f) {
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// 3-byte UFT8
|
|
||||||
if (size > 2 &&
|
|
||||||
(input[0] & 0xf0) == 0xe0 &&
|
|
||||||
(input[1] & 0xc0) == 0x80 &&
|
|
||||||
(input[2] & 0xc0) == 0x80) {
|
|
||||||
*symbol = (((input[0] & 0x0f) << 12) |
|
|
||||||
((input[1] & 0x3f) << 6) |
|
|
||||||
(input[2] & 0x3f));
|
|
||||||
if (*symbol > 0x7ff) {
|
|
||||||
return 3;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// 4-byte UFT8
|
|
||||||
if (size > 3 &&
|
|
||||||
(input[0] & 0xf8) == 0xf0 &&
|
|
||||||
(input[1] & 0xc0) == 0x80 &&
|
|
||||||
(input[2] & 0xc0) == 0x80 &&
|
|
||||||
(input[3] & 0xc0) == 0x80) {
|
|
||||||
*symbol = (((input[0] & 0x07) << 18) |
|
|
||||||
((input[1] & 0x3f) << 12) |
|
|
||||||
((input[2] & 0x3f) << 6) |
|
|
||||||
(input[3] & 0x3f));
|
|
||||||
if (*symbol > 0xffff && *symbol <= 0x10ffff) {
|
|
||||||
return 4;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Not UTF8, emit a special symbol above the UTF8-code space
|
|
||||||
*symbol = 0x110000 | input[0];
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns true if at least min_fraction of the data is UTF8-encoded.
|
|
||||||
bool IsMostlyUTF8(const uint8_t* data, size_t length, double min_fraction) {
|
|
||||||
size_t size_utf8 = 0;
|
|
||||||
for (size_t pos = 0; pos < length; ) {
|
|
||||||
int symbol;
|
|
||||||
int bytes_read = ParseAsUTF8(&symbol, data + pos, length - pos);
|
|
||||||
pos += bytes_read;
|
|
||||||
if (symbol < 0x110000) size_utf8 += bytes_read;
|
|
||||||
}
|
|
||||||
return size_utf8 > min_fraction * length;
|
|
||||||
}
|
|
||||||
|
|
||||||
void RecomputeDistancePrefixes(std::vector<Command>* cmds,
|
void RecomputeDistancePrefixes(std::vector<Command>* cmds,
|
||||||
int num_direct_distance_codes,
|
int num_direct_distance_codes,
|
||||||
int distance_postfix_bits) {
|
int distance_postfix_bits) {
|
||||||
@ -151,19 +90,8 @@ bool WriteMetaBlockParallel(const BrotliParams& params,
|
|||||||
|
|
||||||
// Decide about UTF8 mode.
|
// Decide about UTF8 mode.
|
||||||
static const double kMinUTF8Ratio = 0.75;
|
static const double kMinUTF8Ratio = 0.75;
|
||||||
bool utf8_mode = IsMostlyUTF8(&input[input_pos], input_size, kMinUTF8Ratio);
|
bool utf8_mode = IsMostlyUTF8(&input[0], input_pos, mask, input_size,
|
||||||
|
kMinUTF8Ratio);
|
||||||
// Compute literal costs. The 4 bytes at the end are there to cover for an
|
|
||||||
// over-read past the end of input, but not past the mask, in
|
|
||||||
// CreateBackwardReferences.
|
|
||||||
std::vector<float> literal_cost(prefix_size + input_size + 4);
|
|
||||||
if (utf8_mode) {
|
|
||||||
EstimateBitCostsForLiteralsUTF8(input_pos, input_size, mask, mask,
|
|
||||||
&input[0], &literal_cost[0]);
|
|
||||||
} else {
|
|
||||||
EstimateBitCostsForLiterals(input_pos, input_size, mask, mask,
|
|
||||||
&input[0], &literal_cost[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initialize hashers.
|
// Initialize hashers.
|
||||||
int hash_type = std::min(9, params.quality);
|
int hash_type = std::min(9, params.quality);
|
||||||
@ -180,7 +108,6 @@ bool WriteMetaBlockParallel(const BrotliParams& params,
|
|||||||
CreateBackwardReferences(
|
CreateBackwardReferences(
|
||||||
input_size, input_pos,
|
input_size, input_pos,
|
||||||
&input[0], mask,
|
&input[0], mask,
|
||||||
&literal_cost[0], mask,
|
|
||||||
max_backward_distance,
|
max_backward_distance,
|
||||||
params.quality,
|
params.quality,
|
||||||
hashers,
|
hashers,
|
||||||
|
@ -21,6 +21,7 @@
|
|||||||
|
|
||||||
#include "./fast_log.h"
|
#include "./fast_log.h"
|
||||||
#include "./types.h"
|
#include "./types.h"
|
||||||
|
#include "./utf8_util.h"
|
||||||
|
|
||||||
namespace brotli {
|
namespace brotli {
|
||||||
|
|
||||||
@ -61,8 +62,7 @@ static int DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
|
void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
|
||||||
size_t cost_mask, const uint8_t *data,
|
const uint8_t *data, float *cost) {
|
||||||
float *cost) {
|
|
||||||
|
|
||||||
// max_utf8 is 0 (normal ascii single byte modeling),
|
// max_utf8 is 0 (normal ascii single byte modeling),
|
||||||
// 1 (for 2-byte utf-8 modeling), or 2 (for 3-byte utf-8 modeling).
|
// 1 (for 2-byte utf-8 modeling), or 2 (for 3-byte utf-8 modeling).
|
||||||
@ -126,13 +126,16 @@ void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
|
|||||||
if (i < 2000) {
|
if (i < 2000) {
|
||||||
lit_cost += 0.7 - ((2000 - i) / 2000.0 * 0.35);
|
lit_cost += 0.7 - ((2000 - i) / 2000.0 * 0.35);
|
||||||
}
|
}
|
||||||
cost[(pos + i) & cost_mask] = lit_cost;
|
cost[i] = lit_cost;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
|
void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
|
||||||
size_t cost_mask, const uint8_t *data,
|
const uint8_t *data, float *cost) {
|
||||||
float *cost) {
|
if (IsMostlyUTF8(data, pos, mask, len, kMinUTF8Ratio)) {
|
||||||
|
EstimateBitCostsForLiteralsUTF8(pos, len, mask, data, cost);
|
||||||
|
return;
|
||||||
|
}
|
||||||
int histogram[256] = { 0 };
|
int histogram[256] = { 0 };
|
||||||
int window_half = 2000;
|
int window_half = 2000;
|
||||||
int in_window = std::min(static_cast<size_t>(window_half), len);
|
int in_window = std::min(static_cast<size_t>(window_half), len);
|
||||||
@ -164,7 +167,7 @@ void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
|
|||||||
lit_cost *= 0.5;
|
lit_cost *= 0.5;
|
||||||
lit_cost += 0.5;
|
lit_cost += 0.5;
|
||||||
}
|
}
|
||||||
cost[(pos + i) & cost_mask] = lit_cost;
|
cost[i] = lit_cost;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,14 +23,9 @@ namespace brotli {
|
|||||||
|
|
||||||
// Estimates how many bits the literals in the interval [pos, pos + len) in the
|
// Estimates how many bits the literals in the interval [pos, pos + len) in the
|
||||||
// ringbuffer (data, mask) will take entropy coded and writes these estimates
|
// ringbuffer (data, mask) will take entropy coded and writes these estimates
|
||||||
// to the ringbuffer (cost, mask).
|
// to the cost[0..len) array.
|
||||||
void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
|
void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
|
||||||
size_t cost_mask, const uint8_t *data,
|
const uint8_t *data, float *cost);
|
||||||
float *cost);
|
|
||||||
|
|
||||||
void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
|
|
||||||
size_t cost_mask, const uint8_t *data,
|
|
||||||
float *cost);
|
|
||||||
|
|
||||||
} // namespace brotli
|
} // namespace brotli
|
||||||
|
|
||||||
|
90
enc/utf8_util.cc
Normal file
90
enc/utf8_util.cc
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
//
|
||||||
|
// Heuristics for deciding about the UTF8-ness of strings.
|
||||||
|
|
||||||
|
#include "./utf8_util.h"
|
||||||
|
|
||||||
|
#include "./types.h"
|
||||||
|
|
||||||
|
namespace brotli {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
int ParseAsUTF8(int* symbol, const uint8_t* input, int size) {
|
||||||
|
// ASCII
|
||||||
|
if ((input[0] & 0x80) == 0) {
|
||||||
|
*symbol = input[0];
|
||||||
|
if (*symbol > 0) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 2-byte UTF8
|
||||||
|
if (size > 1 &&
|
||||||
|
(input[0] & 0xe0) == 0xc0 &&
|
||||||
|
(input[1] & 0xc0) == 0x80) {
|
||||||
|
*symbol = (((input[0] & 0x1f) << 6) |
|
||||||
|
(input[1] & 0x3f));
|
||||||
|
if (*symbol > 0x7f) {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 3-byte UFT8
|
||||||
|
if (size > 2 &&
|
||||||
|
(input[0] & 0xf0) == 0xe0 &&
|
||||||
|
(input[1] & 0xc0) == 0x80 &&
|
||||||
|
(input[2] & 0xc0) == 0x80) {
|
||||||
|
*symbol = (((input[0] & 0x0f) << 12) |
|
||||||
|
((input[1] & 0x3f) << 6) |
|
||||||
|
(input[2] & 0x3f));
|
||||||
|
if (*symbol > 0x7ff) {
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 4-byte UFT8
|
||||||
|
if (size > 3 &&
|
||||||
|
(input[0] & 0xf8) == 0xf0 &&
|
||||||
|
(input[1] & 0xc0) == 0x80 &&
|
||||||
|
(input[2] & 0xc0) == 0x80 &&
|
||||||
|
(input[3] & 0xc0) == 0x80) {
|
||||||
|
*symbol = (((input[0] & 0x07) << 18) |
|
||||||
|
((input[1] & 0x3f) << 12) |
|
||||||
|
((input[2] & 0x3f) << 6) |
|
||||||
|
(input[3] & 0x3f));
|
||||||
|
if (*symbol > 0xffff && *symbol <= 0x10ffff) {
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Not UTF8, emit a special symbol above the UTF8-code space
|
||||||
|
*symbol = 0x110000 | input[0];
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
// Returns true if at least min_fraction of the data is UTF8-encoded.
|
||||||
|
bool IsMostlyUTF8(const uint8_t* data, const size_t pos, const size_t mask,
|
||||||
|
const size_t length, const double min_fraction) {
|
||||||
|
size_t size_utf8 = 0;
|
||||||
|
size_t i = 0;
|
||||||
|
while (i < length) {
|
||||||
|
int symbol;
|
||||||
|
int bytes_read = ParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
|
||||||
|
i += bytes_read;
|
||||||
|
if (symbol < 0x110000) size_utf8 += bytes_read;
|
||||||
|
}
|
||||||
|
return size_utf8 > min_fraction * length;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace brotli
|
33
enc/utf8_util.h
Normal file
33
enc/utf8_util.h
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
#ifndef BROTLI_ENC_UTF8_UTIL_H_
|
||||||
|
#define BROTLI_ENC_UTF8_UTIL_H_
|
||||||
|
|
||||||
|
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
//
|
||||||
|
// Heuristics for deciding about the UTF8-ness of strings.
|
||||||
|
|
||||||
|
#include "./types.h"
|
||||||
|
|
||||||
|
namespace brotli {
|
||||||
|
|
||||||
|
static const double kMinUTF8Ratio = 0.75;
|
||||||
|
|
||||||
|
// Returns true if at least min_fraction of the bytes between pos and
|
||||||
|
// pos + length in the (data, mask) ringbuffer is UTF8-encoded.
|
||||||
|
bool IsMostlyUTF8(const uint8_t* data, const size_t pos, const size_t mask,
|
||||||
|
const size_t length, const double min_fraction);
|
||||||
|
|
||||||
|
} // namespace brotli
|
||||||
|
|
||||||
|
#endif // BROTLI_ENC_UTF8_UTIL_H_
|
2
setup.py
2
setup.py
@ -144,6 +144,7 @@ brotli = Extension("brotli",
|
|||||||
"enc/metablock.cc",
|
"enc/metablock.cc",
|
||||||
"enc/static_dict.cc",
|
"enc/static_dict.cc",
|
||||||
"enc/streams.cc",
|
"enc/streams.cc",
|
||||||
|
"enc/utf8_util.cc",
|
||||||
"dec/bit_reader.c",
|
"dec/bit_reader.c",
|
||||||
"dec/decode.c",
|
"dec/decode.c",
|
||||||
"dec/dictionary.c",
|
"dec/dictionary.c",
|
||||||
@ -177,6 +178,7 @@ brotli = Extension("brotli",
|
|||||||
"enc/streams.h",
|
"enc/streams.h",
|
||||||
"enc/transform.h",
|
"enc/transform.h",
|
||||||
"enc/types.h",
|
"enc/types.h",
|
||||||
|
"enc/utf8_util.h",
|
||||||
"enc/write_bits.h",
|
"enc/write_bits.h",
|
||||||
"dec/bit_reader.h",
|
"dec/bit_reader.h",
|
||||||
"dec/context.h",
|
"dec/context.h",
|
||||||
|
Loading…
Reference in New Issue
Block a user