diff --git a/c/enc/block_splitter_inc.h b/c/enc/block_splitter_inc.h index 7e96204..64409ec 100644 --- a/c/enc/block_splitter_inc.h +++ b/c/enc/block_splitter_inc.h @@ -118,7 +118,8 @@ static size_t FN(FindBlocks)(const DataType* data, const size_t length, size_t insert_cost_ix = symbol * num_histograms; double min_cost = 1e99; double block_switch_cost = block_switch_bitcost; - static const double threshold = 0.07 / 2000.0; + static const size_t prologue_length = 2000; + static const double multiplier = 0.07 / 2000; size_t k; for (k = 0; k < num_histograms; ++k) { /* We are coding the symbol with entropy code k. */ @@ -129,8 +130,8 @@ static size_t FN(FindBlocks)(const DataType* data, const size_t length, } } /* More blocks for the beginning. */ - if (byte_ix < 2000) { - block_switch_cost *= 0.77 + threshold * (double)byte_ix; + if (byte_ix < prologue_length) { + block_switch_cost *= 0.77 + multiplier * (double)byte_ix; } for (k = 0; k < num_histograms; ++k) { cost[k] -= min_cost; diff --git a/c/enc/literal_cost.c b/c/enc/literal_cost.c index b063afc..a129657 100644 --- a/c/enc/literal_cost.c +++ b/c/enc/literal_cost.c @@ -106,7 +106,8 @@ static void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask, size_t utf8_pos = UTF8Position(last_c, c, max_utf8); size_t masked_pos = (pos + i) & mask; size_t histo = histogram[256 * utf8_pos + data[masked_pos]]; - static const double threshold = 0.35 / 2000.0; + static const size_t prologue_length = 2000; + static const double multiplier = 0.35 / 2000; double lit_cost; if (histo == 0) { histo = 1; @@ -121,8 +122,8 @@ static void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask, Perhaps because the entropy source is changing its properties rapidly in the beginning of the file, perhaps because the beginning of the data is a statistical "anomaly". */ - if (i < 2000) { - lit_cost += 0.7 - ((double)(2000 - i) * threshold); + if (i < prologue_length) { + lit_cost += 0.35 + multiplier * (double)i; } cost[i] = (float)lit_cost; }