Fix computing lower boundaries for smallest normalized double
This commit is contained in:
parent
bb728a572a
commit
2bc5585ff0
@ -364,6 +364,29 @@ class fp {
|
||||
static FMT_CONSTEXPR_DECL const uint64_t implicit_bit =
|
||||
1ull << double_significand_size;
|
||||
|
||||
// Assigns d to this and return true iff predecessor is closer than successor.
|
||||
template <typename Double> bool assign(Double d) {
|
||||
// Assume double is in the format [sign][exponent][significand].
|
||||
using limits = std::numeric_limits<Double>;
|
||||
const int exponent_size =
|
||||
bits<Double>::value - double_significand_size - 1; // -1 for sign
|
||||
const uint64_t significand_mask = implicit_bit - 1;
|
||||
const uint64_t exponent_mask = (~0ull >> 1) & ~significand_mask;
|
||||
const int exponent_bias = (1 << exponent_size) - limits::max_exponent - 1;
|
||||
auto u = bit_cast<uint64_t>(d);
|
||||
f = u & significand_mask;
|
||||
auto biased_e = (u & exponent_mask) >> double_significand_size;
|
||||
// Predecessor is closer if d is a normalized power of 2 (f == 0) other than
|
||||
// the smallest normalized number (biased_e > 1).
|
||||
bool is_predecessor_closer = f == 0 && biased_e > 1;
|
||||
if (biased_e != 0)
|
||||
f += implicit_bit;
|
||||
else
|
||||
biased_e = 1; // Subnormals use biased exponent 1 (min exponent).
|
||||
e = static_cast<int>(biased_e - exponent_bias - double_significand_size);
|
||||
return is_predecessor_closer;
|
||||
}
|
||||
|
||||
public:
|
||||
significand_type f;
|
||||
int e;
|
||||
@ -376,23 +399,7 @@ class fp {
|
||||
|
||||
// Constructs fp from an IEEE754 double. It is a template to prevent compile
|
||||
// errors on platforms where double is not IEEE754.
|
||||
template <typename Double> explicit fp(Double d) {
|
||||
// Assume double is in the format [sign][exponent][significand].
|
||||
using limits = std::numeric_limits<Double>;
|
||||
const int exponent_size =
|
||||
bits<Double>::value - double_significand_size - 1; // -1 for sign
|
||||
const uint64_t significand_mask = implicit_bit - 1;
|
||||
const uint64_t exponent_mask = (~0ull >> 1) & ~significand_mask;
|
||||
const int exponent_bias = (1 << exponent_size) - limits::max_exponent - 1;
|
||||
auto u = bit_cast<uint64_t>(d);
|
||||
auto biased_e = (u & exponent_mask) >> double_significand_size;
|
||||
f = u & significand_mask;
|
||||
if (biased_e != 0)
|
||||
f += implicit_bit;
|
||||
else
|
||||
biased_e = 1; // Subnormals use biased exponent 1 (min exponent).
|
||||
e = static_cast<int>(biased_e - exponent_bias - double_significand_size);
|
||||
}
|
||||
template <typename Double> explicit fp(Double d) { assign(d); }
|
||||
|
||||
// Normalizes the value converted from double and multiplied by (1 << SHIFT).
|
||||
template <int SHIFT> friend fp normalize(fp value) {
|
||||
@ -410,20 +417,23 @@ class fp {
|
||||
return value;
|
||||
}
|
||||
|
||||
// Computes lower and upper boundaries (m^- and m^+ in the Grisu paper), where
|
||||
// a boundary is a value half way between the number and its predecessor
|
||||
// Assigns d to this together with computing lower and upper boundaries,
|
||||
// where a boundary is a value half way between the number and its predecessor
|
||||
// (lower) or successor (upper). The upper boundary is normalized and lower
|
||||
// has the same exponent but may be not normalized.
|
||||
void compute_boundaries(fp& lower, fp& upper) const {
|
||||
lower =
|
||||
f == implicit_bit ? fp((f << 2) - 1, e - 2) : fp((f << 1) - 1, e - 1);
|
||||
template <typename Double>
|
||||
void assign_with_boundaries(Double d, fp& lower, fp& upper) {
|
||||
bool is_lower_closer = assign(d);
|
||||
lower = is_lower_closer ? fp((f << 2) - 1, e - 2) : fp((f << 1) - 1, e - 1);
|
||||
// 1 in normalize accounts for the exponent shift above.
|
||||
upper = normalize<1>(fp((f << 1) + 1, e - 1));
|
||||
lower.f <<= lower.e - upper.e;
|
||||
lower.e = upper.e;
|
||||
}
|
||||
|
||||
void compute_float_boundaries(fp& lower, fp& upper) const {
|
||||
template <typename Double>
|
||||
void assign_float_with_boundaries(Double d, fp& lower, fp& upper) {
|
||||
assign(d);
|
||||
constexpr int min_normal_e = std::numeric_limits<float>::min_exponent -
|
||||
std::numeric_limits<double>::digits;
|
||||
significand_type half_ulp = 1 << (std::numeric_limits<double>::digits -
|
||||
@ -436,6 +446,8 @@ class fp {
|
||||
}
|
||||
};
|
||||
|
||||
inline bool operator==(fp x, fp y) { return x.f == y.f && x.e == y.e; }
|
||||
|
||||
// Returns an fp number representing x - y. Result may not be normalized.
|
||||
inline fp operator-(fp x, fp y) {
|
||||
FMT_ASSERT(x.f >= y.f && x.e == y.e, "invalid operands");
|
||||
@ -948,30 +960,28 @@ template <int GRISU_VERSION> struct grisu_shortest_handler {
|
||||
|
||||
// Formats value using a variation of the Fixed-Precision Positive
|
||||
// Floating-Point Printout ((FPP)^2) algorithm by Steele & White:
|
||||
// http://kurtstephens.com/files/p372-steele.pdf.
|
||||
template <typename Double>
|
||||
FMT_FUNC void fallback_format(Double value, buffer<char>& buf, int& exp10) {
|
||||
fp fp_value(value);
|
||||
// https://fmt.dev/p372-steele.pdf.
|
||||
FMT_FUNC void fallback_format(fp value, buffer<char>& buf, int& exp10) {
|
||||
// Shift to account for unequal gaps when lower boundary is 2 times closer.
|
||||
// TODO: handle denormals
|
||||
int shift = 0; // fp_value.f == 1 ? 1 : 0;
|
||||
bigint numerator; // 2 * R in (FPP)^2.
|
||||
bigint denominator; // 2 * S in (FPP)^2.
|
||||
bigint lower; // (M^- in (FPP)^2).
|
||||
bigint upper_store;
|
||||
int shift = 0; // fp_value.f == 1 ? 1 : 0;
|
||||
bigint numerator; // 2 * R in (FPP)^2.
|
||||
bigint denominator; // 2 * S in (FPP)^2.
|
||||
bigint lower; // (M^- in (FPP)^2).
|
||||
bigint upper_store; // upper's value if different from lower.
|
||||
bigint* upper = nullptr; // (M^+ in (FPP)^2).
|
||||
// Shift numerator and denominator by an extra bit to make lower and upper
|
||||
// which are normally half ulp integers. This eliminates multiplication by 2
|
||||
// during later computations.
|
||||
uint64_t significand = fp_value.f << (shift + 1);
|
||||
if (fp_value.e >= 0) {
|
||||
uint64_t significand = value.f << (shift + 1);
|
||||
if (value.e >= 0) {
|
||||
numerator.assign(significand);
|
||||
numerator <<= fp_value.e;
|
||||
numerator <<= value.e;
|
||||
lower.assign(1);
|
||||
lower <<= fp_value.e;
|
||||
lower <<= value.e;
|
||||
if (shift != 0) {
|
||||
upper_store.assign(1);
|
||||
upper_store <<= fp_value.e + shift;
|
||||
upper_store <<= value.e + shift;
|
||||
upper = &upper_store;
|
||||
}
|
||||
denominator.assign_pow10(exp10);
|
||||
@ -986,11 +996,11 @@ FMT_FUNC void fallback_format(Double value, buffer<char>& buf, int& exp10) {
|
||||
}
|
||||
numerator *= significand;
|
||||
denominator.assign(1);
|
||||
denominator <<= 1 - fp_value.e;
|
||||
denominator <<= 1 - value.e;
|
||||
} else {
|
||||
numerator.assign(significand);
|
||||
denominator.assign_pow10(exp10);
|
||||
denominator <<= 1 - fp_value.e;
|
||||
denominator <<= 1 - value.e;
|
||||
lower.assign(1);
|
||||
if (shift != 0) {
|
||||
upper_store.assign(1ull << shift);
|
||||
@ -999,7 +1009,7 @@ FMT_FUNC void fallback_format(Double value, buffer<char>& buf, int& exp10) {
|
||||
}
|
||||
if (!upper) upper = &lower;
|
||||
// Invariant: value == (numerator / denominator) * pow(10, exp10).
|
||||
bool even = (fp_value.f & 1) == 0;
|
||||
bool even = (value.f & 1) == 0;
|
||||
int num_digits = 0;
|
||||
char* data = buf.data();
|
||||
for (;;) {
|
||||
@ -1045,11 +1055,11 @@ bool grisu_format(Double value, buffer<char>& buf, int precision,
|
||||
return true;
|
||||
}
|
||||
|
||||
const fp fp_value(value);
|
||||
const int min_exp = -60; // alpha in Grisu.
|
||||
int cached_exp10 = 0; // K in Grisu.
|
||||
if (precision != -1) {
|
||||
if (precision > 17) return false;
|
||||
fp fp_value(value);
|
||||
fp normalized = normalize(fp_value);
|
||||
const auto cached_pow = get_cached_power(
|
||||
min_exp - (normalized.e + fp::significand_size), cached_exp10);
|
||||
@ -1059,11 +1069,12 @@ bool grisu_format(Double value, buffer<char>& buf, int precision,
|
||||
return false;
|
||||
buf.resize(to_unsigned(handler.size));
|
||||
} else {
|
||||
fp fp_value;
|
||||
fp lower, upper; // w^- and w^+ in the Grisu paper.
|
||||
if ((options & grisu_options::binary32) != 0)
|
||||
fp_value.compute_float_boundaries(lower, upper);
|
||||
fp_value.assign_float_with_boundaries(value, lower, upper);
|
||||
else
|
||||
fp_value.compute_boundaries(lower, upper);
|
||||
fp_value.assign_with_boundaries(value, lower, upper);
|
||||
|
||||
// Find a cached power of 10 such that multiplying upper by it will bring
|
||||
// the exponent in the range [min_exp, -32].
|
||||
@ -1092,7 +1103,7 @@ bool grisu_format(Double value, buffer<char>& buf, int precision,
|
||||
size = handler.size;
|
||||
if (result == digits::error) {
|
||||
exp = exp + size - cached_exp10 - 1;
|
||||
fallback_format(value, buf, exp);
|
||||
fallback_format(fp_value, buf, exp);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -180,18 +180,40 @@ TEST(BigIntTest, DivModAssign) {
|
||||
EXPECT_EQ("2a", fmt::format("{}", n2));
|
||||
}
|
||||
|
||||
template <bool is_iec559> void test_construct_from_double() {
|
||||
template <bool is_iec559> void run_double_tests() {
|
||||
fmt::print("warning: double is not IEC559, skipping FP tests\n");
|
||||
}
|
||||
|
||||
template <> void test_construct_from_double<true>() {
|
||||
auto v = fp(1.23);
|
||||
EXPECT_EQ(v.f, 0x13ae147ae147aeu);
|
||||
EXPECT_EQ(v.e, -52);
|
||||
template <> void run_double_tests<true>() {
|
||||
// Construct from double.
|
||||
EXPECT_EQ(fp(1.23), fp(0x13ae147ae147aeu, -52));
|
||||
|
||||
// Compute boundaries:
|
||||
fp value, lower, upper;
|
||||
// Normalized & not power of 2 - equidistant boundaries:
|
||||
value.assign_with_boundaries(1.23, lower, upper);
|
||||
EXPECT_EQ(value, fp(0x0013ae147ae147ae, -52));
|
||||
EXPECT_EQ(lower, fp(0x9d70a3d70a3d6c00, -63));
|
||||
EXPECT_EQ(upper, fp(0x9d70a3d70a3d7400, -63));
|
||||
// Normalized power of 2 - lower boundary is closer:
|
||||
value.assign_with_boundaries(1.9807040628566084e+28, lower, upper); // 2**94
|
||||
EXPECT_EQ(value, fp(0x0010000000000000, 42));
|
||||
EXPECT_EQ(lower, fp(0x7ffffffffffffe00, 31));
|
||||
EXPECT_EQ(upper, fp(0x8000000000000400, 31));
|
||||
// Smallest normalized double - equidistant boundaries:
|
||||
value.assign_with_boundaries(2.2250738585072014e-308, lower, upper);
|
||||
EXPECT_EQ(value, fp(0x0010000000000000, -1074));
|
||||
EXPECT_EQ(lower, fp(0x7ffffffffffffc00, -1085));
|
||||
EXPECT_EQ(upper, fp(0x8000000000000400, -1085));
|
||||
// Subnormal - equidistant boundaries:
|
||||
value.assign_with_boundaries(4.9406564584124654e-324, lower, upper);
|
||||
EXPECT_EQ(value, fp(0x0000000000000001, -1074));
|
||||
EXPECT_EQ(lower, fp(0x4000000000000000, -1137));
|
||||
EXPECT_EQ(upper, fp(0xc000000000000000, -1137));
|
||||
}
|
||||
|
||||
TEST(FPTest, ConstructFromDouble) {
|
||||
test_construct_from_double<std::numeric_limits<double>::is_iec559>();
|
||||
TEST(FPTest, DoubleTests) {
|
||||
run_double_tests<std::numeric_limits<double>::is_iec559>();
|
||||
}
|
||||
|
||||
TEST(FPTest, Normalize) {
|
||||
@ -201,26 +223,6 @@ TEST(FPTest, Normalize) {
|
||||
EXPECT_EQ(-6, normalized.e);
|
||||
}
|
||||
|
||||
TEST(FPTest, ComputeBoundariesSubnormal) {
|
||||
auto v = fp(0xbeef, 42);
|
||||
fp lower, upper;
|
||||
v.compute_boundaries(lower, upper);
|
||||
EXPECT_EQ(0xbeee800000000000, lower.f);
|
||||
EXPECT_EQ(-6, lower.e);
|
||||
EXPECT_EQ(0xbeef800000000000, upper.f);
|
||||
EXPECT_EQ(-6, upper.e);
|
||||
}
|
||||
|
||||
TEST(FPTest, ComputeBoundaries) {
|
||||
auto v = fp(0x10000000000000, 42);
|
||||
fp lower, upper;
|
||||
v.compute_boundaries(lower, upper);
|
||||
EXPECT_EQ(0x7ffffffffffffe00, lower.f);
|
||||
EXPECT_EQ(31, lower.e);
|
||||
EXPECT_EQ(0x8000000000000400, upper.f);
|
||||
EXPECT_EQ(31, upper.e);
|
||||
}
|
||||
|
||||
TEST(FPTest, ComputeFloatBoundaries) {
|
||||
struct {
|
||||
double x, lower, upper;
|
||||
@ -237,13 +239,12 @@ TEST(FPTest, ComputeFloatBoundaries) {
|
||||
{1e-45f, 7.006492321624085e-46, 2.1019476964872256e-45},
|
||||
};
|
||||
for (auto test : tests) {
|
||||
auto v = fp(test.x);
|
||||
fp vlower = normalize(fp(test.lower));
|
||||
fp vupper = normalize(fp(test.upper));
|
||||
vlower.f >>= vupper.e - vlower.e;
|
||||
vlower.e = vupper.e;
|
||||
fp lower, upper;
|
||||
v.compute_float_boundaries(lower, upper);
|
||||
fp value, lower, upper;
|
||||
value.assign_float_with_boundaries(test.x, lower, upper);
|
||||
EXPECT_EQ(vlower.f, lower.f);
|
||||
EXPECT_EQ(vlower.e, lower.e);
|
||||
EXPECT_EQ(vupper.f, upper.f);
|
||||
|
Loading…
Reference in New Issue
Block a user