Move format_float to format.h for __float128

2025-01-10 14:30:09 +00:00 · 2022-05-29 09:55:42 -07:00 · 2022-05-29 09:55:42 -07:00 · c2fcdc54e2
commit c2fcdc54e2
parent 2b9037a190
2 changed files with 819 additions and 823 deletions
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@ -128,567 +128,16 @@ FMT_FUNC std::system_error vsystem_error(int error_code, string_view format_str,

 namespace detail {

-template <typename T = void> struct basic_impl_data {
-  // Normalized 64-bit significands of pow(10, k), for k = -348, -340, ..., 340.
-  // These are generated by support/compute-powers.py.
-  static constexpr uint64_t pow10_significands[87] = {
-      0xfa8fd5a0081c0288, 0xbaaee17fa23ebf76, 0x8b16fb203055ac76,
-      0xcf42894a5dce35ea, 0x9a6bb0aa55653b2d, 0xe61acf033d1a45df,
-      0xab70fe17c79ac6ca, 0xff77b1fcbebcdc4f, 0xbe5691ef416bd60c,
-      0x8dd01fad907ffc3c, 0xd3515c2831559a83, 0x9d71ac8fada6c9b5,
-      0xea9c227723ee8bcb, 0xaecc49914078536d, 0x823c12795db6ce57,
-      0xc21094364dfb5637, 0x9096ea6f3848984f, 0xd77485cb25823ac7,
-      0xa086cfcd97bf97f4, 0xef340a98172aace5, 0xb23867fb2a35b28e,
-      0x84c8d4dfd2c63f3b, 0xc5dd44271ad3cdba, 0x936b9fcebb25c996,
-      0xdbac6c247d62a584, 0xa3ab66580d5fdaf6, 0xf3e2f893dec3f126,
-      0xb5b5ada8aaff80b8, 0x87625f056c7c4a8b, 0xc9bcff6034c13053,
-      0x964e858c91ba2655, 0xdff9772470297ebd, 0xa6dfbd9fb8e5b88f,
-      0xf8a95fcf88747d94, 0xb94470938fa89bcf, 0x8a08f0f8bf0f156b,
-      0xcdb02555653131b6, 0x993fe2c6d07b7fac, 0xe45c10c42a2b3b06,
-      0xaa242499697392d3, 0xfd87b5f28300ca0e, 0xbce5086492111aeb,
-      0x8cbccc096f5088cc, 0xd1b71758e219652c, 0x9c40000000000000,
-      0xe8d4a51000000000, 0xad78ebc5ac620000, 0x813f3978f8940984,
-      0xc097ce7bc90715b3, 0x8f7e32ce7bea5c70, 0xd5d238a4abe98068,
-      0x9f4f2726179a2245, 0xed63a231d4c4fb27, 0xb0de65388cc8ada8,
-      0x83c7088e1aab65db, 0xc45d1df942711d9a, 0x924d692ca61be758,
-      0xda01ee641a708dea, 0xa26da3999aef774a, 0xf209787bb47d6b85,
-      0xb454e4a179dd1877, 0x865b86925b9bc5c2, 0xc83553c5c8965d3d,
-      0x952ab45cfa97a0b3, 0xde469fbd99a05fe3, 0xa59bc234db398c25,
-      0xf6c69a72a3989f5c, 0xb7dcbf5354e9bece, 0x88fcf317f22241e2,
-      0xcc20ce9bd35c78a5, 0x98165af37b2153df, 0xe2a0b5dc971f303a,
-      0xa8d9d1535ce3b396, 0xfb9b7cd9a4a7443c, 0xbb764c4ca7a44410,
-      0x8bab8eefb6409c1a, 0xd01fef10a657842c, 0x9b10a4e5e9913129,
-      0xe7109bfba19c0c9d, 0xac2820d9623bf429, 0x80444b5e7aa7cf85,
-      0xbf21e44003acdd2d, 0x8e679c2f5e44ff8f, 0xd433179d9c8cb841,
-      0x9e19db92b4e31ba9, 0xeb96bf6ebadf77d9, 0xaf87023b9bf0ee6b,
-  };
-
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
-#  pragma GCC diagnostic push
-#  pragma GCC diagnostic ignored "-Wnarrowing"
-#endif
-  // Binary exponents of pow(10, k), for k = -348, -340, ..., 340, corresponding
-  // to significands above.
-  static constexpr int16_t pow10_exponents[87] = {
-      -1220, -1193, -1166, -1140, -1113, -1087, -1060, -1034, -1007, -980, -954,
-      -927,  -901,  -874,  -847,  -821,  -794,  -768,  -741,  -715,  -688, -661,
-      -635,  -608,  -582,  -555,  -529,  -502,  -475,  -449,  -422,  -396, -369,
-      -343,  -316,  -289,  -263,  -236,  -210,  -183,  -157,  -130,  -103, -77,
-      -50,   -24,   3,     30,    56,    83,    109,   136,   162,   189,  216,
-      242,   269,   295,   322,   348,   375,   402,   428,   455,   481,  508,
-      534,   561,   588,   614,   641,   667,   694,   720,   747,   774,  800,
-      827,   853,   880,   907,   933,   960,   986,   1013,  1039,  1066};
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
-#  pragma GCC diagnostic pop
-#endif
-
-  static constexpr uint64_t power_of_10_64[20] = {
-      1, FMT_POWERS_OF_10(1ULL), FMT_POWERS_OF_10(1000000000ULL),
-      10000000000000000000ULL};
-};
-
-// This is a struct rather than an alias to avoid shadowing warnings in gcc.
-struct impl_data : basic_impl_data<> {};
-
 #if __cplusplus < 201703L
-template <typename T>
-constexpr uint64_t basic_impl_data<T>::pow10_significands[];
-template <typename T> constexpr int16_t basic_impl_data<T>::pow10_exponents[];
-template <typename T> constexpr uint64_t basic_impl_data<T>::power_of_10_64[];
+template <typename T> constexpr uint64_t basic_data<T>::pow10_significands[];
+template <typename T> constexpr int16_t basic_data<T>::pow10_exponents[];
+template <typename T> constexpr uint64_t basic_data<T>::power_of_10_64[];
 #endif

-// Normalizes the value converted from double and multiplied by (1 << SHIFT).
-template <int SHIFT = 0, typename F>
-FMT_CONSTEXPR basic_fp<F> normalize(basic_fp<F> value) {
-  // Handle subnormals.
-  const auto implicit_bit = F(1) << num_significand_bits<double>();
-  const auto shifted_implicit_bit = implicit_bit << SHIFT;
-  while ((value.f & shifted_implicit_bit) == 0) {
-    value.f <<= 1;
-    --value.e;
-  }
-  // Subtract 1 to account for hidden bit.
-  const auto offset = basic_fp<F>::num_significand_bits -
-                      num_significand_bits<double>() - SHIFT - 1;
-  value.f <<= offset;
-  value.e -= offset;
-  return value;
-}
-
 template <typename F> inline bool operator==(basic_fp<F> x, basic_fp<F> y) {
  return x.f == y.f && x.e == y.e;
 }

-// Computes lhs * rhs / pow(2, 64) rounded to nearest with half-up tie breaking.
-FMT_CONSTEXPR inline uint64_t multiply(uint64_t lhs, uint64_t rhs) {
-#if FMT_USE_INT128
-  auto product = static_cast<__uint128_t>(lhs) * rhs;
-  auto f = static_cast<uint64_t>(product >> 64);
-  return (static_cast<uint64_t>(product) & (1ULL << 63)) != 0 ? f + 1 : f;
-#else
-  // Multiply 32-bit parts of significands.
-  uint64_t mask = (1ULL << 32) - 1;
-  uint64_t a = lhs >> 32, b = lhs & mask;
-  uint64_t c = rhs >> 32, d = rhs & mask;
-  uint64_t ac = a * c, bc = b * c, ad = a * d, bd = b * d;
-  // Compute mid 64-bit of result and round.
-  uint64_t mid = (bd >> 32) + (ad & mask) + (bc & mask) + (1U << 31);
-  return ac + (ad >> 32) + (bc >> 32) + (mid >> 32);
-#endif
-}
-
-using fp = basic_fp<unsigned long long>;
-
-FMT_CONSTEXPR inline fp operator*(fp x, fp y) {
-  return {multiply(x.f, y.f), x.e + y.e + 64};
-}
-
-// Returns a cached power of 10 `c_k = c_k.f * pow(2, c_k.e)` such that its
-// (binary) exponent satisfies `min_exponent <= c_k.e <= min_exponent + 28`.
-FMT_CONSTEXPR inline fp get_cached_power(int min_exponent,
-                                         int& pow10_exponent) {
-  const int shift = 32;
-  // log10(2) = 0x0.4d104d427de7fbcc...
-  const int64_t significand = 0x4d104d427de7fbcc;
-  int index = static_cast<int>(
-      ((min_exponent + fp::num_significand_bits - 1) * (significand >> shift) +
-       ((int64_t(1) << shift) - 1))  // ceil
-      >> 32                          // arithmetic shift
-  );
-  // Decimal exponent of the first (smallest) cached power of 10.
-  const int first_dec_exp = -348;
-  // Difference between 2 consecutive decimal exponents in cached powers of 10.
-  const int dec_exp_step = 8;
-  index = (index - first_dec_exp - 1) / dec_exp_step + 1;
-  pow10_exponent = first_dec_exp + index * dec_exp_step;
-  return {impl_data::pow10_significands[index],
-          impl_data::pow10_exponents[index]};
-}
-
-class bigint {
- private:
-  // A bigint is stored as an array of bigits (big digits), with bigit at index
-  // 0 being the least significant one.
-  using bigit = uint32_t;
-  using double_bigit = uint64_t;
-  enum { bigits_capacity = 32 };
-  basic_memory_buffer<bigit, bigits_capacity> bigits_;
-  int exp_;
-
-  FMT_CONSTEXPR20 bigit operator[](int index) const {
-    return bigits_[to_unsigned(index)];
-  }
-  FMT_CONSTEXPR20 bigit& operator[](int index) {
-    return bigits_[to_unsigned(index)];
-  }
-
-  static FMT_CONSTEXPR_DECL const int bigit_bits = num_bits<bigit>();
-
-  friend struct formatter<bigint>;
-
-  FMT_CONSTEXPR20 void subtract_bigits(int index, bigit other, bigit& borrow) {
-    auto result = static_cast<double_bigit>((*this)[index]) - other - borrow;
-    (*this)[index] = static_cast<bigit>(result);
-    borrow = static_cast<bigit>(result >> (bigit_bits * 2 - 1));
-  }
-
-  FMT_CONSTEXPR20 void remove_leading_zeros() {
-    int num_bigits = static_cast<int>(bigits_.size()) - 1;
-    while (num_bigits > 0 && (*this)[num_bigits] == 0) --num_bigits;
-    bigits_.resize(to_unsigned(num_bigits + 1));
-  }
-
-  // Computes *this -= other assuming aligned bigints and *this >= other.
-  FMT_CONSTEXPR20 void subtract_aligned(const bigint& other) {
-    FMT_ASSERT(other.exp_ >= exp_, "unaligned bigints");
-    FMT_ASSERT(compare(*this, other) >= 0, "");
-    bigit borrow = 0;
-    int i = other.exp_ - exp_;
-    for (size_t j = 0, n = other.bigits_.size(); j != n; ++i, ++j)
-      subtract_bigits(i, other.bigits_[j], borrow);
-    while (borrow > 0) subtract_bigits(i, 0, borrow);
-    remove_leading_zeros();
-  }
-
-  FMT_CONSTEXPR20 void multiply(uint32_t value) {
-    const double_bigit wide_value = value;
-    bigit carry = 0;
-    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
-      double_bigit result = bigits_[i] * wide_value + carry;
-      bigits_[i] = static_cast<bigit>(result);
-      carry = static_cast<bigit>(result >> bigit_bits);
-    }
-    if (carry != 0) bigits_.push_back(carry);
-  }
-
-  template <typename UInt, FMT_ENABLE_IF(std::is_same<UInt, uint64_t>::value ||
-                                         std::is_same<UInt, uint128_t>::value)>
-  FMT_CONSTEXPR20 void multiply(UInt value) {
-    using half_uint =
-        conditional_t<std::is_same<UInt, uint128_t>::value, uint64_t, uint32_t>;
-    const int shift = num_bits<half_uint>() - bigit_bits;
-    const UInt lower = static_cast<half_uint>(value);
-    const UInt upper = value >> num_bits<half_uint>();
-    UInt carry = 0;
-    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
-      UInt result = lower * bigits_[i] + static_cast<bigit>(carry);
-      carry = (upper * bigits_[i] << shift) + (result >> bigit_bits) +
-              (carry >> bigit_bits);
-      bigits_[i] = static_cast<bigit>(result);
-    }
-    while (carry != 0) {
-      bigits_.push_back(static_cast<bigit>(carry));
-      carry >>= bigit_bits;
-    }
-  }
-
-  template <typename UInt, FMT_ENABLE_IF(std::is_same<UInt, uint64_t>::value ||
-                                         std::is_same<UInt, uint128_t>::value)>
-  FMT_CONSTEXPR20 void assign(UInt n) {
-    size_t num_bigits = 0;
-    do {
-      bigits_[num_bigits++] = static_cast<bigit>(n);
-      n >>= bigit_bits;
-    } while (n != 0);
-    bigits_.resize(num_bigits);
-    exp_ = 0;
-  }
-
- public:
-  FMT_CONSTEXPR20 bigint() : exp_(0) {}
-  explicit bigint(uint64_t n) { assign(n); }
-
-  bigint(const bigint&) = delete;
-  void operator=(const bigint&) = delete;
-
-  FMT_CONSTEXPR20 void assign(const bigint& other) {
-    auto size = other.bigits_.size();
-    bigits_.resize(size);
-    auto data = other.bigits_.data();
-    std::copy(data, data + size, make_checked(bigits_.data(), size));
-    exp_ = other.exp_;
-  }
-
-  template <typename Int> FMT_CONSTEXPR20 void operator=(Int n) {
-    FMT_ASSERT(n > 0, "");
-    assign(uint64_or_128_t<Int>(n));
-  }
-
-  FMT_CONSTEXPR20 int num_bigits() const {
-    return static_cast<int>(bigits_.size()) + exp_;
-  }
-
-  FMT_NOINLINE FMT_CONSTEXPR20 bigint& operator<<=(int shift) {
-    FMT_ASSERT(shift >= 0, "");
-    exp_ += shift / bigit_bits;
-    shift %= bigit_bits;
-    if (shift == 0) return *this;
-    bigit carry = 0;
-    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
-      bigit c = bigits_[i] >> (bigit_bits - shift);
-      bigits_[i] = (bigits_[i] << shift) + carry;
-      carry = c;
-    }
-    if (carry != 0) bigits_.push_back(carry);
-    return *this;
-  }
-
-  template <typename Int> FMT_CONSTEXPR20 bigint& operator*=(Int value) {
-    FMT_ASSERT(value > 0, "");
-    multiply(uint32_or_64_or_128_t<Int>(value));
-    return *this;
-  }
-
-  friend FMT_CONSTEXPR20 int compare(const bigint& lhs, const bigint& rhs) {
-    int num_lhs_bigits = lhs.num_bigits(), num_rhs_bigits = rhs.num_bigits();
-    if (num_lhs_bigits != num_rhs_bigits)
-      return num_lhs_bigits > num_rhs_bigits ? 1 : -1;
-    int i = static_cast<int>(lhs.bigits_.size()) - 1;
-    int j = static_cast<int>(rhs.bigits_.size()) - 1;
-    int end = i - j;
-    if (end < 0) end = 0;
-    for (; i >= end; --i, --j) {
-      bigit lhs_bigit = lhs[i], rhs_bigit = rhs[j];
-      if (lhs_bigit != rhs_bigit) return lhs_bigit > rhs_bigit ? 1 : -1;
-    }
-    if (i != j) return i > j ? 1 : -1;
-    return 0;
-  }
-
-  // Returns compare(lhs1 + lhs2, rhs).
-  friend FMT_CONSTEXPR20 int add_compare(const bigint& lhs1, const bigint& lhs2,
-                                         const bigint& rhs) {
-    int max_lhs_bigits = (std::max)(lhs1.num_bigits(), lhs2.num_bigits());
-    int num_rhs_bigits = rhs.num_bigits();
-    if (max_lhs_bigits + 1 < num_rhs_bigits) return -1;
-    if (max_lhs_bigits > num_rhs_bigits) return 1;
-    auto get_bigit = [](const bigint& n, int i) -> bigit {
-      return i >= n.exp_ && i < n.num_bigits() ? n[i - n.exp_] : 0;
-    };
-    double_bigit borrow = 0;
-    int min_exp = (std::min)((std::min)(lhs1.exp_, lhs2.exp_), rhs.exp_);
-    for (int i = num_rhs_bigits - 1; i >= min_exp; --i) {
-      double_bigit sum =
-          static_cast<double_bigit>(get_bigit(lhs1, i)) + get_bigit(lhs2, i);
-      bigit rhs_bigit = get_bigit(rhs, i);
-      if (sum > rhs_bigit + borrow) return 1;
-      borrow = rhs_bigit + borrow - sum;
-      if (borrow > 1) return -1;
-      borrow <<= bigit_bits;
-    }
-    return borrow != 0 ? -1 : 0;
-  }
-
-  // Assigns pow(10, exp) to this bigint.
-  FMT_CONSTEXPR20 void assign_pow10(int exp) {
-    FMT_ASSERT(exp >= 0, "");
-    if (exp == 0) return *this = 1;
-    // Find the top bit.
-    int bitmask = 1;
-    while (exp >= bitmask) bitmask <<= 1;
-    bitmask >>= 1;
-    // pow(10, exp) = pow(5, exp) * pow(2, exp). First compute pow(5, exp) by
-    // repeated squaring and multiplication.
-    *this = 5;
-    bitmask >>= 1;
-    while (bitmask != 0) {
-      square();
-      if ((exp & bitmask) != 0) *this *= 5;
-      bitmask >>= 1;
-    }
-    *this <<= exp;  // Multiply by pow(2, exp) by shifting.
-  }
-
-  FMT_CONSTEXPR20 void square() {
-    int num_bigits = static_cast<int>(bigits_.size());
-    int num_result_bigits = 2 * num_bigits;
-    basic_memory_buffer<bigit, bigits_capacity> n(std::move(bigits_));
-    bigits_.resize(to_unsigned(num_result_bigits));
-    auto sum = uint128_t();
-    for (int bigit_index = 0; bigit_index < num_bigits; ++bigit_index) {
-      // Compute bigit at position bigit_index of the result by adding
-      // cross-product terms n[i] * n[j] such that i + j == bigit_index.
-      for (int i = 0, j = bigit_index; j >= 0; ++i, --j) {
-        // Most terms are multiplied twice which can be optimized in the future.
-        sum += static_cast<double_bigit>(n[i]) * n[j];
-      }
-      (*this)[bigit_index] = static_cast<bigit>(sum);
-      sum >>= num_bits<bigit>();  // Compute the carry.
-    }
-    // Do the same for the top half.
-    for (int bigit_index = num_bigits; bigit_index < num_result_bigits;
-         ++bigit_index) {
-      for (int j = num_bigits - 1, i = bigit_index - j; i < num_bigits;)
-        sum += static_cast<double_bigit>(n[i++]) * n[j--];
-      (*this)[bigit_index] = static_cast<bigit>(sum);
-      sum >>= num_bits<bigit>();
-    }
-    remove_leading_zeros();
-    exp_ *= 2;
-  }
-
-  // If this bigint has a bigger exponent than other, adds trailing zero to make
-  // exponents equal. This simplifies some operations such as subtraction.
-  FMT_CONSTEXPR20 void align(const bigint& other) {
-    int exp_difference = exp_ - other.exp_;
-    if (exp_difference <= 0) return;
-    int num_bigits = static_cast<int>(bigits_.size());
-    bigits_.resize(to_unsigned(num_bigits + exp_difference));
-    for (int i = num_bigits - 1, j = i + exp_difference; i >= 0; --i, --j)
-      bigits_[j] = bigits_[i];
-    std::uninitialized_fill_n(bigits_.data(), exp_difference, 0);
-    exp_ -= exp_difference;
-  }
-
-  // Divides this bignum by divisor, assigning the remainder to this and
-  // returning the quotient.
-  FMT_CONSTEXPR20 int divmod_assign(const bigint& divisor) {
-    FMT_ASSERT(this != &divisor, "");
-    if (compare(*this, divisor) < 0) return 0;
-    FMT_ASSERT(divisor.bigits_[divisor.bigits_.size() - 1u] != 0, "");
-    align(divisor);
-    int quotient = 0;
-    do {
-      subtract_aligned(divisor);
-      ++quotient;
-    } while (compare(*this, divisor) >= 0);
-    return quotient;
-  }
-};
-
-enum class round_direction { unknown, up, down };
-
-// Given the divisor (normally a power of 10), the remainder = v % divisor for
-// some number v and the error, returns whether v should be rounded up, down, or
-// whether the rounding direction can't be determined due to error.
-// error should be less than divisor / 2.
-FMT_CONSTEXPR inline round_direction get_round_direction(uint64_t divisor,
-                                                         uint64_t remainder,
-                                                         uint64_t error) {
-  FMT_ASSERT(remainder < divisor, "");  // divisor - remainder won't overflow.
-  FMT_ASSERT(error < divisor, "");      // divisor - error won't overflow.
-  FMT_ASSERT(error < divisor - error, "");  // error * 2 won't overflow.
-  // Round down if (remainder + error) * 2 <= divisor.
-  if (remainder <= divisor - remainder && error * 2 <= divisor - remainder * 2)
-    return round_direction::down;
-  // Round up if (remainder - error) * 2 >= divisor.
-  if (remainder >= error &&
-      remainder - error >= divisor - (remainder - error)) {
-    return round_direction::up;
-  }
-  return round_direction::unknown;
-}
-
-namespace digits {
-enum result {
-  more,  // Generate more digits.
-  done,  // Done generating digits.
-  error  // Digit generation cancelled due to an error.
-};
-}
-
-struct gen_digits_handler {
-  char* buf;
-  int size;
-  int precision;
-  int exp10;
-  bool fixed;
-
-  FMT_CONSTEXPR digits::result on_digit(char digit, uint64_t divisor,
-                                        uint64_t remainder, uint64_t error,
-                                        bool integral) {
-    FMT_ASSERT(remainder < divisor, "");
-    buf[size++] = digit;
-    if (!integral && error >= remainder) return digits::error;
-    if (size < precision) return digits::more;
-    if (!integral) {
-      // Check if error * 2 < divisor with overflow prevention.
-      // The check is not needed for the integral part because error = 1
-      // and divisor > (1 << 32) there.
-      if (error >= divisor || error >= divisor - error) return digits::error;
-    } else {
-      FMT_ASSERT(error == 1 && divisor > 2, "");
-    }
-    auto dir = get_round_direction(divisor, remainder, error);
-    if (dir != round_direction::up)
-      return dir == round_direction::down ? digits::done : digits::error;
-    ++buf[size - 1];
-    for (int i = size - 1; i > 0 && buf[i] > '9'; --i) {
-      buf[i] = '0';
-      ++buf[i - 1];
-    }
-    if (buf[0] > '9') {
-      buf[0] = '1';
-      if (fixed)
-        buf[size++] = '0';
-      else
-        ++exp10;
-    }
-    return digits::done;
-  }
-};
-
-inline FMT_CONSTEXPR20 void adjust_precision(int& precision, int exp10) {
-  // Adjust fixed precision by exponent because it is relative to decimal
-  // point.
-  if (exp10 > 0 && precision > max_value<int>() - exp10)
-    FMT_THROW(format_error("number is too big"));
-  precision += exp10;
-}
-
-// Generates output using the Grisu digit-gen algorithm.
-// error: the size of the region (lower, upper) outside of which numbers
-// definitely do not round to value (Delta in Grisu3).
-FMT_INLINE FMT_CONSTEXPR20 digits::result grisu_gen_digits(
-    fp value, uint64_t error, int& exp, gen_digits_handler& handler) {
-  const fp one(1ULL << -value.e, value.e);
-  // The integral part of scaled value (p1 in Grisu) = value / one. It cannot be
-  // zero because it contains a product of two 64-bit numbers with MSB set (due
-  // to normalization) - 1, shifted right by at most 60 bits.
-  auto integral = static_cast<uint32_t>(value.f >> -one.e);
-  FMT_ASSERT(integral != 0, "");
-  FMT_ASSERT(integral == value.f >> -one.e, "");
-  // The fractional part of scaled value (p2 in Grisu) c = value % one.
-  uint64_t fractional = value.f & (one.f - 1);
-  exp = count_digits(integral);  // kappa in Grisu.
-  // Non-fixed formats require at least one digit and no precision adjustment.
-  if (handler.fixed) {
-    adjust_precision(handler.precision, exp + handler.exp10);
-    // Check if precision is satisfied just by leading zeros, e.g.
-    // format("{:.2f}", 0.001) gives "0.00" without generating any digits.
-    if (handler.precision <= 0) {
-      if (handler.precision < 0) return digits::done;
-      // Divide by 10 to prevent overflow.
-      uint64_t divisor = impl_data::power_of_10_64[exp - 1] << -one.e;
-      auto dir = get_round_direction(divisor, value.f / 10, error * 10);
-      if (dir == round_direction::unknown) return digits::error;
-      handler.buf[handler.size++] = dir == round_direction::up ? '1' : '0';
-      return digits::done;
-    }
-  }
-  // Generate digits for the integral part. This can produce up to 10 digits.
-  do {
-    uint32_t digit = 0;
-    auto divmod_integral = [&](uint32_t divisor) {
-      digit = integral / divisor;
-      integral %= divisor;
-    };
-    // This optimization by Milo Yip reduces the number of integer divisions by
-    // one per iteration.
-    switch (exp) {
-    case 10:
-      divmod_integral(1000000000);
-      break;
-    case 9:
-      divmod_integral(100000000);
-      break;
-    case 8:
-      divmod_integral(10000000);
-      break;
-    case 7:
-      divmod_integral(1000000);
-      break;
-    case 6:
-      divmod_integral(100000);
-      break;
-    case 5:
-      divmod_integral(10000);
-      break;
-    case 4:
-      divmod_integral(1000);
-      break;
-    case 3:
-      divmod_integral(100);
-      break;
-    case 2:
-      divmod_integral(10);
-      break;
-    case 1:
-      digit = integral;
-      integral = 0;
-      break;
-    default:
-      FMT_ASSERT(false, "invalid number of digits");
-    }
-    --exp;
-    auto remainder = (static_cast<uint64_t>(integral) << -one.e) + fractional;
-    auto result = handler.on_digit(static_cast<char>('0' + digit),
-                                   impl_data::power_of_10_64[exp] << -one.e,
-                                   remainder, error, true);
-    if (result != digits::more) return result;
-  } while (exp > 0);
-  // Generate digits for the fractional part.
-  for (;;) {
-    fractional *= 10;
-    error *= 10;
-    char digit = static_cast<char>('0' + (fractional >> -one.e));
-    fractional &= one.f - 1;
-    --exp;
-    auto result = handler.on_digit(digit, one.f, fractional, error, false);
-    if (result != digits::more) return result;
-  }
-}
-
 inline FMT_CONSTEXPR20 uint128_fallback& uint128_fallback::operator+=(
    uint64_t n) noexcept {
  if (is_constant_evaluated()) {
@ -1992,143 +1441,6 @@ small_divisor_case_label:
 }
 }  // namespace dragonbox

-// format_dragon flags.
-enum dragon {
-  predecessor_closer = 1,
-  fixup = 2,  // Run fixup to correct exp10 which can be off by one.
-  fixed = 4,
-};
-
-// Formats a floating-point number using a variation of the Fixed-Precision
-// Positive Floating-Point Printout ((FPP)^2) algorithm by Steele & White:
-// https://fmt.dev/papers/p372-steele.pdf.
-FMT_CONSTEXPR20 inline void format_dragon(basic_fp<uint128_t> value,
-                                          unsigned flags, int num_digits,
-                                          buffer<char>& buf, int& exp10) {
-  bigint numerator;    // 2 * R in (FPP)^2.
-  bigint denominator;  // 2 * S in (FPP)^2.
-  // lower and upper are differences between value and corresponding boundaries.
-  bigint lower;             // (M^- in (FPP)^2).
-  bigint upper_store;       // upper's value if different from lower.
-  bigint* upper = nullptr;  // (M^+ in (FPP)^2).
-  // Shift numerator and denominator by an extra bit or two (if lower boundary
-  // is closer) to make lower and upper integers. This eliminates multiplication
-  // by 2 during later computations.
-  bool is_predecessor_closer = (flags & dragon::predecessor_closer) != 0;
-  int shift = is_predecessor_closer ? 2 : 1;
-  if (value.e >= 0) {
-    numerator = value.f;
-    numerator <<= value.e + shift;
-    lower = 1;
-    lower <<= value.e;
-    if (is_predecessor_closer) {
-      upper_store = 1;
-      upper_store <<= value.e + 1;
-      upper = &upper_store;
-    }
-    denominator.assign_pow10(exp10);
-    denominator <<= shift;
-  } else if (exp10 < 0) {
-    numerator.assign_pow10(-exp10);
-    lower.assign(numerator);
-    if (is_predecessor_closer) {
-      upper_store.assign(numerator);
-      upper_store <<= 1;
-      upper = &upper_store;
-    }
-    numerator *= value.f;
-    numerator <<= shift;
-    denominator = 1;
-    denominator <<= shift - value.e;
-  } else {
-    numerator = value.f;
-    numerator <<= shift;
-    denominator.assign_pow10(exp10);
-    denominator <<= shift - value.e;
-    lower = 1;
-    if (is_predecessor_closer) {
-      upper_store = 1ULL << 1;
-      upper = &upper_store;
-    }
-  }
-  bool even = (value.f & 1) == 0;
-  if (!upper) upper = &lower;
-  if ((flags & dragon::fixup) != 0) {
-    if (add_compare(numerator, *upper, denominator) + even <= 0) {
-      --exp10;
-      numerator *= 10;
-      if (num_digits < 0) {
-        lower *= 10;
-        if (upper != &lower) *upper *= 10;
-      }
-    }
-    if ((flags & dragon::fixed) != 0) adjust_precision(num_digits, exp10 + 1);
-  }
-  // Invariant: value == (numerator / denominator) * pow(10, exp10).
-  if (num_digits < 0) {
-    // Generate the shortest representation.
-    num_digits = 0;
-    char* data = buf.data();
-    for (;;) {
-      int digit = numerator.divmod_assign(denominator);
-      bool low = compare(numerator, lower) - even < 0;  // numerator <[=] lower.
-      // numerator + upper >[=] pow10:
-      bool high = add_compare(numerator, *upper, denominator) + even > 0;
-      data[num_digits++] = static_cast<char>('0' + digit);
-      if (low || high) {
-        if (!low) {
-          ++data[num_digits - 1];
-        } else if (high) {
-          int result = add_compare(numerator, numerator, denominator);
-          // Round half to even.
-          if (result > 0 || (result == 0 && (digit % 2) != 0))
-            ++data[num_digits - 1];
-        }
-        buf.try_resize(to_unsigned(num_digits));
-        exp10 -= num_digits - 1;
-        return;
-      }
-      numerator *= 10;
-      lower *= 10;
-      if (upper != &lower) *upper *= 10;
-    }
-  }
-  // Generate the given number of digits.
-  exp10 -= num_digits - 1;
-  if (num_digits == 0) {
-    denominator *= 10;
-    auto digit = add_compare(numerator, numerator, denominator) > 0 ? '1' : '0';
-    buf.push_back(digit);
-    return;
-  }
-  buf.try_resize(to_unsigned(num_digits));
-  for (int i = 0; i < num_digits - 1; ++i) {
-    int digit = numerator.divmod_assign(denominator);
-    buf[i] = static_cast<char>('0' + digit);
-    numerator *= 10;
-  }
-  int digit = numerator.divmod_assign(denominator);
-  auto result = add_compare(numerator, numerator, denominator);
-  if (result > 0 || (result == 0 && (digit % 2) != 0)) {
-    if (digit == 9) {
-      const auto overflow = '0' + 10;
-      buf[num_digits - 1] = overflow;
-      // Propagate the carry.
-      for (int i = num_digits - 1; i > 0 && buf[i] == overflow; --i) {
-        buf[i] = '0';
-        ++buf[i - 1];
-      }
-      if (buf[0] == overflow) {
-        buf[0] = '1';
-        ++exp10;
-      }
-      return;
-    }
-    ++digit;
-  }
-  buf[num_digits - 1] = static_cast<char>('0' + digit);
-}
-
 #ifdef _MSC_VER
 FMT_FUNC auto fmt_snprintf(char* buf, size_t size, const char* fmt, ...)
    -> int {
@ -2139,95 +1451,6 @@ FMT_FUNC auto fmt_snprintf(char* buf, size_t size, const char* fmt, ...)
  return result;
 }
 #endif
-
-template <typename Float>
-FMT_HEADER_ONLY_CONSTEXPR20 int format_float(Float value, int precision,
-                                             float_specs specs,
-                                             buffer<char>& buf) {
-  // float is passed as double to reduce the number of instantiations.
-  static_assert(!std::is_same<Float, float>::value, "");
-  FMT_ASSERT(value >= 0, "value is negative");
-  auto converted_value = convert_float(value);
-
-  const bool fixed = specs.format == float_format::fixed;
-  if (value <= 0) {  // <= instead of == to silence a warning.
-    if (precision <= 0 || !fixed) {
-      buf.push_back('0');
-      return 0;
-    }
-    buf.try_resize(to_unsigned(precision));
-    fill_n(buf.data(), precision, '0');
-    return -precision;
-  }
-
-  int exp = 0;
-  bool use_dragon = true;
-  unsigned dragon_flags = 0;
-  if (!is_fast_float<Float>()) {
-    const auto inv_log2_10 = 0.3010299956639812;  // 1 / log2(10)
-    using info = dragonbox::float_info<decltype(converted_value)>;
-    const auto f = basic_fp<typename info::carrier_uint>(converted_value);
-    // Compute exp, an approximate power of 10, such that
-    //   10^(exp - 1) <= value < 10^exp or 10^exp <= value < 10^(exp + 1).
-    // This is based on log10(value) == log2(value) / log2(10) and approximation
-    // of log2(value) by e + num_fraction_bits idea from double-conversion.
-    exp = static_cast<int>(
-        std::ceil((f.e + count_digits<1>(f.f) - 1) * inv_log2_10 - 1e-10));
-    dragon_flags = dragon::fixup;
-  } else if (!is_constant_evaluated() && precision < 0) {
-    // Use Dragonbox for the shortest format.
-    if (specs.binary32) {
-      auto dec = dragonbox::to_decimal(static_cast<float>(value));
-      write<char>(buffer_appender<char>(buf), dec.significand);
-      return dec.exponent;
-    }
-    auto dec = dragonbox::to_decimal(static_cast<double>(value));
-    write<char>(buffer_appender<char>(buf), dec.significand);
-    return dec.exponent;
-  } else {
-    // Use Grisu + Dragon4 for the given precision:
-    // https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf.
-    const int min_exp = -60;  // alpha in Grisu.
-    int cached_exp10 = 0;     // K in Grisu.
-    fp normalized = normalize(fp(converted_value));
-    const auto cached_pow = get_cached_power(
-        min_exp - (normalized.e + fp::num_significand_bits), cached_exp10);
-    normalized = normalized * cached_pow;
-    gen_digits_handler handler{buf.data(), 0, precision, -cached_exp10, fixed};
-    if (grisu_gen_digits(normalized, 1, exp, handler) != digits::error &&
-        !is_constant_evaluated()) {
-      exp += handler.exp10;
-      buf.try_resize(to_unsigned(handler.size));
-      use_dragon = false;
-    } else {
-      exp += handler.size - cached_exp10 - 1;
-      precision = handler.precision;
-    }
-  }
-  if (use_dragon) {
-    auto f = basic_fp<uint128_t>();
-    bool is_predecessor_closer = specs.binary32
-                                     ? f.assign(static_cast<float>(value))
-                                     : f.assign(converted_value);
-    if (is_predecessor_closer) dragon_flags |= dragon::predecessor_closer;
-    if (fixed) dragon_flags |= dragon::fixed;
-    // Limit precision to the maximum possible number of significant digits in
-    // an IEEE754 double because we don't need to generate zeros.
-    const int max_double_digits = 767;
-    if (precision > max_double_digits) precision = max_double_digits;
-    format_dragon(f, dragon_flags, precision, buf, exp);
-  }
-  if (!fixed && !specs.showpoint) {
-    // Remove trailing zeros.
-    auto num_digits = buf.size();
-    while (num_digits > 0 && buf[num_digits - 1] == '0') {
-      --num_digits;
-      ++exp;
-    }
-    buf.try_resize(num_digits);
-  }
-  return exp;
-}
 }  // namespace detail

 template <> struct formatter<detail::bigint> {
--- a/include/fmt/format.h
+++ b/include/fmt/format.h
@ -1410,10 +1410,130 @@ template <typename F> struct basic_fp {
  }
 };

-template <typename T>
-FMT_HEADER_ONLY_CONSTEXPR20 auto format_float(T value, int precision,
-                                              float_specs specs,
-                                              buffer<char>& buf) -> int;
+using fp = basic_fp<unsigned long long>;
+
+// Normalizes the value converted from double and multiplied by (1 << SHIFT).
+template <int SHIFT = 0, typename F>
+FMT_CONSTEXPR basic_fp<F> normalize(basic_fp<F> value) {
+  // Handle subnormals.
+  const auto implicit_bit = F(1) << num_significand_bits<double>();
+  const auto shifted_implicit_bit = implicit_bit << SHIFT;
+  while ((value.f & shifted_implicit_bit) == 0) {
+    value.f <<= 1;
+    --value.e;
+  }
+  // Subtract 1 to account for hidden bit.
+  const auto offset = basic_fp<F>::num_significand_bits -
+                      num_significand_bits<double>() - SHIFT - 1;
+  value.f <<= offset;
+  value.e -= offset;
+  return value;
+}
+
+// Computes lhs * rhs / pow(2, 64) rounded to nearest with half-up tie breaking.
+FMT_CONSTEXPR inline uint64_t multiply(uint64_t lhs, uint64_t rhs) {
+#if FMT_USE_INT128
+  auto product = static_cast<__uint128_t>(lhs) * rhs;
+  auto f = static_cast<uint64_t>(product >> 64);
+  return (static_cast<uint64_t>(product) & (1ULL << 63)) != 0 ? f + 1 : f;
+#else
+  // Multiply 32-bit parts of significands.
+  uint64_t mask = (1ULL << 32) - 1;
+  uint64_t a = lhs >> 32, b = lhs & mask;
+  uint64_t c = rhs >> 32, d = rhs & mask;
+  uint64_t ac = a * c, bc = b * c, ad = a * d, bd = b * d;
+  // Compute mid 64-bit of result and round.
+  uint64_t mid = (bd >> 32) + (ad & mask) + (bc & mask) + (1U << 31);
+  return ac + (ad >> 32) + (bc >> 32) + (mid >> 32);
+#endif
+}
+
+FMT_CONSTEXPR inline fp operator*(fp x, fp y) {
+  return {multiply(x.f, y.f), x.e + y.e + 64};
+}
+
+template <typename T = void> struct basic_data {
+  // Normalized 64-bit significands of pow(10, k), for k = -348, -340, ..., 340.
+  // These are generated by support/compute-powers.py.
+  static constexpr uint64_t pow10_significands[87] = {
+      0xfa8fd5a0081c0288, 0xbaaee17fa23ebf76, 0x8b16fb203055ac76,
+      0xcf42894a5dce35ea, 0x9a6bb0aa55653b2d, 0xe61acf033d1a45df,
+      0xab70fe17c79ac6ca, 0xff77b1fcbebcdc4f, 0xbe5691ef416bd60c,
+      0x8dd01fad907ffc3c, 0xd3515c2831559a83, 0x9d71ac8fada6c9b5,
+      0xea9c227723ee8bcb, 0xaecc49914078536d, 0x823c12795db6ce57,
+      0xc21094364dfb5637, 0x9096ea6f3848984f, 0xd77485cb25823ac7,
+      0xa086cfcd97bf97f4, 0xef340a98172aace5, 0xb23867fb2a35b28e,
+      0x84c8d4dfd2c63f3b, 0xc5dd44271ad3cdba, 0x936b9fcebb25c996,
+      0xdbac6c247d62a584, 0xa3ab66580d5fdaf6, 0xf3e2f893dec3f126,
+      0xb5b5ada8aaff80b8, 0x87625f056c7c4a8b, 0xc9bcff6034c13053,
+      0x964e858c91ba2655, 0xdff9772470297ebd, 0xa6dfbd9fb8e5b88f,
+      0xf8a95fcf88747d94, 0xb94470938fa89bcf, 0x8a08f0f8bf0f156b,
+      0xcdb02555653131b6, 0x993fe2c6d07b7fac, 0xe45c10c42a2b3b06,
+      0xaa242499697392d3, 0xfd87b5f28300ca0e, 0xbce5086492111aeb,
+      0x8cbccc096f5088cc, 0xd1b71758e219652c, 0x9c40000000000000,
+      0xe8d4a51000000000, 0xad78ebc5ac620000, 0x813f3978f8940984,
+      0xc097ce7bc90715b3, 0x8f7e32ce7bea5c70, 0xd5d238a4abe98068,
+      0x9f4f2726179a2245, 0xed63a231d4c4fb27, 0xb0de65388cc8ada8,
+      0x83c7088e1aab65db, 0xc45d1df942711d9a, 0x924d692ca61be758,
+      0xda01ee641a708dea, 0xa26da3999aef774a, 0xf209787bb47d6b85,
+      0xb454e4a179dd1877, 0x865b86925b9bc5c2, 0xc83553c5c8965d3d,
+      0x952ab45cfa97a0b3, 0xde469fbd99a05fe3, 0xa59bc234db398c25,
+      0xf6c69a72a3989f5c, 0xb7dcbf5354e9bece, 0x88fcf317f22241e2,
+      0xcc20ce9bd35c78a5, 0x98165af37b2153df, 0xe2a0b5dc971f303a,
+      0xa8d9d1535ce3b396, 0xfb9b7cd9a4a7443c, 0xbb764c4ca7a44410,
+      0x8bab8eefb6409c1a, 0xd01fef10a657842c, 0x9b10a4e5e9913129,
+      0xe7109bfba19c0c9d, 0xac2820d9623bf429, 0x80444b5e7aa7cf85,
+      0xbf21e44003acdd2d, 0x8e679c2f5e44ff8f, 0xd433179d9c8cb841,
+      0x9e19db92b4e31ba9, 0xeb96bf6ebadf77d9, 0xaf87023b9bf0ee6b,
+  };
+
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wnarrowing"
+#endif
+  // Binary exponents of pow(10, k), for k = -348, -340, ..., 340, corresponding
+  // to significands above.
+  static constexpr int16_t pow10_exponents[87] = {
+      -1220, -1193, -1166, -1140, -1113, -1087, -1060, -1034, -1007, -980, -954,
+      -927,  -901,  -874,  -847,  -821,  -794,  -768,  -741,  -715,  -688, -661,
+      -635,  -608,  -582,  -555,  -529,  -502,  -475,  -449,  -422,  -396, -369,
+      -343,  -316,  -289,  -263,  -236,  -210,  -183,  -157,  -130,  -103, -77,
+      -50,   -24,   3,     30,    56,    83,    109,   136,   162,   189,  216,
+      242,   269,   295,   322,   348,   375,   402,   428,   455,   481,  508,
+      534,   561,   588,   614,   641,   667,   694,   720,   747,   774,  800,
+      827,   853,   880,   907,   933,   960,   986,   1013,  1039,  1066};
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+#  pragma GCC diagnostic pop
+#endif
+
+  static constexpr uint64_t power_of_10_64[20] = {
+      1, FMT_POWERS_OF_10(1ULL), FMT_POWERS_OF_10(1000000000ULL),
+      10000000000000000000ULL};
+};
+
+// This is a struct rather than an alias to avoid shadowing warnings in gcc.
+struct data : basic_data<> {};
+
+// Returns a cached power of 10 `c_k = c_k.f * pow(2, c_k.e)` such that its
+// (binary) exponent satisfies `min_exponent <= c_k.e <= min_exponent + 28`.
+FMT_CONSTEXPR inline fp get_cached_power(int min_exponent,
+                                         int& pow10_exponent) {
+  const int shift = 32;
+  // log10(2) = 0x0.4d104d427de7fbcc...
+  const int64_t significand = 0x4d104d427de7fbcc;
+  int index = static_cast<int>(
+      ((min_exponent + fp::num_significand_bits - 1) * (significand >> shift) +
+       ((int64_t(1) << shift) - 1))  // ceil
+      >> 32                          // arithmetic shift
+  );
+  // Decimal exponent of the first (smallest) cached power of 10.
+  const int first_dec_exp = -348;
+  // Difference between 2 consecutive decimal exponents in cached powers of 10.
+  const int dec_exp_step = 8;
+  index = (index - first_dec_exp - 1) / dec_exp_step + 1;
+  pow10_exponent = first_dec_exp + index * dec_exp_step;
+  return {data::pow10_significands[index], data::pow10_exponents[index]};
+}

 #ifndef _MSC_VER
 #  define FMT_SNPRINTF snprintf
@ -2055,6 +2175,28 @@ FMT_CONSTEXPR auto write(OutputIt out, const Char* s,
             : write_ptr<Char>(out, bit_cast<uintptr_t>(s), &specs);
 }

+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_integral<T>::value &&
+                        !std::is_same<T, bool>::value &&
+                        !std::is_same<T, Char>::value)>
+FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
+  auto abs_value = static_cast<uint32_or_64_or_128_t<T>>(value);
+  bool negative = is_negative(value);
+  // Don't do -abs_value since it trips unsigned-integer-overflow sanitizer.
+  if (negative) abs_value = ~abs_value + 1;
+  int num_digits = count_digits(abs_value);
+  auto size = (negative ? 1 : 0) + static_cast<size_t>(num_digits);
+  auto it = reserve(out, size);
+  if (auto ptr = to_pointer<Char>(it, size)) {
+    if (negative) *ptr++ = static_cast<Char>('-');
+    format_decimal<Char>(ptr, abs_value, num_digits);
+    return out;
+  }
+  if (negative) *it++ = static_cast<Char>('-');
+  it = format_decimal<Char>(it, abs_value, num_digits).end;
+  return base_iterator(out, it);
+}
+
 template <typename Char, typename OutputIt>
 FMT_CONSTEXPR20 auto write_nonfinite(OutputIt out, bool isnan,
                                     basic_format_specs<Char> specs,
@ -2081,12 +2223,12 @@ struct big_decimal_fp {
  int exponent;
 };

-constexpr auto get_significand_size(const big_decimal_fp& fp) -> int {
-  return fp.significand_size;
+constexpr auto get_significand_size(const big_decimal_fp& f) -> int {
+  return f.significand_size;
 }
 template <typename T>
-inline auto get_significand_size(const dragonbox::decimal_fp<T>& fp) -> int {
-  return count_digits(fp.significand);
+inline auto get_significand_size(const dragonbox::decimal_fp<T>& f) -> int {
+  return count_digits(f.significand);
 }

 template <typename Char, typename OutputIt>
@ -2180,12 +2322,12 @@ FMT_CONSTEXPR20 auto write_significand(OutputIt out, T significand,

 template <typename OutputIt, typename DecimalFP, typename Char,
          typename Grouping = digit_grouping<Char>>
-FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& fp,
+FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& f,
                                    const basic_format_specs<Char>& specs,
                                    float_specs fspecs, locale_ref loc)
    -> OutputIt {
-  auto significand = fp.significand;
-  int significand_size = get_significand_size(fp);
+  auto significand = f.significand;
+  int significand_size = get_significand_size(f);
  const Char zero = static_cast<Char>('0');
  auto sign = fspecs.sign;
  size_t size = to_unsigned(significand_size) + (sign ? 1 : 0);
@ -2194,7 +2336,7 @@ FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& fp,
  Char decimal_point =
      fspecs.locale ? detail::decimal_point<Char>(loc) : static_cast<Char>('.');

-  int output_exp = fp.exponent + significand_size - 1;
+  int output_exp = f.exponent + significand_size - 1;
  auto use_exp_format = [=]() {
    if (fspecs.format == float_format::exp) return true;
    if (fspecs.format != float_format::general) return false;
@ -2232,10 +2374,10 @@ FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& fp,
                           : base_iterator(out, write(reserve(out, size)));
  }

-  int exp = fp.exponent + significand_size;
-  if (fp.exponent >= 0) {
+  int exp = f.exponent + significand_size;
+  if (f.exponent >= 0) {
    // 1234e5 -> 123400000[.0+]
-    size += to_unsigned(fp.exponent);
+    size += to_unsigned(f.exponent);
    int num_zeros = fspecs.precision - exp;
    abort_fuzzing_if(num_zeros > 5000);
    if (fspecs.showpoint) {
@ -2248,7 +2390,7 @@ FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& fp,
    return write_padded<align::right>(out, specs, size, [&](iterator it) {
      if (sign) *it++ = detail::sign<Char>(sign);
      it = write_significand<Char>(it, significand, significand_size,
-                                   fp.exponent, grouping);
+                                   f.exponent, grouping);
      if (!fspecs.showpoint) return it;
      *it++ = decimal_point;
      return num_zeros > 0 ? detail::fill_n(it, num_zeros, zero) : it;
@ -2299,16 +2441,16 @@ template <typename Char> class fallback_digit_grouping {
 };

 template <typename OutputIt, typename DecimalFP, typename Char>
-FMT_CONSTEXPR20 auto write_float(OutputIt out, const DecimalFP& fp,
+FMT_CONSTEXPR20 auto write_float(OutputIt out, const DecimalFP& f,
                                 const basic_format_specs<Char>& specs,
                                 float_specs fspecs, locale_ref loc)
    -> OutputIt {
  if (is_constant_evaluated()) {
    return do_write_float<OutputIt, DecimalFP, Char,
-                          fallback_digit_grouping<Char>>(out, fp, specs, fspecs,
+                          fallback_digit_grouping<Char>>(out, f, specs, fspecs,
                                                         loc);
  } else {
-    return do_write_float(out, fp, specs, fspecs, loc);
+    return do_write_float(out, f, specs, fspecs, loc);
  }
 }

@ -2351,6 +2493,659 @@ FMT_INLINE FMT_CONSTEXPR bool signbit(T value) {
  return std::signbit(static_cast<double>(value));
 }

+enum class round_direction { unknown, up, down };
+
+// Given the divisor (normally a power of 10), the remainder = v % divisor for
+// some number v and the error, returns whether v should be rounded up, down, or
+// whether the rounding direction can't be determined due to error.
+// error should be less than divisor / 2.
+FMT_CONSTEXPR inline round_direction get_round_direction(uint64_t divisor,
+                                                         uint64_t remainder,
+                                                         uint64_t error) {
+  FMT_ASSERT(remainder < divisor, "");  // divisor - remainder won't overflow.
+  FMT_ASSERT(error < divisor, "");      // divisor - error won't overflow.
+  FMT_ASSERT(error < divisor - error, "");  // error * 2 won't overflow.
+  // Round down if (remainder + error) * 2 <= divisor.
+  if (remainder <= divisor - remainder && error * 2 <= divisor - remainder * 2)
+    return round_direction::down;
+  // Round up if (remainder - error) * 2 >= divisor.
+  if (remainder >= error &&
+      remainder - error >= divisor - (remainder - error)) {
+    return round_direction::up;
+  }
+  return round_direction::unknown;
+}
+
+namespace digits {
+enum result {
+  more,  // Generate more digits.
+  done,  // Done generating digits.
+  error  // Digit generation cancelled due to an error.
+};
+}
+
+struct gen_digits_handler {
+  char* buf;
+  int size;
+  int precision;
+  int exp10;
+  bool fixed;
+
+  FMT_CONSTEXPR digits::result on_digit(char digit, uint64_t divisor,
+                                        uint64_t remainder, uint64_t error,
+                                        bool integral) {
+    FMT_ASSERT(remainder < divisor, "");
+    buf[size++] = digit;
+    if (!integral && error >= remainder) return digits::error;
+    if (size < precision) return digits::more;
+    if (!integral) {
+      // Check if error * 2 < divisor with overflow prevention.
+      // The check is not needed for the integral part because error = 1
+      // and divisor > (1 << 32) there.
+      if (error >= divisor || error >= divisor - error) return digits::error;
+    } else {
+      FMT_ASSERT(error == 1 && divisor > 2, "");
+    }
+    auto dir = get_round_direction(divisor, remainder, error);
+    if (dir != round_direction::up)
+      return dir == round_direction::down ? digits::done : digits::error;
+    ++buf[size - 1];
+    for (int i = size - 1; i > 0 && buf[i] > '9'; --i) {
+      buf[i] = '0';
+      ++buf[i - 1];
+    }
+    if (buf[0] > '9') {
+      buf[0] = '1';
+      if (fixed)
+        buf[size++] = '0';
+      else
+        ++exp10;
+    }
+    return digits::done;
+  }
+};
+
+inline FMT_CONSTEXPR20 void adjust_precision(int& precision, int exp10) {
+  // Adjust fixed precision by exponent because it is relative to decimal
+  // point.
+  if (exp10 > 0 && precision > max_value<int>() - exp10)
+    FMT_THROW(format_error("number is too big"));
+  precision += exp10;
+}
+
+// Generates output using the Grisu digit-gen algorithm.
+// error: the size of the region (lower, upper) outside of which numbers
+// definitely do not round to value (Delta in Grisu3).
+FMT_INLINE FMT_CONSTEXPR20 auto grisu_gen_digits(fp value, uint64_t error,
+                                                 int& exp,
+                                                 gen_digits_handler& handler)
+    -> digits::result {
+  const fp one(1ULL << -value.e, value.e);
+  // The integral part of scaled value (p1 in Grisu) = value / one. It cannot be
+  // zero because it contains a product of two 64-bit numbers with MSB set (due
+  // to normalization) - 1, shifted right by at most 60 bits.
+  auto integral = static_cast<uint32_t>(value.f >> -one.e);
+  FMT_ASSERT(integral != 0, "");
+  FMT_ASSERT(integral == value.f >> -one.e, "");
+  // The fractional part of scaled value (p2 in Grisu) c = value % one.
+  uint64_t fractional = value.f & (one.f - 1);
+  exp = count_digits(integral);  // kappa in Grisu.
+  // Non-fixed formats require at least one digit and no precision adjustment.
+  if (handler.fixed) {
+    adjust_precision(handler.precision, exp + handler.exp10);
+    // Check if precision is satisfied just by leading zeros, e.g.
+    // format("{:.2f}", 0.001) gives "0.00" without generating any digits.
+    if (handler.precision <= 0) {
+      if (handler.precision < 0) return digits::done;
+      // Divide by 10 to prevent overflow.
+      uint64_t divisor = data::power_of_10_64[exp - 1] << -one.e;
+      auto dir = get_round_direction(divisor, value.f / 10, error * 10);
+      if (dir == round_direction::unknown) return digits::error;
+      handler.buf[handler.size++] = dir == round_direction::up ? '1' : '0';
+      return digits::done;
+    }
+  }
+  // Generate digits for the integral part. This can produce up to 10 digits.
+  do {
+    uint32_t digit = 0;
+    auto divmod_integral = [&](uint32_t divisor) {
+      digit = integral / divisor;
+      integral %= divisor;
+    };
+    // This optimization by Milo Yip reduces the number of integer divisions by
+    // one per iteration.
+    switch (exp) {
+    case 10:
+      divmod_integral(1000000000);
+      break;
+    case 9:
+      divmod_integral(100000000);
+      break;
+    case 8:
+      divmod_integral(10000000);
+      break;
+    case 7:
+      divmod_integral(1000000);
+      break;
+    case 6:
+      divmod_integral(100000);
+      break;
+    case 5:
+      divmod_integral(10000);
+      break;
+    case 4:
+      divmod_integral(1000);
+      break;
+    case 3:
+      divmod_integral(100);
+      break;
+    case 2:
+      divmod_integral(10);
+      break;
+    case 1:
+      digit = integral;
+      integral = 0;
+      break;
+    default:
+      FMT_ASSERT(false, "invalid number of digits");
+    }
+    --exp;
+    auto remainder = (static_cast<uint64_t>(integral) << -one.e) + fractional;
+    auto result = handler.on_digit(static_cast<char>('0' + digit),
+                                   data::power_of_10_64[exp] << -one.e,
+                                   remainder, error, true);
+    if (result != digits::more) return result;
+  } while (exp > 0);
+  // Generate digits for the fractional part.
+  for (;;) {
+    fractional *= 10;
+    error *= 10;
+    char digit = static_cast<char>('0' + (fractional >> -one.e));
+    fractional &= one.f - 1;
+    --exp;
+    auto result = handler.on_digit(digit, one.f, fractional, error, false);
+    if (result != digits::more) return result;
+  }
+}
+
+class bigint {
+ private:
+  // A bigint is stored as an array of bigits (big digits), with bigit at index
+  // 0 being the least significant one.
+  using bigit = uint32_t;
+  using double_bigit = uint64_t;
+  enum { bigits_capacity = 32 };
+  basic_memory_buffer<bigit, bigits_capacity> bigits_;
+  int exp_;
+
+  FMT_CONSTEXPR20 bigit operator[](int index) const {
+    return bigits_[to_unsigned(index)];
+  }
+  FMT_CONSTEXPR20 bigit& operator[](int index) {
+    return bigits_[to_unsigned(index)];
+  }
+
+  static FMT_CONSTEXPR_DECL const int bigit_bits = num_bits<bigit>();
+
+  friend struct formatter<bigint>;
+
+  FMT_CONSTEXPR20 void subtract_bigits(int index, bigit other, bigit& borrow) {
+    auto result = static_cast<double_bigit>((*this)[index]) - other - borrow;
+    (*this)[index] = static_cast<bigit>(result);
+    borrow = static_cast<bigit>(result >> (bigit_bits * 2 - 1));
+  }
+
+  FMT_CONSTEXPR20 void remove_leading_zeros() {
+    int num_bigits = static_cast<int>(bigits_.size()) - 1;
+    while (num_bigits > 0 && (*this)[num_bigits] == 0) --num_bigits;
+    bigits_.resize(to_unsigned(num_bigits + 1));
+  }
+
+  // Computes *this -= other assuming aligned bigints and *this >= other.
+  FMT_CONSTEXPR20 void subtract_aligned(const bigint& other) {
+    FMT_ASSERT(other.exp_ >= exp_, "unaligned bigints");
+    FMT_ASSERT(compare(*this, other) >= 0, "");
+    bigit borrow = 0;
+    int i = other.exp_ - exp_;
+    for (size_t j = 0, n = other.bigits_.size(); j != n; ++i, ++j)
+      subtract_bigits(i, other.bigits_[j], borrow);
+    while (borrow > 0) subtract_bigits(i, 0, borrow);
+    remove_leading_zeros();
+  }
+
+  FMT_CONSTEXPR20 void multiply(uint32_t value) {
+    const double_bigit wide_value = value;
+    bigit carry = 0;
+    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
+      double_bigit result = bigits_[i] * wide_value + carry;
+      bigits_[i] = static_cast<bigit>(result);
+      carry = static_cast<bigit>(result >> bigit_bits);
+    }
+    if (carry != 0) bigits_.push_back(carry);
+  }
+
+  template <typename UInt, FMT_ENABLE_IF(std::is_same<UInt, uint64_t>::value ||
+                                         std::is_same<UInt, uint128_t>::value)>
+  FMT_CONSTEXPR20 void multiply(UInt value) {
+    using half_uint =
+        conditional_t<std::is_same<UInt, uint128_t>::value, uint64_t, uint32_t>;
+    const int shift = num_bits<half_uint>() - bigit_bits;
+    const UInt lower = static_cast<half_uint>(value);
+    const UInt upper = value >> num_bits<half_uint>();
+    UInt carry = 0;
+    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
+      UInt result = lower * bigits_[i] + static_cast<bigit>(carry);
+      carry = (upper * bigits_[i] << shift) + (result >> bigit_bits) +
+              (carry >> bigit_bits);
+      bigits_[i] = static_cast<bigit>(result);
+    }
+    while (carry != 0) {
+      bigits_.push_back(static_cast<bigit>(carry));
+      carry >>= bigit_bits;
+    }
+  }
+
+  template <typename UInt, FMT_ENABLE_IF(std::is_same<UInt, uint64_t>::value ||
+                                         std::is_same<UInt, uint128_t>::value)>
+  FMT_CONSTEXPR20 void assign(UInt n) {
+    size_t num_bigits = 0;
+    do {
+      bigits_[num_bigits++] = static_cast<bigit>(n);
+      n >>= bigit_bits;
+    } while (n != 0);
+    bigits_.resize(num_bigits);
+    exp_ = 0;
+  }
+
+ public:
+  FMT_CONSTEXPR20 bigint() : exp_(0) {}
+  explicit bigint(uint64_t n) { assign(n); }
+
+  bigint(const bigint&) = delete;
+  void operator=(const bigint&) = delete;
+
+  FMT_CONSTEXPR20 void assign(const bigint& other) {
+    auto size = other.bigits_.size();
+    bigits_.resize(size);
+    auto data = other.bigits_.data();
+    std::copy(data, data + size, make_checked(bigits_.data(), size));
+    exp_ = other.exp_;
+  }
+
+  template <typename Int> FMT_CONSTEXPR20 void operator=(Int n) {
+    FMT_ASSERT(n > 0, "");
+    assign(uint64_or_128_t<Int>(n));
+  }
+
+  FMT_CONSTEXPR20 int num_bigits() const {
+    return static_cast<int>(bigits_.size()) + exp_;
+  }
+
+  FMT_NOINLINE FMT_CONSTEXPR20 bigint& operator<<=(int shift) {
+    FMT_ASSERT(shift >= 0, "");
+    exp_ += shift / bigit_bits;
+    shift %= bigit_bits;
+    if (shift == 0) return *this;
+    bigit carry = 0;
+    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
+      bigit c = bigits_[i] >> (bigit_bits - shift);
+      bigits_[i] = (bigits_[i] << shift) + carry;
+      carry = c;
+    }
+    if (carry != 0) bigits_.push_back(carry);
+    return *this;
+  }
+
+  template <typename Int> FMT_CONSTEXPR20 bigint& operator*=(Int value) {
+    FMT_ASSERT(value > 0, "");
+    multiply(uint32_or_64_or_128_t<Int>(value));
+    return *this;
+  }
+
+  friend FMT_CONSTEXPR20 int compare(const bigint& lhs, const bigint& rhs) {
+    int num_lhs_bigits = lhs.num_bigits(), num_rhs_bigits = rhs.num_bigits();
+    if (num_lhs_bigits != num_rhs_bigits)
+      return num_lhs_bigits > num_rhs_bigits ? 1 : -1;
+    int i = static_cast<int>(lhs.bigits_.size()) - 1;
+    int j = static_cast<int>(rhs.bigits_.size()) - 1;
+    int end = i - j;
+    if (end < 0) end = 0;
+    for (; i >= end; --i, --j) {
+      bigit lhs_bigit = lhs[i], rhs_bigit = rhs[j];
+      if (lhs_bigit != rhs_bigit) return lhs_bigit > rhs_bigit ? 1 : -1;
+    }
+    if (i != j) return i > j ? 1 : -1;
+    return 0;
+  }
+
+  // Returns compare(lhs1 + lhs2, rhs).
+  friend FMT_CONSTEXPR20 int add_compare(const bigint& lhs1, const bigint& lhs2,
+                                         const bigint& rhs) {
+    auto minimum = [](int a, int b) { return a < b ? a : b; };
+    auto maximum = [](int a, int b) { return a > b ? a : b; };
+    int max_lhs_bigits = maximum(lhs1.num_bigits(), lhs2.num_bigits());
+    int num_rhs_bigits = rhs.num_bigits();
+    if (max_lhs_bigits + 1 < num_rhs_bigits) return -1;
+    if (max_lhs_bigits > num_rhs_bigits) return 1;
+    auto get_bigit = [](const bigint& n, int i) -> bigit {
+      return i >= n.exp_ && i < n.num_bigits() ? n[i - n.exp_] : 0;
+    };
+    double_bigit borrow = 0;
+    int min_exp = minimum(minimum(lhs1.exp_, lhs2.exp_), rhs.exp_);
+    for (int i = num_rhs_bigits - 1; i >= min_exp; --i) {
+      double_bigit sum =
+          static_cast<double_bigit>(get_bigit(lhs1, i)) + get_bigit(lhs2, i);
+      bigit rhs_bigit = get_bigit(rhs, i);
+      if (sum > rhs_bigit + borrow) return 1;
+      borrow = rhs_bigit + borrow - sum;
+      if (borrow > 1) return -1;
+      borrow <<= bigit_bits;
+    }
+    return borrow != 0 ? -1 : 0;
+  }
+
+  // Assigns pow(10, exp) to this bigint.
+  FMT_CONSTEXPR20 void assign_pow10(int exp) {
+    FMT_ASSERT(exp >= 0, "");
+    if (exp == 0) return *this = 1;
+    // Find the top bit.
+    int bitmask = 1;
+    while (exp >= bitmask) bitmask <<= 1;
+    bitmask >>= 1;
+    // pow(10, exp) = pow(5, exp) * pow(2, exp). First compute pow(5, exp) by
+    // repeated squaring and multiplication.
+    *this = 5;
+    bitmask >>= 1;
+    while (bitmask != 0) {
+      square();
+      if ((exp & bitmask) != 0) *this *= 5;
+      bitmask >>= 1;
+    }
+    *this <<= exp;  // Multiply by pow(2, exp) by shifting.
+  }
+
+  FMT_CONSTEXPR20 void square() {
+    int num_bigits = static_cast<int>(bigits_.size());
+    int num_result_bigits = 2 * num_bigits;
+    basic_memory_buffer<bigit, bigits_capacity> n(std::move(bigits_));
+    bigits_.resize(to_unsigned(num_result_bigits));
+    auto sum = uint128_t();
+    for (int bigit_index = 0; bigit_index < num_bigits; ++bigit_index) {
+      // Compute bigit at position bigit_index of the result by adding
+      // cross-product terms n[i] * n[j] such that i + j == bigit_index.
+      for (int i = 0, j = bigit_index; j >= 0; ++i, --j) {
+        // Most terms are multiplied twice which can be optimized in the future.
+        sum += static_cast<double_bigit>(n[i]) * n[j];
+      }
+      (*this)[bigit_index] = static_cast<bigit>(sum);
+      sum >>= num_bits<bigit>();  // Compute the carry.
+    }
+    // Do the same for the top half.
+    for (int bigit_index = num_bigits; bigit_index < num_result_bigits;
+         ++bigit_index) {
+      for (int j = num_bigits - 1, i = bigit_index - j; i < num_bigits;)
+        sum += static_cast<double_bigit>(n[i++]) * n[j--];
+      (*this)[bigit_index] = static_cast<bigit>(sum);
+      sum >>= num_bits<bigit>();
+    }
+    remove_leading_zeros();
+    exp_ *= 2;
+  }
+
+  // If this bigint has a bigger exponent than other, adds trailing zero to make
+  // exponents equal. This simplifies some operations such as subtraction.
+  FMT_CONSTEXPR20 void align(const bigint& other) {
+    int exp_difference = exp_ - other.exp_;
+    if (exp_difference <= 0) return;
+    int num_bigits = static_cast<int>(bigits_.size());
+    bigits_.resize(to_unsigned(num_bigits + exp_difference));
+    for (int i = num_bigits - 1, j = i + exp_difference; i >= 0; --i, --j)
+      bigits_[j] = bigits_[i];
+    std::uninitialized_fill_n(bigits_.data(), exp_difference, 0);
+    exp_ -= exp_difference;
+  }
+
+  // Divides this bignum by divisor, assigning the remainder to this and
+  // returning the quotient.
+  FMT_CONSTEXPR20 int divmod_assign(const bigint& divisor) {
+    FMT_ASSERT(this != &divisor, "");
+    if (compare(*this, divisor) < 0) return 0;
+    FMT_ASSERT(divisor.bigits_[divisor.bigits_.size() - 1u] != 0, "");
+    align(divisor);
+    int quotient = 0;
+    do {
+      subtract_aligned(divisor);
+      ++quotient;
+    } while (compare(*this, divisor) >= 0);
+    return quotient;
+  }
+};
+
+// format_dragon flags.
+enum dragon {
+  predecessor_closer = 1,
+  fixup = 2,  // Run fixup to correct exp10 which can be off by one.
+  fixed = 4,
+};
+
+// Formats a floating-point number using a variation of the Fixed-Precision
+// Positive Floating-Point Printout ((FPP)^2) algorithm by Steele & White:
+// https://fmt.dev/papers/p372-steele.pdf.
+FMT_CONSTEXPR20 inline void format_dragon(basic_fp<uint128_t> value,
+                                          unsigned flags, int num_digits,
+                                          buffer<char>& buf, int& exp10) {
+  bigint numerator;    // 2 * R in (FPP)^2.
+  bigint denominator;  // 2 * S in (FPP)^2.
+  // lower and upper are differences between value and corresponding boundaries.
+  bigint lower;             // (M^- in (FPP)^2).
+  bigint upper_store;       // upper's value if different from lower.
+  bigint* upper = nullptr;  // (M^+ in (FPP)^2).
+  // Shift numerator and denominator by an extra bit or two (if lower boundary
+  // is closer) to make lower and upper integers. This eliminates multiplication
+  // by 2 during later computations.
+  bool is_predecessor_closer = (flags & dragon::predecessor_closer) != 0;
+  int shift = is_predecessor_closer ? 2 : 1;
+  if (value.e >= 0) {
+    numerator = value.f;
+    numerator <<= value.e + shift;
+    lower = 1;
+    lower <<= value.e;
+    if (is_predecessor_closer) {
+      upper_store = 1;
+      upper_store <<= value.e + 1;
+      upper = &upper_store;
+    }
+    denominator.assign_pow10(exp10);
+    denominator <<= shift;
+  } else if (exp10 < 0) {
+    numerator.assign_pow10(-exp10);
+    lower.assign(numerator);
+    if (is_predecessor_closer) {
+      upper_store.assign(numerator);
+      upper_store <<= 1;
+      upper = &upper_store;
+    }
+    numerator *= value.f;
+    numerator <<= shift;
+    denominator = 1;
+    denominator <<= shift - value.e;
+  } else {
+    numerator = value.f;
+    numerator <<= shift;
+    denominator.assign_pow10(exp10);
+    denominator <<= shift - value.e;
+    lower = 1;
+    if (is_predecessor_closer) {
+      upper_store = 1ULL << 1;
+      upper = &upper_store;
+    }
+  }
+  bool even = (value.f & 1) == 0;
+  if (!upper) upper = &lower;
+  if ((flags & dragon::fixup) != 0) {
+    if (add_compare(numerator, *upper, denominator) + even <= 0) {
+      --exp10;
+      numerator *= 10;
+      if (num_digits < 0) {
+        lower *= 10;
+        if (upper != &lower) *upper *= 10;
+      }
+    }
+    if ((flags & dragon::fixed) != 0) adjust_precision(num_digits, exp10 + 1);
+  }
+  // Invariant: value == (numerator / denominator) * pow(10, exp10).
+  if (num_digits < 0) {
+    // Generate the shortest representation.
+    num_digits = 0;
+    char* data = buf.data();
+    for (;;) {
+      int digit = numerator.divmod_assign(denominator);
+      bool low = compare(numerator, lower) - even < 0;  // numerator <[=] lower.
+      // numerator + upper >[=] pow10:
+      bool high = add_compare(numerator, *upper, denominator) + even > 0;
+      data[num_digits++] = static_cast<char>('0' + digit);
+      if (low || high) {
+        if (!low) {
+          ++data[num_digits - 1];
+        } else if (high) {
+          int result = add_compare(numerator, numerator, denominator);
+          // Round half to even.
+          if (result > 0 || (result == 0 && (digit % 2) != 0))
+            ++data[num_digits - 1];
+        }
+        buf.try_resize(to_unsigned(num_digits));
+        exp10 -= num_digits - 1;
+        return;
+      }
+      numerator *= 10;
+      lower *= 10;
+      if (upper != &lower) *upper *= 10;
+    }
+  }
+  // Generate the given number of digits.
+  exp10 -= num_digits - 1;
+  if (num_digits == 0) {
+    denominator *= 10;
+    auto digit = add_compare(numerator, numerator, denominator) > 0 ? '1' : '0';
+    buf.push_back(digit);
+    return;
+  }
+  buf.try_resize(to_unsigned(num_digits));
+  for (int i = 0; i < num_digits - 1; ++i) {
+    int digit = numerator.divmod_assign(denominator);
+    buf[i] = static_cast<char>('0' + digit);
+    numerator *= 10;
+  }
+  int digit = numerator.divmod_assign(denominator);
+  auto result = add_compare(numerator, numerator, denominator);
+  if (result > 0 || (result == 0 && (digit % 2) != 0)) {
+    if (digit == 9) {
+      const auto overflow = '0' + 10;
+      buf[num_digits - 1] = overflow;
+      // Propagate the carry.
+      for (int i = num_digits - 1; i > 0 && buf[i] == overflow; --i) {
+        buf[i] = '0';
+        ++buf[i - 1];
+      }
+      if (buf[0] == overflow) {
+        buf[0] = '1';
+        ++exp10;
+      }
+      return;
+    }
+    ++digit;
+  }
+  buf[num_digits - 1] = static_cast<char>('0' + digit);
+}
+
+template <typename Float>
+FMT_CONSTEXPR20 auto format_float(Float value, int precision, float_specs specs,
+                                  buffer<char>& buf) -> int {
+  // float is passed as double to reduce the number of instantiations.
+  static_assert(!std::is_same<Float, float>::value, "");
+  FMT_ASSERT(value >= 0, "value is negative");
+  auto converted_value = convert_float(value);
+
+  const bool fixed = specs.format == float_format::fixed;
+  if (value <= 0) {  // <= instead of == to silence a warning.
+    if (precision <= 0 || !fixed) {
+      buf.push_back('0');
+      return 0;
+    }
+    buf.try_resize(to_unsigned(precision));
+    fill_n(buf.data(), precision, '0');
+    return -precision;
+  }
+
+  int exp = 0;
+  bool use_dragon = true;
+  unsigned dragon_flags = 0;
+  if (!is_fast_float<Float>()) {
+    const auto inv_log2_10 = 0.3010299956639812;  // 1 / log2(10)
+    using info = dragonbox::float_info<decltype(converted_value)>;
+    const auto f = basic_fp<typename info::carrier_uint>(converted_value);
+    // Compute exp, an approximate power of 10, such that
+    //   10^(exp - 1) <= value < 10^exp or 10^exp <= value < 10^(exp + 1).
+    // This is based on log10(value) == log2(value) / log2(10) and approximation
+    // of log2(value) by e + num_fraction_bits idea from double-conversion.
+    exp = static_cast<int>(
+        std::ceil((f.e + count_digits<1>(f.f) - 1) * inv_log2_10 - 1e-10));
+    dragon_flags = dragon::fixup;
+  } else if (!is_constant_evaluated() && precision < 0) {
+    // Use Dragonbox for the shortest format.
+    if (specs.binary32) {
+      auto dec = dragonbox::to_decimal(static_cast<float>(value));
+      write<char>(buffer_appender<char>(buf), dec.significand);
+      return dec.exponent;
+    }
+    auto dec = dragonbox::to_decimal(static_cast<double>(value));
+    write<char>(buffer_appender<char>(buf), dec.significand);
+    return dec.exponent;
+  } else {
+    // Use Grisu + Dragon4 for the given precision:
+    // https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf.
+    const int min_exp = -60;  // alpha in Grisu.
+    int cached_exp10 = 0;     // K in Grisu.
+    fp normalized = normalize(fp(converted_value));
+    const auto cached_pow = get_cached_power(
+        min_exp - (normalized.e + fp::num_significand_bits), cached_exp10);
+    normalized = normalized * cached_pow;
+    gen_digits_handler handler{buf.data(), 0, precision, -cached_exp10, fixed};
+    if (grisu_gen_digits(normalized, 1, exp, handler) != digits::error &&
+        !is_constant_evaluated()) {
+      exp += handler.exp10;
+      buf.try_resize(to_unsigned(handler.size));
+      use_dragon = false;
+    } else {
+      exp += handler.size - cached_exp10 - 1;
+      precision = handler.precision;
+    }
+  }
+  if (use_dragon) {
+    auto f = basic_fp<uint128_t>();
+    bool is_predecessor_closer = specs.binary32
+                                     ? f.assign(static_cast<float>(value))
+                                     : f.assign(converted_value);
+    if (is_predecessor_closer) dragon_flags |= dragon::predecessor_closer;
+    if (fixed) dragon_flags |= dragon::fixed;
+    // Limit precision to the maximum possible number of significant digits in
+    // an IEEE754 double because we don't need to generate zeros.
+    const int max_double_digits = 767;
+    if (precision > max_double_digits) precision = max_double_digits;
+    format_dragon(f, dragon_flags, precision, buf, exp);
+  }
+  if (!fixed && !specs.showpoint) {
+    // Remove trailing zeros.
+    auto num_digits = buf.size();
+    while (num_digits > 0 && buf[num_digits - 1] == '0') {
+      --num_digits;
+      ++exp;
+    }
+    buf.try_resize(num_digits);
+  }
+  return exp;
+}
+
 template <typename Char, typename OutputIt, typename T,
          FMT_ENABLE_IF(is_floating_point<T>::value)>
 FMT_CONSTEXPR20 auto write(OutputIt out, T value,
@ -2398,8 +3193,8 @@ FMT_CONSTEXPR20 auto write(OutputIt out, T value,
  if (const_check(std::is_same<T, float>())) fspecs.binary32 = true;
  int exp = format_float(convert_float(value), precision, fspecs, buffer);
  fspecs.precision = precision;
-  auto fp = big_decimal_fp{buffer.data(), static_cast<int>(buffer.size()), exp};
-  return write_float(out, fp, specs, fspecs, loc);
+  auto f = big_decimal_fp{buffer.data(), static_cast<int>(buffer.size()), exp};
+  return write_float(out, f, specs, fspecs, loc);
 }

 template <typename Char, typename OutputIt, typename T,
@ -2454,28 +3249,6 @@ constexpr auto write(OutputIt out, const T& value) -> OutputIt {
  return write<Char>(out, to_string_view(value));
 }

-template <typename Char, typename OutputIt, typename T,
-          FMT_ENABLE_IF(is_integral<T>::value &&
-                        !std::is_same<T, bool>::value &&
-                        !std::is_same<T, Char>::value)>
-FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
-  auto abs_value = static_cast<uint32_or_64_or_128_t<T>>(value);
-  bool negative = is_negative(value);
-  // Don't do -abs_value since it trips unsigned-integer-overflow sanitizer.
-  if (negative) abs_value = ~abs_value + 1;
-  int num_digits = count_digits(abs_value);
-  auto size = (negative ? 1 : 0) + static_cast<size_t>(num_digits);
-  auto it = reserve(out, size);
-  if (auto ptr = to_pointer<Char>(it, size)) {
-    if (negative) *ptr++ = static_cast<Char>('-');
-    format_decimal<Char>(ptr, abs_value, num_digits);
-    return out;
-  }
-  if (negative) *it++ = static_cast<Char>('-');
-  it = format_decimal<Char>(it, abs_value, num_digits).end;
-  return base_iterator(out, it);
-}
-
 // FMT_ENABLE_IF() condition separated to workaround an MSVC bug.
 template <
    typename Char, typename OutputIt, typename T,