Microoptimizations in FastDtoa.
Optimize FastDtoa, in particular Grisu3. In addition to making a microbenchmark, there are a number of smaller and larger changes here: - Replace divisions by power-of-ten with multiplications by their inverses, using an algorithm very similar to the one in libdivide. - For DiyFp::Times(), use 128-bit hardware multiplication if available (which it generally is on 64-bit platforms). - Where possible, send around a pointer to the end of the string, instead of a pointer and a length, reducing register pressure (especially for Intel). Where not (easily) possible, add a local variable to make the compiler understand that length and decimal_point cannot alias. - Change some ints to unsigneds where it helps us avoid sign extensions. - Some minor changes to reduce instruction dependency chains. - Inline BiggestPowerTen(). Actual performance gain is wildly different between platforms. On my 3990X workstation (Zen 2), gains are about 21%. On a M1 Mac Mini, they are about 17%. But on my i7-10610U laptop (Comet Lake, so Skylake microarchitecture), the function is 78% faster. This is probably because large divisions (divisor over 255) seem to hurt a lot on Skylake, but I haven't gone through it in detail. Change-Id: I5b67c257d788a3f7d1be7065d055456852451d68 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4110741 Commit-Queue: Steinar H Gunderson <sesse@chromium.org> Reviewed-by: Michael Lippautz <mlippautz@chromium.org> Cr-Commit-Position: refs/heads/main@{#84906}
This commit is contained in:
parent
9b3e66263b
commit
6da6e45099
@ -48,9 +48,18 @@ class DiyFp {
|
||||
|
||||
// returns a * b;
|
||||
static DiyFp Times(const DiyFp& a, const DiyFp& b) {
|
||||
#ifdef __SIZEOF_INT128__
|
||||
// If we have compiler-assisted 64x64 -> 128 muls (e.g. x86-64 and
|
||||
// aarch64), we can use that for a faster, inlined implementation.
|
||||
// This rounds the same way as Multiply().
|
||||
uint64_t hi = (a.f_ * static_cast<unsigned __int128>(b.f_)) >> 64;
|
||||
uint64_t lo = (a.f_ * static_cast<unsigned __int128>(b.f_));
|
||||
return {hi + (lo >> 63), a.e_ + b.e_ + 64};
|
||||
#else
|
||||
DiyFp result = a;
|
||||
result.Multiply(b);
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
void Normalize() {
|
||||
|
@ -134,10 +134,9 @@ class Double {
|
||||
void NormalizedBoundaries(DiyFp* out_m_minus, DiyFp* out_m_plus) const {
|
||||
DCHECK_GT(value(), 0.0);
|
||||
DiyFp v = this->AsDiyFp();
|
||||
bool significand_is_zero = (v.f() == kHiddenBit);
|
||||
DiyFp m_plus = DiyFp::Normalize(DiyFp((v.f() << 1) + 1, v.e() - 1));
|
||||
DiyFp m_minus;
|
||||
if (significand_is_zero && v.e() != kDenormalExponent) {
|
||||
if ((AsUint64() & kSignificandMask) == 0 && v.e() != kDenormalExponent) {
|
||||
// The boundary is closer. Think of v = 1000e10 and v- = 9999e9.
|
||||
// Then the boundary (== (v - v-)/2) is not just at a distance of 1e9 but
|
||||
// at a distance of 1e8.
|
||||
|
@ -39,9 +39,9 @@ static const int kMaximalTargetExponent = -32;
|
||||
// Output: returns true if the buffer is guaranteed to contain the closest
|
||||
// representable number to the input.
|
||||
// Modifies the generated digits in the buffer to approach (round towards) w.
|
||||
static bool RoundWeed(Vector<char> buffer, int length,
|
||||
uint64_t distance_too_high_w, uint64_t unsafe_interval,
|
||||
uint64_t rest, uint64_t ten_kappa, uint64_t unit) {
|
||||
static bool RoundWeed(char* last_digit, uint64_t distance_too_high_w,
|
||||
uint64_t unsafe_interval, uint64_t rest,
|
||||
uint64_t ten_kappa, uint64_t unit) {
|
||||
uint64_t small_distance = distance_too_high_w - unit;
|
||||
uint64_t big_distance = distance_too_high_w + unit;
|
||||
// Let w_low = too_high - big_distance, and
|
||||
@ -120,7 +120,7 @@ static bool RoundWeed(Vector<char> buffer, int length,
|
||||
unsafe_interval - rest >= ten_kappa && // Negated condition 2
|
||||
(rest + ten_kappa < small_distance || // buffer{-1} > w_high
|
||||
small_distance - rest >= rest + ten_kappa - small_distance)) {
|
||||
buffer[length - 1]--;
|
||||
--*last_digit;
|
||||
rest += ten_kappa;
|
||||
}
|
||||
|
||||
@ -200,13 +200,62 @@ static const uint32_t kTen7 = 10000000;
|
||||
static const uint32_t kTen8 = 100000000;
|
||||
static const uint32_t kTen9 = 1000000000;
|
||||
|
||||
struct DivMagic {
|
||||
uint32_t mul;
|
||||
uint32_t shift;
|
||||
};
|
||||
|
||||
// This table was computed by libdivide. Essentially, the shift is
|
||||
// floor(log2(x)), and the mul is 2^(33 + shift) / x, rounded up and truncated
|
||||
// to 32 bits.
|
||||
static const DivMagic div[] = {
|
||||
{0, 0}, // Not used, since 1 is not supported by the algorithm.
|
||||
{0x9999999a, 3}, // 10
|
||||
{0x47ae147b, 6}, // 100
|
||||
{0x0624dd30, 9}, // 1000
|
||||
{0xa36e2eb2, 13}, // 10000
|
||||
{0x4f8b588f, 16}, // 100000
|
||||
{0x0c6f7a0c, 19}, // 1000000
|
||||
{0xad7f29ac, 23}, // 10000000
|
||||
{0x5798ee24, 26} // 100000000
|
||||
};
|
||||
|
||||
// Returns *val / divisor, and does *val %= divisor. d must be the DivMagic
|
||||
// corresponding to the divisor.
|
||||
//
|
||||
// This algorithm is exactly the same as libdivide's branch-free u32 algorithm,
|
||||
// except that we add back a branch anyway to support 1.
|
||||
//
|
||||
// GCC/Clang uses a slightly different algorithm that doesn't need
|
||||
// the extra rounding step (and that would allow us to do 1 without
|
||||
// a branch), but it requires a pre-shift for the case of 10000,
|
||||
// so it ends up slower, at least on x86-64.
|
||||
//
|
||||
// Note that this is actually a small loss for certain CPUs with
|
||||
// a very fast divider (e.g. Zen 3), but a significant win for most
|
||||
// others (including the entire Skylake family).
|
||||
static inline uint32_t fast_divmod(uint32_t* val, uint32_t divisor,
|
||||
const DivMagic& d) {
|
||||
if (divisor == 1) {
|
||||
uint32_t digit = *val;
|
||||
*val = 0;
|
||||
return digit;
|
||||
} else {
|
||||
uint32_t q = (static_cast<uint64_t>(*val) * d.mul) >> 32;
|
||||
uint32_t t = ((*val - q) >> 1) + q;
|
||||
uint32_t digit = t >> d.shift;
|
||||
*val -= digit * divisor;
|
||||
return digit;
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the biggest power of ten that is less than or equal than the given
|
||||
// number. We furthermore receive the maximum number of bits 'number' has.
|
||||
// If number_bits == 0 then 0^-1 is returned
|
||||
// The number of bits must be <= 32.
|
||||
// Precondition: number < (1 << (number_bits + 1)).
|
||||
static void BiggestPowerTen(uint32_t number, int number_bits, uint32_t* power,
|
||||
int* exponent) {
|
||||
static inline void BiggestPowerTen(uint32_t number, int number_bits,
|
||||
uint32_t* power, unsigned* exponent) {
|
||||
switch (number_bits) {
|
||||
case 32:
|
||||
case 31:
|
||||
@ -354,8 +403,8 @@ static void BiggestPowerTen(uint32_t number, int number_bits, uint32_t* power,
|
||||
// represent 'w' we can stop. Everything inside the interval low - high
|
||||
// represents w. However we have to pay attention to low, high and w's
|
||||
// imprecision.
|
||||
static bool DigitGen(DiyFp low, DiyFp w, DiyFp high, Vector<char> buffer,
|
||||
int* length, int* kappa) {
|
||||
static bool DigitGen(DiyFp low, DiyFp w, DiyFp high, char** outptr,
|
||||
int* kappa) {
|
||||
DCHECK(low.e() == w.e() && w.e() == high.e());
|
||||
DCHECK(low.f() + 1 <= high.f() - 1);
|
||||
DCHECK(kMinimalTargetExponent <= w.e() && w.e() <= kMaximalTargetExponent);
|
||||
@ -389,20 +438,18 @@ static bool DigitGen(DiyFp low, DiyFp w, DiyFp high, Vector<char> buffer,
|
||||
// Modulo by one is an and.
|
||||
uint64_t fractionals = too_high.f() & (one.f() - 1);
|
||||
uint32_t divisor;
|
||||
int divisor_exponent;
|
||||
unsigned divisor_exponent;
|
||||
BiggestPowerTen(integrals, DiyFp::kSignificandSize - (-one.e()), &divisor,
|
||||
&divisor_exponent);
|
||||
*kappa = divisor_exponent + 1;
|
||||
*length = 0;
|
||||
// Loop invariant: buffer = too_high / 10^kappa (integer division)
|
||||
// The invariant holds for the first iteration: kappa has been initialized
|
||||
// with the divisor exponent + 1. And the divisor is the biggest power of ten
|
||||
// that is smaller than integrals.
|
||||
while (*kappa > 0) {
|
||||
int digit = integrals / divisor;
|
||||
buffer[*length] = '0' + digit;
|
||||
(*length)++;
|
||||
integrals %= divisor;
|
||||
uint32_t digit = fast_divmod(&integrals, divisor, div[divisor_exponent]);
|
||||
**outptr = '0' + digit;
|
||||
(*outptr)++;
|
||||
(*kappa)--;
|
||||
// Note that kappa now equals the exponent of the divisor and that the
|
||||
// invariant thus holds again.
|
||||
@ -413,11 +460,17 @@ static bool DigitGen(DiyFp low, DiyFp w, DiyFp high, Vector<char> buffer,
|
||||
if (rest < unsafe_interval.f()) {
|
||||
// Rounding down (by not emitting the remaining digits) yields a number
|
||||
// that lies within the unsafe interval.
|
||||
return RoundWeed(buffer, *length, DiyFp::Minus(too_high, w).f(),
|
||||
return RoundWeed(*outptr - 1, DiyFp::Minus(too_high, w).f(),
|
||||
unsafe_interval.f(), rest,
|
||||
static_cast<uint64_t>(divisor) << -one.e(), unit);
|
||||
}
|
||||
if (*kappa <= 0) {
|
||||
// Don't bother doing the division below. (The compiler ought to
|
||||
// figure this out itself, but it doesn't.)
|
||||
break;
|
||||
}
|
||||
divisor /= 10;
|
||||
--divisor_exponent;
|
||||
}
|
||||
|
||||
// The integrals have been generated. We are at the point of the decimal
|
||||
@ -435,12 +488,12 @@ static bool DigitGen(DiyFp low, DiyFp w, DiyFp high, Vector<char> buffer,
|
||||
unsafe_interval.set_f(unsafe_interval.f() * 10);
|
||||
// Integer division by one.
|
||||
int digit = static_cast<int>(fractionals >> -one.e());
|
||||
buffer[*length] = '0' + digit;
|
||||
(*length)++;
|
||||
**outptr = '0' + digit;
|
||||
(*outptr)++;
|
||||
fractionals &= one.f() - 1; // Modulo by one.
|
||||
(*kappa)--;
|
||||
if (fractionals < unsafe_interval.f()) {
|
||||
return RoundWeed(buffer, *length, DiyFp::Minus(too_high, w).f() * unit,
|
||||
return RoundWeed(*outptr - 1, DiyFp::Minus(too_high, w).f() * unit,
|
||||
unsafe_interval.f(), fractionals, one.f(), unit);
|
||||
}
|
||||
}
|
||||
@ -492,7 +545,7 @@ static bool DigitGenCounted(DiyFp w, int requested_digits, Vector<char> buffer,
|
||||
// Modulo by one is an and.
|
||||
uint64_t fractionals = w.f() & (one.f() - 1);
|
||||
uint32_t divisor;
|
||||
int divisor_exponent;
|
||||
unsigned divisor_exponent;
|
||||
BiggestPowerTen(integrals, DiyFp::kSignificandSize - (-one.e()), &divisor,
|
||||
&divisor_exponent);
|
||||
*kappa = divisor_exponent + 1;
|
||||
@ -503,16 +556,16 @@ static bool DigitGenCounted(DiyFp w, int requested_digits, Vector<char> buffer,
|
||||
// with the divisor exponent + 1. And the divisor is the biggest power of ten
|
||||
// that is smaller than 'integrals'.
|
||||
while (*kappa > 0) {
|
||||
int digit = integrals / divisor;
|
||||
uint32_t digit = fast_divmod(&integrals, divisor, div[divisor_exponent]);
|
||||
buffer[*length] = '0' + digit;
|
||||
(*length)++;
|
||||
requested_digits--;
|
||||
integrals %= divisor;
|
||||
(*kappa)--;
|
||||
// Note that kappa now equals the exponent of the divisor and that the
|
||||
// invariant thus holds again.
|
||||
if (requested_digits == 0) break;
|
||||
divisor /= 10;
|
||||
--divisor_exponent;
|
||||
}
|
||||
|
||||
if (requested_digits == 0) {
|
||||
@ -559,8 +612,7 @@ static bool DigitGenCounted(DiyFp w, int requested_digits, Vector<char> buffer,
|
||||
// The last digit will be closest to the actual v. That is, even if several
|
||||
// digits might correctly yield 'v' when read again, the closest will be
|
||||
// computed.
|
||||
static bool Grisu3(double v, Vector<char> buffer, int* length,
|
||||
int* decimal_exponent) {
|
||||
static bool Grisu3(double v, char** outptr, int* decimal_exponent) {
|
||||
DiyFp w = Double(v).AsNormalizedDiyFp();
|
||||
// boundary_minus and boundary_plus are the boundaries between v and its
|
||||
// closest floating-point neighbors. Any number strictly between
|
||||
@ -610,7 +662,7 @@ static bool Grisu3(double v, Vector<char> buffer, int* length,
|
||||
// decreased by 2.
|
||||
int kappa;
|
||||
bool result = DigitGen(scaled_boundary_minus, scaled_w, scaled_boundary_plus,
|
||||
buffer, length, &kappa);
|
||||
outptr, &kappa);
|
||||
*decimal_exponent = -mk + kappa;
|
||||
return result;
|
||||
}
|
||||
@ -665,15 +717,20 @@ bool FastDtoa(double v, FastDtoaMode mode, int requested_digits,
|
||||
DCHECK(!Double(v).IsSpecial());
|
||||
|
||||
bool result = false;
|
||||
char* outptr = buffer.data();
|
||||
int decimal_exponent = 0;
|
||||
switch (mode) {
|
||||
case FAST_DTOA_SHORTEST:
|
||||
result = Grisu3(v, buffer, length, &decimal_exponent);
|
||||
result = Grisu3(v, &outptr, &decimal_exponent);
|
||||
*length = static_cast<int>(outptr - buffer.data());
|
||||
break;
|
||||
case FAST_DTOA_PRECISION:
|
||||
result =
|
||||
Grisu3Counted(v, requested_digits, buffer, length, &decimal_exponent);
|
||||
case FAST_DTOA_PRECISION: {
|
||||
int local_length = 0;
|
||||
result = Grisu3Counted(v, requested_digits, buffer, &local_length,
|
||||
&decimal_exponent);
|
||||
*length = local_length;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
@ -30,4 +30,17 @@ if (v8_enable_google_benchmark) {
|
||||
"//third_party/google_benchmark:benchmark_main",
|
||||
]
|
||||
}
|
||||
|
||||
v8_executable("dtoa_benchmark") {
|
||||
testonly = true
|
||||
|
||||
configs = []
|
||||
|
||||
sources = [ "dtoa.cc" ]
|
||||
|
||||
deps = [
|
||||
"//:v8_libbase",
|
||||
"//third_party/google_benchmark:benchmark_main",
|
||||
]
|
||||
}
|
||||
}
|
||||
|
1069
test/benchmarks/cpp/dtoa.cc
Normal file
1069
test/benchmarks/cpp/dtoa.cc
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user