[bigint] Faster parsing from long strings

Combining parts in a balanced-binary-tree like order allows us to use fast multiplication algorithms. Bug: v8:11515 Change-Id: I6829929671770f009f10f6f3b383501fede476ab Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3049079 Reviewed-by: Maya Lekova <mslekova@chromium.org> Commit-Queue: Jakob Kummerow <jkummerow@chromium.org> Cr-Commit-Position: refs/heads/main@{#76404}
2021-08-10 22:21:56 +02:00 · 2021-08-10 22:21:56 +02:00 · dd5e5535ea
commit dd5e5535ea
parent 45424f1a58
5 changed files with 247 additions and 10 deletions
--- a/src/bigint/bigint-internal.h
+++ b/src/bigint/bigint-internal.h
@ -22,6 +22,7 @@ constexpr int kNewtonInversionThreshold = 50;
 // kBarrettThreshold is defined in bigint.h.

 constexpr int kToStringFastThreshold = 43;
+constexpr int kFromStringLargeThreshold = 300;

 class ProcessorImpl : public Processor {
 public:
@ -69,6 +70,7 @@ class ProcessorImpl : public Processor {

  void FromString(RWDigits Z, FromStringAccumulator* accumulator);
  void FromStringClassic(RWDigits Z, FromStringAccumulator* accumulator);
+  void FromStringLarge(RWDigits Z, FromStringAccumulator* accumulator);

  bool should_terminate() { return status_ == Status::kInterrupted; }

--- a/src/bigint/bigint.h
+++ b/src/bigint/bigint.h
@ -262,6 +262,8 @@ class Processor {
  // upon return will be set to the actual length of the result string.
  Status ToString(char* out, int* out_length, Digits X, int radix, bool sign);

+  // Z := the contents of {accumulator}.
+  // Assume that this leaves {accumulator} in unusable state.
  Status FromString(RWDigits Z, FromStringAccumulator* accumulator);
 };

--- a/src/bigint/fromstring.cc
+++ b/src/bigint/fromstring.cc
@ -40,7 +40,6 @@ void ProcessorImpl::FromStringClassic(RWDigits Z,
  // Parts are stored on the heap.
  for (int i = 1; i < num_heap_parts - 1; i++) {
    MultiplySingle(Z, already_set, max_multiplier);
-    if (should_terminate()) return;
    Add(Z, accumulator->heap_parts_[i]);
    already_set.set_len(already_set.len() + 1);
  }
@ -48,6 +47,171 @@ void ProcessorImpl::FromStringClassic(RWDigits Z,
  Add(Z, accumulator->heap_parts_.back());
 }

+// The fast algorithm: combine parts in a balanced-binary-tree like order:
+// Multiply-and-add neighboring pairs of parts, then loop, until only one
+// part is left. The benefit is that the multiplications will have inputs of
+// similar sizes, which makes them amenable to fast multiplication algorithms.
+// We have to do more multiplications than the classic algorithm though,
+// because we also have to multiply the multipliers.
+// Optimizations:
+// - We can skip the multiplier for the first part, because we never need it.
+// - Most multipliers are the same; we can avoid repeated multiplications and
+//   just copy the previous result. (In theory we could even de-dupe them, but
+//   as the parts/multipliers grow, we'll need most of the memory anyway.)
+//   Copied results are marked with a * below.
+// - We can re-use memory using a system of three buffers whose usage rotates:
+//   - one is considered empty, and is overwritten with the new parts,
+//   - one holds the multipliers (and will be "empty" in the next round), and
+//   - one initially holds the parts and is overwritten with the new multipliers
+//   Parts and multipliers both grow in each iteration, and get fewer, so we
+//   use the space of two adjacent old chunks for one new chunk.
+//   Since the {heap_parts_} vectors has the right size, and so does the
+//   result {Z}, we can use that memory, and only need to allocate one scratch
+//   vector. If the final result ends up in the wrong bucket, we have to copy it
+//   to the correct one.
+// - We don't have to keep track of the positions and sizes of the chunks,
+//   because we can deduce their precise placement from the iteration index.
+//
+// Example, assuming digit_t is 4 bits, fitting one decimal digit:
+// Initial state:
+// parts_:        1  2  3  4  5  6  7  8  9  0  1  2  3  4  5
+// multipliers_: 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
+// After the first iteration of the outer loop:
+// parts:         12    34    56    78    90    12    34    5
+// multipliers:        100  *100  *100  *100  *100  *100   10
+// After the second iteration:
+// parts:         1234        5678        9012        345
+// multipliers:              10000      *10000       1000
+// After the third iteration:
+// parts:         12345678                9012345
+// multipliers:                          10000000
+// And then there's an obvious last iteration.
+void ProcessorImpl::FromStringLarge(RWDigits Z,
+                                    FromStringAccumulator* accumulator) {
+  int num_parts = static_cast<int>(accumulator->heap_parts_.size());
+  DCHECK(num_parts >= 2);  // NOLINT(readability/check)
+  DCHECK(Z.len() >= num_parts);
+  RWDigits parts(accumulator->heap_parts_.data(), num_parts);
+  Storage multipliers_storage(num_parts);
+  RWDigits multipliers(multipliers_storage.get(), num_parts);
+  RWDigits temp(Z, 0, num_parts);
+  // Unrolled and specialized first iteration: part_len == 1, so instead of
+  // Digits sub-vectors we have individual digit_t values, and the multipliers
+  // are known up front.
+  {
+    digit_t max_multiplier = accumulator->max_multiplier_;
+    digit_t last_multiplier = accumulator->last_multiplier_;
+    RWDigits new_parts = temp;
+    RWDigits new_multipliers = parts;
+    int i = 0;
+    for (; i + 1 < num_parts; i += 2) {
+      digit_t p_in = parts[i];
+      digit_t p_in2 = parts[i + 1];
+      digit_t m_in = max_multiplier;
+      digit_t m_in2 = i == num_parts - 2 ? last_multiplier : max_multiplier;
+      // p[j] = p[i] * m[i+1] + p[i+1]
+      digit_t p_high;
+      digit_t p_low = digit_mul(p_in, m_in2, &p_high);
+      digit_t carry;
+      new_parts[i] = digit_add2(p_low, p_in2, &carry);
+      new_parts[i + 1] = p_high + carry;
+      // m[j] = m[i] * m[i+1]
+      if (i > 0) {
+        if (i > 2 && m_in2 != last_multiplier) {
+          new_multipliers[i] = new_multipliers[i - 2];
+          new_multipliers[i + 1] = new_multipliers[i - 1];
+        } else {
+          digit_t m_high;
+          new_multipliers[i] = digit_mul(m_in, m_in2, &m_high);
+          new_multipliers[i + 1] = m_high;
+        }
+      }
+    }
+    // Trailing last part (if {num_parts} was odd).
+    if (i < num_parts) {
+      new_parts[i] = parts[i];
+      new_multipliers[i] = last_multiplier;
+      i += 2;
+    }
+    num_parts = i >> 1;
+    RWDigits new_temp = multipliers;
+    parts = new_parts;
+    multipliers = new_multipliers;
+    temp = new_temp;
+    AddWorkEstimate(num_parts);
+  }
+  int part_len = 2;
+
+  // Remaining iterations.
+  while (num_parts > 1) {
+    RWDigits new_parts = temp;
+    RWDigits new_multipliers = parts;
+    int new_part_len = part_len * 2;
+    int i = 0;
+    for (; i + 1 < num_parts; i += 2) {
+      int start = i * part_len;
+      Digits p_in(parts, start, part_len);
+      Digits p_in2(parts, start + part_len, part_len);
+      Digits m_in(multipliers, start, part_len);
+      Digits m_in2(multipliers, start + part_len, part_len);
+      RWDigits p_out(new_parts, start, new_part_len);
+      RWDigits m_out(new_multipliers, start, new_part_len);
+      // p[j] = p[i] * m[i+1] + p[i+1]
+      Multiply(p_out, p_in, m_in2);
+      if (should_terminate()) return;
+      digit_t overflow = AddAndReturnOverflow(p_out, p_in2);
+      DCHECK(overflow == 0);  // NOLINT(readability/check)
+      USE(overflow);
+      // m[j] = m[i] * m[i+1]
+      if (i > 0) {
+        bool copied = false;
+        if (i > 2) {
+          int prev_start = (i - 2) * part_len;
+          Digits m_in_prev(multipliers, prev_start, part_len);
+          Digits m_in2_prev(multipliers, prev_start + part_len, part_len);
+          if (Compare(m_in, m_in_prev) == 0 &&
+              Compare(m_in2, m_in2_prev) == 0) {
+            copied = true;
+            Digits m_out_prev(new_multipliers, prev_start, new_part_len);
+            for (int k = 0; k < new_part_len; k++) m_out[k] = m_out_prev[k];
+          }
+        }
+        if (!copied) {
+          Multiply(m_out, m_in, m_in2);
+          if (should_terminate()) return;
+        }
+      }
+    }
+    // Trailing last part (if {num_parts} was odd).
+    if (i < num_parts) {
+      Digits p_in(parts, i * part_len, part_len);
+      Digits m_in(multipliers, i * part_len, part_len);
+      RWDigits p_out(new_parts, i * part_len, new_part_len);
+      RWDigits m_out(new_multipliers, i * part_len, new_part_len);
+      int k = 0;
+      for (; k < p_in.len(); k++) p_out[k] = p_in[k];
+      for (; k < p_out.len(); k++) p_out[k] = 0;
+      k = 0;
+      for (; k < m_in.len(); k++) m_out[k] = m_in[k];
+      for (; k < m_out.len(); k++) m_out[k] = 0;
+      i += 2;
+    }
+    num_parts = i >> 1;
+    part_len = new_part_len;
+    RWDigits new_temp = multipliers;
+    parts = new_parts;
+    multipliers = new_multipliers;
+    temp = new_temp;
+  }
+  // Copy the result to Z, if it doesn't happen to be there already.
+  if (parts.digits() != Z.digits()) {
+    int i = 0;
+    for (; i < parts.len(); i++) Z[i] = parts[i];
+    // Z might be bigger than we requested; be robust towards that.
+    for (; i < Z.len(); i++) Z[i] = 0;
+  }
+}
+
 void ProcessorImpl::FromString(RWDigits Z, FromStringAccumulator* accumulator) {
  if (accumulator->inline_everything_) {
    int i = 0;
@ -57,8 +221,10 @@ void ProcessorImpl::FromString(RWDigits Z, FromStringAccumulator* accumulator) {
    for (; i < Z.len(); i++) Z[i] = 0;
  } else if (accumulator->stack_parts_used_ == 0) {
    for (int i = 0; i < Z.len(); i++) Z[i] = 0;
-  } else {
+  } else if (accumulator->ResultLength() < kFromStringLargeThreshold) {
    FromStringClassic(Z, accumulator);
+  } else {
+    FromStringLarge(Z, accumulator);
  }
 }

--- a/test/bigint/bigint-shell.cc
+++ b/test/bigint/bigint-shell.cc
@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

+#include <cmath>
 #include <memory>
 #include <string>

@ -28,12 +29,13 @@ int PrintHelp(char** argv) {
  return 1;
 }

-#define TESTS(V)             \
-  V(kBarrett, "barrett")     \
-  V(kBurnikel, "burnikel")   \
-  V(kFFT, "fft")             \
-  V(kKaratsuba, "karatsuba") \
-  V(kToom, "toom")           \
+#define TESTS(V)               \
+  V(kBarrett, "barrett")       \
+  V(kBurnikel, "burnikel")     \
+  V(kFFT, "fft")               \
+  V(kFromString, "fromstring") \
+  V(kKaratsuba, "karatsuba")   \
+  V(kToom, "toom")             \
  V(kToString, "tostring")

 enum Operation { kNoOp, kList, kTest };
@ -86,7 +88,7 @@ class RNG {

 static constexpr int kCharsPerDigit = kDigitBits / 4;

-static const char kConversionChars[] = "0123456789abcdef";
+static const char kConversionChars[] = "0123456789abcdefghijklmnopqrstuvwxyz";

 std::string FormatHex(Digits X) {
  X.Normalize();
@ -173,6 +175,16 @@ class Runner {
    error_ = true;
  }

+  void AssertEquals(const char* input, int input_length, int radix,
+                    Digits expected, Digits actual) {
+    if (Compare(expected, actual) == 0) return;
+    std::cerr << "Input:    " << std::string(input, input_length) << "\n";
+    std::cerr << "Radix:    " << radix << "\n";
+    std::cerr << "Expected: " << FormatHex(expected) << "\n";
+    std::cerr << "Actual:   " << FormatHex(actual) << "\n";
+    error_ = true;
+  }
+
  int RunTest() {
    int count = 0;
    if (test_ == kBarrett) {
@ -199,6 +211,10 @@ class Runner {
      for (int i = 0; i < runs_; i++) {
        TestToString(&count);
      }
+    } else if (test_ == kFromString) {
+      for (int i = 0; i < runs_; i++) {
+        TestFromString(&count);
+      }
    } else {
      DCHECK(false);  // Unreachable.
    }
@ -391,6 +407,33 @@ class Runner {
    }
  }

+  void TestFromString(int* count) {
+    constexpr int kMaxDigits = 1 << 20;  // Any large-enough value will do.
+    constexpr int kMin = kFromStringLargeThreshold / 2;
+    constexpr int kMax = kFromStringLargeThreshold * 2;
+    for (int size = kMin; size < kMax; size++) {
+      // To keep test execution times low, test one random radix every time.
+      // Valid range is 2 <= radix <= 36 (inclusive).
+      int radix = 2 + (rng_.NextUint64() % 35);
+      int num_chars = std::round(size * kDigitBits / std::log2(radix));
+      std::unique_ptr<char[]> chars(new char[num_chars]);
+      GenerateRandomString(chars.get(), num_chars, radix);
+      FromStringAccumulator accumulator(kMaxDigits);
+      FromStringAccumulator ref_accumulator(kMaxDigits);
+      const char* start = chars.get();
+      const char* end = chars.get() + num_chars;
+      accumulator.Parse(start, end, radix);
+      ref_accumulator.Parse(start, end, radix);
+      ScratchDigits result(accumulator.ResultLength());
+      ScratchDigits reference(ref_accumulator.ResultLength());
+      processor()->FromStringLarge(result, &accumulator);
+      processor()->FromStringClassic(reference, &ref_accumulator);
+      AssertEquals(start, num_chars, radix, result, reference);
+      if (error_) return;
+      (*count)++;
+    }
+  }
+
  int ParseOptions(int argc, char** argv) {
    for (int i = 1; i < argc; i++) {
      if (strcmp(argv[i], "--list") == 0) {
@ -447,6 +490,30 @@ class Runner {
    }
  }

+  void GenerateRandomString(char* str, int len, int radix) {
+    DCHECK(2 <= radix && radix <= 36);
+    if (len == 0) return;
+    uint64_t random;
+    int available_bits = 0;
+    const int char_bits = BitLength(radix - 1);
+    const uint64_t char_mask = (1u << char_bits) - 1u;
+    for (int i = 0; i < len; i++) {
+      while (true) {
+        if (available_bits < char_bits) {
+          random = rng_.NextUint64();
+          available_bits = 64;
+        }
+        int next_char = static_cast<int>(random & char_mask);
+        random = random >> char_bits;
+        available_bits -= char_bits;
+        if (next_char >= radix) continue;
+        *str = kConversionChars[next_char];
+        str++;
+        break;
+      };
+    }
+  }
+
  Operation op_{kNoOp};
  Test test_;
  bool error_{false};
--- a/test/cctest/test-thread-termination.cc
+++ b/test/cctest/test-thread-termination.cc
@ -241,7 +241,7 @@ TEST(TerminateBigIntToString) {

 TEST(TerminateBigIntFromString) {
  TestTerminatingSlowOperation(
-      "var a = '12344567890'.repeat(10000);\n"
+      "var a = '12344567890'.repeat(100000);\n"
      "terminate();\n"
      "BigInt(a);\n"
      "fail();\n");