From 485c15c1ca4a581b3796ecbd8953bda28ead5c59 Mon Sep 17 00:00:00 2001
From: Jakob Kummerow <jkummerow@chromium.org>
Date: Thu, 15 Jul 2021 18:20:09 +0200
Subject: [PATCH] [bigint] Barrett-Newton division

Dividing by first computing a multiplicative inverse is faster than
Burnikel-Ziegler division for very large inputs.

Bug: v8:11515
Change-Id: Ice45690c3fa4eef7102d418cdd3d82a942a076c5
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3015573
Commit-Queue: Jakob Kummerow <jkummerow@chromium.org>
Reviewed-by: Maya Lekova <mslekova@chromium.org>
Cr-Commit-Position: refs/heads/master@{#75743}
---
 BUILD.bazel                    |   1 +
 BUILD.gn                       |   1 +
 src/bigint/bigint-internal.cc  |  17 ++
 src/bigint/bigint-internal.h   |  10 +
 src/bigint/bigint.h            |   9 +-
 src/bigint/div-barrett.cc      | 378 +++++++++++++++++++++++++++++++++
 src/bigint/mul-karatsuba.cc    |   2 +
 src/bigint/vector-arithmetic.h |   3 +
 test/bigint/bigint-shell.cc    |  74 ++++++-
 9 files changed, 487 insertions(+), 8 deletions(-)
 create mode 100644 src/bigint/div-barrett.cc

diff --git a/BUILD.bazel b/BUILD.bazel
index 9583948835..17d96d42f2 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -2673,6 +2673,7 @@ filegroup(
         "src/bigint/bigint-internal.h",
         "src/bigint/bigint.h",
         "src/bigint/digit-arithmetic.h",
+        "src/bigint/div-barrett.cc",
         "src/bigint/div-burnikel.cc",
         "src/bigint/div-helpers.cc",
         "src/bigint/div-helpers.h",
diff --git a/BUILD.gn b/BUILD.gn
index d0a7a56847..b2cadd5713 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -4969,6 +4969,7 @@ v8_source_set("v8_bigint") {
 
   if (v8_advanced_bigint_algorithms) {
     sources += [
+      "src/bigint/div-barrett.cc",
       "src/bigint/mul-fft.cc",
       "src/bigint/mul-toom.cc",
     ]
diff --git a/src/bigint/bigint-internal.cc b/src/bigint/bigint-internal.cc
index 828a450e8a..fc501f5a9a 100644
--- a/src/bigint/bigint-internal.cc
+++ b/src/bigint/bigint-internal.cc
@@ -58,7 +58,16 @@ void ProcessorImpl::Divide(RWDigits Q, Digits A, Digits B) {
   if (B.len() < kBurnikelThreshold) {
     return DivideSchoolbook(Q, RWDigits(nullptr, 0), A, B);
   }
+#if !V8_ADVANCED_BIGINT_ALGORITHMS
   return DivideBurnikelZiegler(Q, RWDigits(nullptr, 0), A, B);
+#else
+  if (B.len() < kBarrettThreshold || A.len() == B.len()) {
+    DivideBurnikelZiegler(Q, RWDigits(nullptr, 0), A, B);
+  } else {
+    ScratchDigits R(B.len());
+    DivideBarrett(Q, R, A, B);
+  }
+#endif
 }
 
 void ProcessorImpl::Modulo(RWDigits R, Digits A, Digits B) {
@@ -84,7 +93,15 @@ void ProcessorImpl::Modulo(RWDigits R, Digits A, Digits B) {
   }
   int q_len = DivideResultLength(A, B);
   ScratchDigits Q(q_len);
+#if !V8_ADVANCED_BIGINT_ALGORITHMS
   return DivideBurnikelZiegler(Q, R, A, B);
+#else
+  if (B.len() < kBarrettThreshold || A.len() == B.len()) {
+    DivideBurnikelZiegler(Q, R, A, B);
+  } else {
+    DivideBarrett(Q, R, A, B);
+  }
+#endif
 }
 
 Status Processor::Multiply(RWDigits Z, Digits X, Digits Y) {
diff --git a/src/bigint/bigint-internal.h b/src/bigint/bigint-internal.h
index 41ef9526e5..193f311f28 100644
--- a/src/bigint/bigint-internal.h
+++ b/src/bigint/bigint-internal.h
@@ -18,6 +18,8 @@ constexpr int kFftThreshold = 1500;
 constexpr int kFftInnerThreshold = 200;
 
 constexpr int kBurnikelThreshold = 57;
+constexpr int kNewtonInversionThreshold = 50;
+// kBarrettThreshold is defined in bigint.h.
 
 class ProcessorImpl : public Processor {
  public:
@@ -47,6 +49,14 @@ class ProcessorImpl : public Processor {
   void Toom3Main(RWDigits Z, Digits X, Digits Y);
 
   void MultiplyFFT(RWDigits Z, Digits X, Digits Y);
+
+  void DivideBarrett(RWDigits Q, RWDigits R, Digits A, Digits B);
+  void DivideBarrett(RWDigits Q, RWDigits R, Digits A, Digits B, Digits I,
+                     RWDigits scratch);
+
+  void Invert(RWDigits Z, Digits V, RWDigits scratch);
+  void InvertBasecase(RWDigits Z, Digits V, RWDigits scratch);
+  void InvertNewton(RWDigits Z, Digits V, RWDigits scratch);
 #endif  // V8_ADVANCED_BIGINT_ALGORITHMS
 
   // {out_length} initially contains the allocated capacity of {out}, and
diff --git a/src/bigint/bigint.h b/src/bigint/bigint.h
index 6d3790808c..78ac9a9d05 100644
--- a/src/bigint/bigint.h
+++ b/src/bigint/bigint.h
@@ -274,8 +274,15 @@ inline int SubtractSignedResultLength(int x_length, int y_length,
 inline int MultiplyResultLength(Digits X, Digits Y) {
   return X.len() + Y.len();
 }
+constexpr int kBarrettThreshold = 13310;
 inline int DivideResultLength(Digits A, Digits B) {
-  return A.len() - B.len() + 1;
+#if V8_ADVANCED_BIGINT_ALGORITHMS
+  // The Barrett division algorithm needs one extra digit for temporary use.
+  int kBarrettExtraScratch = B.len() >= kBarrettThreshold ? 1 : 0;
+#else
+  constexpr int kBarrettExtraScratch = 0;
+#endif
+  return A.len() - B.len() + 1 + kBarrettExtraScratch;
 }
 inline int ModuloResultLength(Digits B) { return B.len(); }
 
diff --git a/src/bigint/div-barrett.cc b/src/bigint/div-barrett.cc
new file mode 100644
index 0000000000..6947a5b68e
--- /dev/null
+++ b/src/bigint/div-barrett.cc
@@ -0,0 +1,378 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Barrett division, finding the inverse with Newton's method.
+// Reference: "Fast Division of Large Integers" by Karl Hasselström,
+// found at https://treskal.com/s/masters-thesis.pdf
+
+// Many thanks to Karl Wiberg, k@w5.se, for both writing up an
+// understandable theoretical description of the algorithm and privately
+// providing a demo implementation, on which the implementation in this file is
+// based.
+
+#include <algorithm>
+
+#include "src/bigint/bigint-internal.h"
+#include "src/bigint/digit-arithmetic.h"
+#include "src/bigint/div-helpers.h"
+#include "src/bigint/vector-arithmetic.h"
+
+namespace v8 {
+namespace bigint {
+
+namespace {
+
+constexpr int DivideBarrettScratchSpace(int n) { return n + 2; }
+
+// Local values S and W need "n plus a few" digits; U needs 2*n "plus a few".
+// In all tested cases the "few" were either 2 or 3, so give 5 to be safe.
+// S and W are not live at the same time.
+constexpr int kInvertNewtonExtraSpace = 5;
+constexpr int InvertNewtonScratchSpace(int n) {
+  return 3 * n + 2 * kInvertNewtonExtraSpace;
+}
+
+constexpr int InvertScratchSpace(int n) {
+  return n < kNewtonInversionThreshold ? 2 * n : InvertNewtonScratchSpace(n);
+}
+
+void DcheckIntegerPartRange(Digits X, digit_t min, digit_t max) {
+#if DEBUG
+  digit_t integer_part = X.msd();
+  DCHECK(integer_part >= min);
+  DCHECK(integer_part <= max);
+#else
+  USE(X);
+  USE(min);
+  USE(max);
+#endif
+}
+
+}  // namespace
+
+// Z := (the fractional part of) 1/V, via naive division.
+// See comments at {Invert} and {InvertNewton} below for details.
+void ProcessorImpl::InvertBasecase(RWDigits Z, Digits V, RWDigits scratch) {
+  DCHECK(Z.len() > V.len());
+  DCHECK(V.len() > 0);  // NOLINT(readability/check)
+  DCHECK(scratch.len() >= 2 * V.len());
+  int n = V.len();
+  RWDigits X(scratch, 0, 2 * n);
+  digit_t borrow = 0;
+  int i = 0;
+  for (; i < n; i++) X[i] = 0;
+  for (; i < 2 * n; i++) X[i] = digit_sub2(0, V[i - n], borrow, &borrow);
+  DCHECK(borrow == 1);     // NOLINT(readability/check)
+  RWDigits R(nullptr, 0);  // We don't need the remainder.
+  if (n < kBurnikelThreshold) {
+    DivideSchoolbook(Z, R, X, V);
+  } else {
+    DivideBurnikelZiegler(Z, R, X, V);
+  }
+}
+
+// This is Algorithm 4.2 from the paper.
+// Computes the inverse of V, shifted by kDigitBits * 2 * V.len, accurate to
+// V.len+1 digits. The V.len low digits of the result digits will be written
+// to Z, plus there is an implicit top digit with value 1.
+// Needs InvertNewtonScratchSpace(V.len) of scratch space.
+// The result is either correct or off by one (about half the time it is
+// correct, half the time it is one too much, and in the corner case where V is
+// minimal and the implicit top digit would have to be 2 it is one too little).
+// Barrett's division algorithm can handle that, so we don't care.
+void ProcessorImpl::InvertNewton(RWDigits Z, Digits V, RWDigits scratch) {
+  const int vn = V.len();
+  DCHECK(Z.len() >= vn);
+  DCHECK(scratch.len() >= InvertNewtonScratchSpace(vn));
+  const int kSOffset = 0;
+  const int kWOffset = 0;  // S and W can share their scratch space.
+  const int kUOffset = vn + kInvertNewtonExtraSpace;
+
+  // The base case won't work otherwise.
+  DCHECK(V.len() >= 3);  // NOLINT(readability/check)
+
+  constexpr int kBasecasePrecision = kNewtonInversionThreshold - 1;
+  // V must have more digits than the basecase.
+  DCHECK(V.len() > kBasecasePrecision);
+  DCHECK(IsBitNormalized(V));
+
+  // Step (1): Setup.
+  // Calculate precision required at each step.
+  // {k} is the number of fraction bits for the current iteration.
+  int k = vn * kDigitBits;
+  int target_fraction_bits[8 * sizeof(vn)];  // "k_i" in the paper.
+  int iteration = -1;  // "i" in the paper, except inverted to run downwards.
+  while (k > kBasecasePrecision * kDigitBits) {
+    iteration++;
+    target_fraction_bits[iteration] = k;
+    k = DIV_CEIL(k, 2);
+  }
+  // At this point, k <= kBasecasePrecision*kDigitBits is the number of
+  // fraction bits to use in the base case. {iteration} is the highest index
+  // in use for f[].
+
+  // Step (2): Initial approximation.
+  int initial_digits = DIV_CEIL(k + 1, kDigitBits);
+  Digits top_part_of_v(V, vn - initial_digits, initial_digits);
+  InvertBasecase(Z, top_part_of_v, scratch);
+  Z[initial_digits] = Z[initial_digits] + 1;  // Implicit top digit.
+  // From now on, we'll keep Z.len updated to the part that's already computed.
+  Z.set_len(initial_digits + 1);
+
+  // Step (3): Precision doubling loop.
+  while (true) {
+    DcheckIntegerPartRange(Z, 1, 2);
+
+    // (3b): S = Z^2
+    RWDigits S(scratch, kSOffset, 2 * Z.len());
+    Multiply(S, Z, Z);
+    if (should_terminate()) return;
+    S.TrimOne();  // Top digit of S is unused.
+    DcheckIntegerPartRange(S, 1, 4);
+
+    // (3c): T = V, truncated so that at least 2k+3 fraction bits remain.
+    int fraction_digits = DIV_CEIL(2 * k + 3, kDigitBits);
+    int t_len = std::min(V.len(), fraction_digits);
+    Digits T(V, V.len() - t_len, t_len);
+
+    // (3d): U = T * S, truncated so that at least 2k+1 fraction bits remain
+    // (U has one integer digit, which might be zero).
+    RWDigits U(scratch, kUOffset, S.len() + T.len());
+    DCHECK(U.len() > fraction_digits);
+    Multiply(U, S, T);
+    if (should_terminate()) return;
+    U = U + (U.len() - (1 + fraction_digits));
+    DcheckIntegerPartRange(U, 0, 3);
+
+    // (3e): W = 2 * Z, padded with "0" fraction bits so that it has the
+    // same number of fraction bits as U.
+    DCHECK(U.len() >= Z.len());
+    RWDigits W(scratch, kWOffset, U.len());
+    int padding_digits = U.len() - Z.len();
+    for (int i = 0; i < padding_digits; i++) W[i] = 0;
+    LeftShift(W + padding_digits, Z, 1);
+    DcheckIntegerPartRange(W, 2, 4);
+
+    // (3f): Z = W - U.
+    // This check is '<=' instead of '<' because U's top digit is its
+    // integer part, and we want vn fraction digits.
+    if (U.len() <= vn) {
+      // Normal subtraction.
+      // This is not the last iteration.
+      DCHECK(iteration > 0);  // NOLINT(readability/check)
+      Z.set_len(U.len());
+      digit_t borrow = SubtractAndReturnBorrow(Z, W, U);
+      DCHECK(borrow == 0);  // NOLINT(readability/check)
+      USE(borrow);
+      DcheckIntegerPartRange(Z, 1, 2);
+    } else {
+      // Truncate some least significant digits so that we get vn
+      // fraction digits, and compute the integer digit separately.
+      // This is the last iteration.
+      DCHECK(iteration == 0);  // NOLINT(readability/check)
+      Z.set_len(vn);
+      Digits W_part(W, W.len() - vn - 1, vn);
+      Digits U_part(U, U.len() - vn - 1, vn);
+      digit_t borrow = SubtractAndReturnBorrow(Z, W_part, U_part);
+      digit_t integer_part = W.msd() - U.msd() - borrow;
+      DCHECK(integer_part == 1 || integer_part == 2);
+      if (integer_part == 2) {
+        // This is the rare case where the correct result would be 2.0, but
+        // since we can't express that by returning only the fractional part
+        // with an implicit 1-digit, we have to return [1.]9999... instead.
+        for (int i = 0; i < Z.len(); i++) Z[i] = ~digit_t{0};
+      }
+      break;
+    }
+    // (3g, 3h): Update local variables and loop.
+    k = target_fraction_bits[iteration];
+    iteration--;
+  }
+}
+
+// Computes the inverse of V, shifted by kDigitBits * 2 * V.len, accurate to
+// V.len+1 digits. The V.len low digits of the result digits will be written
+// to Z, plus there is an implicit top digit with value 1.
+// (Corner case: if V is minimal, the implicit digit should be 2; in that case
+// we return one less than the correct answer. DivideBarrett can handle that.)
+// Needs InvertScratchSpace(V.len) digits of scratch space.
+void ProcessorImpl::Invert(RWDigits Z, Digits V, RWDigits scratch) {
+  DCHECK(Z.len() > V.len());
+  DCHECK(V.len() >= 1);  // NOLINT(readability/check)
+  DCHECK(IsBitNormalized(V));
+  DCHECK(scratch.len() >= InvertScratchSpace(V.len()));
+
+  int vn = V.len();
+  if (vn >= kNewtonInversionThreshold) {
+    return InvertNewton(Z, V, scratch);
+  }
+  if (vn == 1) {
+    digit_t d = V[0];
+    digit_t dummy_remainder;
+    Z[0] = digit_div(~d, ~digit_t{0}, d, &dummy_remainder);
+    Z[1] = 0;
+  } else {
+    InvertBasecase(Z, V, scratch);
+    if (Z[vn] == 1) {
+      for (int i = 0; i < vn; i++) Z[i] = ~digit_t{0};
+    }
+  }
+}
+
+// This is algorithm 3.5 from the paper.
+// Computes Q(uotient) and R(emainder) for A/B using I, which is a
+// precomputed approximation of 1/B (e.g. with Invert() above).
+// Needs DivideBarrettScratchSpace(A.len) scratch space.
+void ProcessorImpl::DivideBarrett(RWDigits Q, RWDigits R, Digits A, Digits B,
+                                  Digits I, RWDigits scratch) {
+  DCHECK(Q.len() > A.len() - B.len());
+  DCHECK(R.len() >= B.len());
+  DCHECK(A.len() > B.len());  // Careful: This is *not* '>=' !
+  DCHECK(A.len() <= 2 * B.len());
+  DCHECK(B.len() > 0);  // NOLINT(readability/check)
+  DCHECK(IsBitNormalized(B));
+  DCHECK(I.len() == A.len() - B.len());
+  DCHECK(scratch.len() >= DivideBarrettScratchSpace(A.len()));
+
+  int orig_q_len = Q.len();
+
+  // (1): A1 = A with B.len fewer digits.
+  Digits A1 = A + B.len();
+  DCHECK(A1.len() == I.len());
+
+  // (2): Q = A1*I with I.len fewer digits.
+  // {I} has an implicit high digit with value 1, so we add {A1} to the high
+  // part of the multiplication result.
+  RWDigits K(scratch, 0, 2 * I.len());
+  Multiply(K, A1, I);
+  if (should_terminate()) return;
+  Q.set_len(I.len() + 1);
+  Add(Q, K + I.len(), A1);
+  // K is no longer used, can re-use {scratch} for P.
+
+  // (3): R = A - B*Q (approximate remainder).
+  RWDigits P(scratch, 0, A.len() + 1);
+  Multiply(P, B, Q);
+  if (should_terminate()) return;
+  digit_t borrow = SubtractAndReturnBorrow(R, A, Digits(P, 0, B.len()));
+  // R may be allocated wider than B, zero out any extra digits if so.
+  for (int i = B.len(); i < R.len(); i++) R[i] = 0;
+  digit_t r_high = A[B.len()] - P[B.len()] - borrow;
+
+  // Adjust R and Q so that they become the correct remainder and quotient.
+  // The number of iterations is guaranteed to be at most some very small
+  // constant, unless the caller gave us a bad approximate quotient.
+  if (r_high >> (kDigitBits - 1) == 1) {
+    // (5b): R < 0, so R += B
+    digit_t q_sub = 0;
+    do {
+      r_high += AddAndReturnCarry(R, R, B);
+      q_sub++;
+      DCHECK(q_sub <= 5);  // NOLINT(readability/check)
+    } while (r_high != 0);
+    Subtract(Q, q_sub);
+  } else {
+    digit_t q_add = 0;
+    while (r_high != 0 || GreaterThanOrEqual(R, B)) {
+      // (5c): R >= B, so R -= B
+      r_high -= SubtractAndReturnBorrow(R, R, B);
+      q_add++;
+      DCHECK(q_add <= 5);  // NOLINT(readability/check)
+    }
+    Add(Q, q_add);
+  }
+  // (5a): Return.
+  int final_q_len = Q.len();
+  Q.set_len(orig_q_len);
+  for (int i = final_q_len; i < orig_q_len; i++) Q[i] = 0;
+}
+
+// Computes Q(uotient) and R(emainder) for A/B, using Barrett division.
+void ProcessorImpl::DivideBarrett(RWDigits Q, RWDigits R, Digits A, Digits B) {
+  DCHECK(Q.len() > A.len() - B.len());
+  DCHECK(R.len() >= B.len());
+  DCHECK(A.len() > B.len());  // Careful: This is *not* '>=' !
+  DCHECK(B.len() > 0);        // NOLINT(readability/check)
+
+  // Normalize B, and shift A by the same amount.
+  ShiftedDigits b_normalized(B);
+  ShiftedDigits a_normalized(A, b_normalized.shift());
+  // Keep the code below more concise.
+  B = b_normalized;
+  A = a_normalized;
+
+  // The core DivideBarrett function above only supports A having at most
+  // twice as many digits as B. We generalize this to arbitrary inputs
+  // similar to Burnikel-Ziegler division by performing a t-by-1 division
+  // of B-sized chunks. It's easy to special-case the situation where we
+  // don't need to bother.
+  int barrett_dividend_length = A.len() <= 2 * B.len() ? A.len() : 2 * B.len();
+  int i_len = barrett_dividend_length - B.len();
+  ScratchDigits I(i_len + 1);  // +1 is for temporary use by Invert().
+  int scratch_len =
+      std::max(InvertScratchSpace(i_len),
+               DivideBarrettScratchSpace(barrett_dividend_length));
+  ScratchDigits scratch(scratch_len);
+  Invert(I, Digits(B, B.len() - i_len, i_len), scratch);
+  if (should_terminate()) return;
+  I.TrimOne();
+  DCHECK(I.len() == i_len);
+  if (A.len() > 2 * B.len()) {
+    // This follows the variable names and and algorithmic steps of
+    // DivideBurnikelZiegler().
+    int n = B.len();  // Chunk length.
+    // (5): {t} is the number of B-sized chunks of A.
+    int t = DIV_CEIL(A.len(), n);
+    DCHECK(t >= 3);  // NOLINT(readability/check)
+    // (6)/(7): Z is used for the current 2-chunk block to be divided by B,
+    // initialized to the two topmost chunks of A.
+    int z_len = n * 2;
+    ScratchDigits Z(z_len);
+    PutAt(Z, A + n * (t - 2), z_len);
+    // (8): For i from t-2 downto 0 do
+    int qi_len = n + 1;
+    ScratchDigits Qi(qi_len);
+    ScratchDigits Ri(n);
+    // First iteration unrolled and specialized.
+    {
+      int i = t - 2;
+      DivideBarrett(Qi, Ri, Z, B, I, scratch);
+      if (should_terminate()) return;
+      RWDigits target = Q + n * i;
+      // In the first iteration, all qi_len = n + 1 digits may be used.
+      int to_copy = std::min(qi_len, target.len());
+      for (int j = 0; j < to_copy; j++) target[j] = Qi[j];
+      for (int j = to_copy; j < target.len(); j++) target[j] = 0;
+#if DEBUG
+      for (int j = to_copy; j < Qi.len(); j++) {
+        DCHECK(Qi[j] == 0);  // NOLINT(readability/check)
+      }
+#endif
+    }
+    // Now loop over any remaining iterations.
+    for (int i = t - 3; i >= 0; i--) {
+      // (8b): If i > 0, set Z_(i-1) = [Ri, A_(i-1)].
+      // (De-duped with unrolled first iteration, hence reading A_(i).)
+      PutAt(Z + n, Ri, n);
+      PutAt(Z, A + n * i, n);
+      // (8a): Compute Qi, Ri such that Zi = B*Qi + Ri.
+      DivideBarrett(Qi, Ri, Z, B, I, scratch);
+      DCHECK(Qi[qi_len - 1] == 0);  // NOLINT(readability/check)
+      if (should_terminate()) return;
+      // (9): Return Q = [Q_(t-2), ..., Q_0]...
+      PutAt(Q + n * i, Qi, n);
+    }
+    Ri.Normalize();
+    DCHECK(Ri.len() <= R.len());
+    // (9): ...and R = R_0 * 2^(-leading_zeros).
+    RightShift(R, Ri, b_normalized.shift());
+  } else {
+    DivideBarrett(Q, R, A, B, I, scratch);
+    if (should_terminate()) return;
+    RightShift(R, R, b_normalized.shift());
+  }
+}
+
+}  // namespace bigint
+}  // namespace v8
diff --git a/src/bigint/mul-karatsuba.cc b/src/bigint/mul-karatsuba.cc
index 2a141f213c..d4b5a58383 100644
--- a/src/bigint/mul-karatsuba.cc
+++ b/src/bigint/mul-karatsuba.cc
@@ -201,5 +201,7 @@ void ProcessorImpl::KaratsubaMain(RWDigits Z, Digits X, Digits Y,
   USE(overflow);
 }
 
+#undef MAYBE_TERMINATE
+
 }  // namespace bigint
 }  // namespace v8
diff --git a/src/bigint/vector-arithmetic.h b/src/bigint/vector-arithmetic.h
index 3247660f95..d8b79a961a 100644
--- a/src/bigint/vector-arithmetic.h
+++ b/src/bigint/vector-arithmetic.h
@@ -45,6 +45,9 @@ digit_t AddAndReturnCarry(RWDigits Z, Digits X, Digits Y);
 digit_t SubtractAndReturnBorrow(RWDigits Z, Digits X, Digits Y);
 
 inline bool IsDigitNormalized(Digits X) { return X.len() == 0 || X.msd() != 0; }
+inline bool IsBitNormalized(Digits X) {
+  return (X.msd() >> (kDigitBits - 1)) == 1;
+}
 
 inline bool GreaterThanOrEqual(Digits A, Digits B) {
   return Compare(A, B) >= 0;
diff --git a/test/bigint/bigint-shell.cc b/test/bigint/bigint-shell.cc
index b524944625..593bc47dc1 100644
--- a/test/bigint/bigint-shell.cc
+++ b/test/bigint/bigint-shell.cc
@@ -28,8 +28,11 @@ int PrintHelp(char** argv) {
 }
 
 #define TESTS(V)             \
+  V(kBarrett, "barrett")     \
+  V(kBurnikel, "burnikel")   \
+  V(kFFT, "fft")             \
   V(kKaratsuba, "karatsuba") \
-  V(kBurnikel, "burnikel") V(kToom, "toom") V(kFFT, "fft")
+  V(kToom, "toom")
 
 enum Operation { kNoOp, kList, kTest };
 
@@ -157,22 +160,26 @@ class Runner {
 
   int RunTest() {
     int count = 0;
-    if (test_ == kKaratsuba) {
+    if (test_ == kBarrett) {
       for (int i = 0; i < runs_; i++) {
-        TestKaratsuba(&count);
+        TestBarrett(&count);
       }
     } else if (test_ == kBurnikel) {
       for (int i = 0; i < runs_; i++) {
         TestBurnikel(&count);
       }
-    } else if (test_ == kToom) {
-      for (int i = 0; i < runs_; i++) {
-        TestToom(&count);
-      }
     } else if (test_ == kFFT) {
       for (int i = 0; i < runs_; i++) {
         TestFFT(&count);
       }
+    } else if (test_ == kKaratsuba) {
+      for (int i = 0; i < runs_; i++) {
+        TestKaratsuba(&count);
+      }
+    } else if (test_ == kToom) {
+      for (int i = 0; i < runs_; i++) {
+        TestToom(&count);
+      }
     } else {
       DCHECK(false);  // Unreachable.
     }
@@ -291,6 +298,56 @@ class Runner {
     }
   }
 
+#if V8_ADVANCED_BIGINT_ALGORITHMS
+  void TestBarrett_Internal(int left_size, int right_size) {
+    ScratchDigits A(left_size);
+    ScratchDigits B(right_size);
+    GenerateRandom(A);
+    GenerateRandom(B);
+    int quotient_len = DivideResultLength(A, B);
+    // {DivideResultLength} doesn't expect to be called for sizes below
+    // {kBarrettThreshold} (which we do here to save time), so we have to
+    // manually adjust the allocated result length.
+    if (B.len() < kBarrettThreshold) quotient_len++;
+    int remainder_len = right_size;
+    ScratchDigits quotient(quotient_len);
+    ScratchDigits quotient_burnikel(quotient_len);
+    ScratchDigits remainder(remainder_len);
+    ScratchDigits remainder_burnikel(remainder_len);
+    processor()->DivideBarrett(quotient, remainder, A, B);
+    processor()->DivideBurnikelZiegler(quotient_burnikel, remainder_burnikel, A,
+                                       B);
+    AssertEquals(A, B, quotient_burnikel, quotient);
+    AssertEquals(A, B, remainder_burnikel, remainder);
+  }
+
+  void TestBarrett(int* count) {
+    // We pick a range around kBurnikelThreshold (instead of kBarrettThreshold)
+    // to save test execution time.
+    constexpr int kMin = kBurnikelThreshold / 2;
+    constexpr int kMax = 2 * kBurnikelThreshold;
+    // {DivideBarrett(A, B)} requires that A.len > B.len!
+    for (int right_size = kMin; right_size <= kMax; right_size++) {
+      for (int left_size = right_size + 1; left_size <= kMax; left_size++) {
+        TestBarrett_Internal(left_size, right_size);
+        if (error_) return;
+        (*count)++;
+      }
+    }
+    // We also test one random large case.
+    uint64_t random_bits = rng_.NextUint64();
+    int right_size = kBarrettThreshold + static_cast<int>(random_bits & 0x3FF);
+    random_bits >>= 10;
+    int left_size = right_size + 1 + static_cast<int>(random_bits & 0x3FFF);
+    random_bits >>= 14;
+    TestBarrett_Internal(left_size, right_size);
+    if (error_) return;
+    (*count)++;
+  }
+#else
+  void TestBarrett(int* count) {}
+#endif  // V8_ADVANCED_BIGINT_ALGORITHMS
+
   int ParseOptions(int argc, char** argv) {
     for (int i = 1; i < argc; i++) {
       if (strcmp(argv[i], "--list") == 0) {
@@ -325,6 +382,9 @@ class Runner {
   }
 
  private:
+  // TODO(jkummerow): Also generate "non-random-looking" inputs, i.e. long
+  // strings of zeros and ones in the binary representation, such as
+  // ((1 << random) ± 1).
   void GenerateRandom(RWDigits Z) {
     if (Z.len() == 0) return;
     if (sizeof(digit_t) == 8) {