[bigint] Toom-Cook multiplication
A generalization of Karatsuba's idea for even larger inputs. Bug: v8:11515 Change-Id: I50eac2d313bf4217bf2f55ca2e64b5f120f40206 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2999870 Auto-Submit: Jakob Kummerow <jkummerow@chromium.org> Commit-Queue: Michael Achenbach <machenbach@chromium.org> Reviewed-by: Michael Achenbach <machenbach@chromium.org> Reviewed-by: Maya Lekova <mslekova@chromium.org> Cr-Commit-Position: refs/heads/master@{#75598}
This commit is contained in:
parent
6c68cd14e6
commit
ffb08a6809
@ -137,6 +137,7 @@ v8_config(
|
||||
defines = [
|
||||
"GOOGLE3",
|
||||
"CHROMIUM_ZLIB_NO_CHROMECONF",
|
||||
"V8_ADVANCED_BIGINT_ALGORITHMS",
|
||||
"V8_CONCURRENT_MARKING",
|
||||
] + select({
|
||||
":is_ia32": [ "V8_TARGET_ARCH_IA32" ],
|
||||
@ -2509,6 +2510,7 @@ filegroup(
|
||||
"src/bigint/div-schoolbook.cc",
|
||||
"src/bigint/mul-karatsuba.cc",
|
||||
"src/bigint/mul-schoolbook.cc",
|
||||
"src/bigint/mul-toom.cc",
|
||||
"src/bigint/tostring.cc",
|
||||
"src/bigint/util.h",
|
||||
"src/bigint/vector-arithmetic.cc",
|
||||
|
6
BUILD.gn
6
BUILD.gn
@ -4961,6 +4961,12 @@ v8_source_set("v8_bigint") {
|
||||
"src/bigint/vector-arithmetic.h",
|
||||
]
|
||||
|
||||
if (v8_advanced_bigint_algorithms) {
|
||||
sources += [ "src/bigint/mul-toom.cc" ]
|
||||
|
||||
defines = [ "V8_ADVANCED_BIGINT_ALGORITHMS" ]
|
||||
}
|
||||
|
||||
configs = [ ":internal_config" ]
|
||||
}
|
||||
|
||||
|
@ -87,6 +87,10 @@ declare_args() {
|
||||
v8_enable_google_benchmark = false
|
||||
|
||||
cppgc_is_standalone = false
|
||||
|
||||
# Enable advanced BigInt algorithms, costing about 10-30 KB binary size
|
||||
# depending on platform. Disabled on Android to save binary size.
|
||||
v8_advanced_bigint_algorithms = !is_android
|
||||
}
|
||||
|
||||
if (v8_use_external_startup_data == "") {
|
||||
|
@ -31,7 +31,12 @@ void ProcessorImpl::Multiply(RWDigits Z, Digits X, Digits Y) {
|
||||
if (X.len() < Y.len()) std::swap(X, Y);
|
||||
if (Y.len() == 1) return MultiplySingle(Z, X, Y[0]);
|
||||
if (Y.len() < kKaratsubaThreshold) return MultiplySchoolbook(Z, X, Y);
|
||||
#if !V8_ADVANCED_BIGINT_ALGORITHMS
|
||||
return MultiplyKaratsuba(Z, X, Y);
|
||||
#else
|
||||
if (Y.len() < kToomThreshold) return MultiplyKaratsuba(Z, X, Y);
|
||||
return MultiplyToomCook(Z, X, Y);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ProcessorImpl::Divide(RWDigits Q, Digits A, Digits B) {
|
||||
|
@ -13,6 +13,7 @@ namespace v8 {
|
||||
namespace bigint {
|
||||
|
||||
constexpr int kKaratsubaThreshold = 34;
|
||||
constexpr int kToomThreshold = 193;
|
||||
constexpr int kBurnikelThreshold = 57;
|
||||
|
||||
class ProcessorImpl : public Processor {
|
||||
@ -38,6 +39,11 @@ class ProcessorImpl : public Processor {
|
||||
|
||||
void Modulo(RWDigits R, Digits A, Digits B);
|
||||
|
||||
#if V8_ADVANCED_BIGINT_ALGORITHMS
|
||||
void MultiplyToomCook(RWDigits Z, Digits X, Digits Y);
|
||||
void Toom3Main(RWDigits Z, Digits X, Digits Y);
|
||||
#endif // V8_ADVANCED_BIGINT_ALGORITHMS
|
||||
|
||||
// {out_length} initially contains the allocated capacity of {out}, and
|
||||
// upon return will be set to the actual length of the result string.
|
||||
void ToString(char* out, int* out_length, Digits X, int radix, bool sign);
|
||||
|
@ -22,6 +22,17 @@
|
||||
namespace v8 {
|
||||
namespace bigint {
|
||||
|
||||
// If Karatsuba is the best supported algorithm, then it must check for
|
||||
// termination requests. If there are more advanced algorithms available
|
||||
// for larger inputs, then Karatsuba will only be used for sufficiently
|
||||
// small chunks that checking for termination requests is not necessary.
|
||||
#if V8_ADVANCED_BIGINT_ALGORITHMS
|
||||
#define MAYBE_TERMINATE
|
||||
#else
|
||||
#define MAYBE_TERMINATE \
|
||||
if (should_terminate()) return;
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
// The Karatsuba algorithm sometimes finishes more quickly when the
|
||||
@ -92,7 +103,7 @@ void ProcessorImpl::MultiplyKaratsuba(RWDigits Z, Digits X, Digits Y) {
|
||||
void ProcessorImpl::KaratsubaStart(RWDigits Z, Digits X, Digits Y,
|
||||
RWDigits scratch, int k) {
|
||||
KaratsubaMain(Z, X, Y, scratch, k);
|
||||
if (should_terminate()) return;
|
||||
MAYBE_TERMINATE
|
||||
for (int i = 2 * k; i < Z.len(); i++) Z[i] = 0;
|
||||
if (k < Y.len() || X.len() != Y.len()) {
|
||||
ScratchDigits T(2 * k);
|
||||
@ -101,7 +112,7 @@ void ProcessorImpl::KaratsubaStart(RWDigits Z, Digits X, Digits Y,
|
||||
Digits Y1 = Y + std::min(k, Y.len());
|
||||
if (Y1.len() > 0) {
|
||||
KaratsubaChunk(T, X0, Y1, scratch);
|
||||
if (should_terminate()) return;
|
||||
MAYBE_TERMINATE
|
||||
AddAndReturnOverflow(Z + k, T); // Can't overflow.
|
||||
}
|
||||
|
||||
@ -110,11 +121,11 @@ void ProcessorImpl::KaratsubaStart(RWDigits Z, Digits X, Digits Y,
|
||||
for (int i = k; i < X.len(); i += k) {
|
||||
Digits Xi(X, i, k);
|
||||
KaratsubaChunk(T, Xi, Y0, scratch);
|
||||
if (should_terminate()) return;
|
||||
MAYBE_TERMINATE
|
||||
AddAndReturnOverflow(Z + i, T); // Can't overflow.
|
||||
if (Y1.len() > 0) {
|
||||
KaratsubaChunk(T, Xi, Y1, scratch);
|
||||
if (should_terminate()) return;
|
||||
MAYBE_TERMINATE
|
||||
AddAndReturnOverflow(Z + (i + k), T); // Can't overflow.
|
||||
}
|
||||
}
|
||||
@ -158,11 +169,11 @@ void ProcessorImpl::KaratsubaMain(RWDigits Z, Digits X, Digits Y,
|
||||
RWDigits scratch_for_recursion(scratch, 2 * n, 2 * n);
|
||||
RWDigits P0(scratch, 0, n);
|
||||
KaratsubaMain(P0, X0, Y0, scratch_for_recursion, n2);
|
||||
if (should_terminate()) return;
|
||||
MAYBE_TERMINATE
|
||||
for (int i = 0; i < n; i++) Z[i] = P0[i];
|
||||
RWDigits P2(scratch, n, n);
|
||||
KaratsubaMain(P2, X1, Y1, scratch_for_recursion, n2);
|
||||
if (should_terminate()) return;
|
||||
MAYBE_TERMINATE
|
||||
RWDigits Z2 = Z + n;
|
||||
int end = std::min(Z2.len(), P2.len());
|
||||
for (int i = 0; i < end; i++) Z2[i] = P2[i];
|
||||
|
229
src/bigint/mul-toom.cc
Normal file
229
src/bigint/mul-toom.cc
Normal file
@ -0,0 +1,229 @@
|
||||
// Copyright 2021 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// Toom-Cook multiplication.
|
||||
// Reference: https://en.wikipedia.org/wiki/Toom%E2%80%93Cook_multiplication
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "src/bigint/bigint-internal.h"
|
||||
#include "src/bigint/digit-arithmetic.h"
|
||||
#include "src/bigint/vector-arithmetic.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace bigint {
|
||||
|
||||
namespace {
|
||||
|
||||
void TimesTwo(RWDigits X) {
|
||||
digit_t carry = 0;
|
||||
for (int i = 0; i < X.len(); i++) {
|
||||
digit_t d = X[i];
|
||||
X[i] = (d << 1) | carry;
|
||||
carry = d >> (kDigitBits - 1);
|
||||
}
|
||||
}
|
||||
|
||||
void DivideByTwo(RWDigits X) {
|
||||
digit_t carry = 0;
|
||||
for (int i = X.len() - 1; i >= 0; i--) {
|
||||
digit_t d = X[i];
|
||||
X[i] = (d >> 1) | carry;
|
||||
carry = d << (kDigitBits - 1);
|
||||
}
|
||||
}
|
||||
|
||||
void DivideByThree(RWDigits X) {
|
||||
digit_t remainder = 0;
|
||||
for (int i = X.len() - 1; i >= 0; i--) {
|
||||
digit_t d = X[i];
|
||||
digit_t upper = (remainder << kHalfDigitBits) | (d >> kHalfDigitBits);
|
||||
digit_t u_result = upper / 3;
|
||||
remainder = upper - 3 * u_result;
|
||||
digit_t lower = (remainder << kHalfDigitBits) | (d & kHalfDigitMask);
|
||||
digit_t l_result = lower / 3;
|
||||
remainder = lower - 3 * l_result;
|
||||
X[i] = (u_result << kHalfDigitBits) | l_result;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#if DEBUG
|
||||
// Set {len_} to 1 rather than 0 so that attempts to access the first digit
|
||||
// will crash.
|
||||
#define MARK_INVALID(D) D = RWDigits(nullptr, 1)
|
||||
#else
|
||||
#define MARK_INVALID(D) (void(0))
|
||||
#endif
|
||||
|
||||
void ProcessorImpl::Toom3Main(RWDigits Z, Digits X, Digits Y) {
|
||||
DCHECK(Z.len() >= X.len() + Y.len());
|
||||
// Phase 1: Splitting.
|
||||
int i = DIV_CEIL(std::max(X.len(), Y.len()), 3);
|
||||
Digits X0(X, 0, i);
|
||||
Digits X1(X, i, i);
|
||||
Digits X2(X, 2 * i, i);
|
||||
Digits Y0(Y, 0, i);
|
||||
Digits Y1(Y, i, i);
|
||||
Digits Y2(Y, 2 * i, i);
|
||||
|
||||
// Temporary storage.
|
||||
int p_len = i + 1; // For all px, qx below.
|
||||
int r_len = 2 * p_len; // For all r_x, Rx below.
|
||||
Storage temp_storage(4 * r_len);
|
||||
// We will use the same variable names as the Wikipedia article, as much as
|
||||
// C++ lets us: our "p_m1" is their "p(-1)" etc. For consistency with other
|
||||
// algorithms, we use X and Y where Wikipedia uses m and n.
|
||||
// We will use and re-use the temporary storage as follows:
|
||||
//
|
||||
// chunk | -------- time ----------->
|
||||
// [0 .. i] |( po )( p_m1 ) ( r_m2 )
|
||||
// [i+1 .. rlen-1] |( qo )( q_m1 ) ( r_m2 )
|
||||
// [rlen .. rlen+i] | (p_1 ) ( p_m2 ) (r_inf)
|
||||
// [rlen+i+1 .. 2*rlen-1] | (q_1 ) ( q_m2 ) (r_inf)
|
||||
// [2*rlen .. 3*rlen-1] | ( r_1 )
|
||||
// [3*rlen .. 4*rlen-1] | ( r_m1 )
|
||||
//
|
||||
// This requires interleaving phases 2 and 3 a bit: after computing
|
||||
// r_1 = p_1 * q_1, we can re-use p_1's storage for p_m2, and so on.
|
||||
digit_t* t = temp_storage.get();
|
||||
RWDigits po(t, p_len);
|
||||
RWDigits qo(t + p_len, p_len);
|
||||
RWDigits p_1(t + r_len, p_len);
|
||||
RWDigits q_1(t + r_len + p_len, p_len);
|
||||
RWDigits r_1(t + 2 * r_len, r_len);
|
||||
RWDigits r_m1(t + 3 * r_len, r_len);
|
||||
|
||||
// We can also share the backing stores of Z, r_0, R0.
|
||||
DCHECK(Z.len() >= r_len);
|
||||
RWDigits r_0(Z, 0, r_len);
|
||||
|
||||
// Phase 2a: Evaluation, steps 0, 1, m1.
|
||||
// po = X0 + X2
|
||||
Add(po, X0, X2);
|
||||
// p_0 = X0
|
||||
// p_1 = po + X1
|
||||
Add(p_1, po, X1);
|
||||
// p_m1 = po - X1
|
||||
RWDigits p_m1 = po;
|
||||
bool p_m1_sign = SubtractSigned(p_m1, po, false, X1, false);
|
||||
MARK_INVALID(po);
|
||||
|
||||
// qo = Y0 + Y2
|
||||
Add(qo, Y0, Y2);
|
||||
// q_0 = Y0
|
||||
// q_1 = qo + Y1
|
||||
Add(q_1, qo, Y1);
|
||||
// q_m1 = qo - Y1
|
||||
RWDigits q_m1 = qo;
|
||||
bool q_m1_sign = SubtractSigned(q_m1, qo, false, Y1, false);
|
||||
MARK_INVALID(qo);
|
||||
|
||||
// Phase 3a: Pointwise multiplication, steps 0, 1, m1.
|
||||
Multiply(r_0, X0, Y0);
|
||||
if (should_terminate()) return;
|
||||
Multiply(r_1, p_1, q_1);
|
||||
if (should_terminate()) return;
|
||||
Multiply(r_m1, p_m1, q_m1);
|
||||
if (should_terminate()) return;
|
||||
bool r_m1_sign = p_m1_sign != q_m1_sign;
|
||||
|
||||
// Phase 2b: Evaluation, steps m2 and inf.
|
||||
// p_m2 = (p_m1 + X2) * 2 - X0
|
||||
RWDigits p_m2 = p_1;
|
||||
MARK_INVALID(p_1);
|
||||
bool p_m2_sign = AddSigned(p_m2, p_m1, p_m1_sign, X2, false);
|
||||
TimesTwo(p_m2);
|
||||
p_m2_sign = SubtractSigned(p_m2, p_m2, p_m2_sign, X0, false);
|
||||
// p_inf = X2
|
||||
|
||||
// q_m2 = (q_m1 + Y2) * 2 - Y0
|
||||
RWDigits q_m2 = q_1;
|
||||
MARK_INVALID(q_1);
|
||||
bool q_m2_sign = AddSigned(q_m2, q_m1, q_m1_sign, Y2, false);
|
||||
TimesTwo(q_m2);
|
||||
q_m2_sign = SubtractSigned(q_m2, q_m2, q_m2_sign, Y0, false);
|
||||
// q_inf = Y2
|
||||
|
||||
// Phase 3b: Pointwise multiplication, steps m2 and inf.
|
||||
RWDigits r_m2(t, r_len);
|
||||
MARK_INVALID(p_m1);
|
||||
MARK_INVALID(q_m1);
|
||||
Multiply(r_m2, p_m2, q_m2);
|
||||
if (should_terminate()) return;
|
||||
bool r_m2_sign = p_m2_sign != q_m2_sign;
|
||||
|
||||
RWDigits r_inf(t + r_len, r_len);
|
||||
MARK_INVALID(p_m2);
|
||||
MARK_INVALID(q_m2);
|
||||
Multiply(r_inf, X2, Y2);
|
||||
if (should_terminate()) return;
|
||||
|
||||
// Phase 4: Interpolation.
|
||||
Digits R0 = r_0;
|
||||
Digits R4 = r_inf;
|
||||
// R3 <- (r_m2 - r_1) / 3
|
||||
RWDigits R3 = r_m2;
|
||||
bool R3_sign = SubtractSigned(R3, r_m2, r_m2_sign, r_1, false);
|
||||
DivideByThree(R3);
|
||||
// R1 <- (r_1 - r_m1) / 2
|
||||
RWDigits R1 = r_1;
|
||||
bool R1_sign = SubtractSigned(R1, r_1, false, r_m1, r_m1_sign);
|
||||
DivideByTwo(R1);
|
||||
// R2 <- r_m1 - r_0
|
||||
RWDigits R2 = r_m1;
|
||||
bool R2_sign = SubtractSigned(R2, r_m1, r_m1_sign, R0, false);
|
||||
// R3 <- (R2 - R3) / 2 + 2 * r_inf
|
||||
R3_sign = SubtractSigned(R3, R2, R2_sign, R3, R3_sign);
|
||||
DivideByTwo(R3);
|
||||
// TODO(jkummerow): Would it be a measurable improvement to write an
|
||||
// "AddTwice" helper?
|
||||
R3_sign = AddSigned(R3, R3, R3_sign, r_inf, false);
|
||||
R3_sign = AddSigned(R3, R3, R3_sign, r_inf, false);
|
||||
// R2 <- R2 + R1 - R4
|
||||
R2_sign = AddSigned(R2, R2, R2_sign, R1, R1_sign);
|
||||
R2_sign = SubtractSigned(R2, R2, R2_sign, R4, false);
|
||||
// R1 <- R1 - R3
|
||||
R1_sign = SubtractSigned(R1, R1, R1_sign, R3, R3_sign);
|
||||
|
||||
#if DEBUG
|
||||
R1.Normalize();
|
||||
R2.Normalize();
|
||||
R3.Normalize();
|
||||
DCHECK(R1_sign == false || R1.len() == 0);
|
||||
DCHECK(R2_sign == false || R2.len() == 0);
|
||||
DCHECK(R3_sign == false || R3.len() == 0);
|
||||
#endif
|
||||
|
||||
// Phase 5: Recomposition. R0 is already in place. Overflow can't happen.
|
||||
for (int j = R0.len(); j < Z.len(); j++) Z[j] = 0;
|
||||
AddAndReturnOverflow(Z + i, R1);
|
||||
AddAndReturnOverflow(Z + 2 * i, R2);
|
||||
AddAndReturnOverflow(Z + 3 * i, R3);
|
||||
AddAndReturnOverflow(Z + 4 * i, R4);
|
||||
}
|
||||
|
||||
void ProcessorImpl::MultiplyToomCook(RWDigits Z, Digits X, Digits Y) {
|
||||
DCHECK(X.len() >= Y.len());
|
||||
int k = Y.len();
|
||||
// TODO(jkummerow): Would it be a measurable improvement to share the
|
||||
// scratch memory for several invocations?
|
||||
Digits X0(X, 0, k);
|
||||
Toom3Main(Z, X0, Y);
|
||||
if (X.len() > Y.len()) {
|
||||
ScratchDigits T(2 * k);
|
||||
for (int i = k; i < X.len(); i += k) {
|
||||
if (should_terminate()) return;
|
||||
Digits Xi(X, i, k);
|
||||
// TODO(jkummerow): would it be a measurable improvement to craft a
|
||||
// "ToomChunk" method in the style of {KaratsubaChunk}?
|
||||
Toom3Main(T, Xi, Y);
|
||||
AddAndReturnOverflow(Z + i, T); // Can't overflow.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace bigint
|
||||
} // namespace v8
|
@ -22,4 +22,8 @@ v8_executable("bigint_shell") {
|
||||
configs = [ "../..:internal_config_base" ]
|
||||
|
||||
sources = [ "bigint-shell.cc" ]
|
||||
|
||||
if (v8_advanced_bigint_algorithms) {
|
||||
defines = [ "V8_ADVANCED_BIGINT_ALGORITHMS" ]
|
||||
}
|
||||
}
|
||||
|
@ -27,7 +27,8 @@ int PrintHelp(char** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
#define TESTS(V) V(kKaratsuba, "karatsuba") V(kBurnikel, "burnikel")
|
||||
#define TESTS(V) \
|
||||
V(kKaratsuba, "karatsuba") V(kBurnikel, "burnikel") V(kToom, "toom")
|
||||
|
||||
enum Operation { kNoOp, kList, kTest };
|
||||
|
||||
@ -163,6 +164,10 @@ class Runner {
|
||||
for (int i = 0; i < runs_; i++) {
|
||||
TestBurnikel(&count);
|
||||
}
|
||||
} else if (test_ == kToom) {
|
||||
for (int i = 0; i < runs_; i++) {
|
||||
TestToom(&count);
|
||||
}
|
||||
} else {
|
||||
DCHECK(false); // Unreachable.
|
||||
}
|
||||
@ -174,8 +179,8 @@ class Runner {
|
||||
void TestKaratsuba(int* count) {
|
||||
// Calling {MultiplyKaratsuba} directly is only valid if
|
||||
// left_size >= right_size and right_size >= kKaratsubaThreshold.
|
||||
static const int kMin = kKaratsubaThreshold;
|
||||
static const int kMax = 3 * kKaratsubaThreshold;
|
||||
constexpr int kMin = kKaratsubaThreshold;
|
||||
constexpr int kMax = 3 * kKaratsubaThreshold;
|
||||
for (int right_size = kMin; right_size <= kMax; right_size++) {
|
||||
for (int left_size = right_size; left_size <= kMax; left_size++) {
|
||||
ScratchDigits A(left_size);
|
||||
@ -194,10 +199,36 @@ class Runner {
|
||||
}
|
||||
}
|
||||
|
||||
void TestToom(int* count) {
|
||||
#if V8_ADVANCED_BIGINT_ALGORITHMS
|
||||
// {MultiplyToomCook} works fine even below the threshold, so we can
|
||||
// save some time by starting small.
|
||||
constexpr int kMin = kToomThreshold - 60;
|
||||
constexpr int kMax = kToomThreshold + 10;
|
||||
for (int right_size = kMin; right_size <= kMax; right_size++) {
|
||||
for (int left_size = right_size; left_size <= kMax; left_size++) {
|
||||
ScratchDigits A(left_size);
|
||||
ScratchDigits B(right_size);
|
||||
int result_len = MultiplyResultLength(A, B);
|
||||
ScratchDigits result(result_len);
|
||||
ScratchDigits result_karatsuba(result_len);
|
||||
GenerateRandom(A);
|
||||
GenerateRandom(B);
|
||||
processor()->MultiplyToomCook(result, A, B);
|
||||
// Using Karatsuba as reference.
|
||||
processor()->MultiplyKaratsuba(result_karatsuba, A, B);
|
||||
AssertEquals(A, B, result_karatsuba, result);
|
||||
if (error_) return;
|
||||
(*count)++;
|
||||
}
|
||||
}
|
||||
#endif // V8_ADVANCED_BIGINT_ALGORITHMS
|
||||
}
|
||||
|
||||
void TestBurnikel(int* count) {
|
||||
// Start small to save test execution time.
|
||||
static const int kMin = kBurnikelThreshold / 2;
|
||||
static const int kMax = 2 * kBurnikelThreshold;
|
||||
constexpr int kMin = kBurnikelThreshold / 2;
|
||||
constexpr int kMax = 2 * kBurnikelThreshold;
|
||||
for (int right_size = kMin; right_size <= kMax; right_size++) {
|
||||
for (int left_size = right_size; left_size <= kMax; left_size++) {
|
||||
ScratchDigits A(left_size);
|
||||
|
Loading…
Reference in New Issue
Block a user