├── Makefile ├── README.md ├── benchmarks └── benchmark.cpp ├── include └── backwardmultiply.h └── unit └── basictests.cpp /Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | all: basictests benchmark 4 | 5 | benchmark: benchmarks/benchmark.cpp include/backwardmultiply.h 6 | c++ -std=c++17 -o benchmark benchmarks/benchmark.cpp -O2 -I include 7 | 8 | basictests: unit/basictests.cpp include/backwardmultiply.h 9 | c++ -std=c++17 -o basictests unit/basictests.cpp -O2 -I include 10 | 11 | 12 | clean: 13 | rm -r -f benchmark basictests -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Backward Multiplication 2 | 3 | This C/C++ code illustrates how we might do a multiword multiplication backward, 4 | starting from the most significant words. 5 | 6 | Blog post: [Multiplying backward for profit](https://lemire.me/blog/2020/04/05/multiplying-backward-for-profit/). 7 | 8 | To do the full computation the usual way, use the following function: 9 | 10 | ```c++ 11 | #include "backwardmultiply.h" 12 | using backwardmultiply; 13 | 14 | // Multiply the integer represented by 'w' with the integer represented 15 | // by the multiword integer b[0], b[1],..., b[n-1] 16 | // Result gets written to output, up to n+1 words can be written: caller is 17 | // responsible to ensure that the memory was allocated. 18 | void multiplication(uint64_t w, const uint64_t *b, size_t n, uint64_t *output); 19 | ``` 20 | 21 | If you just want a few of the most significant words, use the following function: 22 | 23 | ```c++ 24 | // Multiply the integer represented by 'w' with the integer represented 25 | // by the multiword integer b[0], b[1],..., b[n-1] 26 | // Result gets written to output, up to n+1 words can be written: caller is 27 | // responsible to ensure that the memory was allocated. The parameter "minexactwords" 28 | // represents the minimal number of exact words (starting from the most 29 | // significant words) that you need. Returns the number of exact words computed 30 | // (starting from the most significant words). 31 | size_t multiplication_backward_limit(uint64_t w, const uint64_t *b, size_t n, 32 | uint64_t *output, size_t minexactwords) 33 | ``` 34 | 35 | 36 | ## Requirements 37 | 38 | - Modern C++ compiler 39 | 40 | ## Usage 41 | 42 | There is a single header. You can run tests with `make && make test`. 43 | 44 | 45 | -------------------------------------------------------------------------------- /benchmarks/benchmark.cpp: -------------------------------------------------------------------------------- 1 | #include "backwardmultiply.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | static inline uint64_t rng(uint64_t h) { 8 | h ^= h >> 33; 9 | h *= UINT64_C(0xff51afd7ed558ccd); 10 | h ^= h >> 33; 11 | h *= UINT64_C(0xc4ceb9fe1a85ec53); 12 | h ^= h >> 33; 13 | return h; 14 | } 15 | 16 | 17 | void demo(size_t integer_size = 32) { 18 | size_t seed = 17; 19 | std::vector out1, out2; 20 | std::chrono::time_point start_clock, end_clock; 21 | std::chrono::duration elapsed1 = std::chrono::duration::max(); 22 | std::chrono::duration elapsed2 = std::chrono::duration::max(); 23 | 24 | for (size_t t = 0; t < 100; t++) { 25 | std::vector v; 26 | for(size_t i = 0; i < integer_size; i++) v.push_back(rng(seed++)); 27 | uint64_t multiplier = rng(seed++); 28 | out1.resize(v.size() + 1); 29 | start_clock = std::chrono::steady_clock::now(); 30 | backwardmultiply::multiplication(multiplier, v.data(), v.size(), 31 | out1.data()); 32 | 33 | end_clock = std::chrono::steady_clock::now(); 34 | elapsed1 = 35 | end_clock - start_clock < elapsed1 ? end_clock - start_clock : elapsed1; 36 | out2.resize(v.size() + 1); 37 | start_clock = std::chrono::steady_clock::now(); 38 | backwardmultiply::multiplication_backward(multiplier, v.data(), v.size(), 39 | out2.data()); 40 | end_clock = std::chrono::steady_clock::now(); 41 | elapsed2 = 42 | end_clock - start_clock < elapsed2 ? end_clock - start_clock : elapsed2; 43 | 44 | } 45 | std::cout 46 | << "forward : " 47 | << std::chrono::duration_cast(elapsed1).count() / 48 | 128.0 49 | << " ns" << std::endl; 50 | std::cout 51 | << "backward : " 52 | << std::chrono::duration_cast(elapsed2).count() / 53 | 128.0 54 | << " ns" << std::endl; 55 | } 56 | 57 | int main() { 58 | demo(); 59 | } -------------------------------------------------------------------------------- /include/backwardmultiply.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | namespace backwardmultiply { 6 | 7 | struct value128 { 8 | uint64_t low; 9 | uint64_t high; 10 | 11 | void add(uint64_t lowbits) { 12 | low = lowbits + low; 13 | if (low < lowbits) { 14 | high++; 15 | } 16 | } 17 | }; 18 | 19 | #ifdef _MSC_VER 20 | #include 21 | #endif 22 | 23 | value128 full_multiplication(uint64_t value1, uint64_t value2) { 24 | value128 answer; 25 | #ifdef _MSC_VER 26 | // todo: this might fail under visual studio for ARM 27 | answer.low = _umul128(value1, value2, &answer.high); 28 | #else 29 | __uint128_t r = ((__uint128_t)value1) * value2; 30 | answer.low = r; 31 | answer.high = r >> 64; 32 | #endif 33 | return answer; 34 | } 35 | 36 | // Multiply the integer represented by 'w' with the integer represented 37 | // by the multiword integer b[0], b[1],..., b[n-1] 38 | // Result gets written to output, up to n+1 words can be written: caller is 39 | // responsible to ensure that the memory was allocated. 40 | void multiplication(uint64_t w, const uint64_t *b, size_t n, uint64_t *output) { 41 | if ((w == 0) || (n == 0)) { // special case 42 | std::fill(output, output + n + 1, 0); 43 | return; 44 | } 45 | auto p = full_multiplication(w, b[0]); 46 | output[0] = p.low; 47 | uint64_t r = p.high; 48 | for (size_t i = 1; i < n; i++) { 49 | p = full_multiplication(w, b[i]); 50 | p.add(r); 51 | output[i] = p.low; 52 | r = p.high; 53 | } 54 | output[n] = r; 55 | } 56 | 57 | // Multiply the integer represented by 'w' with the integer represented 58 | // by the multiword integer b[0], b[1],..., b[n-1] 59 | // Result gets written to output, up to n+1 words can be written: caller is 60 | // responsible to ensure that the memory was allocated. 61 | void multiplication_backward(uint64_t w, const uint64_t *b, size_t n, 62 | uint64_t *output) { 63 | if ((w == 0) || (n == 0)) { // special case 64 | std::fill(output, output + n + 1, 0); 65 | return; 66 | } 67 | auto p = full_multiplication(w, b[n - 1]); 68 | output[n - 1] = p.low; 69 | output[n] = p.high; 70 | for (size_t i = n - 2; i != SIZE_MAX; i--) { 71 | p = full_multiplication(w, b[i]); 72 | output[i] = p.low; 73 | bool overflow = (output[i + 1] + p.high < output[i + 1]); 74 | output[i + 1] += p.high; 75 | for (size_t j = i + 2; overflow; j++) { 76 | output[j]++; 77 | overflow = (output[j] == 0); 78 | } 79 | } 80 | } 81 | 82 | 83 | // Multiply the integer represented by 'w' with the integer represented 84 | // by the multiword integer b[0], b[1],..., b[n-1] 85 | // Result gets written to output, up to n+1 words can be written: caller is 86 | // responsible to ensure that the memory was allocated. The parameter "minexactwords" 87 | // represents the minimal number of exact words (starting from the most 88 | // significant words) that you need. Returns the number of exact words computed 89 | // (starting from the most significant words). 90 | size_t multiplication_backward_limit(uint64_t w, const uint64_t *b, size_t n, 91 | uint64_t *output, size_t minexactwords) { 92 | if ((w == 0) || (n == 0)) { // special case 93 | std::fill(output, output + n + 1, 0); 94 | return n + 1; 95 | } 96 | 97 | auto p = full_multiplication(w, b[n - 1]); 98 | output[n - 1] = p.low; 99 | output[n] = p.high; 100 | if (minexactwords <= 1) { 101 | // check if addition overflows 102 | if (p.low + w - 1 >= p.low) { 103 | // we are good at output[n] 104 | return 1; 105 | } 106 | } 107 | for (size_t i = n - 2; i != SIZE_MAX; i--) { 108 | p = full_multiplication(w, b[i]); 109 | output[i] = p.low; 110 | bool overflow = (output[i + 1] + p.high < output[i + 1]); 111 | output[i + 1] += p.high; 112 | for (size_t j = i + 2; overflow; j++) { 113 | output[j]++; 114 | overflow = (output[j] == 0); 115 | } 116 | if (minexactwords <= n - i) { 117 | // check if addition overflows 118 | if (p.low + w - 1 >= p.low) { 119 | // we are good at output[n], ..., output[i+1], so we have 120 | // n - (i+1) + 1 = n - i good words 121 | return n - i; 122 | } 123 | if (minexactwords < n - i) { 124 | // then the only way we could overflow is if the following is true 125 | if (output[i + 1] != 0xFFFFFFFFFFFFFFFF) { 126 | // we are good at output[n], ..., output[i+2], so we have 127 | // n - (i+2) + 1 = n - i - 1 good words 128 | 129 | return n - i - 1; 130 | } 131 | } 132 | } 133 | } 134 | return n + 1; 135 | } 136 | 137 | // Multiply the integer represented by 'w' with the integer represented 138 | // by the multiword integer b[0], b[1],..., b[n-1]. 139 | // Result gets written to output, up to n+1 words can be written: caller is 140 | // responsible to ensure that the memory was allocated. previousindex should 141 | // start with n + 1 This function can be used to iteratively compute the answer. 142 | // Returns the number of exact words computed (starting from the most 143 | // significant words). 144 | size_t multiplication_backward_limit_resume(uint64_t w, const uint64_t *b, 145 | size_t n, uint64_t *output, 146 | size_t minexactwords, 147 | size_t &previousindex) { 148 | if (n == 0) { // special case 149 | std::fill(output, output + n + 1, 0); 150 | return n + 1; // we are done!!! 151 | } 152 | if (previousindex > n) { 153 | // assume we are done 154 | return n + 1; 155 | } 156 | if (previousindex == n) { 157 | previousindex = n - 1; 158 | auto p = full_multiplication(w, b[n - 1]); 159 | output[n - 1] = p.low; 160 | output[n] = p.high; 161 | if (minexactwords <= 1) { 162 | // check if addition overflows 163 | if (p.low + w - 1 >= p.low) { 164 | // we are good 165 | return 1; 166 | } 167 | } 168 | } 169 | 170 | for (size_t i = previousindex - 1; i != SIZE_MAX; i--) { 171 | auto p = full_multiplication(w, b[i]); 172 | output[i] = p.low; 173 | bool overflow = (output[i + 1] + p.high < output[i + 1]); 174 | output[i + 1] += p.high; 175 | for (size_t j = i + 2; overflow; j++) { 176 | output[j]++; 177 | overflow = (output[j] == 0); 178 | } 179 | if (minexactwords <= n - i) { 180 | // check if addition overflows 181 | if (p.low + w - 1 >= p.low) { 182 | // we are good 183 | previousindex = i; 184 | return n - i; 185 | } 186 | if (minexactwords < n - i) { 187 | // then the only way we could overflow is if the following is true 188 | if (output[i + 1] != 0xFFFFFFFFFFFFFFFF) { 189 | previousindex = i; 190 | return n - i - 1; 191 | } 192 | } 193 | } 194 | } 195 | previousindex = SIZE_MAX; 196 | return n + 1; 197 | } 198 | 199 | }; // namespace backwardmultiply -------------------------------------------------------------------------------- /unit/basictests.cpp: -------------------------------------------------------------------------------- 1 | #include "backwardmultiply.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | static inline uint64_t rng(uint64_t h) { 8 | h ^= h >> 33; 9 | h *= UINT64_C(0xff51afd7ed558ccd); 10 | h ^= h >> 33; 11 | h *= UINT64_C(0xc4ceb9fe1a85ec53); 12 | h ^= h >> 33; 13 | return h; 14 | } 15 | 16 | void print(const std::vector &b) { 17 | if (b.size() == 0) 18 | return; 19 | for (size_t i = b.size() - 1; i != SIZE_MAX; i--) { 20 | std::cout << std::hex << std::setfill('0') << std::setw(8) << b[i]; 21 | std::cout << " "; 22 | } 23 | } 24 | 25 | bool unittest(bool smallmultiplier) { 26 | uint64_t seed = 1; 27 | std::vector out1, out2; 28 | for (size_t t = 0; t < 2000000; t++) { 29 | if ((t % 10000) == 0) { 30 | std::cout << "."; 31 | std::cout.flush(); 32 | } 33 | std::vector v = {rng(seed++), rng(seed++), rng(seed++), 34 | rng(seed++), rng(seed++), rng(seed++), 35 | rng(seed++), rng(seed++), rng(seed++)}; 36 | uint64_t multiplier = rng(seed++); 37 | if (smallmultiplier) { 38 | multiplier &= 7; 39 | } 40 | out1.resize(v.size() + 1); 41 | backwardmultiply::multiplication(multiplier, v.data(), v.size(), 42 | out1.data()); 43 | out2.resize(v.size() + 1); 44 | backwardmultiply::multiplication_backward(multiplier, v.data(), v.size(), 45 | out2.data()); 46 | if (out1 != out2) { 47 | printf("out1 and out2 differs\n"); 48 | std::cout << "input : "; 49 | print(v); 50 | std::cout << std::endl; 51 | std::cout << multiplier << std::endl; 52 | printf("out1:"); 53 | print(out1); 54 | std::cout << std::endl; 55 | printf("out2:"); 56 | print(out2); 57 | std::cout << std::endl; 58 | std::cerr << "bug\n" << std::endl; 59 | return false; 60 | } 61 | //////////// 62 | /// multiplication_backward_limit 63 | ////////////// 64 | for (size_t words = 1; words <= v.size(); words++) { 65 | out2.resize(v.size() + 1); 66 | std::fill(out2.begin(), out2.end(), 0); // clearing to be sure 67 | size_t exactwords = backwardmultiply::multiplication_backward_limit( 68 | multiplier, v.data(), v.size(), out2.data(), words); 69 | if (exactwords < words) { 70 | printf("exactwords = %zu words = %zu\n", exactwords, words); 71 | printf("bug\n"); 72 | abort(); 73 | } 74 | 75 | size_t n = out2.size(); 76 | 77 | // check that we are good... 78 | for (size_t i = n - exactwords; i < n; i++) { 79 | if (out2[i] != out1[i]) { 80 | printf("bug in multiplication_backward_limit\n"); 81 | return false; 82 | } 83 | } 84 | } 85 | //////////// 86 | /// multiplication_backward_limit_resume 87 | ////////////// 88 | for (size_t words = 1; words <= v.size(); words++) { 89 | std::fill(out2.begin(), out2.end(), 0); // clearing to be sure 90 | size_t index = v.size(); 91 | size_t exactwords = 92 | backwardmultiply::multiplication_backward_limit_resume( 93 | multiplier, v.data(), v.size(), out2.data(), words, index); 94 | if (exactwords < words) { 95 | printf("exactwords = %zu words = %zu\n", exactwords, words); 96 | printf("bug\n"); 97 | abort(); 98 | } 99 | 100 | size_t n = out2.size(); 101 | // check that we are good... 102 | for (size_t i = n - exactwords; i < n; i++) { 103 | if (out2[i] != out1[i]) { 104 | printf("out : "); 105 | print(out2); 106 | printf("\n"); 107 | printf("true: "); 108 | print(out1); 109 | printf("\n"); 110 | 111 | printf("bug in multiplication_backward_limit_resume\n"); 112 | return false; 113 | } 114 | } 115 | while (exactwords != n) { 116 | exactwords = backwardmultiply::multiplication_backward_limit_resume( 117 | multiplier, v.data(), v.size(), out2.data(), exactwords + 1, index); 118 | 119 | for (size_t i = n - exactwords; i < n; i++) { 120 | if (out2[i] != out1[i]) { 121 | printf("out : "); 122 | print(out2); 123 | printf("\n"); 124 | printf("true: "); 125 | print(out1); 126 | printf("\n"); 127 | 128 | printf("**bug in multiplication_backward_limit_resume\n"); 129 | return false; 130 | } 131 | } 132 | } 133 | } 134 | } 135 | std::cout << std::endl; 136 | return true; 137 | } 138 | 139 | int main() { 140 | if (unittest(false) && unittest(true)) { 141 | printf("Code looks good.\n"); 142 | return EXIT_SUCCESS; 143 | } 144 | printf("Error detected.\n"); 145 | return EXIT_FAILURE; 146 | } 147 | --------------------------------------------------------------------------------