├── LICENSE ├── README.md ├── fuzzy_test.py ├── generate_updated_tables.py ├── wider.h └── wider_io.h /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Arthur O'Dwyer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Wide Integer Math (Proof of Concept) 2 | ------------------------------------ 3 | 4 | This header provides a class template `Wider`, such that `Wider` 5 | acts like an unsigned 128-bit type, `Wider>` acts like an unsigned 6 | 256-bit type, and so on. `Wider` overloads all the arithmetic operators, 7 | and also provides a free function `countleadingzeros(w)`. 8 | 9 | A sufficiently smart compiler should produce the same codegen for `Wider` 10 | as it does for the built-in primitive type `__uint128_t`. Any significant differences 11 | in codegen (in either direction) should probably be filed as "missed optimization" 12 | bugs against the relevant compiler. 13 | 14 | [As of April 2020, Clang supports](http://blog.llvm.org/2020/04/the-new-clang-extint-feature-provides.html) 15 | a built-in type `unsigned _ExtInt(n)` that implements `n`-bit arithmetic for any `n` at all, even 16 | non-powers of two. A sufficiently smart compiler should produce the same codegen for `Wider>` 17 | as it does for `unsigned _ExtInt(n)`. 18 | 19 | The following tables show naïve instruction counts (not counting labels, but counting the `ret`) 20 | for each member function of `wider_tests::Tests`. The letter "P" indicates codegen 21 | that appears perfect as far as I know. The word "call" indicates failure to completely 22 | inline the code (usually because it's so big). 23 | 24 | "`__udivti3`" and "`__umodti3`" indicate that certain arithmetic operations on `__uint128_t` 25 | are delegated to a library function. The library function is macro-optimized with 26 | many special cases (division by small integers, division by integers with many trailing zeros, 27 | et cetera). In contrast, `Wider`'s `operator/` omits these special cases, 28 | resulting in smaller code but almost certainly slower code in common situations. 29 | 30 | All numbers are produced by Godbolt Compiler Explorer, using `-O3 -std=c++14`, on GCC trunk and Clang trunk. 31 | 32 | 128-bit math using `__uint128_t`, `unsigned _BitInt(128)`, and `Wider`: 33 | 34 | | Test name | Clang `uint128` | Clang `_BitInt` | Clang `W` | GCC `uint128` | GCC `W` | 35 | | ---------- | --------------- | --------------- | -------------- | ------------- | ------------ | 36 | | preinc | 3 P | 3 P | 3 P | 3 P | 6 | 37 | | postinc | 11 | 11 | 11 | 11 | 12 | 38 | | predec | 3 P | 3 P | 3 P | 3 P | 6 | 39 | | postdec | 11 | 11 | 11 | 11 | 13 | 40 | | plus | 5 P | 5 P | 5 P | 5 P | 11 | 41 | | pluseq | 5 P | 5 P | 5 P | 5 P | 6 | 42 | | minus | 5 P | 5 P | 5 P | 5 P | 14 | 43 | | minuseq | 5 P | 5 P | 5 P | 5 P | 9 | 44 | | mul | 11 P | 11 P | 11 P | 11 P | 11 P | 45 | | muleq | 11 P | 11 P | 11 P | 11 P | 11 P | 46 | | div | __udivti3 | __udivti3 | 47 | __udivti3 | 57 | 47 | | diveq | __udivti3 | __udivti3 | 47 | __udivti3 | 57 | 48 | | mod | __umodti3 | __umodti3 | 38 | __umodti3 | 53 | 49 | | modeq | __umodti3 | __umodti3 | 38 | __umodti3 | 53 | 50 | | xor_ | 5 P | 5 P | 5 P | 5 P | 5 P | 51 | | xoreq | 5 P | 5 P | 5 P | 5 P | 5 P | 52 | | and_ | 5 P | 5 P | 5 P | 5 P | 5 P | 53 | | andeq | 5 P | 5 P | 5 P | 5 P | 5 P | 54 | | or_ | 5 P | 5 P | 5 P | 5 P | 5 P | 55 | | oreq | 5 P | 5 P | 5 P | 5 P | 5 P | 56 | | shl | 13 | 13 | 12 P | 12 P | 24 | 57 | | shleq | 13 | 13 | 12 P | 12 P | 24 | 58 | | shr | 13 | 13 | 12 P | 12 P | 23 | 59 | | shreq | 13 | 13 | 12 P | 12 P | 23 | 60 | | clz | 7 | 1 | 7 | 13 | 11 | 61 | | lt | 6 P | 6 P | 6 P | 6 P | 9 | 62 | | leq | 6 P | 6 P | 6 P | 7 | 9 | 63 | | gt | 6 P | 6 P | 6 P | 7 | 9 | 64 | | geq | 6 P | 6 P | 6 P | 6 P | 9 | 65 | | eq | 6 P | 7 | 7 | 7 | 7 | 66 | | neq | 6 P | 7 | 7 | 7 | 7 | 67 | | not_ | 4 P | 4 P | 4 P | 4 P | 4 P | 68 | | bool_ | 4 P | 4 P | 4 P | 4 P | 4 P | 69 | | neg | 5 | 5 | 5 | 4 P | 13 | 70 | | flip | 3 P | 3 P | 5 | 3 P | 5 | 71 | 72 | 73 | 256-bit math using `unsigned _ExtInt(256)` and `Wider>`: 74 | 75 | | Test name | Clang 13 `_ExtInt` | Clang `W>` | GCC `W>` | 76 | | ---------- | ------------------ | ----------------- | --------------- | 77 | | preinc | 5 P | 5 P | 11 | 78 | | postinc | 23 | 13 | 18 | 79 | | predec | 5 P | 5 P | 15 | 80 | | postdec | 23 | 13 | 20 | 81 | | plus | 9 P | 9 P | 20 | 82 | | pluseq | 9 P | 9 P | 13 | 83 | | minus | 9 P | 9 P | 24 | 84 | | minuseq | 9 P | 9 P | 17 | 85 | | mul | 60 | 63 | 64 | 86 | | muleq | 62 | 63 | 64 | 87 | | div | 1 | 173 call | 216 call | 88 | | diveq | 1 | 173 call | 216 call | 89 | | mod | 1 | 173 call | 198 | 90 | | modeq | 1 | 166 call | 198 | 91 | | xor_ | 9 P | 9 P | 9 P | 92 | | xoreq | 9 P | 9 P | 9 P | 93 | | and_ | 9 P | 9 P | 9 P | 94 | | andeq | 9 P | 9 P | 9 P | 95 | | or_ | 9 P | 9 P | 9 P | 96 | | oreq | 9 P | 9 P | 9 P | 97 | | shl | 64 | 30 | 60 | 98 | | shleq | 64 | 30 | 60 | 99 | | shr | 61 | 33 | 64 | 100 | | shreq | 61 | 33 | 64 | 101 | | clz | 1 | 17 | 29 | 102 | | lt | 10 P | 10 P | 15 | 103 | | leq | 10 P | 10 P | 15 | 104 | | gt | 10 P | 10 P | 15 | 105 | | geq | 10 P | 10 P | 15 | 106 | | eq | 13 P | 13 P | 13 P | 107 | | neq | 13 P | 13 P | 13 P | 108 | | not_ | 7 | 9 | 6 P | 109 | | bool_ | 7 | 9 | 6 P | 110 | | neg | 11 | 11 | 21 | 111 | | flip | 5 P | 8 | 8 | 112 | 113 | 114 | 512-bit math using `unsigned _ExtInt(512)` and `Wider>>`: 115 | 116 | | Test name | Clang 13 `_ExtInt` | Clang `W>>` | GCC `W>>` | 117 | | ---------- | ------------------ | -------------------- | ------------------ | 118 | | preinc | 9 | 9 | 23 | 119 | | postinc | 45 | 25 | 35 | 120 | | predec | 9 | 9 | 27 | 121 | | postdec | 45 | 25 | 38 | 122 | | plus | 17 | 17 | 42 | 123 | | pluseq | 17 | 17 | 25 | 124 | | minus | 17 | 17 | 46 | 125 | | minuseq | 17 | 17 | 31 | 126 | | mul | 274 | 300 call | 258 | 127 | | muleq | 275 | 300 call | 258 | 128 | | div | 1 | 467 call | 512 call | 129 | | diveq | 1 | 467 call | 512 call | 130 | | mod | 1 | 467 call | 473 | 131 | | modeq | 1 | 456 call | 479 call | 132 | | xor_ | 17 | 17 | 17 | 133 | | xoreq | 17 | 17 | 17 | 134 | | and_ | 17 | 17 | 17 | 135 | | andeq | 17 | 17 | 17 | 136 | | or_ | 17 | 17 | 17 | 137 | | oreq | 17 | 17 | 17 | 138 | | shl | 347 | 133 call | 137 | 139 | | shleq | 347 | 133 call | 137 | 140 | | shr | 362 | 79 | 168 | 141 | | shreq | 362 | 79 | 168 | 142 | | clz | 1 | 39 | 71 | 143 | | lt | 18 | 18 | 28 | 144 | | leq | 18 | 18 | 29 | 145 | | gt | 18 | 18 | 29 | 146 | | geq | 18 | 18 | 28 | 147 | | eq | 25 | 21 | 26 | 148 | | neq | 25 | 21 | 26 | 149 | | not_ | 13 | 13 | 10 | 150 | | bool_ | 13 | 13 | 10 | 151 | | neg | 23 | 23 | 37 | 152 | | flip | 9 | 14 | 14 | 153 | -------------------------------------------------------------------------------- /fuzzy_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import random 4 | 5 | BITS = 128 6 | MAX = 2**BITS 7 | 8 | def hexit(n): 9 | result = hex(n) 10 | assert result[:2] == '0x' 11 | if result[-1] == 'L': 12 | result = result[:-1] 13 | return result 14 | 15 | def make_testcase(f, BITS): 16 | MAX = 2**BITS 17 | a = random.randint(0, MAX - 1) 18 | b = random.randint(0, MAX - 1) 19 | shiftcount = random.randint(0, BITS - 1) 20 | print >>f, '{' 21 | print >>f, ' Uint%d a = %s_u%d;' % (BITS, hexit(a), BITS) 22 | print >>f, ' Uint%d b = %s_u%d;' % (BITS, hexit(b), BITS) 23 | print >>f, ' int shiftcount = %d;' % shiftcount 24 | print >>f, ' if (a != a) std::cout << "OOPS! " << a << " != " << a << "\\n";' 25 | print >>f, ' if (b != b) std::cout << "OOPS! " << b << " != " << b << "\\n";' 26 | print >>f, ' if (a+b != %s_u%d)' % (hexit((a + b) % MAX), BITS) 27 | print >>f, ' std::cout << "OOPS! " << a << " + " << b << " = " << (a+b) << " not %s\\n";' % (hexit((a + b) % MAX),) 28 | print >>f, ' if (a-b != %s_u%d)' % (hexit((MAX + a - b) % MAX), BITS) 29 | print >>f, ' std::cout << "OOPS! " << a << " - " << b << " = " << (a-b) << " not %s\\n";' % (hexit((MAX + a - b) % MAX),) 30 | print >>f, ' if (a*b != %s_u%d)' % (hexit((a * b) % MAX), BITS) 31 | print >>f, ' std::cout << "OOPS! " << a << " * " << b << " = " << (a*b) << " not %s\\n";' % (hexit((a * b) % MAX),) 32 | print >>f, ' if (b && (a/b != %s_u%d))' % (hexit((a / b) if b else 0), BITS) 33 | print >>f, ' std::cout << "OOPS! " << a << " / " << b << " = " << (a/b) << " not %s\\n";' % (hexit((a / b) if b else 0),) 34 | print >>f, ' if (b && (a%%b != %s_u%d))' % (hexit((a % b) if b else 0), BITS) 35 | print >>f, ' std::cout << "OOPS! " << a << " %% " << b << " = " << (a%%b) << " not %s\\n";' % (hexit((a % b) if b else 0),) 36 | print >>f, ' if ((a << shiftcount) != %s_u%d)' % (hexit((a << shiftcount) % MAX), BITS) 37 | print >>f, ' std::cout << "OOPS! " << a << " << " << shiftcount << " = " << (a << shiftcount) << " not %s\\n";' % (hexit((a << shiftcount) % MAX),) 38 | print >>f, ' if ((a >> shiftcount) != %s_u%d)' % (hexit(a >> shiftcount), BITS) 39 | print >>f, ' std::cout << "OOPS! " << a << " >> " << shiftcount << " = " << (a >> shiftcount) << " not %s\\n";' % (hexit(a >> shiftcount),) 40 | print >>f, '}' 41 | 42 | with open('fuzzy.cc', 'w') as f: 43 | print >>f, '#include "wider.h"' 44 | print >>f, '#include "wider_io.h"' 45 | print >>f, 'int main() {' 46 | print >>f, ' using Uint128 = Wider;' 47 | print >>f, ' using Uint256 = Wider;' 48 | print >>f, ' using Uint512 = Wider;' 49 | for i in xrange(100): 50 | make_testcase(f, 128) 51 | make_testcase(f, 256) 52 | make_testcase(f, 512) 53 | print >>f, '}' 54 | -------------------------------------------------------------------------------- /generate_updated_tables.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import json 5 | import re 6 | import requests 7 | 8 | with open('wider.h') as f: 9 | original_source = f.readlines() 10 | 11 | def process(compiler_name, function_name, type_name, bypass): 12 | global original_source 13 | source = original_source[:] 14 | for i, line in enumerate(source): 15 | if re.match(r' //static .* %s[(]' % function_name, line): 16 | source[i] = re.sub(r'//', '', line) 17 | elif line == '//template struct Tests;\n': 18 | source[i] = 'template struct Tests<%s>;\n' % type_name 19 | 20 | r = requests.post( 21 | 'https://godbolt.org/api/compiler/%s/compile' % compiler_name, 22 | json={ 23 | "bypassCache": bypass, 24 | "source": ''.join(source), 25 | "options": { 26 | "userArguments": "-std=c++14 -O3", 27 | "compilerOptions": {}, 28 | "filters": { 29 | "binary": False, 30 | "execute": False, 31 | "labels": True, 32 | "directives": True, 33 | "commentOnly": True, 34 | "trim": False, 35 | "intel": False, 36 | "demangle": False, 37 | }, 38 | "tools": [] 39 | } 40 | }, 41 | headers={ 42 | 'Accept': 'application/json', 43 | }, 44 | ) 45 | 46 | linecount = 0 47 | has_call = False 48 | has_udivti = False 49 | has_umodti = False 50 | for line in r.json()['asm']: 51 | text = line['text'] 52 | if (not text) or (text[0] in '_.'): 53 | pass 54 | elif 'call' in text: 55 | print (text, '## CALL!') 56 | linecount += 1 57 | has_call = True 58 | if '__udivti3' in text: 59 | has_udivti = True 60 | elif '__umodti3' in text: 61 | has_umodti = True 62 | else: 63 | print (text) 64 | linecount += 1 65 | 66 | if has_umodti: 67 | linecount += 30000 68 | elif has_udivti: 69 | linecount += 20000 70 | elif has_call: 71 | linecount += 10000 72 | 73 | return linecount 74 | 75 | def indicate_perfect_codegen(r, c, lc): 76 | perfect_dict = { 77 | (128, 'preinc'): 3, (128, 'postinc'): 5, 78 | (128, 'predec'): 3, (128, 'postdec'): 5, 79 | (128, 'plus'): 5, (128, 'pluseq'): 5, 80 | (128, 'minus'): 5, (128, 'minuseq'): 5, 81 | (128, 'mul'): 11, (128, 'muleq'): 11, 82 | (128, 'xor_'): 5, (128, 'xoreq'): 5, 83 | (128, 'and_'): 5, (128, 'andeq'): 5, 84 | (128, 'or_'): 5, (128, 'oreq'): 5, 85 | (128, 'shl'): 12, (128, 'shleq'): 12, 86 | (128, 'shr'): 12, (128, 'shreq'): 12, 87 | (128, 'clz'): None, 88 | (128, 'lt'): 6, (128, 'leq'): 6, 89 | (128, 'gt'): 6, (128, 'geq'): 6, 90 | (128, 'eq'): 6, (128, 'neq'): 6, 91 | (128, 'not_'): 4, (128, 'bool_'): 4, 92 | (128, 'neg'): 4, (128, 'flip'): 3, 93 | 94 | (256, 'preinc'): 5, (256, 'postinc'): 9, 95 | (256, 'predec'): 5, (256, 'postdec'): 9, 96 | (256, 'plus'): 9, (256, 'pluseq'): 9, 97 | (256, 'minus'): 9, (256, 'minuseq'): 9, 98 | (256, 'mul'): None, (256, 'muleq'): None, 99 | (256, 'xor_'): 9, (256, 'xoreq'): 9, 100 | (256, 'and_'): 9, (256, 'andeq'): 9, 101 | (256, 'or_'): 9, (256, 'oreq'): 9, 102 | (256, 'shl'): 26, (256, 'shleq'): 26, 103 | (256, 'shr'): 26, (256, 'shreq'): 26, 104 | (256, 'clz'): None, 105 | (256, 'lt'): 10, (256, 'leq'): 10, 106 | (256, 'gt'): 10, (256, 'geq'): 10, 107 | (256, 'eq'): 13, (256, 'neq'): 13, 108 | (256, 'not_'): 6, (256, 'bool_'): 6, 109 | (256, 'neg'): None, (256, 'flip'): 5, 110 | } 111 | perfect_lc = perfect_dict.get((c.bitwidth, r.funcname)) 112 | if lc < (perfect_lc or 1): 113 | print('Uh-oh! %d is less than the perfect %d for %d/%s' % (lc, (perfect_lc or 1), c.bitwidth, r.funcname)) 114 | assert False 115 | elif lc == perfect_lc: 116 | return '%d P' % lc 117 | elif 10000 <= lc < 20000: 118 | return '%d call' % (lc - 10000) 119 | elif 20000 <= lc < 30000: 120 | return '__udivti3' 121 | elif 30000 <= lc < 40000: 122 | return '__umodti3' 123 | return '%d' % lc 124 | 125 | def find_result(r, c): 126 | for tn2, fn2, cn2, lc in results: 127 | if (tn2, fn2, cn2) == (c.typename, r.funcname, c.compiler): 128 | return indicate_perfect_codegen(r, c, lc) 129 | return None 130 | 131 | 132 | class Row: 133 | def __init__(self, funcname): 134 | self.funcname = funcname 135 | 136 | class Column: 137 | def __init__(self, title, compiler, typename): 138 | self.title = title 139 | self.compiler = compiler 140 | self.typename = typename 141 | self.width = len(self.title) 142 | 143 | class Table: 144 | def __init__(self, bitwidth, caption, columns): 145 | self.caption = caption 146 | self.columns = columns 147 | for c in self.columns: 148 | c.bitwidth = bitwidth 149 | 150 | def precompute(self, options, rows): 151 | result = [] 152 | for r in rows: 153 | for c in self.columns: 154 | cn = c.compiler 155 | fn = r.funcname 156 | tn = c.typename 157 | linecount = process(cn, fn, tn, options.bypass) 158 | print('%s/%s/%s: %s' % (tn, fn, cn, linecount)) 159 | result.append((tn, fn, cn, linecount)) 160 | return result 161 | 162 | def produce(self, rows): 163 | result = self.caption + '\n\n' 164 | result += '| Test name | ' + ' | '.join([c.title for c in self.columns]) + ' |\n' 165 | result += '| ---------- | ' + ' | '.join([('-' * c.width) for c in self.columns]) + ' |\n' 166 | for r in rows: 167 | result += '| ' + r.funcname.ljust(10) + ' | ' 168 | result += ' | '.join([ 169 | find_result(r, c).ljust(c.width) 170 | for c in self.columns 171 | ]) 172 | result += ' |\n' 173 | return result 174 | 175 | all_tables = [ 176 | Table( 177 | 128, '128-bit math using `__uint128_t`, `unsigned _BitInt(128)`, and `Wider`:', [ 178 | Column('Clang `uint128`', 'clang_trunk', '__uint128_t'), 179 | Column('Clang `_BitInt`', 'clang_trunk', 'unsigned _BitInt(128)'), 180 | Column('Clang `W`', 'clang_trunk', 'Wider'), 181 | Column('GCC `uint128`', 'gsnapshot', '__uint128_t'), 182 | Column('GCC `W`', 'gsnapshot', 'Wider'), 183 | ] 184 | ), 185 | Table( 186 | 256, '256-bit math using `unsigned _ExtInt(256)` and `Wider>`:', [ 187 | Column('Clang 13 `_ExtInt`', 'clang1301', 'unsigned _ExtInt(256)'), 188 | Column('Clang `W>`', 'clang_trunk', 'Wider>'), 189 | Column('GCC `W>`', 'gsnapshot', 'Wider>'), 190 | ] 191 | ), 192 | Table( 193 | 512, '512-bit math using `unsigned _ExtInt(512)` and `Wider>>`:', [ 194 | Column('Clang 13 `_ExtInt`', 'clang1301', 'unsigned _ExtInt(512)'), 195 | Column('Clang `W>>`', 'clang_trunk', 'Wider>>'), 196 | Column('GCC `W>>`', 'gsnapshot', 'Wider>>'), 197 | ] 198 | ), 199 | ] 200 | 201 | all_rows = [Row(x) for x in [ 202 | 'preinc', 'postinc', 'predec', 'postdec', 203 | 'plus', 'pluseq', 'minus', 'minuseq', 204 | 'mul', 'muleq', 205 | 'div', 'diveq', 'mod', 'modeq', 206 | 'xor_', 'xoreq', 'and_', 'andeq', 'or_', 'oreq', 207 | 'shl', 'shleq', 'shr', 'shreq', 208 | 'clz', 209 | 'lt', 'leq', 'gt', 'geq', 'eq', 'neq', 210 | 'not_', 'bool_', 'neg', 'flip', 211 | ]] 212 | 213 | 214 | if __name__ == '__main__': 215 | parser = argparse.ArgumentParser() 216 | parser.add_argument("--bypass", action="store_true", help="bypass Compiler Explorer's cache") 217 | options = parser.parse_args() 218 | 219 | results = [] 220 | for table in all_tables: 221 | results += table.precompute(options, all_rows) 222 | 223 | for table in all_tables: 224 | print(table.produce(all_rows), '\n') 225 | -------------------------------------------------------------------------------- /wider.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #if WIDER_COMPLETELY_STANDARD 9 | // do nothing 10 | #elif defined(_MSC_VER) 11 | #include 12 | #else // _MSC_VER 13 | #include 14 | #endif // _MSC_VER 15 | 16 | namespace wider_traits { 17 | 18 | template using index_constant = std::integral_constant; 19 | 20 | template struct bit_width { static constexpr size_t value = T::bit_width; }; 21 | template<> struct bit_width { static constexpr size_t value = 64; }; 22 | 23 | template struct array_helper { 24 | template 25 | static T from_array(Ints... ints) { 26 | return T::from_array(std::make_index_sequence(), ints...); 27 | } 28 | template 29 | static auto with_array(const T& t, const F& f) { 30 | using Int64 = typename T::half_type; 31 | return array_helper::with_array(t.lo, [&](auto... los) { 32 | return array_helper::with_array(t.hi, [&](auto... his) { 33 | return f(los..., his...); 34 | }); 35 | }); 36 | } 37 | }; 38 | template<> struct array_helper { 39 | static uint64_t from_array(uint64_t x) { 40 | return x; 41 | } 42 | template 43 | static auto with_array(uint64_t x, const F& f) { 44 | return f(x); 45 | } 46 | }; 47 | 48 | template 49 | uint64_t& get_helper(Ts&&... ts) { 50 | std::tuple tuple(std::forward(ts)...); 51 | return std::get(tuple); 52 | } 53 | 54 | } // namespace wider_traits 55 | 56 | using CarryFlag = bool; 57 | 58 | inline CarryFlag producecarry(uint64_t& x, uint64_t y) { 59 | x += y; 60 | return (x < y); 61 | } 62 | 63 | inline CarryFlag addcarry(CarryFlag cf, uint64_t& x, uint64_t y) { 64 | #if WIDER_COMPLETELY_STANDARD 65 | CarryFlag r1 = producecarry(x, y); 66 | CarryFlag r2 = producecarry(x, cf); 67 | return r1 || r2; 68 | #else 69 | return _addcarry_u64(cf, x, y, (unsigned long long*)&x); 70 | #endif 71 | } 72 | 73 | inline CarryFlag produceborrow(uint64_t& x, uint64_t y) { 74 | CarryFlag cf = (x < y); 75 | x -= y; 76 | return cf; 77 | } 78 | 79 | inline CarryFlag subborrow(CarryFlag cf, uint64_t& x, uint64_t y) { 80 | #if WIDER_COMPLETELY_STANDARD 81 | CarryFlag r1 = produceborrow(x, y); 82 | CarryFlag r2 = produceborrow(x, cf); 83 | return r1 || r2; 84 | #else 85 | return _subborrow_u64(cf, x, y, (unsigned long long*)&x); 86 | #endif 87 | } 88 | 89 | inline uint64_t mulxu(uint64_t a, uint64_t b, uint64_t *rhi) { 90 | #if WIDER_COMPLETELY_STANDARD 91 | auto hi = [](uint64_t x) { return x >> 32; }; 92 | auto lo = [](uint64_t x) { return uint32_t(x); }; 93 | uint64_t xl = lo(a); 94 | uint64_t xh = hi(a); 95 | uint64_t yl = lo(b); 96 | uint64_t yh = hi(b); 97 | 98 | uint64_t rhh = xh * yh; 99 | uint64_t rhl = xh * yl; 100 | uint64_t rlh = xl * yh; 101 | uint64_t rll = xl * yl; 102 | 103 | *rhi = rhh + hi(rhl + hi(rll)) + hi(rlh + lo(rhl + hi(rll))); 104 | return a * b; 105 | #elif defined(_MSC_VER) 106 | return _umul128(a, b, (unsigned long long*)rhi); 107 | #elif defined(__BMI2__) 108 | return _mulx_u64(a, b, (unsigned long long*)rhi); 109 | #else 110 | __uint128_t r = __uint128_t(a) * __uint128_t(b); 111 | *rhi = (r >> 64); 112 | return r; 113 | #endif 114 | } 115 | 116 | inline uint64_t shl128(uint64_t low, uint64_t high, int n) { 117 | #if WIDER_COMPLETELY_STANDARD 118 | n &= 63; 119 | if (n != 0) { 120 | return (high << n) | (low >> (64-n)); 121 | } else { 122 | return high; 123 | } 124 | #elif defined(_MSC_VER) 125 | return __shiftleft128(low, high, n); 126 | #else 127 | __uint128_t v = (__uint128_t(high) << 64) | __uint128_t(low); 128 | return (v << (n & 63)) >> 64; 129 | #endif 130 | } 131 | 132 | inline uint64_t shr128(uint64_t low, uint64_t high, int n) { 133 | #if WIDER_COMPLETELY_STANDARD 134 | n &= 63; 135 | if (n != 0) { 136 | return (low >> n) | (high << (64-n)); 137 | } else { 138 | return low; 139 | } 140 | #elif defined(_MSC_VER) 141 | return __shiftright128(low, high, n); 142 | #else 143 | __uint128_t v = (__uint128_t(high) << 64) | __uint128_t(low); 144 | return v >> (n & 63); 145 | #endif 146 | } 147 | 148 | inline int countleadingzeros(uint64_t x) { 149 | #if WIDER_COMPLETELY_STANDARD 150 | int r = 0; 151 | while ((x & 0x8000000000000000uLL) == 0) { 152 | x <<= 1; 153 | ++r; 154 | } 155 | return r; 156 | #elif defined(_MSC_VER) 157 | return __lzcnt64(x); 158 | #else 159 | return __builtin_clzll(x); 160 | #endif 161 | } 162 | 163 | #ifndef _MSC_VER 164 | inline int countleadingzeros(__uint128_t x) { 165 | int lo = (63 ^ countleadingzeros(uint64_t(x))) + 64; 166 | int hi = 63 ^ countleadingzeros(uint64_t(x >> 64)); 167 | return 63 ^ ((x >> 64) ? hi : lo); 168 | } 169 | #endif 170 | 171 | template 172 | struct Wider { 173 | Int64 lo; 174 | Int64 hi; 175 | 176 | static constexpr size_t bit_width = 2 * wider_traits::bit_width::value; 177 | using half_type = Int64; 178 | 179 | constexpr Wider() = default; 180 | constexpr explicit Wider(int s) noexcept : lo(s), hi((s < 0) ? -1 : 0) {} 181 | constexpr explicit Wider(const Int64& s) noexcept : lo(s), hi(0) {} 182 | constexpr operator bool() const noexcept { return bool(lo | hi); } 183 | 184 | template 185 | static Wider from_array(std::index_sequence, Ints... ints) { 186 | Wider w; 187 | w.lo = wider_traits::array_helper::from_array(wider_traits::get_helper(ints...)...); 188 | w.hi = wider_traits::array_helper::from_array(wider_traits::get_helper(ints...)...); 189 | return w; 190 | } 191 | 192 | friend CarryFlag producecarry(Wider& x, const Wider& y) { 193 | return addcarry(producecarry(x.lo, y.lo), x.hi, y.hi); 194 | } 195 | 196 | friend CarryFlag produceborrow(Wider& x, const Wider& y) { 197 | return subborrow(produceborrow(x.lo, y.lo), x.hi, y.hi); 198 | } 199 | 200 | friend CarryFlag addcarry(CarryFlag cf, Wider& x, const Wider& y) { 201 | cf = addcarry(cf, x.lo, y.lo); 202 | cf = addcarry(cf, x.hi, y.hi); 203 | return cf; 204 | } 205 | 206 | friend CarryFlag subborrow(CarryFlag cf, Wider& x, const Wider& y) { 207 | cf = subborrow(cf, x.lo, y.lo); 208 | cf = subborrow(cf, x.hi, y.hi); 209 | return cf; 210 | } 211 | 212 | friend Wider mulxu(const Wider& a, const Wider& b, Wider *rhi) 213 | { 214 | Wider result; 215 | Wider temp; 216 | temp.lo = mulxu(a.lo, b.lo, &temp.hi); 217 | result.lo.lo = temp.lo; 218 | result.lo.hi = temp.hi; 219 | temp.lo = mulxu(a.hi, b.hi, &temp.hi); 220 | result.hi.lo = temp.lo; 221 | result.hi.hi = temp.hi; 222 | temp.lo = mulxu(a.lo, b.hi, &temp.hi); 223 | bool cf = false; 224 | cf = addcarry(cf, result.lo.hi, temp.lo); 225 | cf = addcarry(cf, result.hi.lo, temp.hi); 226 | result.hi.hi += Int64(int(cf)); 227 | temp.lo = mulxu(a.hi, b.lo, &temp.hi); 228 | cf = false; 229 | cf = addcarry(cf, result.lo.hi, temp.lo); 230 | cf = addcarry(cf, result.hi.lo, temp.hi); 231 | result.hi.hi += Int64(int(cf)); 232 | *rhi = result.hi; 233 | return result.lo; 234 | } 235 | 236 | friend Wider operator*(const Wider& a, const Wider& b) { 237 | Wider result; 238 | result.lo = mulxu(a.lo, b.lo, &result.hi); 239 | result.hi += (a.lo * b.hi); 240 | result.hi += (a.hi * b.lo); 241 | return result; 242 | } 243 | 244 | template 245 | static void shift_left(int n, std::index_sequence, Ts&&... parts) { 246 | using wider_traits::get_helper; 247 | int xx[] = { 248 | [n](auto I, auto&&... parts) { 249 | get_helper(parts...) = shl128( 250 | get_helper(parts...), 251 | get_helper(parts...), 252 | n 253 | ); 254 | return 0; 255 | }(wider_traits::index_constant(), parts...) ... 256 | }; 257 | (void)xx; 258 | get_helper<0>(parts...) <<= (n & 63); 259 | } 260 | 261 | template 262 | static void shift_right(int n, std::index_sequence, Ts&&... parts) { 263 | using wider_traits::get_helper; 264 | int xx[] = { 265 | [n](auto I, auto&&... parts) { 266 | get_helper(parts...) = shr128( 267 | get_helper(parts...), 268 | get_helper(parts...), 269 | n 270 | ); 271 | return 0; 272 | }(wider_traits::index_constant(), parts...) ... 273 | }; 274 | (void)xx; 275 | get_helper(parts...) >>= (n & 63); 276 | } 277 | 278 | friend Wider operator<<(const Wider& a, int n) { 279 | return wider_traits::array_helper::with_array(a, [n](auto... parts) { 280 | Wider::shift_left(n, std::make_index_sequence(), parts...); 281 | uint64_t *ps[] = { &parts... }; 282 | for (int shift = 1; shift < int(sizeof...(parts)); shift *= 2) { 283 | if (n & (shift * 64)) { 284 | for (int i = sizeof...(parts) - 1; i >= 0; --i) { 285 | if (i >= shift) { 286 | *ps[i] = *ps[i - shift]; 287 | } else { 288 | *ps[i] = 0; 289 | } 290 | } 291 | } 292 | } 293 | return wider_traits::array_helper::from_array( parts... ); 294 | }); 295 | } 296 | 297 | friend Wider operator>>(const Wider& a, int n) { 298 | return wider_traits::array_helper::with_array(a, [n](auto... parts) { 299 | Wider::shift_right(n, std::make_index_sequence(), parts...); 300 | uint64_t *ps[] = { &parts... }; 301 | for (int shift = 1; shift < int(sizeof...(parts)); shift *= 2) { 302 | if (n & (shift * 64)) { 303 | for (int i = 0; i < int(sizeof...(parts)); ++i) { 304 | if (i + shift < int(sizeof...(parts))) { 305 | *ps[i] = *ps[i + shift]; 306 | } else { 307 | *ps[i] = 0; 308 | } 309 | } 310 | } 311 | } 312 | return wider_traits::array_helper::from_array( parts... ); 313 | }); 314 | } 315 | 316 | friend int countleadingzeros(const Wider& x) { 317 | constexpr int _64 = bit_width/2; 318 | constexpr int _63 = _64 - 1; 319 | int lo = (_63 ^ countleadingzeros(x.lo)) + _64; 320 | int hi = _63 ^ countleadingzeros(x.hi); 321 | return _63 ^ ((x >> _64) ? hi : lo); 322 | } 323 | 324 | friend Wider divmod(Wider a, const Wider& b, Wider *remainder) { 325 | int leading_zeros = countleadingzeros(b); 326 | Wider subtrahend = b << leading_zeros; 327 | Wider digit = Wider(1) << leading_zeros; 328 | Wider quotient{0}; 329 | while (digit) { 330 | if (subtrahend <= a) { 331 | a -= subtrahend; 332 | quotient |= digit; 333 | } 334 | subtrahend >>= 1; 335 | digit >>= 1; 336 | } 337 | *remainder = a; 338 | return quotient; 339 | } 340 | 341 | friend Wider operator/(const Wider& a, const Wider& b) { 342 | Wider remainder; 343 | return divmod(a, b, &remainder); 344 | } 345 | friend Wider operator%(const Wider& a, const Wider& b) { 346 | Wider remainder; 347 | (void)divmod(a, b, &remainder); 348 | return remainder; 349 | } 350 | friend Wider& operator%=(Wider& a, const Wider& b) { 351 | (void)divmod(a, b, &a); 352 | return a; 353 | } 354 | 355 | Wider& operator++() { *this += Wider(1); return *this; } 356 | Wider operator++(int) { Wider r = *this; ++*this; return r; } 357 | Wider& operator--() { *this -= Wider(1); return *this; } 358 | Wider operator--(int) { Wider r = *this; --*this; return r; } 359 | friend Wider& operator+=(Wider& x, const Wider& y) { (void)producecarry(x, y); return x; } 360 | friend Wider& operator-=(Wider& x, const Wider& y) { (void)produceborrow(x, y); return x; } 361 | friend Wider& operator*=(Wider& x, const Wider& y) { x = (x * y); return x; } 362 | friend Wider& operator/=(Wider& x, const Wider& y) { x = x / y; return x; } 363 | friend Wider& operator^=(Wider& x, const Wider& y) { x.lo ^= y.lo; x.hi ^= y.hi; return x; } 364 | friend Wider& operator&=(Wider& x, const Wider& y) { x.lo &= y.lo; x.hi &= y.hi; return x; } 365 | friend Wider& operator|=(Wider& x, const Wider& y) { x.lo |= y.lo; x.hi |= y.hi; return x; } 366 | friend Wider operator+(Wider x, const Wider& y) { x += y; return x; } 367 | friend Wider operator-(Wider x, const Wider& y) { x -= y; return x; } 368 | friend Wider operator^(Wider x, const Wider& y) { x ^= y; return x; } 369 | friend Wider operator&(Wider x, const Wider& y) { x &= y; return x; } 370 | friend Wider operator|(Wider x, const Wider& y) { x |= y; return x; } 371 | friend Wider& operator<<=(Wider& x, int y) { x = (x << y); return x; } 372 | friend Wider& operator>>=(Wider& x, int y) { x = (x >> y); return x; } 373 | 374 | friend bool operator<(Wider x, const Wider& y) { return produceborrow(x, y); } 375 | friend bool operator>(const Wider& x, const Wider& y) { return (y < x); } 376 | friend bool operator>=(const Wider& x, const Wider& y) { return !(x < y); } 377 | friend bool operator<=(const Wider& x, const Wider& y) { return !(y < x); } 378 | friend bool operator==(const Wider& x, const Wider& y) { return !((x.lo ^ y.lo) | (x.hi ^ y.hi)); } 379 | friend bool operator!=(const Wider& x, const Wider& y) { return !(x == y); } 380 | friend bool operator!(const Wider& x) { return !bool(x); } 381 | friend Wider operator-(const Wider& x) { return Wider(0) - x; } 382 | friend Wider operator~(const Wider& x) { return Wider(-1) ^ x; } 383 | }; 384 | 385 | namespace wider_tests { 386 | 387 | using Uint128 = Wider; 388 | using Uint256 = Wider; 389 | using Uint512 = Wider; 390 | using Uint1024 = Wider; 391 | 392 | template 393 | struct Tests { 394 | //static void preinc(T *p) { ++*p; } 395 | //static void postinc(T *p, T *q) { *p = (*q)++; } 396 | //static void predec(T *p) { --*p; } 397 | //static void postdec(T *p, T *q) { *p = (*q)--; } 398 | //static void plus(T *p, const T *q) { *p = *p + *q; } 399 | //static void pluseq(T *p, const T *q) { *p += *q; } 400 | //static void minus(T *p, const T *q) { *p = *p - *q; } 401 | //static void minuseq(T *p, const T *q) { *p -= *q; } 402 | //static void mul(T *p, const T *q) { *p = *p * *q; } 403 | //static void muleq(T *p, const T *q) { *p *= *q; } 404 | //static void div(T *p, const T *q) { *p = *p / *q; } 405 | //static void diveq(T *p, const T *q) { *p /= *q; } 406 | //static void mod(T *p, const T *q) { *p = *p % *q; } 407 | //static void modeq(T *p, const T *q) { *p %= *q; } 408 | //static void xor_(T *p, const T *q) { *p = *p ^ *q; } 409 | //static void xoreq(T *p, const T *q) { *p ^= *q; } 410 | //static void and_(T *p, const T *q) { *p = *p & *q; } 411 | //static void andeq(T *p, const T *q) { *p &= *q; } 412 | //static void or_(T *p, const T *q) { *p = *p | *q; } 413 | //static void oreq(T *p, const T *q) { *p |= *q; } 414 | //static void shl(T *p, int q) { *p = *p << q; } 415 | //static void shleq(T *p, int q) { *p <<= q; } 416 | //static void shr(T *p, int q) { *p = *p >> q; } 417 | //static void shreq(T *p, int q) { *p >>= q; } 418 | //static int clz(T *p) { return countleadingzeros(*p); } 419 | //static bool lt(const T *p, const T *q) { return *p < *q; } 420 | //static bool leq(const T *p, const T *q) { return *p <= *q; } 421 | //static bool gt(const T *p, const T *q) { return *p > *q; } 422 | //static bool geq(const T *p, const T *q) { return *p >= *q; } 423 | //static bool eq(const T *p, const T *q) { return *p == *q; } 424 | //static bool neq(const T *p, const T *q) { return *p != *q; } 425 | //static bool not_(const T *p) { return !*p; } 426 | //static bool bool_(const T *p) { return *p; } 427 | //static void neg(T *p) { *p = -*p; } 428 | //static void flip(T *p) { *p = ~*p; } 429 | }; 430 | 431 | //template struct Tests<__uint128_t>; 432 | //template struct Tests; 433 | //template struct Tests; 434 | //template struct Tests; 435 | 436 | } // namespace wider_tests 437 | -------------------------------------------------------------------------------- /wider_io.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "wider.h" 4 | #include 5 | #include 6 | #include 7 | 8 | namespace wider_detail { 9 | 10 | template constexpr void multiply_by(T& t, wider_traits::index_constant<10>) { t *= 10; } 11 | template constexpr void multiply_by(T& t, wider_traits::index_constant<16>) { t <<= 4; } 12 | 13 | template 14 | constexpr T parse_it2(const char *first, const char *last) { 15 | T result = T(0); 16 | for (const char *p = first; p != last; ++p) { 17 | char c = *p; 18 | if ('0' <= c && c <= '9') { 19 | multiply_by(result, wider_traits::index_constant{}); 20 | result += T(c - '0'); 21 | } else if (Base == 16 && 'a' <= c && c <= 'f') { 22 | result <<= 4; 23 | result += T(c - 'a' + 10); 24 | } else if (Base == 16 && 'A' <= c && c <= 'F') { 25 | result <<= 4; 26 | result += T(c - 'A' + 10); 27 | } 28 | } 29 | return result; 30 | } 31 | 32 | template 33 | constexpr T parse_it(const char *first, const char *last) { 34 | if (last - first >= 2 && first[0] == '0' && first[1] == 'x') { 35 | return wider_detail::parse_it2(first + 2, last); 36 | } else { 37 | // Wide multiplication by 10 is not yet implemented. 38 | return T(wider_detail::parse_it2(first, last)); 39 | } 40 | } 41 | 42 | inline void print_it(std::ostream& os, const uint64_t& value, bool trailing) { 43 | char buf[17]; 44 | if (trailing) { 45 | snprintf(buf, sizeof buf, "%016llx", (unsigned long long)value); 46 | } else { 47 | snprintf(buf, sizeof buf, "%llx", (unsigned long long)value); 48 | } 49 | std::ostream os2(os.rdbuf()); 50 | os2 << buf; 51 | } 52 | 53 | template 54 | void print_it(std::ostream& os, const Wider& value, bool trailing) { 55 | if (trailing) { 56 | wider_detail::print_it(os, value.hi, true); 57 | wider_detail::print_it(os, value.lo, true); 58 | } else if (value.hi) { 59 | wider_detail::print_it(os, value.hi, false); 60 | wider_detail::print_it(os, value.lo, true); 61 | } else { 62 | wider_detail::print_it(os, value.lo, false); 63 | } 64 | } 65 | 66 | } // namespace wider_detail 67 | 68 | template 69 | constexpr auto operator""_u128() { 70 | using Uint128 = Wider; 71 | const char arr[] = { Cs... }; 72 | return wider_detail::parse_it(arr, arr + sizeof...(Cs)); 73 | } 74 | 75 | template 76 | constexpr auto operator""_u256() { 77 | using Uint128 = Wider; 78 | using Uint256 = Wider; 79 | const char arr[] = { Cs... }; 80 | return wider_detail::parse_it(arr, arr + sizeof...(Cs)); 81 | } 82 | 83 | template 84 | constexpr auto operator""_u512() { 85 | using Uint128 = Wider; 86 | using Uint256 = Wider; 87 | using Uint512 = Wider; 88 | const char arr[] = { Cs... }; 89 | return wider_detail::parse_it(arr, arr + sizeof...(Cs)); 90 | } 91 | 92 | template 93 | std::ostream& operator<<(std::ostream& os, const Wider& value) 94 | { 95 | os << "0x"; 96 | wider_detail::print_it(os, value, false); 97 | return os; 98 | } 99 | 100 | template 101 | std::istream& operator>>(std::istream& is, Wider& value) 102 | { 103 | std::string input; 104 | is >> input; 105 | value = wider_detail::parse_it>(input.data(), input.data() + input.size()); 106 | return is; 107 | } 108 | --------------------------------------------------------------------------------