├── .gitignore ├── gzint.egg-info ├── top_level.txt ├── dependency_links.txt ├── SOURCES.txt └── PKG-INFO ├── gzint ├── __init__.py ├── tests.py └── main.py ├── setup.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | -------------------------------------------------------------------------------- /gzint.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | gzint 2 | -------------------------------------------------------------------------------- /gzint.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /gzint/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import HugeInt, is_huge 2 | -------------------------------------------------------------------------------- /gzint.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | setup.py 2 | gzint/__init__.py 3 | gzint/main.py 4 | gzint/tests.py 5 | gzint.egg-info/PKG-INFO 6 | gzint.egg-info/SOURCES.txt 7 | gzint.egg-info/dependency_links.txt 8 | gzint.egg-info/top_level.txt -------------------------------------------------------------------------------- /gzint.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.1 2 | Name: gzint 3 | Version: 0.0.4 4 | Summary: A library for storing huge integeters efficiently. 5 | Home-page: https://github.com/pirate/gzint 6 | Author: Nick Sweeting 7 | Author-email: gzint@sweeting.me 8 | License: MIT 9 | Description: It makes storing and comparing huge integers fast and lightweight, while gracefully falling back to normal integer operations when math is needed. It works as a drop-in replacement for int. 10 | Keywords: int bigint integers memory gzip storage compression math 11 | Platform: UNKNOWN 12 | Classifier: Development Status :: 3 - Alpha 13 | Classifier: Intended Audience :: Developers 14 | Classifier: Topic :: Utilities 15 | Classifier: Programming Language :: Python :: 3 :: Only 16 | Classifier: License :: OSI Approved :: MIT License 17 | Classifier: Programming Language :: Python :: 3.5 18 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | from codecs import open 4 | from os import path 5 | 6 | here = path.abspath(path.dirname(__file__)) 7 | 8 | # parse description from README.md 9 | try: 10 | with open(path.join(here, 'README.md'), encoding='utf-8') as f: 11 | readme_header = f.read().split('\n', 2) 12 | short_description = readme_header[0].split(': ')[-1] + '.' 13 | long_description = readme_header[2].split('\n', 2)[2].split('\n\n')[0].replace('\n', ' ').replace('`', '') 14 | except Exception: 15 | print('[!] Failed to parse package description from README.md') 16 | short_description = 'A library for storing huge integeters efficiently' 17 | long_description = 'This python library helps store massive integers by using a gzipped-string representation in memory.' 18 | 19 | setup( 20 | name='gzint', 21 | version='0.0.4', 22 | description=short_description, # parsed from first line of README.md 23 | long_description=long_description, # parsed from first section of README.md 24 | 25 | url='https://github.com/pirate/gzint', 26 | author='Nick Sweeting', 27 | author_email='gzint@sweeting.me', 28 | license='MIT', 29 | 30 | classifiers=[ 31 | 'Development Status :: 3 - Alpha', 32 | 'Intended Audience :: Developers', 33 | 'Topic :: Utilities', 34 | 'Programming Language :: Python :: 3 :: Only', 35 | 36 | 'License :: OSI Approved :: MIT License', 37 | 38 | 'Programming Language :: Python :: 3.5', 39 | ], 40 | keywords='int bigint integers memory gzip storage compression math', 41 | 42 | packages=['gzint'], 43 | test_suite='gzint.tests', 44 | install_requires=[], 45 | ) 46 | -------------------------------------------------------------------------------- /gzint/tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | try: 4 | from .main import HugeInt 5 | except SystemError: 6 | print('''Tests must be run using: 7 | python -m gzint.tests 8 | or 9 | python setup.py test''') 10 | raise SystemExit(1) 11 | 12 | 13 | class TestHugeInt(unittest.TestCase): 14 | 15 | # share one copy of these between all tests for speed reasons 16 | large_int = 10**1000000 17 | large = HugeInt(large_int) 18 | 19 | def test_init(self): 20 | zero = HugeInt() 21 | assert not zero._is_huge 22 | assert zero._value == 0 23 | assert str(zero) == '0' 24 | assert repr(zero) == '0' 25 | 26 | assert self.large._is_huge 27 | assert str(self.large) == str(self.large_int) 28 | assert repr(self.large) == '1000000000000000...(1000001)' 29 | 30 | def test_comparisons(self): 31 | assert HugeInt(0) == 0 32 | assert HugeInt(10) == 10 33 | assert HugeInt(10) == HugeInt(10) 34 | assert 10 == HugeInt(10) 35 | 36 | assert self.large == self.large 37 | assert self.large == self.large_int 38 | 39 | def test_identity(self): 40 | huge_ints = {HugeInt(5), HugeInt(6), 7} 41 | assert 5 in huge_ints 42 | assert HugeInt(6) in huge_ints 43 | assert HugeInt(7) in huge_ints 44 | 45 | def test_math(self): 46 | assert type(HugeInt(10) + 10) is HugeInt 47 | assert HugeInt(10) + 10 == 20 48 | assert HugeInt(20) - 20 == 0 49 | assert HugeInt(10) + HugeInt(10) == 20 50 | assert HugeInt(10) + HugeInt(10) == HugeInt(20) 51 | assert HugeInt(20) / HugeInt(10) == 2 52 | assert self.large + 1 == self.large_int + 1 53 | 54 | 55 | if __name__ == '__main__': 56 | unittest.main() 57 | -------------------------------------------------------------------------------- /gzint/main.py: -------------------------------------------------------------------------------- 1 | """Library for managing truly massive numbers at variable precision by storing numbers as gzipped strings.""" 2 | 3 | import decimal 4 | import sys 5 | import zlib 6 | 7 | assert sys.version_info >= (3, 5), 'Must be used on python >= 3.5.0' # don't want to deal with longs, byte-strings, etc. 8 | 9 | 10 | HUGE_NUM_THRESHOLD = 442948 # memory footprint in bytes before a number is compressed 11 | HUGE_STR_THRESHOLD = 1000 # memroy footprint in bytes before a stringified number is considered huge 12 | 13 | def is_huge(value): 14 | if isinstance(value, (int, decimal.Decimal, float)): 15 | if value.__sizeof__() >= HUGE_NUM_THRESHOLD: 16 | return True 17 | elif isinstance(value, (str, bytes)): 18 | if value.__sizeof__() >= HUGE_STR_THRESHOLD: 19 | return True 20 | elif value.__class__.__name__ == 'HugeInt': 21 | return value._is_huge 22 | return False 23 | 24 | def compress(value): 25 | return zlib.compress(value) 26 | 27 | def decompress(value): 28 | return zlib.decompress(value) 29 | 30 | 31 | # Create fallback integer type for methods not yet supported by HugeInt. 32 | # This is needed because operators like +, -, *, etc. don't fall back to __getattr__, 33 | # the corresponding __methods__ must be availabe directly on the class via inheritance 34 | _INT_FALLBACK_METHODS = ( 35 | '__abs__', '__add__', '__and__', '__ceil__', '__eq__', '__floor__', '__sub__', '__rsub__', 36 | '__floordiv__', '__int__', '__invert__', '__le__', '__lshift__', '__lt__', 37 | '__mod__', '__mul__', '__neg__', '__or__', '__pos__', '__pow__', '__radd__', 38 | '__rand__', '__rfloordiv__', '__rlshift__', '__rmod__', '__rmul__', '__ror__', 39 | '__round__', '__rpow__', '__rrshift__', '__rshift__', '__rtruediv__', '__rxor__', 40 | '__truediv__', '__trunc__', '__xor__', 41 | ) 42 | def _get_proxy_method(name): 43 | def _proxy_method(self, *args, **kwgs): 44 | # convert all arguments to normal integers 45 | args = (int(arg) for arg in args) 46 | 47 | # get the result from the int.__method__ 48 | result = getattr(self.to_int(), name)(*args, **kwgs) 49 | 50 | # if the result is an int, convert back to a HugeInt 51 | if isinstance(result, int) and name != '__int__': 52 | return HugeInt(result) 53 | 54 | return result 55 | 56 | _proxy_method.__name__ = _proxy_method.__qualname__ = name # Not a proper qualname, but oh well 57 | return _proxy_method 58 | 59 | _IntFallback = type( 60 | '_IntFallback', 61 | (), 62 | {attr: _get_proxy_method(attr) for attr in _INT_FALLBACK_METHODS}, 63 | ) 64 | 65 | 66 | class HugeInt(_IntFallback): 67 | """Store truly massive nubmers at full precision by saving them as gzipped strings in memory.""" 68 | 69 | def __init__(self, new_value=0): 70 | int_new_value = int(new_value) 71 | self._hash = int_new_value.__hash__() 72 | self._is_huge = is_huge(int_new_value) 73 | 74 | if self._is_huge: 75 | self._value = compress(str(new_value).encode('utf-8', 'ascii')) 76 | else: 77 | self._value = int_new_value 78 | 79 | def __str__(self): 80 | if self._is_huge: 81 | return decompress(self._value).decode() 82 | return str(self._value) 83 | 84 | def __repr__(self): 85 | if self._is_huge: 86 | full_str = decompress(self._value) 87 | return full_str[:16].decode() + '...({})'.format(len(full_str)) 88 | return str(self._value) 89 | 90 | def __eq__(self, other): 91 | if other.__class__.__name__ == 'HugeInt': 92 | return self._value == other._value 93 | elif isinstance(other, int): 94 | return self._hash == other.__hash__() 95 | return self.to_int() == other 96 | 97 | def __hash__(self): 98 | return self._hash 99 | 100 | def to_int(self): 101 | if self._is_huge: 102 | return int(decompress(self._value)) 103 | return self._value 104 | 105 | # TODO: properly implement all the int fallback methods directly on HugeInt 106 | def __getattr__(self, attr): 107 | return getattr(self.to_int(), attr) 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gzint: A library for storing huge integers efficiently [![PyPI](https://img.shields.io/pypi/v/gzint.svg?style=flat-square)](https://pypi.python.org/pypi/gzint/) [![PyPI](https://img.shields.io/pypi/pyversions/gzint.svg?style=flat-square)](https://pypi.python.org/pypi/gzint/) [![Twitter URL](https://img.shields.io/twitter/url/http/shields.io.svg?style=social)](https://twitter.com/thesquashSH) 2 | 3 | 4 | This python library helps store massive integers by using a gzipped-string representation in memory. 5 | It makes storing and comparing huge integers fast and lightweight, while gracefully falling back to normal 6 | integer operations when math is needed. It works as a drop-in replacement for `int`. 7 | 8 | ## Quickstart: 9 | 10 | ```bash 11 | pip3 install gzint 12 | ``` 13 | 14 | ```python 15 | >>> from gzint import HugeInt 16 | 17 | >>> normal_int = 10**1000000 # huge, but compressable (lots of 0's) 18 | >>> huge_int = HugeInt(normal_int) 19 | 20 | # HugeInts are useful when needing to store lots of large numbers without running out of memory 21 | # Notice how the memory footprint of a normal int is much larger than the equivalent HugeInt 22 | >>> normal_int.__sizeof__() 23 | 442948 # almost 0.5mb!! 24 | >>> huge_int._value.__sizeof__() 25 | 1025 # only 1kb 26 | 27 | # HugeInts and normal ints are interchageably comparable, and have the same hashes 28 | >>> HugeInt(5) == 5 29 | True 30 | >>> HugeInt(5) + 5 31 | 10 32 | >>> HugeInt(5) + HugeInt(5) 33 | 10 34 | >>> 5 in {HugeInt(5), 6, 7} # uses python's hashes of the original int for identity 35 | True 36 | 37 | # Of course, this is all silly if you're know beforehand that you're only storing 10**100000, you can just store the string '10**10^6' (57 bytes), and compute it later. 38 | # This applies to almost all compressible data, if you know beforehand what you're storing, picking the perfect compression method is easy. 39 | # The tricky part is applying general encryption methods, because compression is expensive and it's not worth the CPU cost of trying methods sequentially until you find the right one. 40 | # gzip is a fairly simple compression algorithm for catching repeating data, I'm also planning on testing JPEG-style fft compression. 41 | ``` 42 | 43 | ## Theory: 44 | 45 | This library is not magic, I have not somehow figured out how to break the [pigeon-hole principle](https://en.wikipedia.org/wiki/Pigeonhole_principle). 46 | It simply exploits the fact that most large numbers we work with in real life are not 100% random, and 47 | either contain repeating patterns (like lots of 0's) or can be represented compactly by using using notations like 48 | scientific notation, factorial notation, [knuth's up-arrow notation](https://en.wikipedia.org/wiki/Knuth%27s_up-arrow_notation), etc.. 49 | 50 | Do not bother trying to use this library if you're actually reading random data, 51 | it will only make your `int`s bigger. 52 | 53 | The alpha implementation works by compressing repeating patterns in the base-10 representation of `int`s, 54 | which works very well for large numbers with lots of repeating digits (in base-10). I'm working on 55 | adding other compression schemes, and automatically picking the one with the most memory savings (which may 56 | require adding threading to compress the int in several different ways concurrently). 57 | 58 | Another possible option is to try and compress all the `int`s used across an entire program, by storing some state 59 | every time a HugeInt is created, and seeing if patterns exist globally that can be compressed together. 60 | 61 | ## Docs: 62 | 63 | `HugeInt` is a type which aids in storing very large, but **compressable numbers** in memory in python >= 3.5. 64 | It sacrifices CPU time during intialization and math operations, for fast comparisons and at-rest memory efficiency. 65 | 66 | `HugeInt` implements the `int` interface, you can almost always treat it like a normal python `int`. 67 | It will fall back to creating the full `int` in memory if an operation is not supported on the compressed form (e.g. multiplication). 68 | 69 | `HugeInt` provides these methods on top of `int`: 70 | 71 | ```python 72 | - HugeInt.__init__: Initialize a HugeInt from an `int` or str representation 73 | - HugeInt.__eq__: Efficiently compare a `HugeInt` with another `HugeInt` or `int` 74 | - HugeInt.__str__: Get the full `str` representation of the `HugeInt` 75 | - HugeInt.__repr__: Get a short representation of the `HugeInt` suitable for console display 76 | - HugeInt.__hash__: Get the `__hash__` of the uncompressed `int` 77 | - HugeInt.to_int: Get the `int` representation of the `HugeInt` 78 | ``` 79 | 80 | Because `HugeInt` stores a compressed representation of the number, fast, direct math operations are not possible. 81 | For the following operations, the number gets de-compressed, the operation performed using the `int` 82 | equivalent method, and then the result is re-compressed and returned as a `HugeInt` (which can be very slow). 83 | 84 | `__abs__`, `__add__`, `__and__`, `__ceil__`, `__floor__`, `__floordiv__`, `__int__`, `__invert__`, `__le__`, `__lshift__`, `__lt__`, `__mod__`, `__mul__`, `__neg__`, `__or__`, `__pos__`, `__pow__`, `__radd__`, `__rand__`, `__rfloordiv__`, `__rlshift__`, `__rmod__`, `__rmul__`, `__ror__`, `__round__`, `__rpow__`, `__rrshift__`, `__rshift__`, `__rsub__`, `__rtruediv__`, `__rxor__`, `__sub__`, `__truediv__`, `__trunc__`, `__xor__` 85 | 86 | **Example Use Case:** 87 | 88 | Read a file full of huge numbers, and check to see which ones occur more than once (in `O(n)` time). 89 | 90 | ```python 91 | numbers_seen = set() 92 | 93 | for line in open('big_data.txt', 'r'): 94 | compressed_int = HugeInt(line) 95 | if compressed_int in numbers_seen: 96 | print('Found a familiar number:', repr(compressed_int)) 97 | numbers_seen.add(compressed_int) 98 | 99 | del line 100 | 101 | if 1000 in numbers_seen: 102 | print('Saw 1000') 103 | 104 | if HugeInt(10**1000000) in numbers_seen: 105 | print('Saw 10^1,000,000') 106 | ``` 107 | 108 | **Why `HugeInt` is slow to init:** 109 | 110 | You may notice that initializing big `HugeInt`s takes some time. This is because `HugeInt` uses 111 | the gzip "deflate" algorithm, and must perform an O(n) pass over the number, where n is the number of digits in base-10. 112 | Due to this initial cost, it's recommended to avoid using `HugeInt`s for applications where you will need to re-initialize 113 | many `HugeInt`s, or perform many math operations on `HugeInt`s in memory. 114 | 115 | Right now, only `__eq__` (`==`) and `__hash__` (`in`) are optimized to work directly on the compressed number, 116 | other operations will fall back to decompressing back to an `int` and using the slower `int` math methods, 117 | then recompressing the returned value. 118 | 119 | ## Development: 120 | 121 | ```bash 122 | git clone https://github.com/pirate/gzint.git # python3.5 is the only dependency (brew install python3) 123 | cd gzint 124 | python3.5 setup.py test # optional, check that tests are passing 125 | python3.5 setup.py install 126 | # all code is inside gzint/main.py 127 | ``` 128 | 129 | **TODOs:** 130 | 131 | 1. Implement more compression methods and allow users to manually chose which one, with a way to find the optimal one for a given number: 132 | - gzipped hex, binary, octal, or other base representations of the number 133 | - base + exponents 134 | - scientific notation 135 | - knuth's up-arrow notation 136 | - factorial notation 137 | - prime factor notation 138 | - other polynomial representations 139 | - python [rational number support](https://docs.python.org/3.6/library/numbers.html#numbers.Rational) 140 | 2. Fall back to storing the int uncompressed if compression ends up making it bigger 141 | 3. Speed up/parallelize the compression & decompression 142 | 4. See if more math operations can be performed directly on compressed `HugeInt`s without uncompressing first, depending on compression method 143 | 5. Use a cached_property to prevent decompressing the same HugeInt repeatedly during `int` operations (allow expiry eventually with timeout to get GC benefits...?) 144 | --------------------------------------------------------------------------------