├── diskarray ├── __init__.py ├── exception.py ├── strarray.py ├── command.py ├── vararray.py └── diskarray.py ├── test.py ├── setup.py ├── LICENSE ├── .gitignore ├── .travis.yml └── README.md /diskarray/__init__.py: -------------------------------------------------------------------------------- 1 | from .command import main 2 | from .diskarray import DiskArray 3 | from .vararray import DiskVarArray 4 | from .strarray import DiskStringArray 5 | -------------------------------------------------------------------------------- /diskarray/exception.py: -------------------------------------------------------------------------------- 1 | class DiskArrayException(Exception): 2 | pass 3 | 4 | 5 | class AppendNotSupported(DiskArrayException): 6 | def __init__(self, naxes): 7 | self.naxes = naxes 8 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import doctest 4 | import unittest 5 | 6 | from diskarray import diskarray, vararray 7 | 8 | 9 | def suitefn(): 10 | suite = unittest.TestSuite() 11 | suite.addTests(doctest.DocTestSuite(diskarray)) 12 | suite.addTests(doctest.DocTestSuite(vararray)) 13 | return suite 14 | 15 | 16 | if __name__ == "__main__": 17 | doctest.testmod(diskarray) 18 | doctest.testmod(vararray) 19 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | version = "0.1.9" 4 | setup( 5 | name="diskarray", 6 | version=version, 7 | description="A resizable and readable numpy array on disk", 8 | keywords="diskarray", 9 | author="Deep Compute, LLC", 10 | author_email="contact@deepcompute.com", 11 | url="https://github.com/deep-compute/diskarray", 12 | download_url="https://github.com/deep-compute/diskarray/tarball/%s" % version, 13 | license="MIT License", 14 | install_requires=["numpy>=1.14.3", "basescript>=0.2.9"], 15 | package_dir={"diskarray": "diskarray"}, 16 | packages=find_packages("."), 17 | include_package_data=True, 18 | test_suite="test.suitefn", 19 | entry_points={"console_scripts": ["diskarray = diskarray:main"]}, 20 | ) 21 | -------------------------------------------------------------------------------- /diskarray/strarray.py: -------------------------------------------------------------------------------- 1 | from logging import Logger 2 | 3 | import numpy as np 4 | 5 | from .vararray import DiskVarArray 6 | 7 | 8 | class DiskStringArray(DiskVarArray): 9 | # Index to word 10 | def __init__(self, dpath, mode="r+", growby=DiskVarArray.GROWBY, log=Logger): 11 | super(DiskStringArray, self).__init__( 12 | dpath, 13 | dtype=np.uint8, 14 | dtype_index=np.uint64, 15 | mode=mode, 16 | growby=growby, 17 | log=log, 18 | ) 19 | 20 | def __getitem__(self, idx): 21 | data = super(DiskStringArray, self).__getitem__(idx) 22 | return data.tostring() 23 | 24 | def append(self, v): 25 | v = np.array(list(v), dtype=np.uint8) 26 | return super(DiskStringArray, self).append(v) 27 | 28 | def extend(self, v): 29 | v = [np.array(list(x), dtype=np.uint8) for x in v] 30 | return super(DiskStringArray, self).extend(v) 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Deep Compute, LLC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | deploy: 2 | - provider: releases 3 | api_key: 4 | secure: WUC/4w25ZGvmDRItWZbqFVbRhgTqZIWeHye5enfHFKkrY5tn3RxmwKY3s0qHCna8mp5poe01cIe5eDKHuRoSoJTPsXbSkEp5yoj00mCTB1pByhc1axarmvFIhUiXt23QTi5b13ju6IsPll4/GAAqiAQydhBCDuN5wQEol3ilgEBkTvCVZE3e/AowJBNic/7EHS+UsfS7EOkjtCPMHzFycsUcjG9+1sm24nn7Bxq24iCEaeNZBieN2nSdNPzNUaW0qv07zgGPE87jL/5mek2DGH8gSN0qVNDuus1MCbjrPxDqz3M+LypA1YcspIXA+2geArfWYVvpYEvgF5c89s9M2jnn1UlA8A10s8ZRAUvE0kLsAUMMzQNvQklmV7Vfn/8fCWvsc5tUw4G4nP5K352q+/u4DOJ3YATIMT9uzRwbcLWPRP3UjdniLriY9oJAfImIgQRq5XC0pcbpATaswZgg3Ol6L0Aa/V+b2f0yFp6Hkyy3MNTBlSK2/zNtKvR6HxFC1YZzfXBpgLwzmNZdufxnfhwsiDobb/EjUu5fh6U6kGKk+kxEh3iqx1s6KcIKrMi47RMSmLr79OYKRMS/mBkSgmOFB3wC+B7xE/kuJ+MuecFpJWW0Ifdnm2mu++D8ASTKVzxHooxhv9zIyxZZ/iT1JAHjfku146KkYWZE5FTtkbs= 5 | name: diskarray-0.1.9 6 | tag_name: 0.1.9 7 | true: 8 | repo: deep-compute/diskarray 9 | - provider: pypi 10 | distributions: sdist bdist_wheel 11 | server: https://upload.pypi.org/legacy/ 12 | user: deepcompute 13 | password: 14 | secure: XScKeAhmF7gGThEjJCtur1RHlGUZ2n3FXukp7619YHmLoUzvszo8Wg+ZSJrdl3soEuSvVQdn2G3ngxAZdKmNhWVYrUKRdKL5iY4WyYHVujkOq+diVqCGHWbSmZJupyscgt1L/H8l+IohC1dnng/ThQuFp7Jbay68lM8LzS20f16JgSL6Xq+jRqqtBU3jALoqf9scnwuXG+Yj51YTQ9DmS37ctlLyzg3GEbICQB1dNaSZ3HtM1LB0/69++rhukzicm1Z9FcNEbdL9U7ohAgrI1+0mj/4xtURIrOwvlKjhjXUxf5S2RMe49xFq4KDRkXL8uwUEvRnskwXH6u0+mkAQRpMYFMygxcmiVhuhhelJD43RuO56o84IWQLAwje/RUeH1huePUNBV63tAXQC1uQ4yoaZs/DehtjDjgIkk/j4xdTnlwaN68icGrnWxZ5QCwZkz16OQ84HolTLGG9X1Fuqs7iYGI9GmBYfSG6FVp6H6E1Cakvc492gAVqIJKln6Y5u8a1SPpHp2jhhqzsMQo/fwqDTc8m59ENX1zxcIgtJjaWrgkJ4U+W4GBosKpevTJCLKzpirsfJ2Al/7E7rTEOpXpw3NNeeyben708/RiPbWVvhE6yZ7Z6XHYzIO+fF9gr0pRiqdL3uL3ASCUlNEBvw5j7adH57foeTf8mZkQqLyrU= 15 | true: 16 | branch: master 17 | tags: true 18 | install: 19 | - pip install . 20 | language: python 21 | python: 22 | - '3.5' 23 | script: 24 | - docker run -v $(pwd):/app deepcompute/black:python-black-latest --check . 25 | - python setup.py test 26 | -------------------------------------------------------------------------------- /diskarray/command.py: -------------------------------------------------------------------------------- 1 | import code 2 | 3 | from basescript import BaseScript 4 | import numpy as np 5 | 6 | from .diskarray import DiskArray 7 | 8 | 9 | class DiskArrayCommand(BaseScript): 10 | DESC = "DiskArray command-line tool" 11 | 12 | DEFAULT_CAPACITY = None 13 | DEFAULT_GROWBY = 10000 14 | DEFAULT_MODE = "r+" 15 | 16 | def interact(self): 17 | if self.args.capacity: 18 | capacity = eval(self.args.capacity) 19 | else: 20 | capacity = self.args.capacity 21 | 22 | fpath = self.args.fpath 23 | shape = eval(self.args.shape) 24 | growby = self.args.growby 25 | dtype = eval(self.args.dtype) 26 | mode = self.args.mode 27 | 28 | interact = DiskArray( 29 | fpath=fpath, 30 | shape=shape, 31 | capacity=capacity, 32 | growby=growby, 33 | dtype=dtype, 34 | mode=mode, 35 | ) 36 | 37 | namespace = dict(da=interact) 38 | code.interact("DiskArray Console", local=namespace) 39 | 40 | def define_subcommands(self, subcommands): 41 | super(DiskArrayCommand, self).define_subcommands(subcommands) 42 | 43 | interact_cmd = subcommands.add_parser("interact", help="DiskArray Console") 44 | interact_cmd.set_defaults(func=self.interact) 45 | interact_cmd.add_argument( 46 | "fpath", 47 | help="Input file which is used to store disk arrys.\ 48 | eg: /tmp/disk.array", 49 | ) 50 | interact_cmd.add_argument( 51 | "shape", 52 | help="shape is the size of the disk array.\ 53 | eg: '(0, 3)'", 54 | ) 55 | interact_cmd.add_argument( 56 | "dtype", 57 | help="data type of the disk array.\ 58 | eg: np.float32", 59 | ) 60 | interact_cmd.add_argument( 61 | "-c", 62 | "--capacity", 63 | default=self.DEFAULT_CAPACITY, 64 | type=str, 65 | help="capacity is the total capacity of the disk array.\ 66 | This is optional and default is shape value\ 67 | eg: --capacity '(10, 3)'", 68 | ) 69 | interact_cmd.add_argument( 70 | "-g", 71 | "--growby", 72 | default=self.DEFAULT_GROWBY, 73 | type=int, 74 | help="growby is used to increase the size of\ 75 | the disk array when it reaches to it's maximum limit.\ 76 | This is optional and default is 10000\ 77 | eg: --growby 200", 78 | ) 79 | interact_cmd.add_argument( 80 | "-m", 81 | "--mode", 82 | default=self.DEFAULT_MODE, 83 | type=str, 84 | help="mode is to open the disk array in that mode.\ 85 | Example modes are r+, r, w+ and c\ 86 | This is optional and default is r+", 87 | ) 88 | 89 | 90 | def main(): 91 | DiskArrayCommand().start() 92 | 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DiskArray 2 | 3 | A resizable and readable numpy array on disk. 4 | 5 | This module is built on numpy `memmap` used for accessing and modifying small segments of large files on disk, without reading the entire file into memory. 6 | 7 | This module also supports appending your numpy arrays to disk array at any time. 8 | 9 | ## Installation 10 | 11 | > Prerequisites: Python 12 | 13 | ```bash 14 | $ sudo pip install diskarray 15 | ``` 16 | 17 | ## Quick Example 18 | 19 | ```python 20 | >>> import numpy as np 21 | >>> from diskarray import DiskArray 22 | 23 | >>> data = np.array([[2 , 3, 4], [1, 2, 3]]) 24 | 25 | >>> da = DiskArray('/tmp/disk.array', shape=(0, 3), dtype=np.float32) 26 | 27 | >>> da.extend(data) 28 | 29 | >>> print(da[:]) 30 | ``` 31 | 32 | ## Usage 33 | 34 | `DiskArray` supports two methods, extend and append. 35 | 36 | `extend` is used to append arrays to disk array. 37 | 38 | `append` is used to append single array at a time. 39 | 40 | ### Importing 41 | 42 | #### Using extend 43 | 44 | Example1: 45 | 46 | ```python 47 | >>> import numpy as np 48 | >>> from diskarray import DiskArray 49 | 50 | >>> data = np.array([[2 , 3, 4], [1, 2, 3]]) 51 | 52 | # creating object to disk array 53 | >>> da = DiskArray('/tmp/disk.array', shape=(0, 3), capacity=(10, 3), growby=200, dtype=np.float32) 54 | 55 | # extend the data to disk array 56 | >>> da.extend(data) 57 | 58 | # Get the full array 59 | >>> print(da[:]) 60 | 61 | # Get the data which is in first row 62 | >>> print(da[1]) 63 | 64 | # Get the data from first row to third row 65 | >>> print(da[1:3]) 66 | 67 | # Get the data which is in 1st row 1st column 68 | >>> print(da[1][1]) 69 | ``` 70 | 71 | - `/tmp/disk.array` is the file which holds disk arrays. 72 | - `shape` is the size of the disk array. 73 | - `capacity` is the total capacity of the disk array. 74 | This is used because when we want to extend arrays which are larger than `shape` then DiskArray creates again memmap to the file which is costliear operation. 75 | So we are using `capacity` to directly create disk array with the size of `capacity` 76 | 77 | - `capacity` and `growby` are optional which takes `shape` as `capacity` and `growby` as `10000` when these are not given. 78 | 79 | Example2: 80 | 81 | ```python 82 | >>> import numpy as np 83 | >>> from diskarray import DiskArray 84 | 85 | >>> dtype = [('token', np.uint32), ('count', np.uint32), ('vec', np.float32)] 86 | 87 | >>> data = np.array([[(1, 0, 0.), (0, 2, 0.), (0, 2, 0.)], [(1, 0, 0.), (0, 2, 0.), (0, 2, 0.)]], dtype=dtype) 88 | 89 | >>> da = DiskArray('/tmp/disk.array', shape=(0, 3), capacity=(10, 3), dtype=dtype) 90 | 91 | >>> da.extend(data) 92 | 93 | # Get the full array 94 | >>> print(da[:]) 95 | 96 | # Get the count values at 1th row 97 | >>> print(da[1]['count']) 98 | 99 | # Get the token value at 1th row 2nd column 100 | >>> print(da[1][2]['token']) 101 | 102 | # Modify the vec value at 1th row 2nd column 103 | >>> da[1][2]['vec'] = 10.0 104 | ``` 105 | 106 | #### Using append 107 | 108 | Example: 109 | 110 | ```python 111 | >>> import numpy as np 112 | >>> from diskarray import DiskArray 113 | 114 | >>> data = np.array([[2 , 3, 4]) 115 | 116 | # creating object to disk array 117 | >>> da = DiskArray('/tmp/disk.array', shape=(0, 3), capacity=(10, 3), growby=200, dtype=np.float32) 118 | 119 | # append 1 dimensional array to disk array 120 | >>> da.append(data) 121 | >>> da.append(data + 1) 122 | 123 | # Get the full array 124 | >>> print(da[:]) 125 | 126 | # Get the data which is in first row 127 | >>> print(da[1]) 128 | 129 | # Get the data from first row to third row 130 | >>> print(da[1:3]) 131 | 132 | # Get the data which is in 1st row 1st column 133 | >>> print(da[1][1]) 134 | ``` 135 | 136 | `growby` is used to increase the size of disk array when it reaches to it's maximum limit. 137 | 138 | ### Interactive console 139 | 140 | ```bash 141 | # diskarray provides command to directly interact with it 142 | 143 | $ diskarray interact --capacity --growby --mode 144 | 145 | # is the input file which is used to store disk arrys. 146 | # is the size of the disk array. 147 | # is the data type of the disk array. 148 | # is the total capacity of the disk array. 149 | # is used to increase the size of the disk array when it reaches to it's maximum limit. 150 | # is to open the disk array in that mode. 151 | ``` 152 | 153 | Example: 154 | 155 | ```bash 156 | $ diskarray interact /tmp/test '(0, 3)' np.float32 --capacity '(10, 3)' --growby 5 --mode r+ 157 | DiskArray Console 158 | >>> import numpy as np 159 | >>> da.append(np.array([1, 2, 3])) 160 | ``` 161 | 162 | ## Running Tests 163 | 164 | ``` 165 | $ python setup.py test 166 | ``` 167 | -------------------------------------------------------------------------------- /diskarray/vararray.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from logging import Logger 4 | 5 | import numpy as np 6 | 7 | from .diskarray import DiskArray 8 | 9 | 10 | class DiskVarArray(object): 11 | GROWBY = DiskArray.GROWBY 12 | 13 | def __init__( 14 | self, dpath, dtype, dtype_index=np.uint64, mode="r+", growby=GROWBY, log=Logger 15 | ): 16 | """ 17 | >>> import numpy as np 18 | >>> from diskarray import DiskVarArray 19 | >>> d = DiskVarArray('/tmp/test1', dtype='uint32') 20 | >>> d # doctest:+ELLIPSIS 21 | 22 | """ 23 | 24 | self._dpath = dpath 25 | self._dtype = dtype 26 | self._dtype_index = dtype_index 27 | self._mode = mode 28 | self._growby = growby 29 | self.log = log 30 | 31 | if not os.path.exists(dpath): 32 | os.makedirs(dpath) 33 | 34 | self._data_fpath = os.path.join(dpath, "data") 35 | self.data = DiskArray( 36 | self._data_fpath, dtype=dtype, mode=mode, growby=growby, log=log 37 | ) 38 | 39 | self._index_fpath = os.path.join(dpath, "index") 40 | self.index = DiskArray( 41 | self._index_fpath, dtype=dtype_index, mode=mode, growby=growby, log=log 42 | ) 43 | 44 | def flush(self): 45 | self.data.flush() 46 | self.index.flush() 47 | 48 | @property 49 | def dtype(self): 50 | """ 51 | >>> import numpy as np 52 | >>> d = DiskVarArray('/tmp/test1', dtype='uint32') 53 | >>> d.dtype 54 | 'uint32' 55 | >>> shutil.rmtree('/tmp/test1', ignore_errors=True) 56 | """ 57 | return self._dtype 58 | 59 | @property 60 | def dtype_index(self): 61 | """ 62 | >>> import numpy as np 63 | >>> d = DiskVarArray('/tmp/test1', dtype='uint32') 64 | >>> d.dtype_index 65 | 66 | >>> shutil.rmtree('/tmp/test1', ignore_errors=True) 67 | """ 68 | return self._dtype_index 69 | 70 | def __getitem__(self, idx): 71 | """ 72 | >>> import numpy as np 73 | >>> d = DiskVarArray('/tmp/test1', dtype='uint32') 74 | >>> d.append([1, 2, 3, 4]) 75 | >>> d.__getitem__(0) 76 | memmap([1, 2, 3, 4], dtype=uint32) 77 | >>> shutil.rmtree('/tmp/test1', ignore_errors=True) 78 | """ 79 | sindex = self.index[idx] 80 | 81 | if idx == (len(self.index) - 1): 82 | eindex = len(self.data) 83 | else: 84 | eindex = self.index[idx + 1] 85 | 86 | return self.data[sindex:eindex] 87 | 88 | @property 89 | def num_elements(self): 90 | """ 91 | >>> import numpy as np 92 | >>> d = DiskVarArray('/tmp/test1', dtype='uint32') 93 | >>> d.append([1, 2, 3, 4]) 94 | >>> d.num_elements 95 | 4 96 | >>> shutil.rmtree('/tmp/test1', ignore_errors=True) 97 | """ 98 | return len(self.data) 99 | 100 | @property 101 | def num_lists(self): 102 | """ 103 | >>> import numpy as np 104 | >>> d = DiskVarArray('/tmp/test2', dtype='uint32') 105 | >>> d.append([1, 2, 3, 4]) 106 | >>> d.num_lists 107 | 1 108 | >>> d.append([5, 6, 7, 8]) 109 | >>> d.num_lists 110 | 2 111 | >>> shutil.rmtree('/tmp/test2', ignore_errors=True) 112 | """ 113 | return len(self.index) 114 | 115 | def __len__(self): 116 | return self.num_lists 117 | 118 | def append(self, v): 119 | """ 120 | >>> d = DiskVarArray('/tmp/test3', dtype='uint32') 121 | >>> d.append([1, 2, 3, 4]) 122 | >>> d.__getitem__(0) 123 | memmap([1, 2, 3, 4], dtype=uint32) 124 | >>> d.append([5, 6, 7, 8]) 125 | >>> d.__getitem__(1) 126 | memmap([5, 6, 7, 8], dtype=uint32) 127 | >>> shutil.rmtree('/tmp/test3', ignore_errors=True) 128 | """ 129 | self.index.append(len(self.data)) 130 | self.data.extend(v) 131 | 132 | def extend(self, v): 133 | lengths = np.cumsum([len(x) for x in v]) 134 | self.index.append(0) 135 | self.index.extend(lengths[:-1]) 136 | 137 | vals = np.concatenate(v) 138 | self.data.extend(vals) 139 | 140 | def destroy(self): 141 | """ 142 | >>> import numpy as np 143 | >>> d = DiskVarArray('/tmp/test4', dtype='uint32') 144 | >>> d.append([1, 2, 3, 4]) 145 | >>> d.destroy # doctest:+ELLIPSIS 146 | > 147 | >>> shutil.rmtree('/tmp/test4', ignore_errors=True) 148 | """ 149 | 150 | self.data.destroy() 151 | self.data = None 152 | 153 | self.index.destroy() 154 | self.index = None 155 | 156 | def close(self): 157 | self.data.close() 158 | self.index.close() 159 | -------------------------------------------------------------------------------- /diskarray/diskarray.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from functools import reduce 4 | from logging import Logger 5 | 6 | import numpy as np 7 | 8 | from .exception import AppendNotSupported 9 | 10 | 11 | class DiskArray(object): 12 | """ 13 | Stores binary data on disk as a memory mapped file 14 | using numpy.memmap. Allows for growing the disk data 15 | by appending and extending. 16 | 17 | Links: 18 | * https://en.wikipedia.org/wiki/Memory-mapped_file 19 | 20 | # FIXME: 21 | 1. Explain capacity and actual shape 22 | 2. Explain growby 23 | 3. Explain not having to specify shape for 1d arrays 24 | 4. Explain using structured arrays 25 | 5. Why memory mapping? What does it provide? 26 | 6. Why not use np.save and np.load? 27 | """ 28 | 29 | GROWBY = 10000 30 | 31 | def __init__( 32 | self, 33 | fpath, 34 | dtype, 35 | mode="r+", 36 | shape=None, 37 | capacity=None, 38 | growby=GROWBY, 39 | log=Logger, 40 | ): 41 | """ 42 | >>> import numpy as np 43 | >>> da = DiskArray('/tmp/test.array', shape=(0, 3), dtype=np.float32) 44 | >>> print(da[:]) 45 | [[0. 0. 0.]] 46 | """ 47 | 48 | itemsize = np.dtype(dtype).itemsize 49 | 50 | if not os.path.exists(fpath): 51 | if not shape: 52 | shape = (0,) 53 | # FIXME: what if capacity is defined? 54 | if not capacity: 55 | capacity = tuple([max(x, 1) for x in shape]) 56 | 57 | n_init_capacity = self._shape_bytes(capacity, itemsize) 58 | open(fpath, "w").write("\x00" * n_init_capacity) # touch file 59 | 60 | if not shape: 61 | n = int(os.path.getsize(fpath) / itemsize) 62 | shape = (n,) 63 | 64 | self._fpath = fpath 65 | self._shape = shape 66 | self._capacity_shape = capacity or shape 67 | self._dtype = dtype 68 | self._mode = mode 69 | self._growby = growby 70 | self.log = log 71 | 72 | self.data = None 73 | self._update_ndarray() 74 | 75 | def _update_ndarray(self): 76 | if self.data is not None: 77 | self.data.flush() 78 | 79 | self._create_ndarray() 80 | 81 | def _create_ndarray(self): 82 | self.data = np.memmap( 83 | self._fpath, shape=self._capacity_shape, dtype=self._dtype, mode=self._mode 84 | ) 85 | if self._shape is None: 86 | self._shape = self.data.shape 87 | 88 | def flush(self): 89 | self.data.flush() 90 | self._truncate_if_needed() 91 | 92 | def _shape_bytes(self, shape, dtype_bytes): 93 | return reduce((lambda x, y: x * y), shape) * dtype_bytes 94 | 95 | def _truncate_if_needed(self): 96 | fd = os.open(self._fpath, os.O_RDWR | os.O_CREAT) 97 | try: 98 | dtype_bytes = np.dtype(self._dtype).itemsize 99 | nbytes = self._shape_bytes(self._shape, dtype_bytes) 100 | os.ftruncate(fd, nbytes) 101 | self._capacity_shape = self._shape 102 | finally: 103 | os.close(fd) 104 | self._create_ndarray() 105 | 106 | @property 107 | def shape(self): 108 | return self._shape 109 | 110 | @property 111 | def capacity(self): 112 | return self._capacity_shape 113 | 114 | @property 115 | def dtype(self): 116 | return self._dtype 117 | 118 | def __getitem__(self, idx): 119 | return self.data[idx] 120 | 121 | def __setitem__(self, idx, v): 122 | self.data[idx] = v 123 | 124 | def __len__(self): 125 | return self._shape[0] 126 | 127 | def _incr_shape(self, shape, n): 128 | _s = list(shape) 129 | _s[0] += n 130 | return tuple(_s) 131 | 132 | def append(self, v): 133 | """ 134 | >>> import numpy as np 135 | >>> da = DiskArray('/tmp/test.array', shape=(0, 3), growby=3, dtype=np.float32) 136 | >>> print(da[:]) 137 | [] 138 | >>> data = np.array([[2,3,4], [1, 2, 3]]) 139 | >>> da.append(data[0]) 140 | >>> print(da[:]) 141 | [[2. 3. 4.] 142 | [0. 0. 0.] 143 | [0. 0. 0.]] 144 | """ 145 | 146 | # FIXME: for now we only support 147 | # append along axis 0 and only 148 | # for 1d and 2d arrays 149 | 150 | # FIXME: for now we only support 151 | # appending one item at a time 152 | 153 | nrows = self._shape[0] 154 | nrows_capacity = self._capacity_shape[0] 155 | 156 | if nrows == nrows_capacity: 157 | self._capacity_shape = self._incr_shape(self._capacity_shape, self._growby) 158 | self._update_ndarray() 159 | 160 | shapelen = len(self._shape) 161 | 162 | if shapelen not in (1, 2): 163 | raise AppendNotSupported(shapelen) 164 | 165 | self.data[nrows] = v 166 | self._shape = self._incr_shape(self._shape, 1) 167 | 168 | def extend(self, v): 169 | """ 170 | >>> import numpy as np 171 | >>> da = DiskArray('/tmp/test.array', shape=(0, 3), capacity=(10, 3), dtype=np.float32) 172 | >>> print(da[:]) 173 | [[2. 3. 4.] 174 | [0. 0. 0.] 175 | [0. 0. 0.] 176 | [0. 0. 0.] 177 | [0. 0. 0.] 178 | [0. 0. 0.] 179 | [0. 0. 0.] 180 | [0. 0. 0.] 181 | [0. 0. 0.] 182 | [0. 0. 0.]] 183 | >>> data = np.array([[2,3,4], [1, 2, 3]]) 184 | >>> da.extend(data) 185 | >>> print(da[:]) 186 | [[2. 3. 4.] 187 | [1. 2. 3.] 188 | [0. 0. 0.] 189 | [0. 0. 0.] 190 | [0. 0. 0.] 191 | [0. 0. 0.] 192 | [0. 0. 0.] 193 | [0. 0. 0.] 194 | [0. 0. 0.] 195 | [0. 0. 0.]] 196 | >>> os.remove('/tmp/test.array') 197 | """ 198 | 199 | nrows = self._shape[0] 200 | nrows_capacity = self._capacity_shape[0] 201 | remaining_capacity = nrows_capacity - nrows 202 | 203 | if remaining_capacity < len(v): 204 | diff = len(v) - remaining_capacity 205 | self._capacity_shape = self._incr_shape(self._capacity_shape, diff) 206 | self._update_ndarray() 207 | 208 | self.data[nrows : nrows + len(v)] = v 209 | self._shape = self._incr_shape(self._shape, len(v)) 210 | 211 | def grow(self, n): 212 | # FIXME: code 213 | pass 214 | 215 | def close(self): 216 | self.data._mmap.close() 217 | del self.data 218 | del self._fpath 219 | 220 | def truncate(self, n): 221 | # FIXME: code 222 | pass 223 | 224 | def destroy(self): 225 | self.data = None 226 | os.remove(self._fpath) 227 | --------------------------------------------------------------------------------