├── .coveragerc ├── .gitignore ├── .scrutinizer.yml ├── .travis.yml ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── examples └── readme_examples.py ├── jagged ├── __init__.py ├── base.py ├── bcolz_backend.py ├── benchmarks │ ├── __init__.py │ └── utils.py ├── blosc_backend.py ├── bloscpack_backend.py ├── compression │ ├── __init__.py │ └── compressors.py ├── h5py_backend.py ├── joblib_backend.py ├── misc.py ├── mmap_backend.py ├── npy_backend.py ├── pickle_backend.py └── tests │ ├── __init__.py │ ├── fixtures.py │ └── test_raw_stores.py ├── setup.cfg └── setup.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | exclude_lines = 3 | pragma: no cover 4 | def __repr__ 5 | if self.debug: 6 | if settings.DEBUG 7 | raise AssertionError 8 | raise NotImplementedError 9 | if __name__ == .__main__.: 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # "Compiled" python 2 | *.py[cod] 3 | 4 | # C extensions 5 | *.so 6 | *.o 7 | 8 | # Packages 9 | *.egg 10 | *.egg-info 11 | dist 12 | build 13 | eggs 14 | parts 15 | bin 16 | var 17 | sdist 18 | develop-eggs 19 | .installed.cfg 20 | lib 21 | lib64 22 | 23 | # Installer logs 24 | pip-log.txt 25 | 26 | # Unit test / coverage reports 27 | .coverage 28 | .tox 29 | nosetests.xml 30 | 31 | # Translations 32 | *.mo 33 | 34 | # Mr Developer 35 | .mr.developer.cfg 36 | .project 37 | .pydevproject 38 | 39 | # Others 40 | .cache 41 | -------------------------------------------------------------------------------- /.scrutinizer.yml: -------------------------------------------------------------------------------- 1 | checks: 2 | python: 3 | code_rating: true 4 | duplicate_code: true 5 | variables_used_before_assignment: true 6 | variables_unused_wildcard_import: true 7 | variables_unused_variable: true 8 | variables_unused_import: true 9 | variables_unused_argument: true 10 | variables_unpacking_non_sequence: true 11 | variables_undefined_variable: true 12 | variables_undefined_loop_variable: true 13 | variables_undefined_all_variable: true 14 | variables_unbalanced_tuple_unpacking: true 15 | variables_redefined_outer_name: true 16 | variables_redefined_builtin: true 17 | variables_redefine_in_handler: true 18 | variables_no_name_in_module: true 19 | variables_invalid_all_object: true 20 | variables_global_variable_undefined: true 21 | variables_global_variable_not_assigned: true 22 | variables_global_statement: true 23 | variables_global_at_module_level: true 24 | typecheck_unexpected_keyword_arg: true 25 | typecheck_too_many_function_args: true 26 | typecheck_redundant_keyword_arg: true 27 | typecheck_not_callable: true 28 | typecheck_no_value_for_parameter: true 29 | typecheck_no_member: true 30 | typecheck_missing_kwoa: true 31 | typecheck_maybe_no_member: true 32 | typecheck_duplicate_keyword_arg: true 33 | typecheck_assignment_from_none: true 34 | typecheck_assignment_from_no_return: true 35 | string_unused_format_string_key: true 36 | string_truncated_format_string: true 37 | string_too_many_format_args: true 38 | string_too_few_format_args: true 39 | string_mixed_format_string: true 40 | string_missing_format_string_key: true 41 | string_format_needs_mapping: true 42 | string_constant_anomalous_unicode_escape_in_string: true 43 | string_constant_anomalous_backslash_in_string: true 44 | string_bad_str_strip_call: true 45 | string_bad_format_string_key: true 46 | string_bad_format_character: true 47 | open_mode_bad_open_mode: true 48 | newstyle_bad_super_call: true 49 | logging_unsupported_format: true 50 | logging_too_many_args: true 51 | logging_too_few_args: true 52 | logging_not_lazy: true 53 | logging_format_truncated: true 54 | miscellaneous_fixme: true 55 | imports_wildcard_import: true 56 | imports_relative_import: true 57 | imports_reimported: true 58 | imports_import_self: true 59 | imports_import_error: true 60 | imports_deprecated_module: true 61 | imports_cyclic_import: true 62 | format_unnecessary_semicolon: true 63 | format_trailing_whitespace: true 64 | format_superfluous_parens: true 65 | format_old_ne_operator: true 66 | format_multiple_statements: true 67 | format_mixed_indentation: true 68 | format_missing_final_newline: true 69 | format_lowercase_l_suffix: true 70 | format_line_too_long: 71 | max_length: '120' 72 | format_bad_whitespace: true 73 | format_bad_indentation: 74 | indentation: '4 spaces' 75 | format_backtick: true 76 | exceptions_raising_string: true 77 | exceptions_raising_non_exception: true 78 | exceptions_raising_bad_type: true 79 | exceptions_pointless_except: true 80 | exceptions_notimplemented_raised: true 81 | exceptions_catching_non_exception: true 82 | exceptions_broad_except: true 83 | exceptions_binary_op_exception: true 84 | exceptions_bare_except: true 85 | exceptions_bad_except_order: true 86 | design_interface_not_implemented: true 87 | design_abstract_class_not_used: true 88 | design_abstract_class_little_used: true 89 | classes_valid_slots: true 90 | classes_super_init_not_called: true 91 | classes_signature_differs: true 92 | classes_protected_access: true 93 | classes_non_parent_init_called: true 94 | classes_non_iterator_returned: true 95 | classes_no_self_use: true 96 | classes_no_self_argument: true 97 | classes_no_method_argument: true 98 | classes_no_init: true 99 | classes_missing_interface_method: true 100 | classes_method_hidden: true 101 | classes_interface_is_not_class: true 102 | classes_bad_staticmethod_argument: true 103 | classes_bad_mcs_method_argument: true 104 | classes_bad_context_manager: true 105 | classes_bad_mcs_classmethod_argument: true 106 | classes_bad_classmethod_argument: true 107 | classes_attribute_defined_outside_init: true 108 | classes_arguments_differ: true 109 | classes_access_member_before_definition: true 110 | classes_abstract_method: true 111 | basic_yield_outside_function: true 112 | basic_useless_else_on_loop: true 113 | basic_unreachable: true 114 | basic_unnecessary_pass: true 115 | basic_unnecessary_lambda: true 116 | basic_star_args: true 117 | basic_return_outside_function: true 118 | basic_return_in_init: true 119 | basic_return_arg_in_generator: true 120 | basic_pointless_string_statement: true 121 | basic_pointless_statement: true 122 | basic_old_raise_syntax: true 123 | basic_not_in_loop: true 124 | basic_nonexistent_operator: true 125 | basic_missing_reversed_argument: true 126 | basic_missing_module_attribute: true 127 | basic_missing_docstring: true 128 | basic_invalid_name: 129 | functions: '[a-z_][a-z0-9_]{2,30}$' 130 | variables: '[a-z_][a-z0-9_]{2,30}$' 131 | whitelisted_names: 'i,j,k,ex,Run,_' 132 | constants: '(([A-Z_][A-Z0-9_]*)|(__.*__))$' 133 | attributes: '[a-z_][a-z0-9_]{2,30}$' 134 | arguments: '[a-z_][a-z0-9_]{2,30}$' 135 | class_attributes: '([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$' 136 | inline_vars: '[A-Za-z_][A-Za-z0-9_]*$' 137 | classes: '[A-Z_][a-zA-Z0-9]+$' 138 | modules: '(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$' 139 | methods: '[a-z_][a-z0-9_]{2,30}$' 140 | basic_lost_exception: true 141 | basic_init_is_generator: true 142 | basic_function_redefined: true 143 | basic_expression_not_assigned: true 144 | basic_exec_used: true 145 | basic_empty_docstring: true 146 | basic_duplicate_key: true 147 | basic_duplicate_argument_name: true 148 | basic_dangerous_default_value: true 149 | basic_bad_reversed_sequence: true 150 | basic_assert_on_tuple: true 151 | basic_abstract_class_instantiated: true 152 | basic_eval_used: true 153 | 154 | tools: 155 | external_code_coverage: 156 | timeout: 300 # How long should we wait for code coverage (in seconds). 157 | runs: 1 # In how many runs have you split your tests? 158 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - '2.7' 5 | - '3.5' 6 | 7 | before_install: 8 | # From http://conda.pydata.org/docs/travis.html 9 | - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then 10 | wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh; 11 | else 12 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 13 | fi 14 | - bash miniconda.sh -b -p $HOME/miniconda 15 | - export PATH="$HOME/miniconda/bin:$PATH" 16 | - hash -r 17 | - conda config --set always_yes yes --set changeps1 no 18 | - conda update -q conda 19 | - conda info -a 20 | - travis_retry conda create -n test python=$TRAVIS_PYTHON_VERSION pip pytest numpy future bcolz h5py pandas toolz 21 | - source activate test 22 | - travis_retry pip install arpeggio pytest-cov pytest-pep8 codecov scrutinizer-ocular 23 | - travis_retry pip install blosc bloscpack joblib psutil 24 | - travis_retry pip install git+https://github.com/sdvillal/whatami.git@ced628c07bd1#egg=whatami==4.0.git 25 | 26 | script: 27 | py.test -v -rs --doctest-modules --pep8 --cov jagged --cov-report term-missing jagged 28 | 29 | after_success: 30 | - codecov 31 | - ocular --data-file ".coverage" --config-file ".coveragerc" 32 | 33 | notifications: 34 | email: false 35 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | New BSD License 2 | 3 | Copyright (c) 2013-2014 The jagged developers. 4 | All rights reserved. 5 | 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | a. Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | b. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | c. Neither the name of the jagged Developers nor the names of 16 | its contributors may be used to endorse or promote products 17 | derived from this software without specific prior written 18 | permission. 19 | 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 31 | DAMAGE. 32 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.rst 2 | include LICENSE.txt -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | jagged 2 | ====== 3 | 4 | Efficient storage of same-type, uneven-size arrays 5 | -------------------------------------------------- 6 | 7 | |Pypi Version| |Build Status| |Coverage Status| |Scrutinizer Status| 8 | 9 | Jagged_ is an ongoing amateur project exploring the storage panorama 10 | for datasets containing (large amounts of) arrays with the same type 11 | and number of columns, but varying number of rows. Examples of such 12 | datasets for which *jagged* has been used are collections of multivariate 13 | timeseries (short animal behaviour snippets) and collections of molecules 14 | (represented as varying length strings). 15 | 16 | Jagged aims to help analyzing data in the laptop and the cluster, in batch 17 | or interactively, providing a very lightweight store. Jagged provides fast 18 | retrieval of array subsets for many-GB datasets containing millions of rows. 19 | 20 | Requirements 21 | ------------ 22 | 23 | All the requirements are pip-installable and listed in in pypi. 24 | 25 | Jagged needs numpy_, whatami_ and python-future_. 26 | 27 | Jagged stores build on top of several optional high quality python libraries: c-blosc_, python-blosc_, 28 | bloscpack_, bcolz_ and joblib_. Testing relies on pytest_. 29 | 30 | Getting the right combination for blosc, python-blosc, bcolz and bloscpack can be a challenge 31 | (but worth the effort). At the moment (2015/09/02), we recommend using the latest released 32 | versions of c-blosc (1.7.0) in combination with the latest releases of python-blosc (1.2.7) 33 | and bloscpack (0.9.0). 34 | 35 | Jagged runs in python 2.7+ and 3.4+. At the moment it has been tested only on linux, but it should 36 | work on mac and windows as well. 37 | 38 | 39 | Installation 40 | ------------ 41 | 42 | It should suffice to use pip:: 43 | 44 | pip install jagged 45 | 46 | Showcase 47 | -------- 48 | 49 | Using jagged is simple. There are different implementations that provide 50 | two basic methods: **append** adds a new array to the store, **get** retrieves 51 | collections of arrays identified by their insertion order in the store. Usually 52 | the lifecycle of a jagged store is also simple: there is no explicit open, 53 | append and get calls can be interleaved at will and the only needed action 54 | to warrant consistency is to close after write, which can be achieved by calling 55 | **close**, by calling *get* or by using a with statement with the provided 56 | context manager. 57 | 58 | This is a `real life`_ small example combining jagged with indices and queries 59 | over real data. 60 | 61 | Another synthetic example follows: 62 | 63 | .. code:: python 64 | 65 | from __future__ import print_function 66 | import os.path as op 67 | import shutil 68 | import numpy as np 69 | import pandas as pd 70 | import tempfile 71 | from jagged.mmap_backend import JaggedByMemMap 72 | from jagged.blosc_backend import JaggedByBlosc 73 | 74 | # A Jagged instance is all you need 75 | mmap_dir = tempfile.mkdtemp('mmap') 76 | jagged = JaggedByMemMap(op.expanduser(path=mmap_dir)) 77 | # You can drop here any JaggedRawStore implementation you want to 78 | 79 | # Generate a random dataset 80 | print('Creating a random dataset...') 81 | rng = np.random.RandomState(0) 82 | num_arrays = 1000 83 | max_length = 2000 84 | num_columns = 100 85 | originals = [rng.randn(rng.randint(0, max_length), num_columns) 86 | for _ in range(num_arrays)] 87 | 88 | # Add these to the store ("with" context is usually optional, but recommended) 89 | print('Populating the jagged store...') 90 | with jagged: 91 | indices = list(map(jagged.append, originals)) 92 | 93 | # Some jagged stores optimize queries retrieving arrays by their insertion order 94 | # Retrieval speed should not suffer much even with random queries 95 | shuffled_indices = rng.permutation(indices).tolist() 96 | shuffled_originals = [originals[i] for i in shuffled_indices] 97 | 98 | # What do we have in store? 99 | print('Number of arrays: %d, number of rows: %d' % (jagged.narrays, jagged.nrows)) 100 | # Out: Number of arrays: 200, number of rows: 193732 101 | print('Jagged shape=%r, dtype=%r, order=%r' % 102 | (jagged.shape, jagged.dtype, jagged.order)) 103 | # Out: Jagged shape=(193732, 50), dtype=dtype('float64'), order='C' 104 | 105 | # Check roundtrip 106 | roundtrippeds = jagged.get(shuffled_indices) 107 | for original, roundtripped in zip(shuffled_originals, roundtrippeds): 108 | assert np.array_equal(original, roundtripped) 109 | print('Roundtrip checks pass') 110 | 111 | # Jagged stores self-identified themselves (using whatami) 112 | print(jagged.what().id()) 113 | # Out: JaggedByMemMap(autoviews=True,contiguity=None) 114 | 115 | # Jagged stores can be iterated in chunks (see iter) 116 | for original, roundtripped in zip(originals, jagged): 117 | assert np.array_equal(original, roundtripped[0]) 118 | print('Roundtrip checks for iteration pass') 119 | 120 | # Some jagged stores allow to retrieve arbitrary rows without penalty 121 | # (i.e. without retrieving the whole containing array). 122 | # These are marked as "linear" in the store feature matrix. 123 | # You do so by passing a list of (base, size) segments. 124 | some_rows = jagged.get([[3, 22], [45, 1000]]) 125 | assert len(some_rows[1]) == 1000 126 | assert np.array_equal(some_rows[0], originals[0][3:25]) 127 | print('Roundtrip checks for row retrieval pass') 128 | 129 | # Some jagged stores allow to be lazy retrieving the arrays. 130 | # On top of that, the MemMap implementation allow memmapped arrays. 131 | # Can be handy to have long lists of views in memory 132 | # while letting the OS managing memory fetching and eviction for us. 133 | jbmm = JaggedByMemMap(op.expanduser(path=mmap_dir), 134 | autoviews=True, 135 | contiguity='auto') 136 | print('Retrieving %d arrays...' % (len(shuffled_indices) * 100)) 137 | many_arrays = jbmm.get(shuffled_indices * 100) 138 | # This will work also for pandas DataFrames as long as 139 | # "copy=True" is honored by the pandas constructor 140 | # that is, the dtype of the arrays is simple), 141 | print('Making %d dataframes...' % (len(shuffled_indices) * 100)) 142 | columns = pd.Index(np.arange(num_columns)) 143 | dfs = [pd.DataFrame(data=array, columns=columns, copy=False) 144 | for array in many_arrays] 145 | print('Checking roundtrip...') 146 | for original, roundtripped in zip(shuffled_originals * 100, dfs): 147 | assert np.array_equal(original, roundtripped) 148 | print('Roundtrip checks for lazy dataframes pass') 149 | 150 | # Jagged stores can be populated from other jagged stores 151 | blosc_dir = tempfile.mkdtemp('mmap') 152 | jbb = JaggedByBlosc(path=blosc_dir) 153 | print('Saving compressed (although these data are not compressable)...') 154 | jbb.append_from(jagged) 155 | for a_from_mmap, a_from_blosc in zip(jbb, jagged): 156 | assert np.array_equal(a_from_mmap, a_from_blosc) 157 | print(jbb.what().id()) 158 | print('Roundtrip checks for compressed arrays pass') 159 | # Out: JaggedByBlosc(compressor=BloscCompressor(cname='lz4hc', 160 | # level=5, 161 | # n_threads=1, 162 | # shuffle=True)) 163 | 164 | # We are done, cleanup 165 | shutil.rmtree(mmap_dir, ignore_errors=True) 166 | shutil.rmtree(blosc_dir, ignore_errors=True) 167 | 168 | 169 | Backends 170 | -------- 171 | 172 | Although rapidly changing, *jagged* already provides the following storage backends 173 | that can be considered as working and stable. Other backends are planned. 174 | 175 | +-------------------+------+-------+--------+------+-----+------+------+ 176 | | Backend | comp | chunk | column | mmap | lin | lazy | cont | 177 | +===================+======+=======+========+======+=====+======+======+ 178 | | JaggedByBlosc | X | | | X | | | | 179 | +-------------------+------+-------+--------+------+-----+------+------+ 180 | | JaggedByCarray | X | X | | | X | | X | 181 | +-------------------+------+-------+--------+------+-----+------+------+ 182 | | JaggedByH5Py | X | X | | | X | X | X | 183 | +-------------------+------+-------+--------+------+-----+------+------+ 184 | | JaggedByJoblib | X | X | | | | | | 185 | +-------------------+------+-------+--------+------+-----+------+------+ 186 | | JaggedByMemMap | | | | X | X | X | X | 187 | +-------------------+------+-------+--------+------+-----+------+------+ 188 | | JaggedByNPY | | | | | | | | 189 | +-------------------+------+-------+--------+------+-----+------+------+ 190 | | JaggedByBloscpack | X | | | | | | | 191 | +-------------------+------+-------+--------+------+-----+------+------+ 192 | | JaggedByPickle | X | X | | | | | | 193 | +-------------------+------+-------+--------+------+-----+------+------+ 194 | 195 | 196 | - comp: 197 | can be compressed 198 | - chunk: 199 | can be chunked 200 | - column: 201 | stores columns of the array contiguously (can be easily implemented by using a store per column) 202 | - mmap: 203 | can open a memmap to the data 204 | - lin: 205 | can retrieve any row without the need to retrieve the whole array it contains it 206 | - lazy: 207 | the arrays are not fetched immediatly; this can mean also that they can be managed 208 | as virtual-memory by the OS (JaggedByMemMap only) 209 | - cont: 210 | retrieved arrays can be forced to lie in contiguous memory segments 211 | 212 | 213 | Benchmarks 214 | ---------- 215 | 216 | What backend and parameters work best depends on whether the data is compressible or not, the 217 | sizes of the arrays and the kind of queries. We have a good idea of what works best for our data 218 | and query types and are working at providing a benchmarking framework, that can be useful if 219 | you can get a good sample of the data to store. Find here a preview_, results will be soon posted here. 220 | 221 | 222 | By-design constraints 223 | --------------------- 224 | 225 | Jagged would like to be simple: conceptually, to deploy and to use. 226 | 227 | Jagged is about retrieving full arrays. 228 | Focus is on fast retrieval of arbitrary batch queries. 229 | Batch queries over arrays appended closeby should be faster. 230 | Jagged is good for local caches or reducing the burden of 231 | network file systems. 232 | 233 | Jagged stores are append only. 234 | 235 | There is no transaction, replication or distribution or... 236 | It is all files in your local or network disks, written once, read many times. 237 | If you have complex data or requirements, there are many better options. 238 | If you have simple numerical arrays you want to load fast and store light, 239 | jagged might be worth a try. 240 | 241 | Not important efforts have been given yet to optimize 242 | (although some backends work quite smoothly). 243 | At the moment, everything is simple algorithms implemented in pure python. 244 | 245 | 246 | Links 247 | ----- 248 | 249 | This neat blogpost_ from Matthew Rocklin is highly recommended, as it delivers 250 | the promised *"vocabulary to talk about efficient tabular storage"*. Add perhaps 251 | "blocked" (as in "compression is done in cache-friendly sized blocks") and 252 | "chunked" (as in "retrieval is done in I/O-friendly sized chunks") to the lexicon. 253 | The castra_ project is worth a look. 254 | 255 | 256 | .. _Jagged: https://github.com/sdvillal/jagged 257 | .. |Pypi Version| image:: https://badge.fury.io/py/jagged.svg 258 | :target: http://badge.fury.io/py/jagged 259 | .. |Build Status| image:: https://travis-ci.org/sdvillal/jagged.svg?branch=master 260 | :target: https://travis-ci.org/sdvillal/jagged/branches 261 | .. |Coverage Status| image:: http://codecov.io/github/sdvillal/jagged/coverage.svg?branch=master 262 | :target: http://codecov.io/github/sdvillal/jagged?branch=master 263 | .. |Scrutinizer Status| image:: https://scrutinizer-ci.com/g/sdvillal/jagged/badges/quality-score.png?b=master 264 | :target: https://scrutinizer-ci.com/g/sdvillal/jagged/?branch=master 265 | .. _real life: https://github.com/strawlab/strawlab-examples/blob/master/strawlab_examples/euroscipy/euroscipy_example.py 266 | .. _preview: https://github.com/sdvillal/strawlab-examples/tree/master/strawlab_examples/benchmarks 267 | .. _numpy: http://www.numpy.org/ 268 | .. _whatami: http://www.github.com/sdvillal/whatami 269 | .. _python-future: http://python-future.org/ 270 | .. _c-blosc: https://github.com/Blosc/c-blosc 271 | .. _python-blosc: https://github.com/Blosc/python-blosc 272 | .. _bloscpack: https://github.com/Blosc/bloscpack 273 | .. _bcolz: https://github.com/Blosc/bcolz 274 | .. _joblib: https://pythonhosted.org/joblib/ 275 | .. _pytest: http://pytest.org 276 | .. _blogpost: http://matthewrocklin.com/blog/work/2015/08/28/Storage/ 277 | .. _castra: https://github.com/blaze/castra 278 | -------------------------------------------------------------------------------- /examples/readme_examples.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """Examples copied verbatim in the readme. 3 | This should probably be a notebook. 4 | """ 5 | from __future__ import print_function 6 | import os.path as op 7 | import shutil 8 | import numpy as np 9 | import pandas as pd 10 | import tempfile 11 | from jagged.mmap_backend import JaggedByMemMap 12 | from jagged.blosc_backend import JaggedByBlosc 13 | 14 | # A Jagged instance is all you need 15 | mmap_dir = tempfile.mkdtemp('mmap') 16 | jagged = JaggedByMemMap(op.expanduser(path=mmap_dir)) 17 | # You can drop here any JaggedRawStore implementation you want to 18 | 19 | # Generate a random dataset 20 | print('Creating a random dataset...') 21 | rng = np.random.RandomState(0) 22 | num_arrays = 1000 23 | max_length = 2000 24 | num_columns = 100 25 | originals = [rng.randn(rng.randint(0, max_length), num_columns) 26 | for _ in range(num_arrays)] 27 | 28 | # Add these to the store ("with" context is usually optional, but recommended) 29 | print('Populating the jagged store...') 30 | with jagged: 31 | indices = list(map(jagged.append, originals)) 32 | 33 | # Some jagged stores optimize queries retrieving arrays by their insertion order 34 | # Retrieval speed should not suffer much even with random queries 35 | shuffled_indices = rng.permutation(indices).tolist() 36 | shuffled_originals = [originals[i] for i in shuffled_indices] 37 | 38 | # What do we have in store? 39 | print('Number of arrays: %d, number of rows: %d' % (jagged.narrays, jagged.nrows)) 40 | # Out: Number of arrays: 200, number of rows: 193732 41 | print('Jagged shape=%r, dtype=%r, order=%r' % 42 | (jagged.shape, jagged.dtype, jagged.order)) 43 | # Out: Jagged shape=(193732, 50), dtype=dtype('float64'), order='C' 44 | 45 | # Check roundtrip 46 | roundtrippeds = jagged.get(shuffled_indices) 47 | for original, roundtripped in zip(shuffled_originals, roundtrippeds): 48 | assert np.array_equal(original, roundtripped) 49 | print('Roundtrip checks pass') 50 | 51 | # Jagged stores self-identified themselves (using whatami) 52 | print(jagged.what().id()) 53 | # Out: JaggedByMemMap(autoviews=True,contiguity=None) 54 | 55 | # Jagged stores can be iterated in chunks (see iter) 56 | for original, roundtripped in zip(originals, jagged): 57 | assert np.array_equal(original, roundtripped[0]) 58 | print('Roundtrip checks for iteration pass') 59 | 60 | # Some jagged stores allow to retrieve arbitrary rows without penalty 61 | # (i.e. without retrieving the whole containing array). 62 | # These are marked as "linear" in the store feature matrix. 63 | # You do so by passing a list of (base, size) segments. 64 | some_rows = jagged.get([[3, 22], [45, 1000]]) 65 | assert len(some_rows[1]) == 1000 66 | assert np.array_equal(some_rows[0], originals[0][3:25]) 67 | print('Roundtrip checks for row retrieval pass') 68 | 69 | # Some jagged stores allow to be lazy retrieving the arrays. 70 | # On top of that, the MemMap implementation allow memmapped arrays. 71 | # Can be handy to have long lists of views in memory 72 | # while letting the OS managing memory fetching and eviction for us. 73 | jbmm = JaggedByMemMap(op.expanduser(path=mmap_dir), 74 | autoviews=True, 75 | contiguity='auto') 76 | print('Retrieving %d arrays...' % (len(shuffled_indices) * 100)) 77 | many_arrays = jbmm.get(shuffled_indices * 100) 78 | # This will work also for pandas DataFrames as long as 79 | # "copy=True" is honored by the pandas constructor 80 | # that is, the dtype of the arrays is simple), 81 | print('Making %d dataframes...' % (len(shuffled_indices) * 100)) 82 | columns = pd.Index(np.arange(num_columns)) 83 | dfs = [pd.DataFrame(data=array, columns=columns, copy=False) 84 | for array in many_arrays] 85 | print('Checking roundtrip...') 86 | for original, roundtripped in zip(shuffled_originals * 100, dfs): 87 | assert np.array_equal(original, roundtripped) 88 | print('Roundtrip checks for lazy dataframes pass') 89 | 90 | # Jagged stores can be populated from other jagged stores 91 | blosc_dir = tempfile.mkdtemp('mmap') 92 | jbb = JaggedByBlosc(path=blosc_dir) 93 | print('Saving compressed (although these data are not compressable)...') 94 | jbb.append_from(jagged) 95 | for a_from_mmap, a_from_blosc in zip(jbb, jagged): 96 | assert np.array_equal(a_from_mmap, a_from_blosc) 97 | print(jbb.what().id()) 98 | print('Roundtrip checks for compressed arrays pass') 99 | # Out: JaggedByBlosc(compressor=BloscCompressor(cname='lz4hc', 100 | # level=5, 101 | # n_threads=1, 102 | # shuffle=True)) 103 | 104 | # We are done, cleanup 105 | shutil.rmtree(mmap_dir, ignore_errors=True) 106 | shutil.rmtree(blosc_dir, ignore_errors=True) 107 | -------------------------------------------------------------------------------- /jagged/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import absolute_import 3 | 4 | # --- Backend imports 5 | 6 | from .mmap_backend import JaggedByMemMap 7 | 8 | try: 9 | from .blosc_backend import JaggedByBlosc 10 | except ImportError: # pragma: no cover 11 | JaggedByBlosc = None 12 | 13 | try: 14 | from .bcolz_backend import JaggedByCarray 15 | except ImportError: # pragma: no cover 16 | JaggedByCarray = None 17 | 18 | try: 19 | from .h5py_backend import JaggedByH5Py 20 | except ImportError: # pragma: no cover 21 | JaggedByH5Py = None 22 | 23 | from .npy_backend import JaggedByNPY 24 | 25 | try: 26 | from .bloscpack_backend import JaggedByBloscpack 27 | except ImportError: # pragma: no cover 28 | JaggedByBloscpack = None 29 | 30 | from .pickle_backend import JaggedByPickle 31 | 32 | try: 33 | from .joblib_backend import JaggedByJoblib 34 | except ImportError: # pragma: no cover 35 | JaggedByJoblib = None 36 | 37 | 38 | # --- Version 39 | 40 | __version__ = '0.1.1-dev0' 41 | -------------------------------------------------------------------------------- /jagged/base.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """Convenient (and somehow performing) storage of objects with homogeneous types but different lengths. 3 | 4 | "jagged" array providers have very simple, low level contracts: 5 | - Focus on reading performance, append only store. 6 | - Use numpy arrays as the canonical data carriers 7 | - May or may not restrict the type of the stored elements 8 | - Retrieve only by providing indices *collections* 9 | No explicit support for slice notation 10 | - All clases are whatami whatables 11 | """ 12 | from __future__ import absolute_import, unicode_literals, print_function 13 | from future.utils import bytes_to_native_str 14 | from abc import ABCMeta 15 | from array import array 16 | from functools import partial 17 | import os.path as op 18 | from operator import itemgetter 19 | import json 20 | 21 | from future.builtins import range, map 22 | from toolz import merge, partition_all 23 | import numpy as np 24 | 25 | from jagged.misc import ensure_dir 26 | from whatami import whatable 27 | 28 | try: 29 | import cPickle as pickle 30 | except ImportError: # pragma: no cover 31 | import pickle 32 | 33 | 34 | # --- Journals (persitence of array lengths) 35 | 36 | 37 | def _int_or_0(v): 38 | if v is None: 39 | return 0 40 | return int(v) 41 | 42 | 43 | def _read_full_file(x, path): 44 | """Reads the full contentes of file path into array x.""" 45 | with open(path, 'rb') as reader: 46 | reader.seek(0, 2) 47 | size = reader.tell() 48 | reader.seek(0, 0) 49 | if size % x.itemsize != 0: 50 | raise Exception('Truncated file') 51 | x.fromfile(reader, size // x.itemsize) 52 | return x 53 | 54 | 55 | class JaggedJournal(object): 56 | """Keeps track and persists information about the sizes of added arrays.""" 57 | 58 | # a journal must be instantiated only when jagged knows its location 59 | # a journal can be shared by many jagged instances (e.g. when storing different columns by different jaggeds) 60 | 61 | def __init__(self, path): 62 | super(JaggedJournal, self).__init__() 63 | self._path = ensure_dir(path) 64 | # base and length of each added array 65 | self._lengths_file = op.join(self._path, 'lengths.array') 66 | self._lengths = self._read_lengths() 67 | self._bases = None 68 | # total number of rows and arrays 69 | self._sizes_file = op.join(self._path, 'size.json') 70 | self._numrows, self._numarrays = self._read_sizes() 71 | 72 | def append(self, data): 73 | """Appends the data array to the journal.""" 74 | self._add_length(data) 75 | self._add_sizes(data) 76 | 77 | # --- Num rows, num arrays (redundant with lengths, light and good for redundancy) 78 | 79 | def _add_sizes(self, data): 80 | """Adds to numrows and numarrays the sizes of the array data and immediatly persists them.""" 81 | self._numrows += len(data) 82 | self._numarrays += 1 83 | with open(self._sizes_file, 'w') as writer: 84 | json.dump({'numrows': self._numrows, 'numarrays': self._numarrays}, writer, indent=2) 85 | 86 | def _read_sizes(self): 87 | """Reads the current numrows and numarrays values from persistent storage. 88 | If there is no info stored, makes them 0. 89 | """ 90 | if op.isfile(self._sizes_file): 91 | with open(self._sizes_file, 'r') as reader: 92 | sizes = json.load(reader) 93 | return _int_or_0(sizes['numrows']), _int_or_0(sizes['numarrays']) 94 | return 0, 0 95 | 96 | def numrows(self): 97 | """Returns the total number of rows in the jagged instance.""" 98 | return self._numrows 99 | 100 | def numarrays(self): 101 | """Returns the number of arrays in the jagged instance.""" 102 | return self._numarrays 103 | 104 | # --- Base and size of each array 105 | 106 | def _add_length(self, data): 107 | """Adds the length to the journal and immediatly persists it.""" 108 | self._lengths.append(len(data)) 109 | with open(self._lengths_file, 'ab') as writer: 110 | self._lengths[-1:].tofile(writer) 111 | 112 | def _read_lengths(self): 113 | """Reads the lengths from persistent storage, if it does not exist, returns an empty array.""" 114 | lengths = array(bytes_to_native_str(b'l')) 115 | if op.isfile(self._lengths_file): 116 | _read_full_file(lengths, self._lengths_file) 117 | return lengths 118 | 119 | def lengths(self): 120 | """Returns an array with the length of each array added to the journal.""" 121 | return self._lengths 122 | 123 | def bases(self): 124 | """Returns where each array would start if the storage is linear.""" 125 | if self._bases is None or len(self._bases) < len(self._lengths): 126 | self._bases = np.hstack(([0], np.cumsum(self._lengths))) 127 | return self._bases 128 | 129 | def start_end(self, index): 130 | """Returns the start and end of the array at index.""" 131 | base, size = self.base_size(index) 132 | return base, base + size 133 | 134 | def base_size(self, index): 135 | """Returns the base and size of the array at index.""" 136 | return self.bases()[index], self.lengths()[index] 137 | 138 | # --- Sanity checks 139 | 140 | def check_consistency(self): 141 | """Checks the internal consistency of the journal.""" 142 | assert len(self.lengths()) == len(self.bases()) 143 | assert len(self.lengths()) == self.numarrays() 144 | assert len(np.sum(self.lengths())) == self.numrows() 145 | 146 | # --- Raw stores 147 | 148 | 149 | @whatable(add_properties=False) 150 | class JaggedRawStore(object): 151 | """Persistent storage of objects of the same type but different length.""" 152 | 153 | __metaclass__ = ABCMeta 154 | 155 | def __init__(self, path, journal=None): 156 | super(JaggedRawStore, self).__init__() 157 | self._path = path 158 | if self._path is not None: 159 | ensure_dir(self._path) 160 | self._template = None # how the saved arrays look like 161 | self._journal = journal # sizes of the added arrays 162 | 163 | # --- Where this storage resides 164 | 165 | def path_or_fail(self): 166 | """Returns the path if set, otherwise raises an exception.""" 167 | if self._path is None: 168 | raise Exception('In-memory only arrays are not implemented for %s.' % self.what().id()) 169 | return self._path 170 | 171 | # --- Journal 172 | 173 | def journal(self): 174 | if self._journal is None: 175 | self._journal = JaggedJournal(op.join(self.path_or_fail(), 'meta', 'journal')) 176 | return self._journal 177 | 178 | # --- Template 179 | 180 | def template(self): 181 | template_dir = ensure_dir(op.join(self.path_or_fail(), 'meta', 'template')) 182 | template_path = op.join(template_dir, 'template.npy') 183 | if self._template is None: 184 | if op.isfile(template_path): 185 | self._template = np.load(template_path) 186 | return self._template 187 | 188 | def _write_template(self, data): 189 | template_dir = ensure_dir(op.join(self.path_or_fail(), 'meta', 'template')) 190 | template_path = op.join(template_dir, 'template.npy') 191 | np.save(template_path, data[:0]) 192 | 193 | def can_add(self, data): 194 | """Returns True iff data can be stored. 195 | This usually means it is of the same kind as previously stored arrays. 196 | """ 197 | # Obviously we could just store arbitrary arrays in some implementations (e.g. NPY) 198 | # But lets keep jagged contracts... 199 | template = self.template() 200 | if template is None: 201 | return True 202 | return (template.dtype >= data.dtype and 203 | data.shape[-1] == template.shape[-1] and 204 | np.isfortran(data) == np.isfortran(data)) 205 | 206 | # --- Lifecycle 207 | 208 | # N.B. at the moment, to make things simple, we only want write or read 209 | # We should ensure concurrency does not break this rule (only one writer xor many readers) 210 | # Of course we could use stuff like SWMR from hdf5 or role our own, more featureful and compled concurrency 211 | # Not a priority 212 | 213 | @property 214 | def is_writing(self): 215 | """Returns whether we can append more data using this jagged instance.""" 216 | raise NotImplementedError() 217 | 218 | @property 219 | def is_reading(self): 220 | """Returns whether we can append more data using this jagged instance.""" 221 | raise NotImplementedError() 222 | 223 | @property 224 | def is_open(self): 225 | """Returns whether we are currently open in any mode.""" 226 | raise NotImplementedError() 227 | 228 | def close(self): 229 | """Flushes buffers to permanent storage and closes the underlying backend.""" 230 | raise NotImplementedError() 231 | 232 | # --- Writing data 233 | 234 | def _open_write(self, data=None): 235 | """Opens in writing mode, returns None. 236 | 237 | Parameters 238 | ---------- 239 | data : numpy array like, default None 240 | data schema to use by the storage, needed if this is the first opening of the repository 241 | """ 242 | raise NotImplementedError() 243 | 244 | def append(self, data): 245 | """Appends new data to this storage. 246 | 247 | If the storage is empty, this will define the dtype of the store. 248 | 249 | Parameters 250 | ---------- 251 | data : numpy-array like 252 | The data to append, must have a compatible dtype with what was already added to the store. 253 | 254 | Returns 255 | ------- 256 | An integer addressing the added array in the storage 257 | """ 258 | 259 | # at the moment we do not allow coordinate-less stores 260 | self.path_or_fail() 261 | 262 | # check data validity 263 | if any(s < 1 for s in data.shape[1:]): 264 | raise Exception('Cannot append data with sizes 0 in non-leading dimension (%s, %r)' % 265 | (self.what().id(), data.shape)) 266 | 267 | # check we can write 268 | if self.is_reading and not self.is_writing: 269 | self.close() 270 | 271 | # template 272 | if self.template() is None: 273 | self._write_template(data) 274 | assert self.can_add(data) 275 | 276 | # id log 277 | if not op.isfile(op.join(self.path_or_fail(), 'meta', 'whatid.txt')): 278 | ensure_dir(op.join(self.path_or_fail(), 'meta')) 279 | with open(op.join(self.path_or_fail(), 'meta', 'whatid.txt'), 'w') as writer: 280 | writer.write(self.what().id()) 281 | 282 | # open 283 | self._open_write(data) 284 | 285 | # write 286 | self._append_hook(data) 287 | 288 | # bookkeeping 289 | index = self.journal().numarrays() 290 | self.journal().append(data) 291 | 292 | # done 293 | return index 294 | 295 | def _append_hook(self, data): 296 | """Saves the data, returns nothing.""" 297 | raise NotImplementedError() 298 | 299 | def append_from(self, jagged, arrays_per_chunk=None): 300 | """Appends all the contents of jagged.""" 301 | for chunk in jagged.iter_arrays(arrays_per_chunk=arrays_per_chunk): 302 | for data in chunk: 303 | self.append(data) 304 | 305 | # --- Reading 306 | 307 | def _open_read(self): 308 | """Opens in reading mode, returns None.""" 309 | raise NotImplementedError() 310 | 311 | def _get_views(self, keys, columns): 312 | """Returns a list of arrays corresponding to the provided keys and columns.""" 313 | raise NotImplementedError() 314 | 315 | def get(self, keys=None, columns=None, factory=None): 316 | """Returns a list with the data specified in `keys` (and `columns`), possibly transformed by `factory`. 317 | 318 | Concrete implementations may warrant things like "all segments actually lie in congiguous regions in memory". 319 | 320 | Parameters 321 | ---------- 322 | keys : list of keys 323 | specifies which elements to retrieve; if None, all arrays are returned 324 | 325 | columns : list of integers, default None 326 | specifies which columns to retrieve; if None, retrieve all columns 327 | 328 | factory : factory(ndarray)->desired type, default None 329 | transforms each of the returned elements into a desired type (for example, a pandas DataFrame) 330 | another use can be to apply summary statistics 331 | 332 | Returns 333 | ------- 334 | A list with the retrieved elements, possibly transformed by factory. 335 | """ 336 | 337 | # at the moment we do not allow coordinate-less stores 338 | self.path_or_fail() 339 | 340 | # flush if needed 341 | if self.is_writing and not self.is_reading: 342 | self.close() 343 | 344 | # open 345 | self._open_read() 346 | 347 | # read 348 | views = self._get_views(keys, columns) 349 | 350 | return views if factory is None else map(factory, views) 351 | 352 | # -- Iteration 353 | 354 | def iter_arrays(self, arrays_per_chunk=None): 355 | """Iterates over the arrays in this store.""" 356 | if arrays_per_chunk is None: 357 | for key in range(self.journal().numarrays()): 358 | yield self.get([key]) 359 | elif arrays_per_chunk <= 0: 360 | raise ValueError('arrays_per_chunk must be None or bigger than 0, it is %r' % arrays_per_chunk) 361 | else: 362 | for segments in partition_all(arrays_per_chunk, range(self.journal().numarrays())): 363 | yield self.get(segments) 364 | 365 | def __iter__(self, arrays_per_chunk=None): 366 | """Alias to iter_arrays.""" 367 | return self.iter_arrays(arrays_per_chunk=arrays_per_chunk) 368 | 369 | # def iter_rows(self, max_rows_per_chunk): 370 | # # Iterates segments in chunks with max_rows_per_chunk as upper bound 371 | # # (but will give at least one segment at a time) 372 | # # This can be more (e.g. SegmentRawStorage) or less involved (e.g. JaggedByNumpy) 373 | # # Useful to iterate with really controlled amount of memory 374 | # raise NotImplementedError() 375 | 376 | # --- Factories / curries / partials 377 | 378 | def copyconf(self, **params): 379 | """Returns a partial function that instantiates this type of store 380 | with changed default parameters. 381 | 382 | N.B. this default implementation is based on being able to retrieve all default parameters 383 | using the `what` method; override if that is not the case. 384 | 385 | Parameters 386 | ---------- 387 | params: **dict 388 | The parameters that will be fixed in the returned factory function. 389 | """ 390 | return whatable(partial(self.__class__, **merge(self.what().conf, params)), add_properties=False) 391 | 392 | # --- shape, dtype, order 393 | 394 | @property 395 | def shape(self): 396 | """Returns a tuple with the current size of the storage in each dimension.""" 397 | ncols = self.ncols 398 | return None if ncols is None else (self.nrows, ncols) 399 | 400 | @property 401 | def dtype(self): 402 | """Returns the data type of the store.""" 403 | template = self.template() 404 | return None if template is None else template.dtype 405 | 406 | @property 407 | def ndims(self): 408 | """Returns the number of dimensions.""" 409 | # Actually at the moment we only support ndims == 2 410 | shape = self.shape 411 | return len(self.shape) if shape is not None else None 412 | 413 | @property 414 | def ncols(self): 415 | """Returns the number of columns.""" 416 | template = self.template() 417 | return None if template is None else template.shape[1] 418 | 419 | @property 420 | def nrows(self): 421 | """Returns the number of rows in the store.""" 422 | return self.journal().numrows() 423 | 424 | @property 425 | def narrays(self): 426 | """Returns the number or arrays in the store.""" 427 | return self.journal().numarrays() 428 | 429 | @property 430 | def order(self): 431 | """Returns 'C' for row major, 'F' for column major.""" 432 | template = self.template() 433 | if template is None: 434 | return None 435 | return 'F' if np.isfortran(template) else 'C' 436 | 437 | # --- Context manager and other magics... 438 | 439 | def __enter__(self): 440 | return self 441 | 442 | def __exit__(self, *_): 443 | self.close() 444 | 445 | def __len__(self): 446 | """Returns the size of the leading dimension.""" 447 | return self.shape[0] if self.shape is not None else 0 448 | 449 | # Also consider register to atexit as a parameter to the constructor 450 | 451 | 452 | # --- Linear stores (can address arbitrary row segments) 453 | 454 | 455 | class LinearRawStorage(JaggedRawStore): 456 | 457 | __metaclass__ = ABCMeta # no harm, lint stops complaining 458 | 459 | def __init__(self, path, journal=None, contiguity=None): 460 | """ 461 | A linear raw storage can access arbitrary rows using base (row index in the storage) and size 462 | (number of rows to retrieve). 463 | 464 | Parameters 465 | ---------- 466 | journal : must quack like JaggedJournal, default None 467 | see base class 468 | 469 | contiguity : string or None, default None 470 | indicates the type of contiguity sought for the results; for performance segments retrieval 471 | does not need to be done in any order 472 | - 'read': a best effort should be done to leave retrieved segments order-contiguous in memory; 473 | this can potentially speed up operations reading these data in the order specified by segments 474 | - 'write': a best effort should be done to write segments sequentially in memory; 475 | this can potentially speed up retrieval 476 | - 'auto': allow the backend to decide the return flavor; 477 | using this the backends can return "lazy" or "cached" arrays 478 | (for example, views on memmapped arrays or hdf5 datasets) 479 | - None: do not force any contiguity nor allow any strange return, just plain numpy arrays 480 | owning their own data; this is safest and usually well performing 481 | usually 'read' can be a good idea for analysis, and 'auto' can have memory saving benefits 482 | beware that forcing contiguity for speed might lead to memory leaks 483 | (the whole retrieved segments won't be released while any of them is reacheable) 484 | """ 485 | super(LinearRawStorage, self).__init__(path, journal=journal) 486 | self.contiguity = contiguity 487 | 488 | def _get_views(self, keys, columns): 489 | # get all segments if segments is None 490 | if keys is None: 491 | keys = range(self.journal().numarrays()) 492 | keys = [self.journal().base_size(key) if isinstance(key, int) else key for key in keys] 493 | 494 | if 0 == len(keys): 495 | return [] 496 | 497 | # retrieve data 498 | ne, nc = self.shape 499 | views = retrieve_contiguous(keys, columns, self._get_hook, self.dtype, ne, nc, self.contiguity) 500 | 501 | return views 502 | 503 | def _get_hook(self, base, size, columns, dest): 504 | raise NotImplementedError() 505 | 506 | def iter_rows(self, rows_per_chunk): 507 | """Reads rows_per_chunk rows at a time until all is read.""" 508 | base = 0 509 | total = len(self) 510 | while base < total: 511 | size = min(rows_per_chunk, total - base) 512 | yield self.get([(base, size)])[0] 513 | base += size 514 | 515 | 516 | def retrieve_contiguous(segments, columns, reader, dtype, ne, nc, contiguity): 517 | 518 | # Check for valid contiguity 519 | if contiguity not in ('read', 'write', 'auto', None): 520 | raise ValueError('Unknown contiguity scheme: %r' % contiguity) 521 | 522 | # Check query sanity and prepare contiguous query 523 | # dest_base tells where each query must go to in case of contiguity='read' 524 | # note that dest_base is not useful for unsorting in the presence of 0-length items (so we explicitly store order) 525 | dest_base = 0 526 | query_dest = [] 527 | for order, (base, size) in enumerate(segments): 528 | if (base + size) > ne or base < 0: 529 | raise ValueError('Out of bounds query (base=%d, size=%d, maxsize=%d)' % (base, size, ne)) 530 | query_dest.append((order, base, dest_base, size)) 531 | dest_base += size 532 | total_size = dest_base 533 | 534 | nc = len(columns) if columns is not None else nc 535 | 536 | # Retrieve 537 | views = [] 538 | if contiguity == 'read': 539 | # Hope for one-malloc only, but beware of memory leaks 540 | dest = np.empty((total_size, nc), dtype=dtype) 541 | # Populate 542 | for order, base, dest_base, size in sorted(query_dest): 543 | view = dest[dest_base:dest_base+size] 544 | view = reader(base, size, columns, view) 545 | views.append((order, view)) 546 | elif contiguity == 'write': 547 | # Hope for one-malloc only, but beware of memory leaks 548 | dest = np.empty((total_size, nc), dtype=dtype) 549 | # Populate 550 | dest_base = 0 551 | for order, base, _, size in sorted(query_dest): 552 | view = dest[dest_base:dest_base+size] 553 | view = reader(base, size, columns, view) 554 | dest_base += size 555 | views.append((order, view)) 556 | elif contiguity == 'auto': 557 | for order, base, _, size in sorted(query_dest): 558 | view = reader(base, size, columns, None) 559 | views.append((order, view)) 560 | else: 561 | for order, base, _, size in sorted(query_dest): 562 | view = np.empty((size, nc), dtype=dtype) 563 | view = reader(base, size, columns, view) 564 | views.append((order, view)) 565 | 566 | # Unpack views while restoring original order 567 | return list(map(itemgetter(1), sorted(views, key=itemgetter(0)))) 568 | -------------------------------------------------------------------------------- /jagged/bcolz_backend.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import bcolz 4 | from whatami import whatable 5 | import os.path as op 6 | from .base import LinearRawStorage 7 | 8 | 9 | class JaggedByCarray(LinearRawStorage): 10 | """ 11 | A Jagged store that uses in-disk `bcolz.carray` to store the data. 12 | 13 | This backend should be good for compressable data accessed sequentially or in batch range queries. 14 | Random access of small segments will suffer from a considerable performance degradation. 15 | 16 | Usually these stores are backed by many files, so access via network file systems or from spin disks can 17 | potentially be inefficient. 18 | 19 | Parameters 20 | ---------- 21 | path : string 22 | the carray will/must reside here 23 | 24 | contiguity : string, default None 25 | see base class 26 | 27 | journal : must quack like JaggedJournal, default None 28 | see base class 29 | 30 | expectedlen : int, default None 31 | passed to the carray on creation, the expected number of rows in the store 32 | carray will use it to guess a good chunksize 33 | the actual size of each chunk will of course depend also on the number of columns 34 | must be None if `chunklen` is provided 35 | 36 | chunklen : int, default None 37 | passed to the carray on creation, the number of rows to store per chunk 38 | the actual size of each chunk will of course depend also on the number of columns 39 | must be None if `expectedlen` is provided 40 | 41 | cparams : `bcolz.cparams`, default bcolz.cparams(clevel=5, shuffle=False, cname='lz4hc') 42 | the compression configuration for bcolz; only used if the array is empty 43 | """ 44 | 45 | def __init__(self, 46 | path=None, 47 | journal=None, 48 | contiguity=None, 49 | # bcolz params 50 | expectedlen=None, 51 | chunklen=1024 ** 2 // 2, # 500K rows 52 | cparams=bcolz.cparams(clevel=5, shuffle=False, cname='lz4hc')): 53 | 54 | super(JaggedByCarray, self).__init__(path, journal=journal, contiguity=contiguity) 55 | 56 | self.expectedlen = expectedlen 57 | self.chunklen = chunklen 58 | self.cparams = whatable(cparams, add_properties=True) 59 | self._bcolz = None 60 | 61 | def _bcolz_dir(self): 62 | # Needs to be different than self._path or metainfo gets deleted 63 | return op.join(self.path_or_fail(), 'bcolz') 64 | 65 | # --- Write 66 | 67 | def _open_write(self, data=None): 68 | if self._bcolz is None: 69 | try: # append 70 | self._bcolz = \ 71 | bcolz.carray(None, 72 | rootdir=self._bcolz_dir(), 73 | mode='a', 74 | # bcolz conf in case mode='a' semantics change to create, otherwise innocuous 75 | chunklen=self.chunklen, 76 | expectedlen=self.expectedlen, 77 | cparams=self.cparams) 78 | except: # create 79 | self._bcolz = \ 80 | bcolz.carray(data[0:0], 81 | rootdir=self._bcolz_dir(), 82 | mode='w', 83 | chunklen=self.chunklen, 84 | expectedlen=self.expectedlen, 85 | cparams=self.cparams) 86 | 87 | def _append_hook(self, data): 88 | self._bcolz.append(data) 89 | 90 | # --- Read 91 | 92 | def _open_read(self): 93 | if self._bcolz is None: 94 | self._bcolz = bcolz.carray(None, rootdir=self._bcolz_dir(), mode='r') 95 | 96 | def _get_hook(self, base, size, columns, dest): 97 | if dest is not None and columns is None: 98 | # measure if this has any performance benefit, if so, asks for it to be public API 99 | self._bcolz._getrange(base, size, dest) 100 | return dest 101 | if columns is not None: 102 | view = self._bcolz[base:base+size, columns] 103 | else: 104 | view = self._bcolz[base:base+size] 105 | if dest is not None: 106 | dest[:] = view 107 | return dest 108 | return view 109 | 110 | # --- Lifecycle 111 | 112 | @property 113 | def is_writing(self): 114 | return self.is_open and self._bcolz.mode in ('w', 'a') 115 | 116 | @property 117 | def is_reading(self): 118 | return self.is_open and self._bcolz.mode == 'r' 119 | 120 | @property 121 | def is_open(self): 122 | return self._bcolz is not None 123 | 124 | def close(self): 125 | if self.is_writing: 126 | self._bcolz.flush() 127 | self._bcolz = None 128 | -------------------------------------------------------------------------------- /jagged/benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdvillal/jagged/e737eae08005ab20222b46668f54e6b5dcb8e04c/jagged/benchmarks/__init__.py -------------------------------------------------------------------------------- /jagged/benchmarks/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """Benchmarking utilities. 3 | Some of these are inspired by bloscpack / bloscpack-benchmarks. 4 | https://github.com/Blosc/bloscpack-benchmarking 5 | """ 6 | import os.path as op 7 | import socket 8 | import datetime 9 | import subprocess 10 | import json 11 | import os 12 | from jagged.misc import ensure_dir 13 | import psutil 14 | 15 | # 16 | # Timing is hard and we should at least use timeit 17 | # (something with support for calibration and repetition). 18 | # A great resource is also pytest benchmark 19 | # https://pypi.python.org/pypi/pytest-benchmark/2.5.0 20 | # https://bitbucket.org/haypo/misc/src/tip/python/benchmark.py 21 | # There are a bunch of benchmarker / timer etc. libraries in pypi 22 | # Do not forget about /usr/bin/time -v 23 | # 24 | 25 | 26 | def timestr(): 27 | return datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 28 | 29 | 30 | def hostname(): 31 | return socket.gethostname() 32 | 33 | 34 | def collect_sysinfo(dest=None): 35 | """ 36 | Collects basic information from the machine using several tools. 37 | This needs to run as root. 38 | Note that speeds are theoretical, not measured 39 | (specially peak network and network drives speeds should be measured). 40 | 41 | Prerequisites 42 | ------------- 43 | If in ubuntu: 44 | sudo apt-get install smartmontools inxi dmidecode 45 | If in arch: 46 | sudo pacman -S smartmontools inxi dmidecode 47 | 48 | What is run 49 | ----------- 50 | # Basic information about mount points 51 | mount > mount.info 52 | # Inxi reports 53 | inxi > inxi.info 54 | # Full dmidecode 55 | dmidecode > dmidecode.info 56 | # Network speed information 57 | dmesg | grep -i duplex > network-speed.info 58 | # SMART information 59 | sudo smartctl -a /dev/sda > smartctl-sda.info 60 | 61 | References 62 | ---------- 63 | http://www.binarytides.com/linux-commands-hardware-info/ 64 | http://www.cyberciti.biz/faq/linux-command-to-find-sata-harddisk-link-speed/ 65 | http://www.cyberciti.biz/faq/howto-setup-linux-lan-card-find-out-full-duplex-half-speed-or-mode/ 66 | http://www.cyberciti.biz/tips/linux-find-out-wireless-network-speed-signal-strength.html 67 | """ 68 | 69 | # 70 | # Any way of getting actual memory latencies, CAS...? 71 | # Also we could look at pure python libraries like dmidecode 72 | # 73 | 74 | if dest is None: 75 | dest = op.join(op.dirname(__file__), 'sysinfo') 76 | dest = op.join(ensure_dir(op.join(dest, hostname())), timestr() + '.json') 77 | 78 | info = { 79 | 'mount': subprocess.check_output('mount'), 80 | 'dmesg-eth': '\n'.join(line for line in subprocess.check_output('dmesg').splitlines() if 'duplex' in line), 81 | 'iwconfig': subprocess.check_output('iwconfig'), 82 | 'inxiF': subprocess.check_output(['inxi', '-c 0', '-F']), 83 | # add some more inxi stuff 84 | } 85 | 86 | with open(dest, 'w') as writer: 87 | json.dump(info, writer, indent=2, sort_keys=True) 88 | 89 | 90 | def du(path): 91 | """Returns the size of the tree under path in bytes.""" 92 | return int(subprocess.check_output(['du', '-s', '-L', '-B1', path]).split()[0].decode('utf-8')) 93 | 94 | 95 | def drop_caches(path, drop_level=3, max_size='1000G', verbose=False): 96 | # 97 | # Some light reading 98 | # http://www.linuxatemyram.com/play.html 99 | # vmtouch 100 | # https://aur.archlinux.org/packages/vmtouch/ 101 | # http://serverfault.com/questions/278454/is-it-possible-to-list-the-files-that-are-cached 102 | # http://serverfault.com/questions/43383/caching-preloading-files-on-linux-into-ram 103 | # fincore 104 | # yaourt -S --noconfirm perl-file-sharedir-install 105 | # yaourt -S --noconfirm fincore 106 | # To drop system caches, one needs root; 107 | # an option, add the program to sudoers so no pass is required. 108 | # 109 | if 0 != os.system('vmtouch -e -f -q -m %s "%s"' % (max_size, path)): 110 | if os.geteuid() == 0: 111 | os.system('echo %d > /proc/sys/vm/drop_caches' % drop_level) 112 | if verbose: 113 | print('Full system cache dropped because of %s' % path) 114 | else: 115 | raise RuntimeError('Need vmtouch or root permission to drop caches') 116 | else: 117 | if verbose: 118 | print('All pages under %s evicted' % path) 119 | 120 | 121 | def sync(): 122 | """Flushes buffers to disk.""" 123 | os.system('sync') 124 | 125 | 126 | def available_ram(): 127 | return psutil.virtual_memory().available 128 | 129 | # 130 | # We need to make sure that: 131 | # - we go beyond microbenchmarks and look at relevant tasks 132 | # e.g. realtime visualisation or data exploration as opposed to batch 133 | # 134 | 135 | # 136 | # Measure dataset complexity (e.g. lempel ziv via compression) and report it 137 | # 138 | -------------------------------------------------------------------------------- /jagged/blosc_backend.py: -------------------------------------------------------------------------------- 1 | from mmap import mmap, ACCESS_READ 2 | from operator import itemgetter 3 | import os.path as op 4 | 5 | from future.builtins import range 6 | 7 | from jagged.base import JaggedRawStore, JaggedJournal 8 | from jagged.compression.compressors import BloscCompressor 9 | from whatami import What 10 | 11 | 12 | class JaggedByBlosc(JaggedRawStore): 13 | 14 | # Memmapped 15 | # Not chunked - hope to keep using bcolz for that 16 | 17 | def __init__(self, path=None, journal=None, compressor=BloscCompressor): 18 | super(JaggedByBlosc, self).__init__(path, journal=journal) 19 | self.compressor = compressor 20 | self._mm = None 21 | self._writing = None 22 | self._bjournal = None 23 | 24 | def _bytes_journal(self): 25 | if self._bjournal is None: 26 | self._bjournal = JaggedJournal(op.join(self.path_or_fail(), 'meta', 'bytes_journal')) 27 | return self._bjournal 28 | 29 | def _compressor(self): 30 | if not isinstance(self.compressor, BloscCompressor): 31 | self.compressor = self.compressor(dtype=self.dtype, 32 | shape=self.shape, 33 | order=self.order) 34 | return self.compressor 35 | 36 | # --- Custom what() to be available at any circumstance 37 | 38 | def what(self): 39 | try: 40 | return What(self.__class__.__name__, {'compressor': self.compressor()}) 41 | except TypeError: 42 | return What(self.__class__.__name__, {'compressor': self.compressor}) 43 | 44 | # --- Write 45 | 46 | def _open_write(self, data=None): 47 | self._mm = open(op.join(self.path_or_fail(), 'data'), 'ab') 48 | self._writing = True 49 | 50 | def _append_hook(self, data): 51 | compressor = self._compressor() 52 | compressed = compressor.compress(data) 53 | self._mm.write(compressed) 54 | self._bytes_journal().append(compressed) 55 | 56 | # --- Read 57 | 58 | def _open_read(self): 59 | self._mm = open(op.join(self.path_or_fail(), 'data'), 'r') 60 | self._mm = mmap(self._mm.fileno(), 0, access=ACCESS_READ) 61 | self._writing = False 62 | 63 | def _read_segment(self, base, size): 64 | return self._mm[base:base+size] 65 | 66 | def _get_views(self, keys, columns): 67 | 68 | if keys is None: 69 | keys = range(self.narrays) 70 | 71 | keys = [(key, order) for order, key in enumerate(keys)] 72 | 73 | compressor = self._compressor() 74 | views = [] 75 | for key, order in sorted(keys): 76 | base, size = self._bytes_journal().base_size(key) # cache these segments? 77 | array = compressor.decompress(self._read_segment(base, size)) 78 | if columns is not None: 79 | array = array[:, tuple(columns)] 80 | views.append((array, order)) 81 | views = list(map(itemgetter(0), sorted(views, key=itemgetter(1)))) 82 | 83 | return views 84 | 85 | # --- Lifecycle 86 | 87 | @property 88 | def is_reading(self): 89 | return self.is_open and not self.is_writing 90 | 91 | @property 92 | def is_writing(self): 93 | return self.is_open and self._writing 94 | 95 | @property 96 | def is_open(self): 97 | return self._mm is not None 98 | 99 | def close(self): 100 | if self.is_open: 101 | self._mm.close() 102 | self._mm = None 103 | self._writing = None 104 | -------------------------------------------------------------------------------- /jagged/bloscpack_backend.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os.path as op 3 | import bloscpack 4 | from bloscpack.defaults import DEFAULT_CHUNK_SIZE 5 | from jagged.npy_backend import JaggedByNPY 6 | from bloscpack.numpy_io import pack_ndarray_file, unpack_ndarray_file 7 | 8 | 9 | class JaggedByBloscpack(JaggedByNPY): 10 | 11 | def __init__(self, 12 | path=None, 13 | journal=None, 14 | # blosc 15 | clevel=5, 16 | shuffle=True, 17 | cname='lz4hc', 18 | # bloscpack 19 | chunk_size=DEFAULT_CHUNK_SIZE, 20 | offsets=False, 21 | checksum='None'): 22 | super(JaggedByBloscpack, self).__init__(path, journal=journal) 23 | self.clevel = clevel 24 | self.shuffle = shuffle 25 | self.cname = cname 26 | self.offsets = offsets 27 | self.checksum = checksum 28 | self.chunk_size = chunk_size 29 | self._bp_args = None 30 | self._blosc_args = None 31 | 32 | def _dest_file(self, index): 33 | return op.join(self._shards[index % 256], '%d.blp' % index) 34 | 35 | def _read_one(self, key): 36 | return unpack_ndarray_file(self._dest_file(key)) 37 | 38 | def _append_hook(self, data): 39 | if self._bp_args is None: 40 | self._bp_args = bloscpack.BloscpackArgs(offsets=self.offsets, 41 | checksum=self.checksum) 42 | if self._blosc_args is None: 43 | self._blosc_args = bloscpack.BloscArgs(typesize=self.dtype.itemsize, 44 | clevel=self.clevel, 45 | shuffle=self.shuffle, 46 | cname=self.cname) 47 | pack_ndarray_file(data, self._dest_file(self.narrays), 48 | chunk_size=self.chunk_size, 49 | blosc_args=self._blosc_args, 50 | bloscpack_args=self._bp_args) 51 | -------------------------------------------------------------------------------- /jagged/compression/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdvillal/jagged/e737eae08005ab20222b46668f54e6b5dcb8e04c/jagged/compression/__init__.py -------------------------------------------------------------------------------- /jagged/compression/compressors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from whatami import whatable 4 | 5 | try: 6 | import blosc 7 | except ImportError: # pragma: no cover 8 | blosc = None 9 | 10 | 11 | # --- Consistent compressors API 12 | 13 | 14 | @whatable 15 | class Compressor(object): 16 | 17 | def compress(self, data): 18 | raise NotImplementedError() 19 | 20 | def decompress(self, cdata): 21 | raise NotImplementedError() 22 | 23 | def uncompress(self, cdata): # pragma: no cover 24 | return self.decompress(cdata) 25 | 26 | 27 | # --- Pimping Blosc to compress our arrays 28 | 29 | class BloscCompressor(Compressor): 30 | 31 | # This has quite an overhead beyond compression ATM 32 | 33 | def __init__(self, shuffle=True, level=5, cname='lz4hc', n_threads=1, dtype=None, shape=None, order=None): 34 | super(BloscCompressor, self).__init__() 35 | self.shuffle = shuffle 36 | self.level = level 37 | self.cname = cname 38 | self.n_threads = n_threads 39 | self._dtype = dtype 40 | self._shape = shape 41 | self._order = order 42 | 43 | def compress(self, x): 44 | blosc.set_nthreads(self.n_threads) # mmmm global, put in a context to reset 45 | x = np.ascontiguousarray(x) # LOOK AT THIS 46 | shape, dtype, order = x.shape, x.dtype, ('F' if np.isfortran(x) else 'C') 47 | if self._dtype is None: 48 | self._shape, self._dtype, self._order = shape, dtype, order 49 | else: 50 | assert order == self._order 51 | assert dtype == self._dtype 52 | assert len(self._shape) == 1 or shape[1] == self._shape[1] 53 | return blosc.compress_ptr(x.__array_interface__['data'][0], 54 | x.size, x.dtype.itemsize, 55 | shuffle=self.shuffle, cname=self.cname, clevel=self.level) 56 | 57 | def decompress(self, cx): 58 | blosc.set_nthreads(self.n_threads) # mmmm global, put in a context to reset 59 | x = blosc.decompress(cx) 60 | x = np.frombuffer(x, dtype=self._dtype) # beware gets an immutable array 61 | if self._order == 'F': 62 | np.asfortranarray(x) # correct? makes copy and screwes up? 63 | if len(self._shape) > 1: 64 | x = x.reshape(-1, self._shape[1]) 65 | return x 66 | -------------------------------------------------------------------------------- /jagged/h5py_backend.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os.path as op 3 | 4 | import numpy as np 5 | import h5py 6 | 7 | from jagged.base import LinearRawStorage 8 | 9 | 10 | class JaggedByH5Py(LinearRawStorage): 11 | 12 | def __init__(self, 13 | path=None, 14 | journal=None, 15 | contiguity=None, 16 | # hdf params 17 | dset_name='data', 18 | chunklen=None, 19 | compression=None, 20 | compression_opts=None, 21 | shuffle=False, 22 | checksum=False): 23 | super(JaggedByH5Py, self).__init__(path, journal=journal, contiguity=contiguity) 24 | 25 | self._dset_name = dset_name 26 | 27 | if self._path is not None: 28 | self._h5_path = op.join(self._path, 'data.h5') 29 | self._h5 = None 30 | self._dset = None 31 | 32 | self.chunklen = chunklen 33 | self.compression = compression 34 | self.compression_opts = compression_opts 35 | self.shuffle = shuffle 36 | self.checksum = checksum 37 | 38 | # --- Read 39 | 40 | def _open_read(self): 41 | if self._h5 is None: 42 | self._h5 = h5py.File(self._h5_path, mode='r') 43 | self._dset = self._h5[self._dset_name] 44 | 45 | def _get_hook(self, base, size, columns, dest): 46 | 47 | # h5py does not handle graciously this case 48 | if size == 0: 49 | if dest is not None: 50 | return dest 51 | nc = len(columns) if columns is not None else self._dset.shape[-1] 52 | return np.empty((0, nc), dtype=self._dset.dtype) 53 | 54 | # easy case, no column subset requested 55 | if columns is None: 56 | if dest is None: 57 | return self._dset[base:base+size] # should we force read with [:]? add to benchmark 58 | else: 59 | self._dset.read_direct(dest, source_sel=np.s_[base:base+size]) 60 | return dest 61 | 62 | # N.B.: tuple(columns) to force 2d if columns happens to be a one-element list 63 | # column-subset is requested 64 | # h5py only supports increasing order indices in fancy indexing 65 | # https://github.com/h5py/h5py/issues/368 66 | # https://github.com/h5py/h5py/issues/368 67 | # (boiling down to issues with hdf5 hyperslabs) 68 | 69 | if not np.any(np.diff(columns) < 1): 70 | if dest is not None: 71 | self._dset.read_direct(dest, source_sel=np.s_[base:base+size, tuple(columns)]) 72 | return dest 73 | else: 74 | return self._dset[base:base+size, tuple(columns)] 75 | 76 | # better slow than unsupported... 77 | columns, inverse = np.unique(columns, return_inverse=True) 78 | if dest is not None: 79 | dest[:] = self._dset[base:base+size, tuple(columns)][:, inverse] 80 | return dest 81 | else: 82 | return self._dset[base:base+size, tuple(columns)][:, inverse] 83 | 84 | # --- Write 85 | 86 | def _open_write(self, data=None): 87 | if self._h5 is None: 88 | self._h5 = h5py.File(self._h5_path, mode='a') 89 | if 'data' not in self._h5: 90 | # http://docs.h5py.org/en/latest/high/dataset.html 91 | chunks = None 92 | if self.chunklen is not None: 93 | chunks = (self.chunklen,) + (data.shape[1:] if data.ndim > 1 else ()) 94 | self._dset = self._h5.create_dataset(self._dset_name, 95 | dtype=data.dtype, 96 | shape=(0, data.shape[1]), 97 | maxshape=(None, data.shape[1]), 98 | chunks=chunks, 99 | compression=self.compression, 100 | compression_opts=self.compression_opts, 101 | shuffle=self.shuffle, 102 | fletcher32=self.checksum) 103 | else: 104 | self._dset = self._h5[self._dset_name] 105 | 106 | def _append_hook(self, data): 107 | base = len(self) 108 | size = len(data) 109 | self._dset.resize(base + size, axis=0) 110 | self._dset[base:(base+size)] = data 111 | 112 | # --- Lifecycle 113 | 114 | @property 115 | def is_writing(self): 116 | return self.is_open and self._h5.mode != 'r' 117 | 118 | @property 119 | def is_reading(self): 120 | return self.is_open and self._h5.mode == 'r' 121 | 122 | @property 123 | def is_open(self): 124 | return self._h5 is not None 125 | 126 | def close(self): 127 | if self._h5 is not None: 128 | self._h5.close() 129 | self._h5 = None 130 | -------------------------------------------------------------------------------- /jagged/joblib_backend.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import joblib 3 | from jagged.pickle_backend import JaggedByPickle 4 | 5 | 6 | class JaggedByJoblib(JaggedByPickle): 7 | 8 | def __init__(self, path=None, journal=None, arrays_per_chunk=1000, compress=False): 9 | super(JaggedByJoblib, self).__init__(path, journal, arrays_per_chunk, compress) 10 | 11 | def _load(self, path): 12 | self._cache = joblib.load(path) 13 | 14 | def _dump(self, path): 15 | compress = 0 if not self.compress else (5 if self.compress is True else self.compress) 16 | joblib.dump(self._cache, path, compress=compress) # cache_size 17 | -------------------------------------------------------------------------------- /jagged/misc.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """A jumble of seemingly useful stuff.""" 3 | from __future__ import unicode_literals 4 | from itertools import chain 5 | from contextlib import contextmanager 6 | import numbers 7 | import os 8 | import os.path as op 9 | import numpy as np 10 | 11 | 12 | def home(): # pragma: no cover 13 | """Returns current user home dir.""" 14 | return op.expanduser('~') # Valid in both py2 and py3 15 | 16 | 17 | def ensure_writable_dir(path): # pragma: no cover 18 | """Ensures that a path is a writable directory.""" 19 | def check_path(path): 20 | if not op.isdir(path): 21 | raise Exception('%s exists but it is not a directory' % path) 22 | if not os.access(path, os.W_OK): 23 | raise Exception('%s is a directory but it is not writable' % path) 24 | if op.exists(path): 25 | check_path(path) 26 | else: 27 | try: 28 | os.makedirs(path) 29 | except Exception: 30 | if op.exists(path): # Simpler than using a file lock to work on multithreading... 31 | check_path(path) 32 | else: 33 | raise 34 | return path 35 | 36 | 37 | def ensure_dir(path): # pragma: no cover 38 | return ensure_writable_dir(path) 39 | 40 | 41 | @contextmanager 42 | def cd(newdir): 43 | prevdir = os.getcwd() 44 | os.chdir(op.expanduser(newdir)) 45 | try: 46 | yield 47 | finally: 48 | os.chdir(prevdir) 49 | 50 | 51 | # --- Intervals 52 | 53 | def crossings(x, threshold=0, after=False): 54 | """Returns the indices of the elements before or after crossing a threshold. 55 | 56 | N.B. touching the threshold itself is considered a cross. 57 | 58 | Parameters 59 | ---------- 60 | x: array 61 | The data 62 | 63 | threshold: float, default 0 64 | Where crossing happens. 65 | 66 | after: bool, default False 67 | If True, the indices represent the elements after the cross, if False the elements before the cross. 68 | 69 | Returns 70 | ------- 71 | The indices where crosses happen. 72 | 73 | Examples 74 | -------- 75 | 76 | >>> print(crossings(np.array([0, 1, -1, -1, 1, -1]))) 77 | [0 1 3 4] 78 | >>> print(crossings(np.array([0, 1, -1, -1, 1, -1]), after=True)) 79 | [1 2 4 5] 80 | >>> print(crossings(np.array([0, 0, 0]))) 81 | [] 82 | >>> print(crossings(np.array([0, 3, -3, -3, 1]), threshold=1)) 83 | [0 1 3] 84 | >>> print(crossings(np.array([0, 3, -3, -3]), threshold=-2.5)) 85 | [1] 86 | >>> print(crossings(np.array([[0, 3], [-3, -3]]), threshold=-2.5)) # doctest: +IGNORE_EXCEPTION_DETAIL 87 | Traceback (most recent call last): 88 | Exception: Only 1D arrays, please (you gave me 2 dimensions) 89 | """ 90 | if len(x.shape) > 1: 91 | raise Exception('Only 1D arrays, please (you gave me %d dimensions)' % len(x.shape)) 92 | where_crosses = np.where(np.diff(np.sign(x - threshold)))[0] 93 | if after: 94 | return where_crosses + 1 95 | return where_crosses 96 | 97 | 98 | def find_intervals(x): 99 | """ 100 | Finds the intervals in which x is True or non-zero. 101 | 102 | 103 | Returns 104 | ------- 105 | Pairs of indices representing the intervals in which x is True or nonzero. 106 | The pairs represent valid python intervals, lower point included, upper point excluded. 107 | 108 | 109 | Examples 110 | -------- 111 | >>> find_intervals([]) 112 | [] 113 | >>> find_intervals([1]) 114 | [(0, 1)] 115 | >>> find_intervals([0, 1]) 116 | [(1, 2)] 117 | >>> find_intervals([0, 0, 1, 1, 0, 0, 1, 1, 0]) 118 | [(2, 4), (6, 8)] 119 | >>> find_intervals([0, 0, 0]) 120 | [] 121 | >>> find_intervals([1, 1, 1]) 122 | [(0, 3)] 123 | >>> find_intervals([True, True, True]) 124 | [(0, 3)] 125 | >>> find_intervals([1, 1, 1, 0]) 126 | [(0, 3)] 127 | """ 128 | # This ugly 6 lines are here because: 129 | # - we allow to pass lists but we need numpy arrays 130 | # - we want to allow both boolean (True, False) arrays and numeric arrays 131 | # - we want to use the crossings function which only accepts numeric arrays 132 | if not isinstance(x, np.ndarray): 133 | x = np.array(x) 134 | if not x.dtype == np.bool: 135 | x = x != 0 136 | zeros_ones = np.zeros_like(x, dtype=np.int) 137 | zeros_ones[x] = 1 138 | 139 | # Find where we change from being in an interval to not being in an interval 140 | starts_ends = list(crossings(zeros_ones, after=True)) 141 | 142 | # Do we start already in an interval? 143 | if len(zeros_ones) > 0 and 1 == zeros_ones[0]: 144 | starts_ends = [0] + starts_ends 145 | 146 | # Do we end in an interval? 147 | if len(zeros_ones) > 0 and 1 == zeros_ones[-1]: 148 | starts_ends = starts_ends + [len(x)] 149 | 150 | assert len(starts_ends) % 2 == 0 151 | 152 | starts = starts_ends[0::2] 153 | ends = starts_ends[1::2] 154 | return list(zip(starts, ends)) 155 | 156 | 157 | def is_valid_segment(ss, relative_to=None): 158 | if not isinstance(ss, (tuple, list)): 159 | return False 160 | if not len(ss) == 2: 161 | return False 162 | ss_base, ss_size = ss 163 | if not isinstance(ss_base, numbers.Integral) and isinstance(ss_size, numbers.Integral): 164 | return False 165 | if relative_to is not None: 166 | base, size = relative_to 167 | if ss_base < 0 or (base + ss_base + ss_size) > (base + size): 168 | return False 169 | return True 170 | 171 | 172 | def bool2segments(ss, size): 173 | ssa = np.array(ss) 174 | if ssa.dtype.kind == 'b' and ssa.ndim == 1 and len(ssa) == size: 175 | return [(start, end - start) for start, end in find_intervals(ssa)] 176 | return None 177 | 178 | 179 | def subsegments(segment, *subs): 180 | """Make subsegments relative to the start of a base segment, checking for boundaries. 181 | 182 | Parameters 183 | ---------- 184 | segment : tuple (base, size) 185 | The segment to which relative subsegments are being specified 186 | 187 | subs : list of (base, size) boolean arrays specifying subsegments 188 | These can be either something like (3, 8) (ss_base, ss_size), or boolean lists/arrays 189 | It is assumed that ss_base is here is offset from `segment` base 190 | 191 | Returns 192 | ------- 193 | A list of subsegments [(base, size)], each lying within the boundaries of `segment`. 194 | 195 | Examples 196 | -------- 197 | >>> subsegments((5, 100)) 198 | [] 199 | >>> subsegments((5, 100), (11, 14)) 200 | [(16, 14)] 201 | >>> subsegments((5, 100), (11, 14), (3, 88)) 202 | [(16, 14), (8, 88)] 203 | >>> subsegments((0, 5), [True, True, False, True, True]) 204 | [(0, 2), (3, 2)] 205 | >>> subsegments((0, 5), [False] * 5) 206 | [] 207 | >>> subsegments((0, 5), [True] * 5) 208 | [(0, 5)] 209 | >>> subsegments((0, 5), np.array([True] * 5)) 210 | [(0, 5)] 211 | >>> subsegments((0, 5), [True, True, False, True, True], (2, 2)) 212 | [(0, 2), (3, 2), (2, 2)] 213 | >>> subsegments((0, 100), (90, 11)) # doctest: +IGNORE_EXCEPTION_DETAIL 214 | Traceback (most recent call last): 215 | ValueError: (90, 11) is not a valid subsegment specification for (0, 100) 216 | >>> subsegments((0, 100), (-3, 8)) # doctest: +IGNORE_EXCEPTION_DETAIL 217 | Traceback (most recent call last): 218 | ValueError: (90, 11) is not a valid subsegment specification for (0, 100) 219 | >>> subsegments((0, 100), ('a', 8)) # doctest: +IGNORE_EXCEPTION_DETAIL 220 | Traceback (most recent call last): 221 | ValueError: ('a', 8) is not a valid subsegment specification for (0, 100) 222 | >>> subsegments((0, 100), 'crazyyou') # doctest: +IGNORE_EXCEPTION_DETAIL 223 | Traceback (most recent call last): 224 | ValueError: 'crazyyou' is not a valid subsegment specification for (0, 100) 225 | """ 226 | 227 | # This implementation is slow, but seemingly correct; I do not think it will bottleneck 228 | 229 | base, size = segment 230 | 231 | def bool_and_valid(ss): 232 | if is_valid_segment(ss, relative_to=segment): 233 | return [ss] 234 | ss_from_bool = bool2segments(ss, size) 235 | if ss_from_bool is not None and all(is_valid_segment(ss, relative_to=segment) for ss in ss_from_bool): 236 | return ss_from_bool 237 | raise ValueError('%r is not a valid subsegment specification for %r' % (ss, segment)) 238 | 239 | return [(base + ss_base, ss_size) for ss_base, ss_size in 240 | chain.from_iterable(bool_and_valid(ss) for ss in subs)] 241 | -------------------------------------------------------------------------------- /jagged/mmap_backend.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """Backend using python/numpy mmap bindings.""" 3 | from __future__ import absolute_import, unicode_literals, print_function, division 4 | import os.path as op 5 | from future.utils import PY3 6 | import numpy as np 7 | from jagged.base import LinearRawStorage 8 | 9 | 10 | class JaggedByMemMap(LinearRawStorage): 11 | """Provides numpy arrays as views of an underlying memmapped array.""" 12 | 13 | def __init__(self, path=None, journal=None, contiguity=None, autoviews=True): 14 | super(JaggedByMemMap, self).__init__(path, journal=journal, contiguity=contiguity) 15 | 16 | if self._path is not None: 17 | self._mmpath = op.join(self._path, 'data.mm') 18 | 19 | self._mm = None # numpy memmap for reading / file handler for writing 20 | 21 | self.autoviews = autoviews 22 | 23 | # --- Read 24 | 25 | def _open_read(self): 26 | if self._mm is None: 27 | self._mm = np.memmap(self._mmpath, 28 | dtype=self.dtype, shape=self.shape, order=self.order, 29 | mode='r') 30 | self._check_sizes() 31 | 32 | def _get_hook(self, base, size, columns, dest): 33 | view = self._mm[base:base+size] 34 | if columns is not None: 35 | view = view[:, tuple(columns)] 36 | if dest is None: 37 | return view.copy() if not self.autoviews else view 38 | dest[:] = view 39 | return dest 40 | 41 | # --- Write 42 | 43 | def _open_write(self, data=None): 44 | self._mm = open(self._mmpath, mode='a') 45 | 46 | def _append_hook(self, data): 47 | self._mm.buffer.write(data.data) if PY3 else self._mm.write(str(data.data)) 48 | 49 | # --- Lifecycle 50 | 51 | @property 52 | def is_writing(self): 53 | return self.is_open and not self.is_reading 54 | 55 | @property 56 | def is_reading(self): 57 | return isinstance(self._mm, np.memmap) 58 | 59 | @property 60 | def is_open(self): 61 | return self._mm is not None 62 | 63 | def close(self): 64 | if self.is_writing: 65 | self._mm.close() 66 | self._mm = None 67 | 68 | # --- Storage for underlying array shape, dtype, row/column order 69 | 70 | def _len_by_filelen(self): 71 | """Helps to check sanity of the array.""" 72 | mmsize_bytes = op.getsize(self._mmpath) 73 | row_size_bytes = self.shape[1] * self.dtype.itemsize 74 | num_rows = mmsize_bytes // row_size_bytes 75 | leftovers = mmsize_bytes % row_size_bytes 76 | return num_rows, leftovers 77 | 78 | def _check_sizes(self): 79 | if op.isfile(self._mmpath) and self.shape is not None: 80 | num_rows, leftovers = self._len_by_filelen() 81 | if 0 != leftovers: 82 | raise Exception('the memmap file has incomplete data ' 83 | '(%d leftover bytes from a partially written array).' 84 | '(are you missing transactions?)' % leftovers) 85 | if num_rows != self.shape[0]: 86 | raise Exception('the number or rows inferred by file size ' 87 | 'does not coincide with the length of the store ' 88 | '(%d != %d)' % (num_rows, self.shape[0])) 89 | 90 | # 91 | # Remember that resize for numpy mmap objects never resize the file under the hood. 92 | # Numpy mmap ndarray subclass code is simple and neat, can be read in no time. Get back there. 93 | # 94 | # Document that when using mode 'auto', everything returned is a view to the large memmapped array. 95 | # 96 | -------------------------------------------------------------------------------- /jagged/npy_backend.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from future.builtins import range 3 | from itertools import chain 4 | import os.path as op 5 | import os 6 | import numpy as np 7 | from jagged.base import JaggedRawStore 8 | from jagged.misc import ensure_dir 9 | 10 | 11 | class JaggedByNPY(JaggedRawStore): 12 | """Stores each array in an individual .npy file.""" 13 | 14 | def __init__(self, path=None, journal=None): 15 | super(JaggedByNPY, self).__init__(path, journal=journal) 16 | self._shards = None 17 | if path is not None: 18 | self._all_shards() 19 | 20 | # We can do this memory map (see np.load) 21 | 22 | def _all_shards(self): 23 | if self._shards is None: 24 | self._shards = [ensure_dir(op.join(self.path_or_fail(), str(shard))) for shard in range(256)] 25 | return self._shards 26 | # random note, 256 is the last cached int in cpython 27 | 28 | def _dest_file(self, index): 29 | return op.join(self._shards[index % 256], '%d.npy' % index) 30 | 31 | def _infer_numarrays(self): 32 | numarrays = 0 33 | for shard in self._shards: 34 | numarrays = max(chain([numarrays], (int(fn[:-4]) + 1 for fn in os.listdir(shard)))) 35 | return numarrays 36 | 37 | def check_numarrays(self): 38 | assert self._infer_numarrays() == self.journal().numarrays() 39 | 40 | # --- Write 41 | 42 | def _open_write(self, data=None): 43 | pass 44 | 45 | def _append_hook(self, data): 46 | np.save(self._dest_file(self.narrays), data) 47 | 48 | # --- Read 49 | 50 | def _open_read(self): 51 | pass 52 | 53 | def _read_one(self, key): 54 | return np.load(self._dest_file(key)) 55 | 56 | def _get_one(self, key, columns): 57 | data = self._read_one(key) 58 | if columns is not None: 59 | data = data[:, tuple(columns)] 60 | return data 61 | 62 | def _get_views(self, keys, columns): 63 | if keys is None: 64 | return list(self._get_one(key, columns) for key in range(self.journal().numarrays())) 65 | return [self._get_one(key, columns) for key in keys] 66 | 67 | # --- Lifecycle 68 | 69 | @property 70 | def is_writing(self): 71 | return True 72 | 73 | @property 74 | def is_reading(self): 75 | return True 76 | 77 | @property 78 | def is_open(self): 79 | return True 80 | 81 | def close(self): 82 | pass 83 | -------------------------------------------------------------------------------- /jagged/pickle_backend.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import gzip 3 | from operator import itemgetter 4 | from jagged.base import JaggedRawStore 5 | import os.path as op 6 | try: 7 | import cPickle as pickle 8 | except ImportError: # pragma: no cover 9 | import pickle 10 | 11 | 12 | class JaggedByPickle(JaggedRawStore): 13 | """A chunked store based on pickle.""" 14 | 15 | def __init__(self, path=None, journal=None, arrays_per_chunk=1000, compress=False): 16 | super(JaggedByPickle, self).__init__(path, journal) 17 | self.arrays_per_chunk = arrays_per_chunk 18 | self.compress = compress 19 | self._cache = [] 20 | self._cached_pickle_num = None 21 | self._writing = None 22 | 23 | # --- Pickles 24 | 25 | def _dump(self, path): 26 | with gzip.open(path, 'wb') if self.compress else open(path, 'wb') as writer: 27 | pickle.dump(self._cache, writer, protocol=2) 28 | # protocol=2 instead of highest to maintain py2 compat of the store 29 | 30 | def _load(self, path): 31 | with gzip.open(path, 'rb') if self.compress else open(path, 'rb') as reader: 32 | self._cache = pickle.load(reader) 33 | 34 | def _pickle_num(self, index): 35 | return index // self.arrays_per_chunk 36 | 37 | def _pickle_file(self, index): 38 | path = op.join(self.path_or_fail(), '%d.pkl' % self._pickle_num(index)) 39 | return (path + '.gz') if self.compress else path 40 | 41 | def _save_pickle(self): 42 | if self.is_writing: 43 | self._dump(self._pickle_file(self.narrays)) 44 | 45 | def _read_pickle(self, index): 46 | pickle_num = self._pickle_num(index) 47 | if self._cached_pickle_num != pickle_num: 48 | try: 49 | self._load(self._pickle_file(index)) 50 | except IOError: 51 | self._cache = [] 52 | self._cached_pickle_num = pickle_num 53 | 54 | # --- Cache 55 | 56 | def _cache_full(self): 57 | return self._cache is not None and len(self._cache) == self.arrays_per_chunk 58 | 59 | # --- Read 60 | 61 | def _open_read(self): 62 | self._writing = False 63 | 64 | def _get_views(self, keys, columns): 65 | if keys is None: 66 | keys = range(self.narrays) 67 | 68 | keys = [(key, order) for order, key in enumerate(keys)] 69 | 70 | views = [] 71 | for key, order in sorted(keys): 72 | if not 0 <= key < self.narrays: 73 | raise ValueError('Key not in storage: %d' % key) 74 | self._read_pickle(key) 75 | array = self._cache[key % self.arrays_per_chunk] 76 | if columns is not None: 77 | array = array[:, tuple(columns)] 78 | views.append((array, order)) 79 | views = list(map(itemgetter(0), sorted(views, key=itemgetter(1)))) 80 | 81 | return views 82 | 83 | # --- Write 84 | 85 | def _open_write(self, data=None): 86 | self._writing = True 87 | self._read_pickle(self.narrays) 88 | 89 | def _append_hook(self, data): 90 | self._cache.append(data.copy()) 91 | if self._cache_full(): 92 | self._save_pickle() 93 | self._cache = [] 94 | self._cached_pickle_num += 1 95 | 96 | # --- Lifecycle 97 | 98 | @property 99 | def is_open(self): 100 | return self._writing is not None 101 | 102 | @property 103 | def is_writing(self): 104 | return self.is_open and self._writing 105 | 106 | @property 107 | def is_reading(self): 108 | return self.is_open and not self._writing 109 | 110 | def close(self): 111 | self._save_pickle() 112 | self._cache = None 113 | self._cached_pickle_num = None 114 | self._writing = None 115 | 116 | # 117 | # In general it would use protocol 2, so pickles can be read also in py2 118 | # We can be as clever as we want with caches: many read caches with LRU, one write cache... 119 | # 120 | -------------------------------------------------------------------------------- /jagged/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdvillal/jagged/e737eae08005ab20222b46668f54e6b5dcb8e04c/jagged/tests/__init__.py -------------------------------------------------------------------------------- /jagged/tests/fixtures.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import print_function, absolute_import, unicode_literals 3 | 4 | import numpy as np 5 | import pytest 6 | 7 | from functools import partial 8 | 9 | from jagged import JaggedByCarray, JaggedByBlosc, JaggedByH5Py, JaggedByJoblib, \ 10 | JaggedByMemMap, JaggedByNPY, JaggedByBloscpack, JaggedByPickle 11 | 12 | RAW_STORES = [] 13 | 14 | LINEAR_RAW_STORES = ( 15 | ('jr=carray', JaggedByCarray), 16 | ('jr=carraychunks', partial(JaggedByCarray, chunklen=100) if JaggedByCarray else None), 17 | ('jr=h5py', JaggedByH5Py), 18 | ('jr=h5pychunks', partial(JaggedByH5Py, chunklen=100) if JaggedByCarray else None), 19 | ('jr=mmap', JaggedByMemMap), 20 | ) 21 | 22 | for contiguity in ('read', 'write', None, 'auto'): 23 | for name, store in LINEAR_RAW_STORES: 24 | RAW_STORES.append((name + '#' + 'cont=%s' % contiguity, partial(store, contiguity=contiguity))) 25 | 26 | RAW_STORES.extend([ 27 | ('jr=npy', JaggedByNPY), 28 | ('jr=blp', JaggedByBloscpack), 29 | ('jr=blosc-mm', JaggedByBlosc), 30 | ('jr=pickle', JaggedByPickle), 31 | ('jr=joblib', JaggedByJoblib), 32 | ]) 33 | 34 | # pytest.importorskip won't cut it here... 35 | 36 | 37 | def store_skip(store): # pragma: no cover 38 | """Skips a store if its dependencies are not available.""" 39 | name, store = store 40 | if store is None: 41 | return pytest.mark.skipif(name, reason='the numpy plugin requires both pandas and joblib') 42 | return store 43 | 44 | 45 | stores = list(map(store_skip, RAW_STORES)) 46 | names = [name for name, _ in RAW_STORES] 47 | 48 | 49 | @pytest.yield_fixture(params=stores, ids=names) 50 | def jagged_raw(request, tmpdir): 51 | jr = request.param 52 | dest = tmpdir.join(jr().what().id()).ensure_dir() 53 | try: 54 | yield jr, str(dest) 55 | finally: 56 | dest.remove(ignore_errors=True) 57 | 58 | 59 | @pytest.fixture(params=(1, 2, 10), ids=('ncol=1', 'ncol=2', 'ncol=10')) 60 | def ncol(request): 61 | return request.param 62 | 63 | 64 | @pytest.fixture(params=('cols=all', 'cols=all-exp', 'cols=first', 'cols=last', 65 | 'cols=even', 'cols=inverse', 'cols=mixed'), 66 | ids=('cols=all', 'cols=all-exp', 'cols=first', 'cols=last', 67 | 'cols=even', 'cols=inverse', 'cols=mixed')) 68 | def columns(request, ncol): 69 | if request.param == 'cols=all': 70 | return None 71 | elif request.param == 'cols=all-exp': 72 | return range(ncol) 73 | elif request.param == 'cols=first': 74 | return [0] 75 | elif request.param == 'cols=last': 76 | return [ncol - 1] # (we should support python negative indexing syntax) 77 | elif request.param == 'cols=even': 78 | return list(range(0, ncol, 2)) 79 | elif request.param == 'cols=inverse': 80 | return list(range(ncol)[::-1]) 81 | elif request.param == 'cols=mixed': 82 | list(range(ncol)[::-1]) + list(range(0, ncol, 2)) 83 | else: # pragma: no cover 84 | raise ValueError('Unknows column spec %r' % request.param) 85 | 86 | 87 | @pytest.fixture(params=(0, 1), ids=['rng=0', 'rng=1']) 88 | def rng(request): 89 | return np.random.RandomState(request.param) 90 | 91 | 92 | @pytest.fixture 93 | def dataset(ncol, rng): 94 | sizes = [0] + rng.randint(low=0, high=500, size=10).tolist() 95 | rng.shuffle(sizes) 96 | originals = [rng.rand(size, ncol) for size in sizes] 97 | return rng, originals, ncol 98 | 99 | 100 | @pytest.fixture 101 | def mock_jagged_raw(dataset): 102 | 103 | # unbox the fixture 104 | rng, originals, ncol = dataset 105 | 106 | # reader 107 | jagged = np.vstack(originals) 108 | 109 | def reader(base, size, columns, dest): 110 | view = jagged[base:(base+size)] 111 | if columns is not None: 112 | view = view[:, tuple(columns)] 113 | if dest is None: 114 | return view 115 | dest[:] = view 116 | return dest 117 | 118 | # shape 119 | ne, nc = jagged.shape 120 | 121 | # segments 122 | base = 0 123 | segments = [] 124 | for o in originals: 125 | segments.append((base, len(o))) 126 | base += len(o) 127 | 128 | return originals, ne, nc, originals[0].dtype, segments, reader, rng 129 | 130 | 131 | @pytest.fixture(params=('read', 'write', None, 'auto'), 132 | ids=('cont=read', 'cont=write', 'cont=none', 'cont=auto')) 133 | def contiguity(request): 134 | return request.param 135 | -------------------------------------------------------------------------------- /jagged/tests/test_raw_stores.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """Tests the raw storers.""" 3 | from __future__ import print_function, absolute_import, unicode_literals 4 | from future.builtins import zip 5 | import os.path as op 6 | import bcolz 7 | from .fixtures import * 8 | from jagged.base import retrieve_contiguous 9 | from jagged.misc import ensure_dir 10 | 11 | 12 | # -- lifecycle tests 13 | 14 | def test_lifecycle(jagged_raw): 15 | jagged_raw, path = jagged_raw 16 | data0 = np.zeros((2, 10)) 17 | data1 = np.ones((3, 10)) 18 | expected = np.vstack((data0, data1)) 19 | with jagged_raw(path=path) as jr: 20 | # before writing, everything is unknown 21 | assert jr.shape is None 22 | assert jr.ndims is None 23 | assert jr.dtype is None 24 | # first write-up 25 | jr.append(data0) 26 | assert jr.shape == data0.shape 27 | assert jr.dtype == data0.dtype 28 | assert jr.ndims == data0.ndim 29 | assert jr.narrays == 1 30 | assert len(jr) == len(data0) 31 | # first read 32 | assert np.allclose(data0, jr.get()[0]) 33 | # even if we close it... 34 | jr.close() 35 | # we can now know shapes and the like 36 | assert jr.shape == data0.shape 37 | assert jr.dtype == data0.dtype 38 | assert jr.ndims == data0.ndim 39 | # we can reread... 40 | assert np.allclose(data0, jr.get()[0]) 41 | # we can know shapes and the like 42 | assert jr.shape == data0.shape 43 | assert jr.dtype == data0.dtype 44 | assert jr.ndims == data0.ndim 45 | # we can append more 46 | jr.append(data1) 47 | assert jr.shape == expected.shape 48 | assert jr.dtype == expected.dtype 49 | assert jr.ndims == expected.ndim 50 | assert jr.narrays == 2 51 | assert len(jr) == len(data0) + len(data1) 52 | # and the data will be properlly appended 53 | assert np.allclose(data0, jr.get()[0]) 54 | assert np.allclose(data1, jr.get()[1]) 55 | 56 | 57 | # -- Tests retrieve contiguous 58 | 59 | def test_retrieve_contiguous(mock_jagged_raw, columns, contiguity): 60 | 61 | originals, ne, nc, dtype, segments, reader, rng = mock_jagged_raw 62 | 63 | if columns is not None: 64 | originals = [o[:, tuple(columns)] for o in originals] 65 | 66 | # sanity checks for wrong inputs 67 | with pytest.raises(ValueError) as excinfo: 68 | retrieve_contiguous(segments, columns, reader, dtype, ne, nc, 'wrong') 69 | assert 'Unknown contiguity scheme:' in str(excinfo.value) 70 | 71 | with pytest.raises(ValueError) as excinfo: 72 | retrieve_contiguous([(-1, 1)], columns, reader, dtype, ne, nc, contiguity) 73 | assert 'Out of bounds query (base=-1, size=1' in str(excinfo.value) 74 | 75 | with pytest.raises(ValueError) as excinfo: 76 | retrieve_contiguous([(0, 100000)], columns, reader, dtype, ne, nc, contiguity) 77 | assert 'Out of bounds query (base=0, size=100000' in str(excinfo.value) 78 | 79 | # insertion order 80 | views = retrieve_contiguous(segments, columns, reader, dtype, ne, nc, contiguity) 81 | for o, v in zip(originals, views): 82 | assert np.allclose(o, v) 83 | 84 | # random order 85 | o_s = list(zip(originals, segments)) 86 | rng.shuffle(o_s) 87 | originals, segments = zip(*o_s) 88 | views = retrieve_contiguous(segments, columns, reader, dtype, ne, nc, contiguity) 89 | for o, v in zip(originals, views): 90 | assert np.allclose(o, v) 91 | 92 | 93 | # -- roundtrip tests 94 | 95 | def test_roundtrip(jagged_raw, dataset, columns): 96 | jagged_raw, path = jagged_raw 97 | jagged_raw = partial(jagged_raw, path=path) 98 | rng, originals, ncol = dataset 99 | 100 | # Write 101 | keys = [] 102 | with jagged_raw() as jr: 103 | total = 0 104 | assert jr.dtype is None 105 | assert jr.shape is None 106 | for original in originals: 107 | key = jr.append(original) 108 | assert jr.is_writing 109 | keys.append(key) 110 | total += len(original) 111 | assert len(jr) == total 112 | assert jr.dtype == originals[0].dtype 113 | assert jr.shape == (total, ncol) 114 | 115 | # Read 116 | def test_read(originals, keys): 117 | 118 | if columns is not None: 119 | originals = [o[:, columns] for o in originals] 120 | 121 | # test read, one by one 122 | with jagged_raw() as jr: 123 | for original, key in zip(originals, keys): 124 | roundtripped = jr.get([key], columns=columns)[0] 125 | assert np.allclose(original, roundtripped) 126 | 127 | # test read, in a batch 128 | with jagged_raw() as jr: 129 | for original, roundtripped in zip(originals, jr.get(keys, columns=columns)): 130 | assert np.allclose(original, roundtripped) 131 | 132 | # read all 133 | # with jagged_raw() as jr: 134 | # for original, roundtripped in zip(originals, jr.get(columns=columns)): 135 | # original = original if columns is None else original[:, columns] 136 | # assert np.allclose(original, roundtripped) 137 | 138 | # read in insertion order 139 | test_read(originals, keys) 140 | 141 | # read in random order 142 | or_s = list(zip(originals, keys)) 143 | rng.shuffle(or_s) 144 | originals, keys = zip(*or_s) 145 | test_read(originals, keys) 146 | 147 | 148 | # --- Test self-identification 149 | 150 | def test_whatid(): 151 | 152 | assert "JaggedByCarray(chunklen=1000," \ 153 | "contiguity=None," \ 154 | "cparams=cparams(clevel=3,cname='zlib',quantize=0,shuffle=False)," \ 155 | "expectedlen=None)" \ 156 | == JaggedByCarray(chunklen=1000, 157 | cparams=bcolz.cparams(clevel=3, cname='zlib', shuffle=False), 158 | expectedlen=None).what().id() 159 | assert "JaggedByH5Py(checksum=False," \ 160 | "chunklen=1000," \ 161 | "compression='lzf'," \ 162 | "compression_opts=0," \ 163 | "contiguity=None," \ 164 | "shuffle=True)" \ 165 | == JaggedByH5Py(chunklen=1000, 166 | compression='lzf', 167 | compression_opts=0, 168 | shuffle=True).what().id() 169 | 170 | 171 | # --- Test factories / curries/ partials 172 | 173 | def test_copyconf(jagged_raw): 174 | jagged_raw, path = jagged_raw 175 | assert jagged_raw().what().id() == jagged_raw().copyconf()().what().id(), \ 176 | 'factory without parameters should give the same config as the constructor' 177 | 178 | 179 | # --- Misc tests 180 | 181 | def test_nonvalid_appends(jagged_raw): 182 | jagged_raw, path = jagged_raw 183 | with jagged_raw(path=path) as jr: 184 | with pytest.raises(Exception) as excinfo: 185 | jr.append(np.zeros((10, 0))) 186 | assert 'Cannot append data with sizes 0 in non-leading dimension' in str(excinfo.value) 187 | 188 | 189 | def test_no_inmemory_storage(jagged_raw): 190 | # maybe one day we allow these... 191 | jagged_raw, path = jagged_raw 192 | with jagged_raw(path=None) as jr: 193 | with pytest.raises(Exception) as excinfo: 194 | jr.append(np.zeros((1, 1))) 195 | assert 'In-memory only arrays are not implemented' in str(excinfo.value) 196 | 197 | 198 | def test_copy_from(jagged_raw): 199 | jagged_raw, path = jagged_raw 200 | path0 = ensure_dir(op.join(path, 'test0')) 201 | path1 = ensure_dir(op.join(path, 'test1')) 202 | with jagged_raw(path0) as jr0, jagged_raw(path1) as jr1: 203 | jr0.append(np.zeros((2, 10))) 204 | jr0.append(np.ones((3, 10))) 205 | jr1.append_from(jr0) 206 | assert np.allclose(jr0.get()[0], jr1.get()[0]) 207 | 208 | 209 | def test_chunked_copy_from(jagged_raw): 210 | jagged_raw, path = jagged_raw 211 | path0 = ensure_dir(op.join(path, 'test0')) 212 | path1 = ensure_dir(op.join(path, 'test1')) 213 | with jagged_raw(path0) as jr0, jagged_raw(path1) as jr1: 214 | for _ in range(10): 215 | jr0.append(np.zeros((2, 10))) 216 | jr0.append(np.ones((3, 10))) 217 | jr1.append_from(jr0, arrays_per_chunk=2) 218 | assert np.allclose(jr0.get()[0], jr1.get()[0]) 219 | with pytest.raises(ValueError) as excinfo: 220 | jr1.append_from(jr0, arrays_per_chunk=-1) 221 | assert 'arrays_per_chunk must be None or bigger than 0, it is -1' in str(excinfo.value) 222 | 223 | 224 | def test_mmap_check_sizes(tmpdir): 225 | dest = str(tmpdir) 226 | x = np.empty((5, 2), dtype=np.int32) 227 | with JaggedByMemMap(dest) as jbm: 228 | jbm.append(x) 229 | mmf = jbm._mmpath 230 | # write row-sized junk 231 | with open(mmf, 'a') as writer: 232 | writer.write(str('junk' * 10)) 233 | with JaggedByMemMap(dest) as jbm: 234 | with pytest.raises(Exception) as excinfo: 235 | jbm.get([(0, 2)]) 236 | assert 'the number or rows inferred by file size does not coincide' in str(excinfo.value) 237 | # write junk that look like leftovers of an aborted write 238 | with open(mmf, 'a') as writer: 239 | writer.write(str('jagged')) 240 | with JaggedByMemMap(dest) as jbm: 241 | with pytest.raises(Exception) as excinfo: 242 | jbm.get([(0, 2)]) 243 | assert 'the memmap file has incomplete data' in str(excinfo.value) 244 | # make the memmap way too small 245 | with open(mmf, 'w') as writer: 246 | writer.write(str('jagged')) 247 | with pytest.raises(Exception) as excinfo: 248 | jbm.get([(0, 2)]) 249 | assert 'mmap length is greater than file size' in str(excinfo.value) 250 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.rst 3 | 4 | [pytest] 5 | python_files=*.py 6 | pep8maxlinelength = 120 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | # Authors: Santi Villalba 5 | # Licence: BSD 3 clause 6 | 7 | try: 8 | from setuptools import setup 9 | except ImportError: 10 | from distutils.core import setup 11 | 12 | import jagged 13 | 14 | setup( 15 | name='jagged', 16 | license='BSD 3 clause', 17 | description='Simple tricks for efficient loading or merging collections of unevenly sized elements', 18 | long_description=open('README.rst').read().replace('|Build Status| |Coverage Status| |Scrutinizer Status|', ''), 19 | version=jagged.__version__, 20 | url='https://github.com/sdvillal/jagged', 21 | author='Santi Villalba', 22 | author_email='sdvillal@gmail.com', 23 | packages=['jagged', 24 | 'jagged.compression', 25 | 'jagged.benchmarks', 26 | 'jagged.tests'], 27 | classifiers=[ 28 | 'Intended Audience :: Science/Research', 29 | 'Intended Audience :: Developers', 30 | 'Topic :: Software Development', 31 | 'Topic :: Scientific/Engineering', 32 | 'License :: OSI Approved', 33 | 'Programming Language :: Python :: 2', 34 | 'Programming Language :: Python :: 2.7', 35 | 'Programming Language :: Python :: 3', 36 | 'Programming Language :: Python :: 3.4', 37 | 'Operating System :: Unix', 38 | ], 39 | install_requires=['future', 40 | 'numpy', 41 | 'whatami>=4.0.0', 42 | 'toolz'], 43 | tests_require=['pytest'], 44 | extras_require={ 45 | 'blosc': ['blosc'], 46 | 'bloscpack': ['bloscpack'], 47 | 'bcolz': ['bcolz'], 48 | 'h5py': ['h5py'], 49 | 'joblib': ['joblib'], 50 | 'benchmarks': ['psutil'] 51 | } 52 | ) 53 | --------------------------------------------------------------------------------