├── .coveragerc
├── .gitignore
├── .scrutinizer.yml
├── .travis.yml
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── examples
    └── readme_examples.py
├── jagged
    ├── __init__.py
    ├── base.py
    ├── bcolz_backend.py
    ├── benchmarks
    │   ├── __init__.py
    │   └── utils.py
    ├── blosc_backend.py
    ├── bloscpack_backend.py
    ├── compression
    │   ├── __init__.py
    │   └── compressors.py
    ├── h5py_backend.py
    ├── joblib_backend.py
    ├── misc.py
    ├── mmap_backend.py
    ├── npy_backend.py
    ├── pickle_backend.py
    └── tests
    │   ├── __init__.py
    │   ├── fixtures.py
    │   └── test_raw_stores.py
├── setup.cfg
└── setup.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [report]
 2 | exclude_lines =
 3 |     pragma: no cover
 4 |     def __repr__
 5 |     if self.debug:
 6 |     if settings.DEBUG
 7 |     raise AssertionError
 8 |     raise NotImplementedError
 9 |     if __name__ == .__main__.:
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # "Compiled" python
 2 | *.py[cod]
 3 | 
 4 | # C extensions
 5 | *.so
 6 | *.o
 7 | 
 8 | # Packages
 9 | *.egg
10 | *.egg-info
11 | dist
12 | build
13 | eggs
14 | parts
15 | bin
16 | var
17 | sdist
18 | develop-eggs
19 | .installed.cfg
20 | lib
21 | lib64
22 | 
23 | # Installer logs
24 | pip-log.txt
25 | 
26 | # Unit test / coverage reports
27 | .coverage
28 | .tox
29 | nosetests.xml
30 | 
31 | # Translations
32 | *.mo
33 | 
34 | # Mr Developer
35 | .mr.developer.cfg
36 | .project
37 | .pydevproject
38 | 
39 | # Others
40 | .cache
41 | 


--------------------------------------------------------------------------------
/.scrutinizer.yml:
--------------------------------------------------------------------------------
  1 | checks:
  2 |     python:
  3 |         code_rating: true
  4 |         duplicate_code: true
  5 |         variables_used_before_assignment: true
  6 |         variables_unused_wildcard_import: true
  7 |         variables_unused_variable: true
  8 |         variables_unused_import: true
  9 |         variables_unused_argument: true
 10 |         variables_unpacking_non_sequence: true
 11 |         variables_undefined_variable: true
 12 |         variables_undefined_loop_variable: true
 13 |         variables_undefined_all_variable: true
 14 |         variables_unbalanced_tuple_unpacking: true
 15 |         variables_redefined_outer_name: true
 16 |         variables_redefined_builtin: true
 17 |         variables_redefine_in_handler: true
 18 |         variables_no_name_in_module: true
 19 |         variables_invalid_all_object: true
 20 |         variables_global_variable_undefined: true
 21 |         variables_global_variable_not_assigned: true
 22 |         variables_global_statement: true
 23 |         variables_global_at_module_level: true
 24 |         typecheck_unexpected_keyword_arg: true
 25 |         typecheck_too_many_function_args: true
 26 |         typecheck_redundant_keyword_arg: true
 27 |         typecheck_not_callable: true
 28 |         typecheck_no_value_for_parameter: true
 29 |         typecheck_no_member: true
 30 |         typecheck_missing_kwoa: true
 31 |         typecheck_maybe_no_member: true
 32 |         typecheck_duplicate_keyword_arg: true
 33 |         typecheck_assignment_from_none: true
 34 |         typecheck_assignment_from_no_return: true
 35 |         string_unused_format_string_key: true
 36 |         string_truncated_format_string: true
 37 |         string_too_many_format_args: true
 38 |         string_too_few_format_args: true
 39 |         string_mixed_format_string: true
 40 |         string_missing_format_string_key: true
 41 |         string_format_needs_mapping: true
 42 |         string_constant_anomalous_unicode_escape_in_string: true
 43 |         string_constant_anomalous_backslash_in_string: true
 44 |         string_bad_str_strip_call: true
 45 |         string_bad_format_string_key: true
 46 |         string_bad_format_character: true
 47 |         open_mode_bad_open_mode: true
 48 |         newstyle_bad_super_call: true
 49 |         logging_unsupported_format: true
 50 |         logging_too_many_args: true
 51 |         logging_too_few_args: true
 52 |         logging_not_lazy: true
 53 |         logging_format_truncated: true
 54 |         miscellaneous_fixme: true
 55 |         imports_wildcard_import: true
 56 |         imports_relative_import: true
 57 |         imports_reimported: true
 58 |         imports_import_self: true
 59 |         imports_import_error: true
 60 |         imports_deprecated_module: true
 61 |         imports_cyclic_import: true
 62 |         format_unnecessary_semicolon: true
 63 |         format_trailing_whitespace: true
 64 |         format_superfluous_parens: true
 65 |         format_old_ne_operator: true
 66 |         format_multiple_statements: true
 67 |         format_mixed_indentation: true
 68 |         format_missing_final_newline: true
 69 |         format_lowercase_l_suffix: true
 70 |         format_line_too_long:
 71 |             max_length: '120'
 72 |         format_bad_whitespace: true
 73 |         format_bad_indentation:
 74 |             indentation: '4 spaces'
 75 |         format_backtick: true
 76 |         exceptions_raising_string: true
 77 |         exceptions_raising_non_exception: true
 78 |         exceptions_raising_bad_type: true
 79 |         exceptions_pointless_except: true
 80 |         exceptions_notimplemented_raised: true
 81 |         exceptions_catching_non_exception: true
 82 |         exceptions_broad_except: true
 83 |         exceptions_binary_op_exception: true
 84 |         exceptions_bare_except: true
 85 |         exceptions_bad_except_order: true
 86 |         design_interface_not_implemented: true
 87 |         design_abstract_class_not_used: true
 88 |         design_abstract_class_little_used: true
 89 |         classes_valid_slots: true
 90 |         classes_super_init_not_called: true
 91 |         classes_signature_differs: true
 92 |         classes_protected_access: true
 93 |         classes_non_parent_init_called: true
 94 |         classes_non_iterator_returned: true
 95 |         classes_no_self_use: true
 96 |         classes_no_self_argument: true
 97 |         classes_no_method_argument: true
 98 |         classes_no_init: true
 99 |         classes_missing_interface_method: true
100 |         classes_method_hidden: true
101 |         classes_interface_is_not_class: true
102 |         classes_bad_staticmethod_argument: true
103 |         classes_bad_mcs_method_argument: true
104 |         classes_bad_context_manager: true
105 |         classes_bad_mcs_classmethod_argument: true
106 |         classes_bad_classmethod_argument: true
107 |         classes_attribute_defined_outside_init: true
108 |         classes_arguments_differ: true
109 |         classes_access_member_before_definition: true
110 |         classes_abstract_method: true
111 |         basic_yield_outside_function: true
112 |         basic_useless_else_on_loop: true
113 |         basic_unreachable: true
114 |         basic_unnecessary_pass: true
115 |         basic_unnecessary_lambda: true
116 |         basic_star_args: true
117 |         basic_return_outside_function: true
118 |         basic_return_in_init: true
119 |         basic_return_arg_in_generator: true
120 |         basic_pointless_string_statement: true
121 |         basic_pointless_statement: true
122 |         basic_old_raise_syntax: true
123 |         basic_not_in_loop: true
124 |         basic_nonexistent_operator: true
125 |         basic_missing_reversed_argument: true
126 |         basic_missing_module_attribute: true
127 |         basic_missing_docstring: true
128 |         basic_invalid_name:
129 |             functions: '[a-z_][a-z0-9_]{2,30}$'
130 |             variables: '[a-z_][a-z0-9_]{2,30}$'
131 |             whitelisted_names: 'i,j,k,ex,Run,_'
132 |             constants: '(([A-Z_][A-Z0-9_]*)|(__.*__))$'
133 |             attributes: '[a-z_][a-z0-9_]{2,30}$'
134 |             arguments: '[a-z_][a-z0-9_]{2,30}$'
135 |             class_attributes: '([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$'
136 |             inline_vars: '[A-Za-z_][A-Za-z0-9_]*$'
137 |             classes: '[A-Z_][a-zA-Z0-9]+$'
138 |             modules: '(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$'
139 |             methods: '[a-z_][a-z0-9_]{2,30}$'
140 |         basic_lost_exception: true
141 |         basic_init_is_generator: true
142 |         basic_function_redefined: true
143 |         basic_expression_not_assigned: true
144 |         basic_exec_used: true
145 |         basic_empty_docstring: true
146 |         basic_duplicate_key: true
147 |         basic_duplicate_argument_name: true
148 |         basic_dangerous_default_value: true
149 |         basic_bad_reversed_sequence: true
150 |         basic_assert_on_tuple: true
151 |         basic_abstract_class_instantiated: true
152 |         basic_eval_used: true
153 | 
154 | tools:
155 |     external_code_coverage:
156 |         timeout: 300   # How long should we wait for code coverage (in seconds).
157 |         runs: 1        # In how many runs have you split your tests?
158 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |   - '2.7'
 5 |   - '3.5'
 6 | 
 7 | before_install:
 8 |  # From http://conda.pydata.org/docs/travis.html
 9 |  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
10 |       wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh;
11 |     else
12 |       wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
13 |     fi
14 |  - bash miniconda.sh -b -p $HOME/miniconda
15 |  - export PATH="$HOME/miniconda/bin:$PATH"
16 |  - hash -r
17 |  - conda config --set always_yes yes --set changeps1 no
18 |  - conda update -q conda
19 |  - conda info -a
20 |  - travis_retry conda create -n test python=$TRAVIS_PYTHON_VERSION pip pytest numpy future bcolz h5py pandas toolz
21 |  - source activate test
22 |  - travis_retry pip install arpeggio pytest-cov pytest-pep8 codecov scrutinizer-ocular
23 |  - travis_retry pip install blosc bloscpack joblib psutil
24 |  - travis_retry pip install git+https://github.com/sdvillal/whatami.git@ced628c07bd1#egg=whatami==4.0.git
25 | 
26 | script:
27 |   py.test -v -rs --doctest-modules --pep8 --cov jagged --cov-report term-missing jagged
28 | 
29 | after_success:
30 |  - codecov
31 |  - ocular --data-file ".coverage" --config-file ".coveragerc"
32 | 
33 | notifications:
34 |   email: false
35 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | New BSD License
 2 | 
 3 | Copyright (c) 2013-2014 The jagged developers.
 4 | All rights reserved.
 5 | 
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are met:
 9 | 
10 |   a. Redistributions of source code must retain the above copyright notice,
11 |      this list of conditions and the following disclaimer.
12 |   b. Redistributions in binary form must reproduce the above copyright
13 |      notice, this list of conditions and the following disclaimer in the
14 |      documentation and/or other materials provided with the distribution.
15 |   c. Neither the name of the jagged Developers  nor the names of
16 |      its contributors may be used to endorse or promote products
17 |      derived from this software without specific prior written
18 |      permission. 
19 | 
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
31 | DAMAGE.
32 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.rst
2 | include LICENSE.txt


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | jagged
  2 | ======
  3 | 
  4 | Efficient storage of same-type, uneven-size arrays
  5 | --------------------------------------------------
  6 | 
  7 | |Pypi Version| |Build Status| |Coverage Status| |Scrutinizer Status|
  8 | 
  9 | Jagged_ is an ongoing amateur project exploring the storage panorama
 10 | for datasets containing (large amounts of) arrays with the same type
 11 | and number of columns, but varying number of rows. Examples of such
 12 | datasets for which *jagged* has been used are collections of multivariate
 13 | timeseries (short animal behaviour snippets) and collections of molecules
 14 | (represented as varying length strings).
 15 | 
 16 | Jagged aims to help analyzing data in the laptop and the cluster, in batch
 17 | or interactively, providing a very lightweight store. Jagged provides fast
 18 | retrieval of array subsets for many-GB datasets containing millions of rows.
 19 | 
 20 | Requirements
 21 | ------------
 22 | 
 23 | All the requirements are pip-installable and listed in in pypi.
 24 | 
 25 | Jagged needs numpy_, whatami_ and python-future_.
 26 | 
 27 | Jagged stores build on top of several optional high quality python libraries: c-blosc_, python-blosc_,
 28 | bloscpack_, bcolz_ and joblib_. Testing relies on pytest_.
 29 | 
 30 | Getting the right combination for blosc, python-blosc, bcolz and bloscpack can be a challenge
 31 | (but worth the effort). At the moment (2015/09/02), we recommend using the latest released
 32 | versions of c-blosc (1.7.0) in combination with the latest releases of python-blosc (1.2.7)
 33 | and bloscpack (0.9.0).
 34 | 
 35 | Jagged runs in python 2.7+ and 3.4+. At the moment it has been tested only on linux, but it should
 36 | work on mac and windows as well.
 37 | 
 38 | 
 39 | Installation
 40 | ------------
 41 | 
 42 | It should suffice to use pip::
 43 | 
 44 |     pip install jagged
 45 | 
 46 | Showcase
 47 | --------
 48 | 
 49 | Using jagged is simple. There are different implementations that provide
 50 | two basic methods: **append** adds a new array to the store, **get** retrieves
 51 | collections of arrays identified by their insertion order in the store. Usually
 52 | the lifecycle of a jagged store is also simple: there is no explicit open,
 53 | append and get calls can be interleaved at will and the only needed action
 54 | to warrant consistency is to close after write, which can be achieved by calling
 55 | **close**, by calling *get* or by using a with statement with the provided
 56 | context manager.
 57 | 
 58 | This is a `real life`_ small example combining jagged with indices and queries
 59 | over real data.
 60 | 
 61 | Another synthetic example follows:
 62 | 
 63 | .. code:: python
 64 | 
 65 |     from __future__ import print_function
 66 |     import os.path as op
 67 |     import shutil
 68 |     import numpy as np
 69 |     import pandas as pd
 70 |     import tempfile
 71 |     from jagged.mmap_backend import JaggedByMemMap
 72 |     from jagged.blosc_backend import JaggedByBlosc
 73 | 
 74 |     # A Jagged instance is all you need
 75 |     mmap_dir = tempfile.mkdtemp('mmap')
 76 |     jagged = JaggedByMemMap(op.expanduser(path=mmap_dir))
 77 |     # You can drop here any JaggedRawStore implementation you want to
 78 | 
 79 |     # Generate a random dataset
 80 |     print('Creating a random dataset...')
 81 |     rng = np.random.RandomState(0)
 82 |     num_arrays = 1000
 83 |     max_length = 2000
 84 |     num_columns = 100
 85 |     originals = [rng.randn(rng.randint(0, max_length), num_columns)
 86 |                  for _ in range(num_arrays)]
 87 | 
 88 |     # Add these to the store ("with" context is usually optional, but recommended)
 89 |     print('Populating the jagged store...')
 90 |     with jagged:
 91 |         indices = list(map(jagged.append, originals))
 92 | 
 93 |     # Some jagged stores optimize queries retrieving arrays by their insertion order
 94 |     # Retrieval speed should not suffer much even with random queries
 95 |     shuffled_indices = rng.permutation(indices).tolist()
 96 |     shuffled_originals = [originals[i] for i in shuffled_indices]
 97 | 
 98 |     # What do we have in store?
 99 |     print('Number of arrays: %d, number of rows: %d' % (jagged.narrays, jagged.nrows))
100 |     # Out: Number of arrays: 200, number of rows: 193732
101 |     print('Jagged shape=%r, dtype=%r, order=%r' %
102 |           (jagged.shape, jagged.dtype, jagged.order))
103 |     # Out: Jagged shape=(193732, 50), dtype=dtype('float64'), order='C'
104 | 
105 |     # Check roundtrip
106 |     roundtrippeds = jagged.get(shuffled_indices)
107 |     for original, roundtripped in zip(shuffled_originals, roundtrippeds):
108 |         assert np.array_equal(original, roundtripped)
109 |     print('Roundtrip checks pass')
110 | 
111 |     # Jagged stores self-identified themselves (using whatami)
112 |     print(jagged.what().id())
113 |     # Out: JaggedByMemMap(autoviews=True,contiguity=None)
114 | 
115 |     # Jagged stores can be iterated in chunks (see iter)
116 |     for original, roundtripped in zip(originals, jagged):
117 |         assert np.array_equal(original, roundtripped[0])
118 |     print('Roundtrip checks for iteration pass')
119 | 
120 |     # Some jagged stores allow to retrieve arbitrary rows without penalty
121 |     # (i.e. without retrieving the whole containing array).
122 |     # These are marked as "linear" in the store feature matrix.
123 |     # You do so by passing a list of (base, size) segments.
124 |     some_rows = jagged.get([[3, 22], [45, 1000]])
125 |     assert len(some_rows[1]) == 1000
126 |     assert np.array_equal(some_rows[0], originals[0][3:25])
127 |     print('Roundtrip checks for row retrieval pass')
128 | 
129 |     # Some jagged stores allow to be lazy retrieving the arrays.
130 |     # On top of that, the MemMap implementation allow memmapped arrays.
131 |     # Can be handy to have long lists of views in memory
132 |     # while letting the OS managing memory fetching and eviction for us.
133 |     jbmm = JaggedByMemMap(op.expanduser(path=mmap_dir),
134 |                           autoviews=True,
135 |                           contiguity='auto')
136 |     print('Retrieving %d arrays...' % (len(shuffled_indices) * 100))
137 |     many_arrays = jbmm.get(shuffled_indices * 100)
138 |     # This will work also for pandas DataFrames as long as
139 |     # "copy=True" is honored by the pandas constructor
140 |     # that is, the dtype of the arrays is simple),
141 |     print('Making %d dataframes...' % (len(shuffled_indices) * 100))
142 |     columns = pd.Index(np.arange(num_columns))
143 |     dfs = [pd.DataFrame(data=array, columns=columns, copy=False)
144 |            for array in many_arrays]
145 |     print('Checking roundtrip...')
146 |     for original, roundtripped in zip(shuffled_originals * 100, dfs):
147 |         assert np.array_equal(original, roundtripped)
148 |     print('Roundtrip checks for lazy dataframes pass')
149 | 
150 |     # Jagged stores can be populated from other jagged stores
151 |     blosc_dir = tempfile.mkdtemp('mmap')
152 |     jbb = JaggedByBlosc(path=blosc_dir)
153 |     print('Saving compressed (although these data are not compressable)...')
154 |     jbb.append_from(jagged)
155 |     for a_from_mmap, a_from_blosc in zip(jbb, jagged):
156 |         assert np.array_equal(a_from_mmap, a_from_blosc)
157 |     print(jbb.what().id())
158 |     print('Roundtrip checks for compressed arrays pass')
159 |     # Out: JaggedByBlosc(compressor=BloscCompressor(cname='lz4hc',
160 |     #                                               level=5,
161 |     #                                               n_threads=1,
162 |     #                                               shuffle=True))
163 | 
164 |     # We are done, cleanup
165 |     shutil.rmtree(mmap_dir, ignore_errors=True)
166 |     shutil.rmtree(blosc_dir, ignore_errors=True)
167 | 
168 | 
169 | Backends
170 | --------
171 | 
172 | Although rapidly changing, *jagged* already provides the following storage backends
173 | that can be considered as working and stable. Other backends are planned.
174 | 
175 | +-------------------+------+-------+--------+------+-----+------+------+
176 | | Backend           | comp | chunk | column | mmap | lin | lazy | cont |
177 | +===================+======+=======+========+======+=====+======+======+
178 | | JaggedByBlosc     | X    |       |        | X    |     |      |      |
179 | +-------------------+------+-------+--------+------+-----+------+------+
180 | | JaggedByCarray    | X    | X     |        |      | X   |      | X    |
181 | +-------------------+------+-------+--------+------+-----+------+------+
182 | | JaggedByH5Py      | X    | X     |        |      | X   | X    | X    |
183 | +-------------------+------+-------+--------+------+-----+------+------+
184 | | JaggedByJoblib    | X    | X     |        |      |     |      |      |
185 | +-------------------+------+-------+--------+------+-----+------+------+
186 | | JaggedByMemMap    |      |       |        | X    | X   | X    | X    |
187 | +-------------------+------+-------+--------+------+-----+------+------+
188 | | JaggedByNPY       |      |       |        |      |     |      |      |
189 | +-------------------+------+-------+--------+------+-----+------+------+
190 | | JaggedByBloscpack | X    |       |        |      |     |      |      |
191 | +-------------------+------+-------+--------+------+-----+------+------+
192 | | JaggedByPickle    | X    | X     |        |      |     |      |      |
193 | +-------------------+------+-------+--------+------+-----+------+------+
194 | 
195 | 
196 | - comp:
197 |   can be compressed
198 | - chunk:
199 |   can be chunked
200 | - column:
201 |   stores columns of the array contiguously (can be easily implemented by using a store per column)
202 | - mmap:
203 |   can open a memmap to the data
204 | - lin:
205 |   can retrieve any row without the need to retrieve the whole array it contains it
206 | - lazy:
207 |   the arrays are not fetched immediatly; this can mean also that they can be managed
208 |   as virtual-memory by the OS (JaggedByMemMap only)
209 | - cont:
210 |   retrieved arrays can be forced to lie in contiguous memory segments
211 | 
212 | 
213 | Benchmarks
214 | ----------
215 | 
216 | What backend and parameters work best depends on whether the data is compressible or not, the
217 | sizes of the arrays and the kind of queries. We have a good idea of what works best for our data
218 | and query types and are working at providing a benchmarking framework, that can be useful if
219 | you can get a good sample of the data to store. Find here a preview_, results will be soon posted here.
220 | 
221 | 
222 | By-design constraints
223 | ---------------------
224 | 
225 | Jagged would like to be simple: conceptually, to deploy and to use.
226 | 
227 | Jagged is about retrieving full arrays.
228 | Focus is on fast retrieval of arbitrary batch queries.
229 | Batch queries over arrays appended closeby should be faster.
230 | Jagged is good for local caches or reducing the burden of
231 | network file systems.
232 | 
233 | Jagged stores are append only.
234 | 
235 | There is no transaction, replication or distribution or...
236 | It is all files in your local or network disks, written once, read many times.
237 | If you have complex data or requirements, there are many better options.
238 | If you have simple numerical arrays you want to load fast and store light,
239 | jagged might be worth a try.
240 | 
241 | Not important efforts have been given yet to optimize
242 | (although some backends work quite smoothly).
243 | At the moment, everything is simple algorithms implemented in pure python.
244 | 
245 | 
246 | Links
247 | -----
248 | 
249 | This neat blogpost_ from Matthew Rocklin is highly recommended, as it delivers
250 | the promised *"vocabulary to talk about efficient tabular storage"*. Add perhaps
251 | "blocked" (as in "compression is done in cache-friendly sized blocks") and
252 | "chunked" (as in "retrieval is done in I/O-friendly sized chunks") to the lexicon.
253 | The castra_ project is worth a look.
254 | 
255 | 
256 | .. _Jagged: https://github.com/sdvillal/jagged
257 | .. |Pypi Version| image:: https://badge.fury.io/py/jagged.svg
258 |    :target: http://badge.fury.io/py/jagged
259 | .. |Build Status| image:: https://travis-ci.org/sdvillal/jagged.svg?branch=master
260 |    :target: https://travis-ci.org/sdvillal/jagged/branches
261 | .. |Coverage Status| image:: http://codecov.io/github/sdvillal/jagged/coverage.svg?branch=master
262 |    :target: http://codecov.io/github/sdvillal/jagged?branch=master
263 | .. |Scrutinizer Status| image:: https://scrutinizer-ci.com/g/sdvillal/jagged/badges/quality-score.png?b=master
264 |    :target: https://scrutinizer-ci.com/g/sdvillal/jagged/?branch=master
265 | .. _real life: https://github.com/strawlab/strawlab-examples/blob/master/strawlab_examples/euroscipy/euroscipy_example.py
266 | .. _preview: https://github.com/sdvillal/strawlab-examples/tree/master/strawlab_examples/benchmarks
267 | .. _numpy: http://www.numpy.org/
268 | .. _whatami: http://www.github.com/sdvillal/whatami
269 | .. _python-future: http://python-future.org/
270 | .. _c-blosc: https://github.com/Blosc/c-blosc
271 | .. _python-blosc: https://github.com/Blosc/python-blosc
272 | .. _bloscpack: https://github.com/Blosc/bloscpack
273 | .. _bcolz: https://github.com/Blosc/bcolz
274 | .. _joblib: https://pythonhosted.org/joblib/
275 | .. _pytest: http://pytest.org
276 | .. _blogpost: http://matthewrocklin.com/blog/work/2015/08/28/Storage/
277 | .. _castra: https://github.com/blaze/castra
278 | 


--------------------------------------------------------------------------------
/examples/readme_examples.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """Examples copied verbatim in the readme.
  3 | This should probably be a notebook.
  4 | """
  5 | from __future__ import print_function
  6 | import os.path as op
  7 | import shutil
  8 | import numpy as np
  9 | import pandas as pd
 10 | import tempfile
 11 | from jagged.mmap_backend import JaggedByMemMap
 12 | from jagged.blosc_backend import JaggedByBlosc
 13 | 
 14 | # A Jagged instance is all you need
 15 | mmap_dir = tempfile.mkdtemp('mmap')
 16 | jagged = JaggedByMemMap(op.expanduser(path=mmap_dir))
 17 | # You can drop here any JaggedRawStore implementation you want to
 18 | 
 19 | # Generate a random dataset
 20 | print('Creating a random dataset...')
 21 | rng = np.random.RandomState(0)
 22 | num_arrays = 1000
 23 | max_length = 2000
 24 | num_columns = 100
 25 | originals = [rng.randn(rng.randint(0, max_length), num_columns)
 26 |              for _ in range(num_arrays)]
 27 | 
 28 | # Add these to the store ("with" context is usually optional, but recommended)
 29 | print('Populating the jagged store...')
 30 | with jagged:
 31 |     indices = list(map(jagged.append, originals))
 32 | 
 33 | # Some jagged stores optimize queries retrieving arrays by their insertion order
 34 | # Retrieval speed should not suffer much even with random queries
 35 | shuffled_indices = rng.permutation(indices).tolist()
 36 | shuffled_originals = [originals[i] for i in shuffled_indices]
 37 | 
 38 | # What do we have in store?
 39 | print('Number of arrays: %d, number of rows: %d' % (jagged.narrays, jagged.nrows))
 40 | # Out: Number of arrays: 200, number of rows: 193732
 41 | print('Jagged shape=%r, dtype=%r, order=%r' %
 42 |       (jagged.shape, jagged.dtype, jagged.order))
 43 | # Out: Jagged shape=(193732, 50), dtype=dtype('float64'), order='C'
 44 | 
 45 | # Check roundtrip
 46 | roundtrippeds = jagged.get(shuffled_indices)
 47 | for original, roundtripped in zip(shuffled_originals, roundtrippeds):
 48 |     assert np.array_equal(original, roundtripped)
 49 | print('Roundtrip checks pass')
 50 | 
 51 | # Jagged stores self-identified themselves (using whatami)
 52 | print(jagged.what().id())
 53 | # Out: JaggedByMemMap(autoviews=True,contiguity=None)
 54 | 
 55 | # Jagged stores can be iterated in chunks (see iter)
 56 | for original, roundtripped in zip(originals, jagged):
 57 |     assert np.array_equal(original, roundtripped[0])
 58 | print('Roundtrip checks for iteration pass')
 59 | 
 60 | # Some jagged stores allow to retrieve arbitrary rows without penalty
 61 | # (i.e. without retrieving the whole containing array).
 62 | # These are marked as "linear" in the store feature matrix.
 63 | # You do so by passing a list of (base, size) segments.
 64 | some_rows = jagged.get([[3, 22], [45, 1000]])
 65 | assert len(some_rows[1]) == 1000
 66 | assert np.array_equal(some_rows[0], originals[0][3:25])
 67 | print('Roundtrip checks for row retrieval pass')
 68 | 
 69 | # Some jagged stores allow to be lazy retrieving the arrays.
 70 | # On top of that, the MemMap implementation allow memmapped arrays.
 71 | # Can be handy to have long lists of views in memory
 72 | # while letting the OS managing memory fetching and eviction for us.
 73 | jbmm = JaggedByMemMap(op.expanduser(path=mmap_dir),
 74 |                       autoviews=True,
 75 |                       contiguity='auto')
 76 | print('Retrieving %d arrays...' % (len(shuffled_indices) * 100))
 77 | many_arrays = jbmm.get(shuffled_indices * 100)
 78 | # This will work also for pandas DataFrames as long as
 79 | # "copy=True" is honored by the pandas constructor
 80 | # that is, the dtype of the arrays is simple),
 81 | print('Making %d dataframes...' % (len(shuffled_indices) * 100))
 82 | columns = pd.Index(np.arange(num_columns))
 83 | dfs = [pd.DataFrame(data=array, columns=columns, copy=False)
 84 |        for array in many_arrays]
 85 | print('Checking roundtrip...')
 86 | for original, roundtripped in zip(shuffled_originals * 100, dfs):
 87 |     assert np.array_equal(original, roundtripped)
 88 | print('Roundtrip checks for lazy dataframes pass')
 89 | 
 90 | # Jagged stores can be populated from other jagged stores
 91 | blosc_dir = tempfile.mkdtemp('mmap')
 92 | jbb = JaggedByBlosc(path=blosc_dir)
 93 | print('Saving compressed (although these data are not compressable)...')
 94 | jbb.append_from(jagged)
 95 | for a_from_mmap, a_from_blosc in zip(jbb, jagged):
 96 |     assert np.array_equal(a_from_mmap, a_from_blosc)
 97 | print(jbb.what().id())
 98 | print('Roundtrip checks for compressed arrays pass')
 99 | # Out: JaggedByBlosc(compressor=BloscCompressor(cname='lz4hc',
100 | #                                               level=5,
101 | #                                               n_threads=1,
102 | #                                               shuffle=True))
103 | 
104 | # We are done, cleanup
105 | shutil.rmtree(mmap_dir, ignore_errors=True)
106 | shutil.rmtree(blosc_dir, ignore_errors=True)
107 | 


--------------------------------------------------------------------------------
/jagged/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from __future__ import absolute_import
 3 | 
 4 | # --- Backend imports
 5 | 
 6 | from .mmap_backend import JaggedByMemMap
 7 | 
 8 | try:
 9 |     from .blosc_backend import JaggedByBlosc
10 | except ImportError:  # pragma: no cover
11 |     JaggedByBlosc = None
12 | 
13 | try:
14 |     from .bcolz_backend import JaggedByCarray
15 | except ImportError:  # pragma: no cover
16 |     JaggedByCarray = None
17 | 
18 | try:
19 |     from .h5py_backend import JaggedByH5Py
20 | except ImportError:  # pragma: no cover
21 |     JaggedByH5Py = None
22 | 
23 | from .npy_backend import JaggedByNPY
24 | 
25 | try:
26 |     from .bloscpack_backend import JaggedByBloscpack
27 | except ImportError:  # pragma: no cover
28 |     JaggedByBloscpack = None
29 | 
30 | from .pickle_backend import JaggedByPickle
31 | 
32 | try:
33 |     from .joblib_backend import JaggedByJoblib
34 | except ImportError:  # pragma: no cover
35 |     JaggedByJoblib = None
36 | 
37 | 
38 | # --- Version
39 | 
40 | __version__ = '0.1.1-dev0'
41 | 


--------------------------------------------------------------------------------
/jagged/base.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """Convenient (and somehow performing) storage of objects with homogeneous types but different lengths.
  3 | 
  4 | "jagged" array providers have very simple, low level contracts:
  5 |  - Focus on reading performance, append only store.
  6 |  - Use numpy arrays as the canonical data carriers
  7 |  - May or may not restrict the type of the stored elements
  8 |  - Retrieve only by providing indices *collections*
  9 |    No explicit support for slice notation
 10 |  - All clases are whatami whatables
 11 | """
 12 | from __future__ import absolute_import, unicode_literals, print_function
 13 | from future.utils import bytes_to_native_str
 14 | from abc import ABCMeta
 15 | from array import array
 16 | from functools import partial
 17 | import os.path as op
 18 | from operator import itemgetter
 19 | import json
 20 | 
 21 | from future.builtins import range, map
 22 | from toolz import merge, partition_all
 23 | import numpy as np
 24 | 
 25 | from jagged.misc import ensure_dir
 26 | from whatami import whatable
 27 | 
 28 | try:
 29 |     import cPickle as pickle
 30 | except ImportError:  # pragma: no cover
 31 |     import pickle
 32 | 
 33 | 
 34 | # --- Journals (persitence of array lengths)
 35 | 
 36 | 
 37 | def _int_or_0(v):
 38 |     if v is None:
 39 |         return 0
 40 |     return int(v)
 41 | 
 42 | 
 43 | def _read_full_file(x, path):
 44 |     """Reads the full contentes of file path into array x."""
 45 |     with open(path, 'rb') as reader:
 46 |         reader.seek(0, 2)
 47 |         size = reader.tell()
 48 |         reader.seek(0, 0)
 49 |         if size % x.itemsize != 0:
 50 |             raise Exception('Truncated file')
 51 |         x.fromfile(reader, size // x.itemsize)
 52 |         return x
 53 | 
 54 | 
 55 | class JaggedJournal(object):
 56 |     """Keeps track and persists information about the sizes of added arrays."""
 57 | 
 58 |     # a journal must be instantiated only when jagged knows its location
 59 |     # a journal can be shared by many jagged instances (e.g. when storing different columns by different jaggeds)
 60 | 
 61 |     def __init__(self, path):
 62 |         super(JaggedJournal, self).__init__()
 63 |         self._path = ensure_dir(path)
 64 |         # base and length of each added array
 65 |         self._lengths_file = op.join(self._path, 'lengths.array')
 66 |         self._lengths = self._read_lengths()
 67 |         self._bases = None
 68 |         # total number of rows and arrays
 69 |         self._sizes_file = op.join(self._path, 'size.json')
 70 |         self._numrows, self._numarrays = self._read_sizes()
 71 | 
 72 |     def append(self, data):
 73 |         """Appends the data array to the journal."""
 74 |         self._add_length(data)
 75 |         self._add_sizes(data)
 76 | 
 77 |     # --- Num rows, num arrays (redundant with lengths, light and good for redundancy)
 78 | 
 79 |     def _add_sizes(self, data):
 80 |         """Adds to numrows and numarrays the sizes of the array data and immediatly persists them."""
 81 |         self._numrows += len(data)
 82 |         self._numarrays += 1
 83 |         with open(self._sizes_file, 'w') as writer:
 84 |             json.dump({'numrows': self._numrows, 'numarrays': self._numarrays}, writer, indent=2)
 85 | 
 86 |     def _read_sizes(self):
 87 |         """Reads the current numrows and numarrays values from persistent storage.
 88 |         If there is no info stored, makes them 0.
 89 |         """
 90 |         if op.isfile(self._sizes_file):
 91 |             with open(self._sizes_file, 'r') as reader:
 92 |                 sizes = json.load(reader)
 93 |                 return _int_or_0(sizes['numrows']), _int_or_0(sizes['numarrays'])
 94 |         return 0, 0
 95 | 
 96 |     def numrows(self):
 97 |         """Returns the total number of rows in the jagged instance."""
 98 |         return self._numrows
 99 | 
100 |     def numarrays(self):
101 |         """Returns the number of arrays in the jagged instance."""
102 |         return self._numarrays
103 | 
104 |     # --- Base and size of each array
105 | 
106 |     def _add_length(self, data):
107 |         """Adds the length to the journal and immediatly persists it."""
108 |         self._lengths.append(len(data))
109 |         with open(self._lengths_file, 'ab') as writer:
110 |             self._lengths[-1:].tofile(writer)
111 | 
112 |     def _read_lengths(self):
113 |         """Reads the lengths from persistent storage, if it does not exist, returns an empty array."""
114 |         lengths = array(bytes_to_native_str(b'l'))
115 |         if op.isfile(self._lengths_file):
116 |             _read_full_file(lengths, self._lengths_file)
117 |         return lengths
118 | 
119 |     def lengths(self):
120 |         """Returns an array with the length of each array added to the journal."""
121 |         return self._lengths
122 | 
123 |     def bases(self):
124 |         """Returns where each array would start if the storage is linear."""
125 |         if self._bases is None or len(self._bases) < len(self._lengths):
126 |             self._bases = np.hstack(([0], np.cumsum(self._lengths)))
127 |         return self._bases
128 | 
129 |     def start_end(self, index):
130 |         """Returns the start and end of the array at index."""
131 |         base, size = self.base_size(index)
132 |         return base, base + size
133 | 
134 |     def base_size(self, index):
135 |         """Returns the base and size of the array at index."""
136 |         return self.bases()[index], self.lengths()[index]
137 | 
138 |     # --- Sanity checks
139 | 
140 |     def check_consistency(self):
141 |         """Checks the internal consistency of the journal."""
142 |         assert len(self.lengths()) == len(self.bases())
143 |         assert len(self.lengths()) == self.numarrays()
144 |         assert len(np.sum(self.lengths())) == self.numrows()
145 | 
146 | # --- Raw stores
147 | 
148 | 
149 | @whatable(add_properties=False)
150 | class JaggedRawStore(object):
151 |     """Persistent storage of objects of the same type but different length."""
152 | 
153 |     __metaclass__ = ABCMeta
154 | 
155 |     def __init__(self, path, journal=None):
156 |         super(JaggedRawStore, self).__init__()
157 |         self._path = path
158 |         if self._path is not None:
159 |             ensure_dir(self._path)
160 |         self._template = None    # how the saved arrays look like
161 |         self._journal = journal  # sizes of the added arrays
162 | 
163 |     # --- Where this storage resides
164 | 
165 |     def path_or_fail(self):
166 |         """Returns the path if set, otherwise raises an exception."""
167 |         if self._path is None:
168 |             raise Exception('In-memory only arrays are not implemented for %s.' % self.what().id())
169 |         return self._path
170 | 
171 |     # --- Journal
172 | 
173 |     def journal(self):
174 |         if self._journal is None:
175 |             self._journal = JaggedJournal(op.join(self.path_or_fail(), 'meta', 'journal'))
176 |         return self._journal
177 | 
178 |     # --- Template
179 | 
180 |     def template(self):
181 |         template_dir = ensure_dir(op.join(self.path_or_fail(), 'meta', 'template'))
182 |         template_path = op.join(template_dir, 'template.npy')
183 |         if self._template is None:
184 |             if op.isfile(template_path):
185 |                 self._template = np.load(template_path)
186 |         return self._template
187 | 
188 |     def _write_template(self, data):
189 |         template_dir = ensure_dir(op.join(self.path_or_fail(), 'meta', 'template'))
190 |         template_path = op.join(template_dir, 'template.npy')
191 |         np.save(template_path, data[:0])
192 | 
193 |     def can_add(self, data):
194 |         """Returns True iff data can be stored.
195 |         This usually means it is of the same kind as previously stored arrays.
196 |         """
197 |         # Obviously we could just store arbitrary arrays in some implementations (e.g. NPY)
198 |         # But lets keep jagged contracts...
199 |         template = self.template()
200 |         if template is None:
201 |             return True
202 |         return (template.dtype >= data.dtype and
203 |                 data.shape[-1] == template.shape[-1] and
204 |                 np.isfortran(data) == np.isfortran(data))
205 | 
206 |     # --- Lifecycle
207 | 
208 |     # N.B. at the moment, to make things simple, we only want write or read
209 |     # We should ensure concurrency does not break this rule (only one writer xor many readers)
210 |     # Of course we could use stuff like SWMR from hdf5 or role our own, more featureful and compled concurrency
211 |     # Not a priority
212 | 
213 |     @property
214 |     def is_writing(self):
215 |         """Returns whether we can append more data using this jagged instance."""
216 |         raise NotImplementedError()
217 | 
218 |     @property
219 |     def is_reading(self):
220 |         """Returns whether we can append more data using this jagged instance."""
221 |         raise NotImplementedError()
222 | 
223 |     @property
224 |     def is_open(self):
225 |         """Returns whether we are currently open in any mode."""
226 |         raise NotImplementedError()
227 | 
228 |     def close(self):
229 |         """Flushes buffers to permanent storage and closes the underlying backend."""
230 |         raise NotImplementedError()
231 | 
232 |     # --- Writing data
233 | 
234 |     def _open_write(self, data=None):
235 |         """Opens in writing mode, returns None.
236 | 
237 |         Parameters
238 |         ----------
239 |         data : numpy array like, default None
240 |           data schema to use by the storage, needed if this is the first opening of the repository
241 |         """
242 |         raise NotImplementedError()
243 | 
244 |     def append(self, data):
245 |         """Appends new data to this storage.
246 | 
247 |         If the storage is empty, this will define the dtype of the store.
248 | 
249 |         Parameters
250 |         ----------
251 |         data : numpy-array like
252 |           The data to append, must have a compatible dtype with what was already added to the store.
253 | 
254 |         Returns
255 |         -------
256 |         An integer addressing the added array in the storage
257 |         """
258 | 
259 |         # at the moment we do not allow coordinate-less stores
260 |         self.path_or_fail()
261 | 
262 |         # check data validity
263 |         if any(s < 1 for s in data.shape[1:]):
264 |             raise Exception('Cannot append data with sizes 0 in non-leading dimension (%s, %r)' %
265 |                             (self.what().id(), data.shape))
266 | 
267 |         # check we can write
268 |         if self.is_reading and not self.is_writing:
269 |             self.close()
270 | 
271 |         # template
272 |         if self.template() is None:
273 |             self._write_template(data)
274 |         assert self.can_add(data)
275 | 
276 |         # id log
277 |         if not op.isfile(op.join(self.path_or_fail(),  'meta', 'whatid.txt')):
278 |             ensure_dir(op.join(self.path_or_fail(),  'meta'))
279 |             with open(op.join(self.path_or_fail(), 'meta', 'whatid.txt'), 'w') as writer:
280 |                 writer.write(self.what().id())
281 | 
282 |         # open
283 |         self._open_write(data)
284 | 
285 |         # write
286 |         self._append_hook(data)
287 | 
288 |         # bookkeeping
289 |         index = self.journal().numarrays()
290 |         self.journal().append(data)
291 | 
292 |         # done
293 |         return index
294 | 
295 |     def _append_hook(self, data):
296 |         """Saves the data, returns nothing."""
297 |         raise NotImplementedError()
298 | 
299 |     def append_from(self, jagged, arrays_per_chunk=None):
300 |         """Appends all the contents of jagged."""
301 |         for chunk in jagged.iter_arrays(arrays_per_chunk=arrays_per_chunk):
302 |             for data in chunk:
303 |                 self.append(data)
304 | 
305 |     # --- Reading
306 | 
307 |     def _open_read(self):
308 |         """Opens in reading mode, returns None."""
309 |         raise NotImplementedError()
310 | 
311 |     def _get_views(self, keys, columns):
312 |         """Returns a list of arrays corresponding to the provided keys and columns."""
313 |         raise NotImplementedError()
314 | 
315 |     def get(self, keys=None, columns=None, factory=None):
316 |         """Returns a list with the data specified in `keys` (and `columns`), possibly transformed by `factory`.
317 | 
318 |         Concrete implementations may warrant things like "all segments actually lie in congiguous regions in memory".
319 | 
320 |         Parameters
321 |         ----------
322 |         keys : list of keys
323 |           specifies which elements to retrieve; if None, all arrays are returned
324 | 
325 |         columns : list of integers, default None
326 |           specifies which columns to retrieve; if None, retrieve all columns
327 | 
328 |         factory : factory(ndarray)->desired type, default None
329 |           transforms each of the returned elements into a desired type (for example, a pandas DataFrame)
330 |           another use can be to apply summary statistics
331 | 
332 |         Returns
333 |         -------
334 |         A list with the retrieved elements, possibly transformed by factory.
335 |         """
336 | 
337 |         # at the moment we do not allow coordinate-less stores
338 |         self.path_or_fail()
339 | 
340 |         # flush if needed
341 |         if self.is_writing and not self.is_reading:
342 |             self.close()
343 | 
344 |         # open
345 |         self._open_read()
346 | 
347 |         # read
348 |         views = self._get_views(keys, columns)
349 | 
350 |         return views if factory is None else map(factory, views)
351 | 
352 |     # -- Iteration
353 | 
354 |     def iter_arrays(self, arrays_per_chunk=None):
355 |         """Iterates over the arrays in this store."""
356 |         if arrays_per_chunk is None:
357 |             for key in range(self.journal().numarrays()):
358 |                 yield self.get([key])
359 |         elif arrays_per_chunk <= 0:
360 |             raise ValueError('arrays_per_chunk must be None or bigger than 0, it is %r' % arrays_per_chunk)
361 |         else:
362 |             for segments in partition_all(arrays_per_chunk, range(self.journal().numarrays())):
363 |                 yield self.get(segments)
364 | 
365 |     def __iter__(self, arrays_per_chunk=None):
366 |         """Alias to iter_arrays."""
367 |         return self.iter_arrays(arrays_per_chunk=arrays_per_chunk)
368 | 
369 |     # def iter_rows(self, max_rows_per_chunk):
370 |     #     # Iterates segments in chunks with max_rows_per_chunk as upper bound
371 |     #     # (but will give at least one segment at a time)
372 |     #     # This can be more (e.g. SegmentRawStorage) or less involved (e.g. JaggedByNumpy)
373 |     #     # Useful to iterate with really controlled amount of memory
374 |     #     raise NotImplementedError()
375 | 
376 |     #  --- Factories / curries / partials
377 | 
378 |     def copyconf(self, **params):
379 |         """Returns a partial function that instantiates this type of store
380 |         with changed default parameters.
381 | 
382 |         N.B. this default implementation is based on being able to retrieve all default parameters
383 |         using the `what` method; override if that is not the case.
384 | 
385 |         Parameters
386 |         ----------
387 |         params: **dict
388 |           The parameters that will be fixed in the returned factory function.
389 |         """
390 |         return whatable(partial(self.__class__, **merge(self.what().conf, params)), add_properties=False)
391 | 
392 |     # --- shape, dtype, order
393 | 
394 |     @property
395 |     def shape(self):
396 |         """Returns a tuple with the current size of the storage in each dimension."""
397 |         ncols = self.ncols
398 |         return None if ncols is None else (self.nrows, ncols)
399 | 
400 |     @property
401 |     def dtype(self):
402 |         """Returns the data type of the store."""
403 |         template = self.template()
404 |         return None if template is None else template.dtype
405 | 
406 |     @property
407 |     def ndims(self):
408 |         """Returns the number of dimensions."""
409 |         # Actually at the moment we only support ndims == 2
410 |         shape = self.shape
411 |         return len(self.shape) if shape is not None else None
412 | 
413 |     @property
414 |     def ncols(self):
415 |         """Returns the number of columns."""
416 |         template = self.template()
417 |         return None if template is None else template.shape[1]
418 | 
419 |     @property
420 |     def nrows(self):
421 |         """Returns the number of rows in the store."""
422 |         return self.journal().numrows()
423 | 
424 |     @property
425 |     def narrays(self):
426 |         """Returns the number or arrays in the store."""
427 |         return self.journal().numarrays()
428 | 
429 |     @property
430 |     def order(self):
431 |         """Returns 'C' for row major, 'F' for column major."""
432 |         template = self.template()
433 |         if template is None:
434 |             return None
435 |         return 'F' if np.isfortran(template) else 'C'
436 | 
437 |     # --- Context manager and other magics...
438 | 
439 |     def __enter__(self):
440 |         return self
441 | 
442 |     def __exit__(self, *_):
443 |         self.close()
444 | 
445 |     def __len__(self):
446 |         """Returns the size of the leading dimension."""
447 |         return self.shape[0] if self.shape is not None else 0
448 | 
449 |     # Also consider register to atexit as a parameter to the constructor
450 | 
451 | 
452 | # --- Linear stores (can address arbitrary row segments)
453 | 
454 | 
455 | class LinearRawStorage(JaggedRawStore):
456 | 
457 |     __metaclass__ = ABCMeta  # no harm, lint stops complaining
458 | 
459 |     def __init__(self, path, journal=None, contiguity=None):
460 |         """
461 |         A linear raw storage can access arbitrary rows using base (row index in the storage) and size
462 |         (number of rows to retrieve).
463 | 
464 |         Parameters
465 |         ----------
466 |         journal : must quack like JaggedJournal, default None
467 |           see base class
468 | 
469 |         contiguity : string or None, default None
470 |            indicates the type of contiguity sought for the results; for performance segments retrieval
471 |            does not need to be done in any order
472 |              - 'read': a best effort should be done to leave retrieved segments order-contiguous in memory;
473 |                        this can potentially speed up operations reading these data in the order specified by segments
474 |              - 'write': a best effort should be done to write segments sequentially in memory;
475 |                         this can potentially speed up retrieval
476 |              - 'auto': allow the backend to decide the return flavor;
477 |                        using this the backends can return "lazy" or "cached" arrays
478 |                        (for example, views on memmapped arrays or hdf5 datasets)
479 |              - None: do not force any contiguity nor allow any strange return, just plain numpy arrays
480 |                      owning their own data; this is safest and usually well performing
481 |            usually 'read' can be a good idea for analysis, and 'auto' can have memory saving benefits
482 |            beware that forcing contiguity for speed might lead to memory leaks
483 |            (the whole retrieved segments won't be released while any of them is reacheable)
484 |         """
485 |         super(LinearRawStorage, self).__init__(path, journal=journal)
486 |         self.contiguity = contiguity
487 | 
488 |     def _get_views(self, keys, columns):
489 |         # get all segments if segments is None
490 |         if keys is None:
491 |             keys = range(self.journal().numarrays())
492 |         keys = [self.journal().base_size(key) if isinstance(key, int) else key for key in keys]
493 | 
494 |         if 0 == len(keys):
495 |             return []
496 | 
497 |         # retrieve data
498 |         ne, nc = self.shape
499 |         views = retrieve_contiguous(keys, columns, self._get_hook, self.dtype, ne, nc, self.contiguity)
500 | 
501 |         return views
502 | 
503 |     def _get_hook(self, base, size, columns, dest):
504 |         raise NotImplementedError()
505 | 
506 |     def iter_rows(self, rows_per_chunk):
507 |         """Reads rows_per_chunk rows at a time until all is read."""
508 |         base = 0
509 |         total = len(self)
510 |         while base < total:
511 |             size = min(rows_per_chunk, total - base)
512 |             yield self.get([(base, size)])[0]
513 |             base += size
514 | 
515 | 
516 | def retrieve_contiguous(segments, columns, reader, dtype, ne, nc, contiguity):
517 | 
518 |     # Check for valid contiguity
519 |     if contiguity not in ('read', 'write', 'auto', None):
520 |         raise ValueError('Unknown contiguity scheme: %r' % contiguity)
521 | 
522 |     # Check query sanity and prepare contiguous query
523 |     # dest_base tells where each query must go to in case of contiguity='read'
524 |     # note that dest_base is not useful for unsorting in the presence of 0-length items (so we explicitly store order)
525 |     dest_base = 0
526 |     query_dest = []
527 |     for order, (base, size) in enumerate(segments):
528 |         if (base + size) > ne or base < 0:
529 |             raise ValueError('Out of bounds query (base=%d, size=%d, maxsize=%d)' % (base, size, ne))
530 |         query_dest.append((order, base, dest_base, size))
531 |         dest_base += size
532 |     total_size = dest_base
533 | 
534 |     nc = len(columns) if columns is not None else nc
535 | 
536 |     # Retrieve
537 |     views = []
538 |     if contiguity == 'read':
539 |         # Hope for one-malloc only, but beware of memory leaks
540 |         dest = np.empty((total_size, nc), dtype=dtype)
541 |         # Populate
542 |         for order, base, dest_base, size in sorted(query_dest):
543 |             view = dest[dest_base:dest_base+size]
544 |             view = reader(base, size, columns, view)
545 |             views.append((order, view))
546 |     elif contiguity == 'write':
547 |         # Hope for one-malloc only, but beware of memory leaks
548 |         dest = np.empty((total_size, nc), dtype=dtype)
549 |         # Populate
550 |         dest_base = 0
551 |         for order, base, _, size in sorted(query_dest):
552 |             view = dest[dest_base:dest_base+size]
553 |             view = reader(base, size, columns, view)
554 |             dest_base += size
555 |             views.append((order, view))
556 |     elif contiguity == 'auto':
557 |         for order, base, _, size in sorted(query_dest):
558 |             view = reader(base, size, columns, None)
559 |             views.append((order, view))
560 |     else:
561 |         for order, base, _, size in sorted(query_dest):
562 |             view = np.empty((size, nc), dtype=dtype)
563 |             view = reader(base, size, columns, view)
564 |             views.append((order, view))
565 | 
566 |     # Unpack views while restoring original order
567 |     return list(map(itemgetter(1), sorted(views, key=itemgetter(0))))
568 | 


--------------------------------------------------------------------------------
/jagged/bcolz_backend.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import bcolz
  4 | from whatami import whatable
  5 | import os.path as op
  6 | from .base import LinearRawStorage
  7 | 
  8 | 
  9 | class JaggedByCarray(LinearRawStorage):
 10 |     """
 11 |     A Jagged store that uses in-disk `bcolz.carray` to store the data.
 12 | 
 13 |     This backend should be good for compressable data accessed sequentially or in batch range queries.
 14 |     Random access of small segments will suffer from a considerable performance degradation.
 15 | 
 16 |     Usually these stores are backed by many files, so access via network file systems or from spin disks can
 17 |     potentially be inefficient.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     path : string
 22 |       the carray will/must reside here
 23 | 
 24 |     contiguity : string, default None
 25 |       see base class
 26 | 
 27 |     journal : must quack like JaggedJournal, default None
 28 |       see base class
 29 | 
 30 |     expectedlen : int, default None
 31 |       passed to the carray on creation, the expected number of rows in the store
 32 |       carray will use it to guess a good chunksize
 33 |       the actual size of each chunk will of course depend also on the number of columns
 34 |       must be None if `chunklen` is provided
 35 | 
 36 |     chunklen : int, default None
 37 |       passed to the carray on creation, the number of rows to store per chunk
 38 |       the actual size of each chunk will of course depend also on the number of columns
 39 |       must be None if `expectedlen` is provided
 40 | 
 41 |     cparams : `bcolz.cparams`, default bcolz.cparams(clevel=5, shuffle=False, cname='lz4hc')
 42 |       the compression configuration for bcolz; only used if the array is empty
 43 |     """
 44 | 
 45 |     def __init__(self,
 46 |                  path=None,
 47 |                  journal=None,
 48 |                  contiguity=None,
 49 |                  # bcolz params
 50 |                  expectedlen=None,
 51 |                  chunklen=1024 ** 2 // 2,  # 500K rows
 52 |                  cparams=bcolz.cparams(clevel=5, shuffle=False, cname='lz4hc')):
 53 | 
 54 |         super(JaggedByCarray, self).__init__(path, journal=journal, contiguity=contiguity)
 55 | 
 56 |         self.expectedlen = expectedlen
 57 |         self.chunklen = chunklen
 58 |         self.cparams = whatable(cparams, add_properties=True)
 59 |         self._bcolz = None
 60 | 
 61 |     def _bcolz_dir(self):
 62 |         # Needs to be different than self._path or metainfo gets deleted
 63 |         return op.join(self.path_or_fail(), 'bcolz')
 64 | 
 65 |     # --- Write
 66 | 
 67 |     def _open_write(self, data=None):
 68 |         if self._bcolz is None:
 69 |             try:  # append
 70 |                 self._bcolz = \
 71 |                     bcolz.carray(None,
 72 |                                  rootdir=self._bcolz_dir(),
 73 |                                  mode='a',
 74 |                                  # bcolz conf in case mode='a' semantics change to create, otherwise innocuous
 75 |                                  chunklen=self.chunklen,
 76 |                                  expectedlen=self.expectedlen,
 77 |                                  cparams=self.cparams)
 78 |             except:  # create
 79 |                 self._bcolz = \
 80 |                     bcolz.carray(data[0:0],
 81 |                                  rootdir=self._bcolz_dir(),
 82 |                                  mode='w',
 83 |                                  chunklen=self.chunklen,
 84 |                                  expectedlen=self.expectedlen,
 85 |                                  cparams=self.cparams)
 86 | 
 87 |     def _append_hook(self, data):
 88 |         self._bcolz.append(data)
 89 | 
 90 |     # --- Read
 91 | 
 92 |     def _open_read(self):
 93 |         if self._bcolz is None:
 94 |             self._bcolz = bcolz.carray(None, rootdir=self._bcolz_dir(), mode='r')
 95 | 
 96 |     def _get_hook(self, base, size, columns, dest):
 97 |         if dest is not None and columns is None:
 98 |             # measure if this has any performance benefit, if so, asks for it to be public API
 99 |             self._bcolz._getrange(base, size, dest)
100 |             return dest
101 |         if columns is not None:
102 |             view = self._bcolz[base:base+size, columns]
103 |         else:
104 |             view = self._bcolz[base:base+size]
105 |         if dest is not None:
106 |             dest[:] = view
107 |             return dest
108 |         return view
109 | 
110 |     # --- Lifecycle
111 | 
112 |     @property
113 |     def is_writing(self):
114 |         return self.is_open and self._bcolz.mode in ('w', 'a')
115 | 
116 |     @property
117 |     def is_reading(self):
118 |         return self.is_open and self._bcolz.mode == 'r'
119 | 
120 |     @property
121 |     def is_open(self):
122 |         return self._bcolz is not None
123 | 
124 |     def close(self):
125 |         if self.is_writing:
126 |             self._bcolz.flush()
127 |         self._bcolz = None
128 | 


--------------------------------------------------------------------------------
/jagged/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdvillal/jagged/e737eae08005ab20222b46668f54e6b5dcb8e04c/jagged/benchmarks/__init__.py


--------------------------------------------------------------------------------
/jagged/benchmarks/utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """Benchmarking utilities.
  3 | Some of these are inspired by bloscpack / bloscpack-benchmarks.
  4 |   https://github.com/Blosc/bloscpack-benchmarking
  5 | """
  6 | import os.path as op
  7 | import socket
  8 | import datetime
  9 | import subprocess
 10 | import json
 11 | import os
 12 | from jagged.misc import ensure_dir
 13 | import psutil
 14 | 
 15 | #
 16 | # Timing is hard and we should at least use timeit
 17 | # (something with support for calibration and repetition).
 18 | # A great resource is also pytest benchmark
 19 | #   https://pypi.python.org/pypi/pytest-benchmark/2.5.0
 20 | #   https://bitbucket.org/haypo/misc/src/tip/python/benchmark.py
 21 | # There are a bunch of benchmarker / timer etc. libraries in pypi
 22 | # Do not forget about /usr/bin/time -v
 23 | #
 24 | 
 25 | 
 26 | def timestr():
 27 |     return datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
 28 | 
 29 | 
 30 | def hostname():
 31 |     return socket.gethostname()
 32 | 
 33 | 
 34 | def collect_sysinfo(dest=None):
 35 |     """
 36 |     Collects basic information from the machine using several tools.
 37 |     This needs to run as root.
 38 |     Note that speeds are theoretical, not measured
 39 |     (specially peak network and network drives speeds should be measured).
 40 | 
 41 |     Prerequisites
 42 |     -------------
 43 |     If in ubuntu:
 44 |       sudo apt-get install smartmontools inxi dmidecode
 45 |     If in arch:
 46 |      sudo pacman -S smartmontools inxi dmidecode
 47 | 
 48 |     What is run
 49 |     -----------
 50 |     # Basic information about mount points
 51 |     mount > mount.info
 52 |     # Inxi reports
 53 |     inxi > inxi.info
 54 |     # Full dmidecode
 55 |     dmidecode > dmidecode.info
 56 |     # Network speed information
 57 |     dmesg | grep -i duplex > network-speed.info
 58 |     # SMART information
 59 |     sudo smartctl -a /dev/sda > smartctl-sda.info
 60 | 
 61 |     References
 62 |     ----------
 63 |     http://www.binarytides.com/linux-commands-hardware-info/
 64 |     http://www.cyberciti.biz/faq/linux-command-to-find-sata-harddisk-link-speed/
 65 |     http://www.cyberciti.biz/faq/howto-setup-linux-lan-card-find-out-full-duplex-half-speed-or-mode/
 66 |     http://www.cyberciti.biz/tips/linux-find-out-wireless-network-speed-signal-strength.html
 67 |     """
 68 | 
 69 |     #
 70 |     # Any way of getting actual memory latencies, CAS...?
 71 |     # Also we could look at pure python libraries like dmidecode
 72 |     #
 73 | 
 74 |     if dest is None:
 75 |         dest = op.join(op.dirname(__file__), 'sysinfo')
 76 |     dest = op.join(ensure_dir(op.join(dest, hostname())), timestr() + '.json')
 77 | 
 78 |     info = {
 79 |         'mount': subprocess.check_output('mount'),
 80 |         'dmesg-eth': '\n'.join(line for line in subprocess.check_output('dmesg').splitlines() if 'duplex' in line),
 81 |         'iwconfig': subprocess.check_output('iwconfig'),
 82 |         'inxiF': subprocess.check_output(['inxi', '-c 0', '-F']),
 83 |         # add some more inxi stuff
 84 |     }
 85 | 
 86 |     with open(dest, 'w') as writer:
 87 |         json.dump(info, writer, indent=2, sort_keys=True)
 88 | 
 89 | 
 90 | def du(path):
 91 |     """Returns the size of the tree under path in bytes."""
 92 |     return int(subprocess.check_output(['du', '-s', '-L', '-B1', path]).split()[0].decode('utf-8'))
 93 | 
 94 | 
 95 | def drop_caches(path, drop_level=3, max_size='1000G', verbose=False):
 96 |     #
 97 |     # Some light reading
 98 |     #   http://www.linuxatemyram.com/play.html
 99 |     # vmtouch
100 |     #   https://aur.archlinux.org/packages/vmtouch/
101 |     #   http://serverfault.com/questions/278454/is-it-possible-to-list-the-files-that-are-cached
102 |     #   http://serverfault.com/questions/43383/caching-preloading-files-on-linux-into-ram
103 |     # fincore
104 |     #   yaourt -S --noconfirm perl-file-sharedir-install
105 |     #   yaourt -S --noconfirm fincore
106 |     # To drop system caches, one needs root;
107 |     # an option, add the program to sudoers so no pass is required.
108 |     #
109 |     if 0 != os.system('vmtouch -e -f -q -m %s "%s"' % (max_size, path)):
110 |         if os.geteuid() == 0:
111 |             os.system('echo %d > /proc/sys/vm/drop_caches' % drop_level)
112 |             if verbose:
113 |                 print('Full system cache dropped because of %s' % path)
114 |         else:
115 |             raise RuntimeError('Need vmtouch or root permission to drop caches')
116 |     else:
117 |         if verbose:
118 |             print('All pages under %s evicted' % path)
119 | 
120 | 
121 | def sync():
122 |     """Flushes buffers to disk."""
123 |     os.system('sync')
124 | 
125 | 
126 | def available_ram():
127 |     return psutil.virtual_memory().available
128 | 
129 | #
130 | # We need to make sure that:
131 | #  - we go beyond microbenchmarks and look at relevant tasks
132 | #    e.g. realtime visualisation or data exploration as opposed to batch
133 | #
134 | 
135 | #
136 | # Measure dataset complexity (e.g. lempel ziv via compression) and report it
137 | #
138 | 


--------------------------------------------------------------------------------
/jagged/blosc_backend.py:
--------------------------------------------------------------------------------
  1 | from mmap import mmap, ACCESS_READ
  2 | from operator import itemgetter
  3 | import os.path as op
  4 | 
  5 | from future.builtins import range
  6 | 
  7 | from jagged.base import JaggedRawStore, JaggedJournal
  8 | from jagged.compression.compressors import BloscCompressor
  9 | from whatami import What
 10 | 
 11 | 
 12 | class JaggedByBlosc(JaggedRawStore):
 13 | 
 14 |     # Memmapped
 15 |     # Not chunked - hope to keep using bcolz for that
 16 | 
 17 |     def __init__(self, path=None, journal=None, compressor=BloscCompressor):
 18 |         super(JaggedByBlosc, self).__init__(path, journal=journal)
 19 |         self.compressor = compressor
 20 |         self._mm = None
 21 |         self._writing = None
 22 |         self._bjournal = None
 23 | 
 24 |     def _bytes_journal(self):
 25 |         if self._bjournal is None:
 26 |             self._bjournal = JaggedJournal(op.join(self.path_or_fail(), 'meta', 'bytes_journal'))
 27 |         return self._bjournal
 28 | 
 29 |     def _compressor(self):
 30 |         if not isinstance(self.compressor, BloscCompressor):
 31 |             self.compressor = self.compressor(dtype=self.dtype,
 32 |                                               shape=self.shape,
 33 |                                               order=self.order)
 34 |         return self.compressor
 35 | 
 36 |     # --- Custom what() to be available at any circumstance
 37 | 
 38 |     def what(self):
 39 |         try:
 40 |             return What(self.__class__.__name__, {'compressor': self.compressor()})
 41 |         except TypeError:
 42 |             return What(self.__class__.__name__, {'compressor': self.compressor})
 43 | 
 44 |     # --- Write
 45 | 
 46 |     def _open_write(self, data=None):
 47 |         self._mm = open(op.join(self.path_or_fail(), 'data'), 'ab')
 48 |         self._writing = True
 49 | 
 50 |     def _append_hook(self, data):
 51 |         compressor = self._compressor()
 52 |         compressed = compressor.compress(data)
 53 |         self._mm.write(compressed)
 54 |         self._bytes_journal().append(compressed)
 55 | 
 56 |     # --- Read
 57 | 
 58 |     def _open_read(self):
 59 |         self._mm = open(op.join(self.path_or_fail(), 'data'), 'r')
 60 |         self._mm = mmap(self._mm.fileno(), 0, access=ACCESS_READ)
 61 |         self._writing = False
 62 | 
 63 |     def _read_segment(self, base, size):
 64 |         return self._mm[base:base+size]
 65 | 
 66 |     def _get_views(self, keys, columns):
 67 | 
 68 |         if keys is None:
 69 |             keys = range(self.narrays)
 70 | 
 71 |         keys = [(key, order) for order, key in enumerate(keys)]
 72 | 
 73 |         compressor = self._compressor()
 74 |         views = []
 75 |         for key, order in sorted(keys):
 76 |             base, size = self._bytes_journal().base_size(key)  # cache these segments?
 77 |             array = compressor.decompress(self._read_segment(base, size))
 78 |             if columns is not None:
 79 |                 array = array[:, tuple(columns)]
 80 |             views.append((array, order))
 81 |         views = list(map(itemgetter(0), sorted(views, key=itemgetter(1))))
 82 | 
 83 |         return views
 84 | 
 85 |     # --- Lifecycle
 86 | 
 87 |     @property
 88 |     def is_reading(self):
 89 |         return self.is_open and not self.is_writing
 90 | 
 91 |     @property
 92 |     def is_writing(self):
 93 |         return self.is_open and self._writing
 94 | 
 95 |     @property
 96 |     def is_open(self):
 97 |         return self._mm is not None
 98 | 
 99 |     def close(self):
100 |         if self.is_open:
101 |             self._mm.close()
102 |         self._mm = None
103 |         self._writing = None
104 | 


--------------------------------------------------------------------------------
/jagged/bloscpack_backend.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import os.path as op
 3 | import bloscpack
 4 | from bloscpack.defaults import DEFAULT_CHUNK_SIZE
 5 | from jagged.npy_backend import JaggedByNPY
 6 | from bloscpack.numpy_io import pack_ndarray_file, unpack_ndarray_file
 7 | 
 8 | 
 9 | class JaggedByBloscpack(JaggedByNPY):
10 | 
11 |     def __init__(self,
12 |                  path=None,
13 |                  journal=None,
14 |                  # blosc
15 |                  clevel=5,
16 |                  shuffle=True,
17 |                  cname='lz4hc',
18 |                  # bloscpack
19 |                  chunk_size=DEFAULT_CHUNK_SIZE,
20 |                  offsets=False,
21 |                  checksum='None'):
22 |         super(JaggedByBloscpack, self).__init__(path, journal=journal)
23 |         self.clevel = clevel
24 |         self.shuffle = shuffle
25 |         self.cname = cname
26 |         self.offsets = offsets
27 |         self.checksum = checksum
28 |         self.chunk_size = chunk_size
29 |         self._bp_args = None
30 |         self._blosc_args = None
31 | 
32 |     def _dest_file(self, index):
33 |         return op.join(self._shards[index % 256], '%d.blp' % index)
34 | 
35 |     def _read_one(self, key):
36 |         return unpack_ndarray_file(self._dest_file(key))
37 | 
38 |     def _append_hook(self, data):
39 |         if self._bp_args is None:
40 |             self._bp_args = bloscpack.BloscpackArgs(offsets=self.offsets,
41 |                                                     checksum=self.checksum)
42 |         if self._blosc_args is None:
43 |             self._blosc_args = bloscpack.BloscArgs(typesize=self.dtype.itemsize,
44 |                                                    clevel=self.clevel,
45 |                                                    shuffle=self.shuffle,
46 |                                                    cname=self.cname)
47 |         pack_ndarray_file(data, self._dest_file(self.narrays),
48 |                           chunk_size=self.chunk_size,
49 |                           blosc_args=self._blosc_args,
50 |                           bloscpack_args=self._bp_args)
51 | 


--------------------------------------------------------------------------------
/jagged/compression/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdvillal/jagged/e737eae08005ab20222b46668f54e6b5dcb8e04c/jagged/compression/__init__.py


--------------------------------------------------------------------------------
/jagged/compression/compressors.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from whatami import whatable
 4 | 
 5 | try:
 6 |     import blosc
 7 | except ImportError:  # pragma: no cover
 8 |     blosc = None
 9 | 
10 | 
11 | # --- Consistent compressors API
12 | 
13 | 
14 | @whatable
15 | class Compressor(object):
16 | 
17 |     def compress(self, data):
18 |         raise NotImplementedError()
19 | 
20 |     def decompress(self, cdata):
21 |         raise NotImplementedError()
22 | 
23 |     def uncompress(self, cdata):  # pragma: no cover
24 |         return self.decompress(cdata)
25 | 
26 | 
27 | # --- Pimping Blosc to compress our arrays
28 | 
29 | class BloscCompressor(Compressor):
30 | 
31 |     # This has quite an overhead beyond compression ATM
32 | 
33 |     def __init__(self, shuffle=True, level=5, cname='lz4hc', n_threads=1, dtype=None, shape=None, order=None):
34 |         super(BloscCompressor, self).__init__()
35 |         self.shuffle = shuffle
36 |         self.level = level
37 |         self.cname = cname
38 |         self.n_threads = n_threads
39 |         self._dtype = dtype
40 |         self._shape = shape
41 |         self._order = order
42 | 
43 |     def compress(self, x):
44 |         blosc.set_nthreads(self.n_threads)  # mmmm global, put in a context to reset
45 |         x = np.ascontiguousarray(x)  # LOOK AT THIS
46 |         shape, dtype, order = x.shape, x.dtype, ('F' if np.isfortran(x) else 'C')
47 |         if self._dtype is None:
48 |             self._shape, self._dtype, self._order = shape, dtype, order
49 |         else:
50 |             assert order == self._order
51 |             assert dtype == self._dtype
52 |             assert len(self._shape) == 1 or shape[1] == self._shape[1]
53 |         return blosc.compress_ptr(x.__array_interface__['data'][0],
54 |                                   x.size, x.dtype.itemsize,
55 |                                   shuffle=self.shuffle, cname=self.cname, clevel=self.level)
56 | 
57 |     def decompress(self, cx):
58 |         blosc.set_nthreads(self.n_threads)  # mmmm global, put in a context to reset
59 |         x = blosc.decompress(cx)
60 |         x = np.frombuffer(x, dtype=self._dtype)  # beware gets an immutable array
61 |         if self._order == 'F':
62 |             np.asfortranarray(x)  # correct? makes copy and screwes up?
63 |         if len(self._shape) > 1:
64 |             x = x.reshape(-1, self._shape[1])
65 |         return x
66 | 


--------------------------------------------------------------------------------
/jagged/h5py_backend.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import os.path as op
  3 | 
  4 | import numpy as np
  5 | import h5py
  6 | 
  7 | from jagged.base import LinearRawStorage
  8 | 
  9 | 
 10 | class JaggedByH5Py(LinearRawStorage):
 11 | 
 12 |     def __init__(self,
 13 |                  path=None,
 14 |                  journal=None,
 15 |                  contiguity=None,
 16 |                  # hdf params
 17 |                  dset_name='data',
 18 |                  chunklen=None,
 19 |                  compression=None,
 20 |                  compression_opts=None,
 21 |                  shuffle=False,
 22 |                  checksum=False):
 23 |         super(JaggedByH5Py, self).__init__(path, journal=journal, contiguity=contiguity)
 24 | 
 25 |         self._dset_name = dset_name
 26 | 
 27 |         if self._path is not None:
 28 |             self._h5_path = op.join(self._path, 'data.h5')
 29 |         self._h5 = None
 30 |         self._dset = None
 31 | 
 32 |         self.chunklen = chunklen
 33 |         self.compression = compression
 34 |         self.compression_opts = compression_opts
 35 |         self.shuffle = shuffle
 36 |         self.checksum = checksum
 37 | 
 38 |     # --- Read
 39 | 
 40 |     def _open_read(self):
 41 |         if self._h5 is None:
 42 |             self._h5 = h5py.File(self._h5_path, mode='r')
 43 |             self._dset = self._h5[self._dset_name]
 44 | 
 45 |     def _get_hook(self, base, size, columns, dest):
 46 | 
 47 |         # h5py does not handle graciously this case
 48 |         if size == 0:
 49 |             if dest is not None:
 50 |                 return dest
 51 |             nc = len(columns) if columns is not None else self._dset.shape[-1]
 52 |             return np.empty((0, nc), dtype=self._dset.dtype)
 53 | 
 54 |         # easy case, no column subset requested
 55 |         if columns is None:
 56 |             if dest is None:
 57 |                 return self._dset[base:base+size]  # should we force read with [:]? add to benchmark
 58 |             else:
 59 |                 self._dset.read_direct(dest, source_sel=np.s_[base:base+size])
 60 |                 return dest
 61 | 
 62 |         # N.B.: tuple(columns) to force 2d if columns happens to be a one-element list
 63 |         # column-subset is requested
 64 |         # h5py only supports increasing order indices in fancy indexing
 65 |         #   https://github.com/h5py/h5py/issues/368
 66 |         #   https://github.com/h5py/h5py/issues/368
 67 |         # (boiling down to issues with hdf5 hyperslabs)
 68 | 
 69 |         if not np.any(np.diff(columns) < 1):
 70 |             if dest is not None:
 71 |                 self._dset.read_direct(dest, source_sel=np.s_[base:base+size, tuple(columns)])
 72 |                 return dest
 73 |             else:
 74 |                 return self._dset[base:base+size, tuple(columns)]
 75 | 
 76 |         # better slow than unsupported...
 77 |         columns, inverse = np.unique(columns, return_inverse=True)
 78 |         if dest is not None:
 79 |             dest[:] = self._dset[base:base+size, tuple(columns)][:, inverse]
 80 |             return dest
 81 |         else:
 82 |             return self._dset[base:base+size, tuple(columns)][:, inverse]
 83 | 
 84 |     # --- Write
 85 | 
 86 |     def _open_write(self, data=None):
 87 |         if self._h5 is None:
 88 |             self._h5 = h5py.File(self._h5_path, mode='a')
 89 |             if 'data' not in self._h5:
 90 |                 # http://docs.h5py.org/en/latest/high/dataset.html
 91 |                 chunks = None
 92 |                 if self.chunklen is not None:
 93 |                     chunks = (self.chunklen,) + (data.shape[1:] if data.ndim > 1 else ())
 94 |                 self._dset = self._h5.create_dataset(self._dset_name,
 95 |                                                      dtype=data.dtype,
 96 |                                                      shape=(0, data.shape[1]),
 97 |                                                      maxshape=(None, data.shape[1]),
 98 |                                                      chunks=chunks,
 99 |                                                      compression=self.compression,
100 |                                                      compression_opts=self.compression_opts,
101 |                                                      shuffle=self.shuffle,
102 |                                                      fletcher32=self.checksum)
103 |             else:
104 |                 self._dset = self._h5[self._dset_name]
105 | 
106 |     def _append_hook(self, data):
107 |         base = len(self)
108 |         size = len(data)
109 |         self._dset.resize(base + size, axis=0)
110 |         self._dset[base:(base+size)] = data
111 | 
112 |     # --- Lifecycle
113 | 
114 |     @property
115 |     def is_writing(self):
116 |         return self.is_open and self._h5.mode != 'r'
117 | 
118 |     @property
119 |     def is_reading(self):
120 |         return self.is_open and self._h5.mode == 'r'
121 | 
122 |     @property
123 |     def is_open(self):
124 |         return self._h5 is not None
125 | 
126 |     def close(self):
127 |         if self._h5 is not None:
128 |             self._h5.close()
129 |             self._h5 = None
130 | 


--------------------------------------------------------------------------------
/jagged/joblib_backend.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import joblib
 3 | from jagged.pickle_backend import JaggedByPickle
 4 | 
 5 | 
 6 | class JaggedByJoblib(JaggedByPickle):
 7 | 
 8 |     def __init__(self, path=None, journal=None, arrays_per_chunk=1000, compress=False):
 9 |         super(JaggedByJoblib, self).__init__(path, journal, arrays_per_chunk, compress)
10 | 
11 |     def _load(self, path):
12 |         self._cache = joblib.load(path)
13 | 
14 |     def _dump(self, path):
15 |         compress = 0 if not self.compress else (5 if self.compress is True else self.compress)
16 |         joblib.dump(self._cache, path, compress=compress)  # cache_size
17 | 


--------------------------------------------------------------------------------
/jagged/misc.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """A jumble of seemingly useful stuff."""
  3 | from __future__ import unicode_literals
  4 | from itertools import chain
  5 | from contextlib import contextmanager
  6 | import numbers
  7 | import os
  8 | import os.path as op
  9 | import numpy as np
 10 | 
 11 | 
 12 | def home():  # pragma: no cover
 13 |     """Returns current user home dir."""
 14 |     return op.expanduser('~')  # Valid in both py2 and py3
 15 | 
 16 | 
 17 | def ensure_writable_dir(path):  # pragma: no cover
 18 |     """Ensures that a path is a writable directory."""
 19 |     def check_path(path):
 20 |         if not op.isdir(path):
 21 |             raise Exception('%s exists but it is not a directory' % path)
 22 |         if not os.access(path, os.W_OK):
 23 |             raise Exception('%s is a directory but it is not writable' % path)
 24 |     if op.exists(path):
 25 |         check_path(path)
 26 |     else:
 27 |         try:
 28 |             os.makedirs(path)
 29 |         except Exception:
 30 |             if op.exists(path):  # Simpler than using a file lock to work on multithreading...
 31 |                 check_path(path)
 32 |             else:
 33 |                 raise
 34 |     return path
 35 | 
 36 | 
 37 | def ensure_dir(path):  # pragma: no cover
 38 |     return ensure_writable_dir(path)
 39 | 
 40 | 
 41 | @contextmanager
 42 | def cd(newdir):
 43 |     prevdir = os.getcwd()
 44 |     os.chdir(op.expanduser(newdir))
 45 |     try:
 46 |         yield
 47 |     finally:
 48 |         os.chdir(prevdir)
 49 | 
 50 | 
 51 | # --- Intervals
 52 | 
 53 | def crossings(x, threshold=0, after=False):
 54 |     """Returns the indices of the elements before or after crossing a threshold.
 55 | 
 56 |     N.B. touching the threshold itself is considered a cross.
 57 | 
 58 |     Parameters
 59 |     ----------
 60 |     x: array
 61 |     The data
 62 | 
 63 |     threshold: float, default 0
 64 |     Where crossing happens.
 65 | 
 66 |     after: bool, default False
 67 |     If True, the indices represent the elements after the cross, if False the elements before the cross.
 68 | 
 69 |     Returns
 70 |     -------
 71 |     The indices where crosses happen.
 72 | 
 73 |     Examples
 74 |     --------
 75 | 
 76 |     >>> print(crossings(np.array([0, 1, -1, -1, 1, -1])))
 77 |     [0 1 3 4]
 78 |     >>> print(crossings(np.array([0, 1, -1, -1, 1, -1]), after=True))
 79 |     [1 2 4 5]
 80 |     >>> print(crossings(np.array([0, 0, 0])))
 81 |     []
 82 |     >>> print(crossings(np.array([0, 3, -3, -3, 1]), threshold=1))
 83 |     [0 1 3]
 84 |     >>> print(crossings(np.array([0, 3, -3, -3]), threshold=-2.5))
 85 |     [1]
 86 |     >>> print(crossings(np.array([[0, 3], [-3, -3]]), threshold=-2.5))  # doctest: +IGNORE_EXCEPTION_DETAIL
 87 |     Traceback (most recent call last):
 88 |     Exception: Only 1D arrays, please (you gave me 2 dimensions)
 89 |     """
 90 |     if len(x.shape) > 1:
 91 |         raise Exception('Only 1D arrays, please (you gave me %d dimensions)' % len(x.shape))
 92 |     where_crosses = np.where(np.diff(np.sign(x - threshold)))[0]
 93 |     if after:
 94 |         return where_crosses + 1
 95 |     return where_crosses
 96 | 
 97 | 
 98 | def find_intervals(x):
 99 |     """
100 |     Finds the intervals in which x is True or non-zero.
101 | 
102 | 
103 |     Returns
104 |     -------
105 |     Pairs of indices representing the intervals in which x is True or nonzero.
106 |     The pairs represent valid python intervals, lower point included, upper point excluded.
107 | 
108 | 
109 |     Examples
110 |     --------
111 |     >>> find_intervals([])
112 |     []
113 |     >>> find_intervals([1])
114 |     [(0, 1)]
115 |     >>> find_intervals([0, 1])
116 |     [(1, 2)]
117 |     >>> find_intervals([0, 0, 1, 1, 0, 0, 1, 1, 0])
118 |     [(2, 4), (6, 8)]
119 |     >>> find_intervals([0, 0, 0])
120 |     []
121 |     >>> find_intervals([1, 1, 1])
122 |     [(0, 3)]
123 |     >>> find_intervals([True, True, True])
124 |     [(0, 3)]
125 |     >>> find_intervals([1, 1, 1, 0])
126 |     [(0, 3)]
127 |     """
128 |     # This ugly 6 lines are here because:
129 |     #   - we allow to pass lists but we need numpy arrays
130 |     #   - we want to allow both boolean (True, False) arrays and numeric arrays
131 |     #   - we want to use the crossings function which only accepts numeric arrays
132 |     if not isinstance(x, np.ndarray):
133 |         x = np.array(x)
134 |     if not x.dtype == np.bool:
135 |         x = x != 0
136 |     zeros_ones = np.zeros_like(x, dtype=np.int)
137 |     zeros_ones[x] = 1
138 | 
139 |     # Find where we change from being in an interval to not being in an interval
140 |     starts_ends = list(crossings(zeros_ones, after=True))
141 | 
142 |     # Do we start already in an interval?
143 |     if len(zeros_ones) > 0 and 1 == zeros_ones[0]:
144 |         starts_ends = [0] + starts_ends
145 | 
146 |     # Do we end in an interval?
147 |     if len(zeros_ones) > 0 and 1 == zeros_ones[-1]:
148 |         starts_ends = starts_ends + [len(x)]
149 | 
150 |     assert len(starts_ends) % 2 == 0
151 | 
152 |     starts = starts_ends[0::2]
153 |     ends = starts_ends[1::2]
154 |     return list(zip(starts, ends))
155 | 
156 | 
157 | def is_valid_segment(ss, relative_to=None):
158 |     if not isinstance(ss, (tuple, list)):
159 |         return False
160 |     if not len(ss) == 2:
161 |         return False
162 |     ss_base, ss_size = ss
163 |     if not isinstance(ss_base, numbers.Integral) and isinstance(ss_size, numbers.Integral):
164 |         return False
165 |     if relative_to is not None:
166 |         base, size = relative_to
167 |         if ss_base < 0 or (base + ss_base + ss_size) > (base + size):
168 |             return False
169 |     return True
170 | 
171 | 
172 | def bool2segments(ss, size):
173 |     ssa = np.array(ss)
174 |     if ssa.dtype.kind == 'b' and ssa.ndim == 1 and len(ssa) == size:
175 |         return [(start, end - start) for start, end in find_intervals(ssa)]
176 |     return None
177 | 
178 | 
179 | def subsegments(segment, *subs):
180 |     """Make subsegments relative to the start of a base segment, checking for boundaries.
181 | 
182 |     Parameters
183 |     ----------
184 |     segment : tuple (base, size)
185 |       The segment to which relative subsegments are being specified
186 | 
187 |     subs : list of (base, size) boolean arrays specifying subsegments
188 |       These can be either something like (3, 8) (ss_base, ss_size), or boolean lists/arrays
189 |       It is assumed that ss_base is here is offset from `segment` base
190 | 
191 |     Returns
192 |     -------
193 |     A list of subsegments [(base, size)], each lying within the boundaries of `segment`.
194 | 
195 |     Examples
196 |     --------
197 |     >>> subsegments((5, 100))
198 |     []
199 |     >>> subsegments((5, 100), (11, 14))
200 |     [(16, 14)]
201 |     >>> subsegments((5, 100), (11, 14), (3, 88))
202 |     [(16, 14), (8, 88)]
203 |     >>> subsegments((0, 5), [True, True, False, True, True])
204 |     [(0, 2), (3, 2)]
205 |     >>> subsegments((0, 5), [False] * 5)
206 |     []
207 |     >>> subsegments((0, 5), [True] * 5)
208 |     [(0, 5)]
209 |     >>> subsegments((0, 5), np.array([True] * 5))
210 |     [(0, 5)]
211 |     >>> subsegments((0, 5), [True, True, False, True, True], (2, 2))
212 |     [(0, 2), (3, 2), (2, 2)]
213 |     >>> subsegments((0, 100), (90, 11))  # doctest: +IGNORE_EXCEPTION_DETAIL
214 |     Traceback (most recent call last):
215 |     ValueError: (90, 11) is not a valid subsegment specification for (0, 100)
216 |     >>> subsegments((0, 100), (-3, 8))  # doctest: +IGNORE_EXCEPTION_DETAIL
217 |     Traceback (most recent call last):
218 |     ValueError: (90, 11) is not a valid subsegment specification for (0, 100)
219 |     >>> subsegments((0, 100), ('a', 8))  # doctest: +IGNORE_EXCEPTION_DETAIL
220 |     Traceback (most recent call last):
221 |     ValueError: ('a', 8) is not a valid subsegment specification for (0, 100)
222 |     >>> subsegments((0, 100), 'crazyyou')  # doctest: +IGNORE_EXCEPTION_DETAIL
223 |     Traceback (most recent call last):
224 |     ValueError: 'crazyyou' is not a valid subsegment specification for (0, 100)
225 |     """
226 | 
227 |     # This implementation is slow, but seemingly correct; I do not think it will bottleneck
228 | 
229 |     base, size = segment
230 | 
231 |     def bool_and_valid(ss):
232 |         if is_valid_segment(ss, relative_to=segment):
233 |             return [ss]
234 |         ss_from_bool = bool2segments(ss, size)
235 |         if ss_from_bool is not None and all(is_valid_segment(ss, relative_to=segment) for ss in ss_from_bool):
236 |             return ss_from_bool
237 |         raise ValueError('%r is not a valid subsegment specification for %r' % (ss, segment))
238 | 
239 |     return [(base + ss_base, ss_size) for ss_base, ss_size in
240 |             chain.from_iterable(bool_and_valid(ss) for ss in subs)]
241 | 


--------------------------------------------------------------------------------
/jagged/mmap_backend.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | """Backend using python/numpy mmap bindings."""
 3 | from __future__ import absolute_import, unicode_literals, print_function, division
 4 | import os.path as op
 5 | from future.utils import PY3
 6 | import numpy as np
 7 | from jagged.base import LinearRawStorage
 8 | 
 9 | 
10 | class JaggedByMemMap(LinearRawStorage):
11 |     """Provides numpy arrays as views of an underlying memmapped array."""
12 | 
13 |     def __init__(self, path=None, journal=None, contiguity=None, autoviews=True):
14 |         super(JaggedByMemMap, self).__init__(path, journal=journal, contiguity=contiguity)
15 | 
16 |         if self._path is not None:
17 |             self._mmpath = op.join(self._path, 'data.mm')
18 | 
19 |         self._mm = None  # numpy memmap for reading / file handler for writing
20 | 
21 |         self.autoviews = autoviews
22 | 
23 |     # --- Read
24 | 
25 |     def _open_read(self):
26 |         if self._mm is None:
27 |             self._mm = np.memmap(self._mmpath,
28 |                                  dtype=self.dtype, shape=self.shape, order=self.order,
29 |                                  mode='r')
30 |         self._check_sizes()
31 | 
32 |     def _get_hook(self, base, size, columns, dest):
33 |         view = self._mm[base:base+size]
34 |         if columns is not None:
35 |             view = view[:, tuple(columns)]
36 |         if dest is None:
37 |             return view.copy() if not self.autoviews else view
38 |         dest[:] = view
39 |         return dest
40 | 
41 |     # --- Write
42 | 
43 |     def _open_write(self, data=None):
44 |         self._mm = open(self._mmpath, mode='a')
45 | 
46 |     def _append_hook(self, data):
47 |         self._mm.buffer.write(data.data) if PY3 else self._mm.write(str(data.data))
48 | 
49 |     # --- Lifecycle
50 | 
51 |     @property
52 |     def is_writing(self):
53 |         return self.is_open and not self.is_reading
54 | 
55 |     @property
56 |     def is_reading(self):
57 |         return isinstance(self._mm, np.memmap)
58 | 
59 |     @property
60 |     def is_open(self):
61 |         return self._mm is not None
62 | 
63 |     def close(self):
64 |         if self.is_writing:
65 |             self._mm.close()
66 |         self._mm = None
67 | 
68 |     # --- Storage for underlying array shape, dtype, row/column order
69 | 
70 |     def _len_by_filelen(self):
71 |         """Helps to check sanity of the array."""
72 |         mmsize_bytes = op.getsize(self._mmpath)
73 |         row_size_bytes = self.shape[1] * self.dtype.itemsize
74 |         num_rows = mmsize_bytes // row_size_bytes
75 |         leftovers = mmsize_bytes % row_size_bytes
76 |         return num_rows, leftovers
77 | 
78 |     def _check_sizes(self):
79 |         if op.isfile(self._mmpath) and self.shape is not None:
80 |             num_rows, leftovers = self._len_by_filelen()
81 |             if 0 != leftovers:
82 |                 raise Exception('the memmap file has incomplete data '
83 |                                 '(%d leftover bytes from a partially written array).'
84 |                                 '(are you missing transactions?)' % leftovers)
85 |             if num_rows != self.shape[0]:
86 |                 raise Exception('the number or rows inferred by file size '
87 |                                 'does not coincide with the length of the store '
88 |                                 '(%d != %d)' % (num_rows, self.shape[0]))
89 | 
90 | #
91 | # Remember that resize for numpy mmap objects never resize the file under the hood.
92 | # Numpy mmap ndarray subclass code is simple and neat, can be read in no time. Get back there.
93 | #
94 | # Document that when using mode 'auto', everything returned is a view to the large memmapped array.
95 | #
96 | 


--------------------------------------------------------------------------------
/jagged/npy_backend.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from future.builtins import range
 3 | from itertools import chain
 4 | import os.path as op
 5 | import os
 6 | import numpy as np
 7 | from jagged.base import JaggedRawStore
 8 | from jagged.misc import ensure_dir
 9 | 
10 | 
11 | class JaggedByNPY(JaggedRawStore):
12 |     """Stores each array in an individual .npy file."""
13 | 
14 |     def __init__(self, path=None, journal=None):
15 |         super(JaggedByNPY, self).__init__(path, journal=journal)
16 |         self._shards = None
17 |         if path is not None:
18 |             self._all_shards()
19 | 
20 |     # We can do this memory map (see np.load)
21 | 
22 |     def _all_shards(self):
23 |         if self._shards is None:
24 |             self._shards = [ensure_dir(op.join(self.path_or_fail(), str(shard))) for shard in range(256)]
25 |         return self._shards
26 |         # random note, 256 is the last cached int in cpython
27 | 
28 |     def _dest_file(self, index):
29 |         return op.join(self._shards[index % 256], '%d.npy' % index)
30 | 
31 |     def _infer_numarrays(self):
32 |         numarrays = 0
33 |         for shard in self._shards:
34 |             numarrays = max(chain([numarrays], (int(fn[:-4]) + 1 for fn in os.listdir(shard))))
35 |         return numarrays
36 | 
37 |     def check_numarrays(self):
38 |         assert self._infer_numarrays() == self.journal().numarrays()
39 | 
40 |     # --- Write
41 | 
42 |     def _open_write(self, data=None):
43 |         pass
44 | 
45 |     def _append_hook(self, data):
46 |         np.save(self._dest_file(self.narrays), data)
47 | 
48 |     # --- Read
49 | 
50 |     def _open_read(self):
51 |         pass
52 | 
53 |     def _read_one(self, key):
54 |         return np.load(self._dest_file(key))
55 | 
56 |     def _get_one(self, key, columns):
57 |         data = self._read_one(key)
58 |         if columns is not None:
59 |             data = data[:, tuple(columns)]
60 |         return data
61 | 
62 |     def _get_views(self, keys, columns):
63 |         if keys is None:
64 |             return list(self._get_one(key, columns) for key in range(self.journal().numarrays()))
65 |         return [self._get_one(key, columns) for key in keys]
66 | 
67 |     # --- Lifecycle
68 | 
69 |     @property
70 |     def is_writing(self):
71 |         return True
72 | 
73 |     @property
74 |     def is_reading(self):
75 |         return True
76 | 
77 |     @property
78 |     def is_open(self):
79 |         return True
80 | 
81 |     def close(self):
82 |         pass
83 | 


--------------------------------------------------------------------------------
/jagged/pickle_backend.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import gzip
  3 | from operator import itemgetter
  4 | from jagged.base import JaggedRawStore
  5 | import os.path as op
  6 | try:
  7 |     import cPickle as pickle
  8 | except ImportError:  # pragma: no cover
  9 |     import pickle
 10 | 
 11 | 
 12 | class JaggedByPickle(JaggedRawStore):
 13 |     """A chunked store based on pickle."""
 14 | 
 15 |     def __init__(self, path=None, journal=None, arrays_per_chunk=1000, compress=False):
 16 |         super(JaggedByPickle, self).__init__(path, journal)
 17 |         self.arrays_per_chunk = arrays_per_chunk
 18 |         self.compress = compress
 19 |         self._cache = []
 20 |         self._cached_pickle_num = None
 21 |         self._writing = None
 22 | 
 23 |     # --- Pickles
 24 | 
 25 |     def _dump(self, path):
 26 |         with gzip.open(path, 'wb') if self.compress else open(path, 'wb') as writer:
 27 |             pickle.dump(self._cache, writer, protocol=2)
 28 |             # protocol=2 instead of highest to maintain py2 compat of the store
 29 | 
 30 |     def _load(self, path):
 31 |         with gzip.open(path, 'rb') if self.compress else open(path, 'rb') as reader:
 32 |             self._cache = pickle.load(reader)
 33 | 
 34 |     def _pickle_num(self, index):
 35 |         return index // self.arrays_per_chunk
 36 | 
 37 |     def _pickle_file(self, index):
 38 |         path = op.join(self.path_or_fail(), '%d.pkl' % self._pickle_num(index))
 39 |         return (path + '.gz') if self.compress else path
 40 | 
 41 |     def _save_pickle(self):
 42 |         if self.is_writing:
 43 |             self._dump(self._pickle_file(self.narrays))
 44 | 
 45 |     def _read_pickle(self, index):
 46 |         pickle_num = self._pickle_num(index)
 47 |         if self._cached_pickle_num != pickle_num:
 48 |             try:
 49 |                 self._load(self._pickle_file(index))
 50 |             except IOError:
 51 |                 self._cache = []
 52 |             self._cached_pickle_num = pickle_num
 53 | 
 54 |     # --- Cache
 55 | 
 56 |     def _cache_full(self):
 57 |         return self._cache is not None and len(self._cache) == self.arrays_per_chunk
 58 | 
 59 |     # --- Read
 60 | 
 61 |     def _open_read(self):
 62 |         self._writing = False
 63 | 
 64 |     def _get_views(self, keys, columns):
 65 |         if keys is None:
 66 |             keys = range(self.narrays)
 67 | 
 68 |         keys = [(key, order) for order, key in enumerate(keys)]
 69 | 
 70 |         views = []
 71 |         for key, order in sorted(keys):
 72 |             if not 0 <= key < self.narrays:
 73 |                 raise ValueError('Key not in storage: %d' % key)
 74 |             self._read_pickle(key)
 75 |             array = self._cache[key % self.arrays_per_chunk]
 76 |             if columns is not None:
 77 |                 array = array[:, tuple(columns)]
 78 |             views.append((array, order))
 79 |         views = list(map(itemgetter(0), sorted(views, key=itemgetter(1))))
 80 | 
 81 |         return views
 82 | 
 83 |     # --- Write
 84 | 
 85 |     def _open_write(self, data=None):
 86 |         self._writing = True
 87 |         self._read_pickle(self.narrays)
 88 | 
 89 |     def _append_hook(self, data):
 90 |         self._cache.append(data.copy())
 91 |         if self._cache_full():
 92 |             self._save_pickle()
 93 |             self._cache = []
 94 |             self._cached_pickle_num += 1
 95 | 
 96 |     # --- Lifecycle
 97 | 
 98 |     @property
 99 |     def is_open(self):
100 |         return self._writing is not None
101 | 
102 |     @property
103 |     def is_writing(self):
104 |         return self.is_open and self._writing
105 | 
106 |     @property
107 |     def is_reading(self):
108 |         return self.is_open and not self._writing
109 | 
110 |     def close(self):
111 |         self._save_pickle()
112 |         self._cache = None
113 |         self._cached_pickle_num = None
114 |         self._writing = None
115 | 
116 | #
117 | # In general it would use protocol 2, so pickles can be read also in py2
118 | # We can be as clever as we want with caches: many read caches with LRU, one write cache...
119 | #
120 | 


--------------------------------------------------------------------------------
/jagged/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdvillal/jagged/e737eae08005ab20222b46668f54e6b5dcb8e04c/jagged/tests/__init__.py


--------------------------------------------------------------------------------
/jagged/tests/fixtures.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | from __future__ import print_function, absolute_import, unicode_literals
  3 | 
  4 | import numpy as np
  5 | import pytest
  6 | 
  7 | from functools import partial
  8 | 
  9 | from jagged import JaggedByCarray, JaggedByBlosc, JaggedByH5Py, JaggedByJoblib, \
 10 |     JaggedByMemMap, JaggedByNPY, JaggedByBloscpack, JaggedByPickle
 11 | 
 12 | RAW_STORES = []
 13 | 
 14 | LINEAR_RAW_STORES = (
 15 |     ('jr=carray', JaggedByCarray),
 16 |     ('jr=carraychunks', partial(JaggedByCarray, chunklen=100) if JaggedByCarray else None),
 17 |     ('jr=h5py', JaggedByH5Py),
 18 |     ('jr=h5pychunks', partial(JaggedByH5Py, chunklen=100) if JaggedByCarray else None),
 19 |     ('jr=mmap', JaggedByMemMap),
 20 | )
 21 | 
 22 | for contiguity in ('read', 'write', None, 'auto'):
 23 |     for name, store in LINEAR_RAW_STORES:
 24 |         RAW_STORES.append((name + '#' + 'cont=%s' % contiguity, partial(store, contiguity=contiguity)))
 25 | 
 26 | RAW_STORES.extend([
 27 |     ('jr=npy', JaggedByNPY),
 28 |     ('jr=blp', JaggedByBloscpack),
 29 |     ('jr=blosc-mm', JaggedByBlosc),
 30 |     ('jr=pickle', JaggedByPickle),
 31 |     ('jr=joblib', JaggedByJoblib),
 32 | ])
 33 | 
 34 | # pytest.importorskip won't cut it here...
 35 | 
 36 | 
 37 | def store_skip(store):  # pragma: no cover
 38 |     """Skips a store if its dependencies are not available."""
 39 |     name, store = store
 40 |     if store is None:
 41 |         return pytest.mark.skipif(name, reason='the numpy plugin requires both pandas and joblib')
 42 |     return store
 43 | 
 44 | 
 45 | stores = list(map(store_skip, RAW_STORES))
 46 | names = [name for name, _ in RAW_STORES]
 47 | 
 48 | 
 49 | @pytest.yield_fixture(params=stores, ids=names)
 50 | def jagged_raw(request, tmpdir):
 51 |     jr = request.param
 52 |     dest = tmpdir.join(jr().what().id()).ensure_dir()
 53 |     try:
 54 |         yield jr, str(dest)
 55 |     finally:
 56 |         dest.remove(ignore_errors=True)
 57 | 
 58 | 
 59 | @pytest.fixture(params=(1, 2, 10), ids=('ncol=1', 'ncol=2', 'ncol=10'))
 60 | def ncol(request):
 61 |     return request.param
 62 | 
 63 | 
 64 | @pytest.fixture(params=('cols=all', 'cols=all-exp', 'cols=first', 'cols=last',
 65 |                         'cols=even', 'cols=inverse', 'cols=mixed'),
 66 |                 ids=('cols=all', 'cols=all-exp', 'cols=first', 'cols=last',
 67 |                      'cols=even', 'cols=inverse', 'cols=mixed'))
 68 | def columns(request, ncol):
 69 |     if request.param == 'cols=all':
 70 |         return None
 71 |     elif request.param == 'cols=all-exp':
 72 |         return range(ncol)
 73 |     elif request.param == 'cols=first':
 74 |         return [0]
 75 |     elif request.param == 'cols=last':
 76 |         return [ncol - 1]  # (we should support python negative indexing syntax)
 77 |     elif request.param == 'cols=even':
 78 |         return list(range(0, ncol, 2))
 79 |     elif request.param == 'cols=inverse':
 80 |         return list(range(ncol)[::-1])
 81 |     elif request.param == 'cols=mixed':
 82 |         list(range(ncol)[::-1]) + list(range(0, ncol, 2))
 83 |     else:  # pragma: no cover
 84 |         raise ValueError('Unknows column spec %r' % request.param)
 85 | 
 86 | 
 87 | @pytest.fixture(params=(0, 1), ids=['rng=0', 'rng=1'])
 88 | def rng(request):
 89 |     return np.random.RandomState(request.param)
 90 | 
 91 | 
 92 | @pytest.fixture
 93 | def dataset(ncol, rng):
 94 |     sizes = [0] + rng.randint(low=0, high=500, size=10).tolist()
 95 |     rng.shuffle(sizes)
 96 |     originals = [rng.rand(size, ncol) for size in sizes]
 97 |     return rng, originals, ncol
 98 | 
 99 | 
100 | @pytest.fixture
101 | def mock_jagged_raw(dataset):
102 | 
103 |     # unbox the fixture
104 |     rng, originals, ncol = dataset
105 | 
106 |     # reader
107 |     jagged = np.vstack(originals)
108 | 
109 |     def reader(base, size, columns, dest):
110 |         view = jagged[base:(base+size)]
111 |         if columns is not None:
112 |             view = view[:, tuple(columns)]
113 |         if dest is None:
114 |             return view
115 |         dest[:] = view
116 |         return dest
117 | 
118 |     # shape
119 |     ne, nc = jagged.shape
120 | 
121 |     # segments
122 |     base = 0
123 |     segments = []
124 |     for o in originals:
125 |         segments.append((base, len(o)))
126 |         base += len(o)
127 | 
128 |     return originals, ne, nc, originals[0].dtype, segments, reader, rng
129 | 
130 | 
131 | @pytest.fixture(params=('read', 'write', None, 'auto'),
132 |                 ids=('cont=read', 'cont=write', 'cont=none', 'cont=auto'))
133 | def contiguity(request):
134 |     return request.param
135 | 


--------------------------------------------------------------------------------
/jagged/tests/test_raw_stores.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """Tests the raw storers."""
  3 | from __future__ import print_function, absolute_import, unicode_literals
  4 | from future.builtins import zip
  5 | import os.path as op
  6 | import bcolz
  7 | from .fixtures import *
  8 | from jagged.base import retrieve_contiguous
  9 | from jagged.misc import ensure_dir
 10 | 
 11 | 
 12 | # -- lifecycle tests
 13 | 
 14 | def test_lifecycle(jagged_raw):
 15 |     jagged_raw, path = jagged_raw
 16 |     data0 = np.zeros((2, 10))
 17 |     data1 = np.ones((3, 10))
 18 |     expected = np.vstack((data0, data1))
 19 |     with jagged_raw(path=path) as jr:
 20 |         # before writing, everything is unknown
 21 |         assert jr.shape is None
 22 |         assert jr.ndims is None
 23 |         assert jr.dtype is None
 24 |         # first write-up
 25 |         jr.append(data0)
 26 |         assert jr.shape == data0.shape
 27 |         assert jr.dtype == data0.dtype
 28 |         assert jr.ndims == data0.ndim
 29 |         assert jr.narrays == 1
 30 |         assert len(jr) == len(data0)
 31 |         # first read
 32 |         assert np.allclose(data0, jr.get()[0])
 33 |         # even if we close it...
 34 |         jr.close()
 35 |         # we can now know shapes and the like
 36 |         assert jr.shape == data0.shape
 37 |         assert jr.dtype == data0.dtype
 38 |         assert jr.ndims == data0.ndim
 39 |         # we can reread...
 40 |         assert np.allclose(data0, jr.get()[0])
 41 |         # we can know shapes and the like
 42 |         assert jr.shape == data0.shape
 43 |         assert jr.dtype == data0.dtype
 44 |         assert jr.ndims == data0.ndim
 45 |         # we can append more
 46 |         jr.append(data1)
 47 |         assert jr.shape == expected.shape
 48 |         assert jr.dtype == expected.dtype
 49 |         assert jr.ndims == expected.ndim
 50 |         assert jr.narrays == 2
 51 |         assert len(jr) == len(data0) + len(data1)
 52 |         # and the data will be properlly appended
 53 |         assert np.allclose(data0, jr.get()[0])
 54 |         assert np.allclose(data1, jr.get()[1])
 55 | 
 56 | 
 57 | # -- Tests retrieve contiguous
 58 | 
 59 | def test_retrieve_contiguous(mock_jagged_raw, columns, contiguity):
 60 | 
 61 |     originals, ne, nc, dtype, segments, reader, rng = mock_jagged_raw
 62 | 
 63 |     if columns is not None:
 64 |         originals = [o[:, tuple(columns)] for o in originals]
 65 | 
 66 |     # sanity checks for wrong inputs
 67 |     with pytest.raises(ValueError) as excinfo:
 68 |         retrieve_contiguous(segments, columns, reader, dtype, ne, nc, 'wrong')
 69 |     assert 'Unknown contiguity scheme:' in str(excinfo.value)
 70 | 
 71 |     with pytest.raises(ValueError) as excinfo:
 72 |         retrieve_contiguous([(-1, 1)], columns, reader, dtype, ne, nc, contiguity)
 73 |     assert 'Out of bounds query (base=-1, size=1' in str(excinfo.value)
 74 | 
 75 |     with pytest.raises(ValueError) as excinfo:
 76 |         retrieve_contiguous([(0, 100000)], columns, reader, dtype, ne, nc, contiguity)
 77 |     assert 'Out of bounds query (base=0, size=100000' in str(excinfo.value)
 78 | 
 79 |     # insertion order
 80 |     views = retrieve_contiguous(segments, columns, reader, dtype, ne, nc, contiguity)
 81 |     for o, v in zip(originals, views):
 82 |         assert np.allclose(o, v)
 83 | 
 84 |     # random order
 85 |     o_s = list(zip(originals, segments))
 86 |     rng.shuffle(o_s)
 87 |     originals, segments = zip(*o_s)
 88 |     views = retrieve_contiguous(segments, columns, reader, dtype, ne, nc, contiguity)
 89 |     for o, v in zip(originals, views):
 90 |         assert np.allclose(o, v)
 91 | 
 92 | 
 93 | # -- roundtrip tests
 94 | 
 95 | def test_roundtrip(jagged_raw, dataset, columns):
 96 |     jagged_raw, path = jagged_raw
 97 |     jagged_raw = partial(jagged_raw, path=path)
 98 |     rng, originals, ncol = dataset
 99 | 
100 |     # Write
101 |     keys = []
102 |     with jagged_raw() as jr:
103 |         total = 0
104 |         assert jr.dtype is None
105 |         assert jr.shape is None
106 |         for original in originals:
107 |             key = jr.append(original)
108 |             assert jr.is_writing
109 |             keys.append(key)
110 |             total += len(original)
111 |             assert len(jr) == total
112 |         assert jr.dtype == originals[0].dtype
113 |         assert jr.shape == (total, ncol)
114 | 
115 |     # Read
116 |     def test_read(originals, keys):
117 | 
118 |         if columns is not None:
119 |             originals = [o[:, columns] for o in originals]
120 | 
121 |         # test read, one by one
122 |         with jagged_raw() as jr:
123 |             for original, key in zip(originals, keys):
124 |                 roundtripped = jr.get([key], columns=columns)[0]
125 |                 assert np.allclose(original, roundtripped)
126 | 
127 |         # test read, in a batch
128 |         with jagged_raw() as jr:
129 |             for original, roundtripped in zip(originals, jr.get(keys, columns=columns)):
130 |                 assert np.allclose(original, roundtripped)
131 | 
132 |     # read all
133 |     # with jagged_raw() as jr:
134 |     #     for original, roundtripped in zip(originals, jr.get(columns=columns)):
135 |     #         original = original if columns is None else original[:, columns]
136 |     #         assert np.allclose(original, roundtripped)
137 | 
138 |     # read in insertion order
139 |     test_read(originals, keys)
140 | 
141 |     # read in random order
142 |     or_s = list(zip(originals, keys))
143 |     rng.shuffle(or_s)
144 |     originals, keys = zip(*or_s)
145 |     test_read(originals, keys)
146 | 
147 | 
148 | # --- Test self-identification
149 | 
150 | def test_whatid():
151 | 
152 |     assert "JaggedByCarray(chunklen=1000," \
153 |            "contiguity=None," \
154 |            "cparams=cparams(clevel=3,cname='zlib',quantize=0,shuffle=False)," \
155 |            "expectedlen=None)" \
156 |            == JaggedByCarray(chunklen=1000,
157 |                              cparams=bcolz.cparams(clevel=3, cname='zlib', shuffle=False),
158 |                              expectedlen=None).what().id()
159 |     assert "JaggedByH5Py(checksum=False," \
160 |            "chunklen=1000," \
161 |            "compression='lzf'," \
162 |            "compression_opts=0," \
163 |            "contiguity=None," \
164 |            "shuffle=True)" \
165 |            == JaggedByH5Py(chunklen=1000,
166 |                            compression='lzf',
167 |                            compression_opts=0,
168 |                            shuffle=True).what().id()
169 | 
170 | 
171 | # --- Test factories / curries/ partials
172 | 
173 | def test_copyconf(jagged_raw):
174 |     jagged_raw, path = jagged_raw
175 |     assert jagged_raw().what().id() == jagged_raw().copyconf()().what().id(), \
176 |         'factory without parameters should give the same config as the constructor'
177 | 
178 | 
179 | # --- Misc tests
180 | 
181 | def test_nonvalid_appends(jagged_raw):
182 |     jagged_raw, path = jagged_raw
183 |     with jagged_raw(path=path) as jr:
184 |         with pytest.raises(Exception) as excinfo:
185 |             jr.append(np.zeros((10, 0)))
186 |         assert 'Cannot append data with sizes 0 in non-leading dimension' in str(excinfo.value)
187 | 
188 | 
189 | def test_no_inmemory_storage(jagged_raw):
190 |     # maybe one day we allow these...
191 |     jagged_raw, path = jagged_raw
192 |     with jagged_raw(path=None) as jr:
193 |         with pytest.raises(Exception) as excinfo:
194 |             jr.append(np.zeros((1, 1)))
195 |         assert 'In-memory only arrays are not implemented' in str(excinfo.value)
196 | 
197 | 
198 | def test_copy_from(jagged_raw):
199 |     jagged_raw, path = jagged_raw
200 |     path0 = ensure_dir(op.join(path, 'test0'))
201 |     path1 = ensure_dir(op.join(path, 'test1'))
202 |     with jagged_raw(path0) as jr0, jagged_raw(path1) as jr1:
203 |         jr0.append(np.zeros((2, 10)))
204 |         jr0.append(np.ones((3, 10)))
205 |         jr1.append_from(jr0)
206 |         assert np.allclose(jr0.get()[0], jr1.get()[0])
207 | 
208 | 
209 | def test_chunked_copy_from(jagged_raw):
210 |     jagged_raw, path = jagged_raw
211 |     path0 = ensure_dir(op.join(path, 'test0'))
212 |     path1 = ensure_dir(op.join(path, 'test1'))
213 |     with jagged_raw(path0) as jr0, jagged_raw(path1) as jr1:
214 |         for _ in range(10):
215 |             jr0.append(np.zeros((2, 10)))
216 |             jr0.append(np.ones((3, 10)))
217 |         jr1.append_from(jr0, arrays_per_chunk=2)
218 |         assert np.allclose(jr0.get()[0], jr1.get()[0])
219 |         with pytest.raises(ValueError) as excinfo:
220 |             jr1.append_from(jr0, arrays_per_chunk=-1)
221 |         assert 'arrays_per_chunk must be None or bigger than 0, it is -1' in str(excinfo.value)
222 | 
223 | 
224 | def test_mmap_check_sizes(tmpdir):
225 |     dest = str(tmpdir)
226 |     x = np.empty((5, 2), dtype=np.int32)
227 |     with JaggedByMemMap(dest) as jbm:
228 |         jbm.append(x)
229 |         mmf = jbm._mmpath
230 |     # write row-sized junk
231 |     with open(mmf, 'a') as writer:
232 |         writer.write(str('junk' * 10))
233 |     with JaggedByMemMap(dest) as jbm:
234 |         with pytest.raises(Exception) as excinfo:
235 |             jbm.get([(0, 2)])
236 |         assert 'the number or rows inferred by file size does not coincide' in str(excinfo.value)
237 |     # write junk that look like leftovers of an aborted write
238 |     with open(mmf, 'a') as writer:
239 |         writer.write(str('jagged'))
240 |     with JaggedByMemMap(dest) as jbm:
241 |         with pytest.raises(Exception) as excinfo:
242 |             jbm.get([(0, 2)])
243 |         assert 'the memmap file has incomplete data' in str(excinfo.value)
244 |     # make the memmap way too small
245 |     with open(mmf, 'w') as writer:
246 |         writer.write(str('jagged'))
247 |     with pytest.raises(Exception) as excinfo:
248 |         jbm.get([(0, 2)])
249 |     assert 'mmap length is greater than file size' in str(excinfo.value)
250 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.rst
3 | 
4 | [pytest]
5 | python_files=*.py
6 | pep8maxlinelength = 120
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | # Authors: Santi Villalba <sdvillal@gmail.com>
 5 | # Licence: BSD 3 clause
 6 | 
 7 | try:
 8 |     from setuptools import setup
 9 | except ImportError:
10 |     from distutils.core import setup
11 | 
12 | import jagged
13 | 
14 | setup(
15 |     name='jagged',
16 |     license='BSD 3 clause',
17 |     description='Simple tricks for efficient loading or merging collections of unevenly sized elements',
18 |     long_description=open('README.rst').read().replace('|Build Status| |Coverage Status| |Scrutinizer Status|', ''),
19 |     version=jagged.__version__,
20 |     url='https://github.com/sdvillal/jagged',
21 |     author='Santi Villalba',
22 |     author_email='sdvillal@gmail.com',
23 |     packages=['jagged',
24 |               'jagged.compression',
25 |               'jagged.benchmarks',
26 |               'jagged.tests'],
27 |     classifiers=[
28 |         'Intended Audience :: Science/Research',
29 |         'Intended Audience :: Developers',
30 |         'Topic :: Software Development',
31 |         'Topic :: Scientific/Engineering',
32 |         'License :: OSI Approved',
33 |         'Programming Language :: Python :: 2',
34 |         'Programming Language :: Python :: 2.7',
35 |         'Programming Language :: Python :: 3',
36 |         'Programming Language :: Python :: 3.4',
37 |         'Operating System :: Unix',
38 |     ],
39 |     install_requires=['future',
40 |                       'numpy',
41 |                       'whatami>=4.0.0',
42 |                       'toolz'],
43 |     tests_require=['pytest'],
44 |     extras_require={
45 |         'blosc': ['blosc'],
46 |         'bloscpack': ['bloscpack'],
47 |         'bcolz': ['bcolz'],
48 |         'h5py': ['h5py'],
49 |         'joblib': ['joblib'],
50 |         'benchmarks': ['psutil']
51 |     }
52 | )
53 | 


--------------------------------------------------------------------------------