├── requirements.txt ├── setup.cfg ├── .gitignore ├── tests ├── hickle_1_1_0.hkl ├── hickle_1_3_0.hkl └── test_hickle.py ├── .travis.yml ├── setup.py ├── LICENSE ├── README.md ├── hickle_legacy.py └── hickle.py /requirements.txt: -------------------------------------------------------------------------------- 1 | h5py 2 | numpy -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .pypirc 3 | build/ 4 | dist/ 5 | .DS_Store 6 | .idea -------------------------------------------------------------------------------- /tests/hickle_1_1_0.hkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/hickle/master/tests/hickle_1_1_0.hkl -------------------------------------------------------------------------------- /tests/hickle_1_3_0.hkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/hickle/master/tests/hickle_1_3_0.hkl -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.6" 4 | - "2.7" 5 | 6 | # command to install dependencies 7 | install: 8 | - sudo apt-get update -qq 9 | - sudo apt-get install -qq libhdf5-serial-dev 10 | - pip install unittest2 11 | - pip install -r requirements.txt 12 | - pip install . 13 | 14 | # command to install dependencies 15 | script: python tests/test_hickle.py 16 | branches: 17 | only: 18 | - master -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # To increment version 2 | # git tag x.y.z 3 | # git push --tags 4 | # python setup.py sdist upload 5 | from distutils.core import setup 6 | setup(name = 'hickle', 7 | version = '2.0.5', 8 | description = 'Hickle - a HDF5 based version of pickle', 9 | author = 'Danny Price', 10 | author_email = 'dan@thetelegraphic.com', 11 | url = 'http://github.com/telegraphic/hickle', 12 | download_url='https://github.com/telegraphic/hickle/archive/2.0.5.tar.gz', 13 | platforms = 'Cross platform (Linux, Mac OSX, Windows)', 14 | keywords = ['pickle', 'hdf5', 'data storage', 'data export'], 15 | py_modules = ['hickle', 'hickle_legacy'], 16 | install_requires=['numpy', 'h5py'] 17 | ) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 Danny Price and contributors 2 | http://github.com/telegraphic/hickle 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/telegraphic/hickle.svg?branch=master)](https://travis-ci.org/telegraphic/hickle) 2 | 3 | Hickle 4 | ====== 5 | 6 | Hickle is a HDF5 based clone of Pickle, with a twist. Instead of serializing to a pickle file, 7 | Hickle dumps to a HDF5 file. It is designed to be a "drop-in" replacement for pickle (for common data objects). That is: it is a neat little way of dumping python variables to file. Hickle is fast, and allows for transparent compression of your data (LZF / GZIP). 8 | 9 | Why use Hickle? 10 | --------------- 11 | 12 | While hickle is designed to be a drop-in replacement for pickle (and json), it works very differently. 13 | Instead of serializing / json-izing, it instead stores the data using the excellent h5py module. 14 | 15 | The main reasons to use hickle are: 16 | 17 | 1. it's faster than pickle and cPickle 18 | 2. it stores data in HDF5 19 | 3. You can easily compress your data. 20 | 21 | The main reasons not to use hickle are: 22 | 23 | 1. You don't want to store your data in HDF5. While hickle can serialize arbitrary python objects, this functionality is provided only for convenience, and you're probably better off just using the pickle module. 24 | 2. You want to convert your data in JSON. For this, use a json or uJson. 25 | 26 | So, if you want your data in HDF5, or if your pickling is taking too long, give hickle a try. Hickle is particularly good at storing large numpy arrays, thanks to h5py running under the hood. 27 | 28 | 29 | Performance comparison 30 | ---------------------- 31 | 32 | Hickle runs a lot faster than pickle with its default settings, and a little faster than pickle with `protocol=2` set: 33 | 34 | ```Python 35 | In [1]: import numpy as np 36 | 37 | In [2]: x = np.random.random((2000, 2000)) 38 | 39 | In [3]: import pickle 40 | 41 | In [4]: f = open('foo.pkl', 'w') 42 | 43 | In [5]: %time pickle.dump(x, f) # slow by default 44 | CPU times: user 2 s, sys: 274 ms, total: 2.27 s 45 | Wall time: 2.74 s 46 | 47 | In [6]: f = open('foo.pkl', 'w') 48 | 49 | In [7]: %time pickle.dump(x, f, protocol=2) # actually very fast 50 | CPU times: user 18.8 ms, sys: 36 ms, total: 54.8 ms 51 | Wall time: 55.6 ms 52 | 53 | In [8]: import hickle 54 | 55 | In [9]: f = open('foo.hkl', 'w') 56 | 57 | In [10]: %time hickle.dump(x, f) # a bit faster 58 | dumping to file 59 | CPU times: user 764 µs, sys: 35.6 ms, total: 36.4 ms 60 | Wall time: 36.2 ms 61 | ``` 62 | 63 | So if you do continue to use pickle, add the `protocol=2` keyword (thanks @mrocklin for pointing this out). 64 | 65 | For storing python dictionaries of lists, hickle beats the python json encoder, but is slower than uJson. For a dictionary with 64 entries, each containing a 4096 length list of random numbers, the times are: 66 | 67 | 68 | json took 2633.263 ms 69 | uJson took 138.482 ms 70 | hickle took 232.181 ms 71 | 72 | 73 | It should be noted that these comparisons are of course not fair: storing in HDF5 will not help you convert something into JSON, nor will it help you serialize a string. But for quick storage of the contents of a python variable, it's a pretty good option. 74 | 75 | Installation guidelines (for Linux and Mac OS). 76 | ---------------------------------------------------------------------------------------------------- 77 | 78 | ### Easy method 79 | Install with `pip` by running `pip install hickle` from the command line. 80 | 81 | ### Manual install 82 | 83 | 1. You should have Python 2.7 and above installed 84 | 85 | 2. Install h5py 86 | (Official page: http://docs.h5py.org/en/latest/build.html) 87 | 88 | 3. Install hdf5 89 | (Official page: http://www.hdfgroup.org/ftp/HDF5/current/src/unpacked/release_docs/INSTALL) 90 | 91 | 4. Download `hickle`: 92 | via terminal: git clone https://github.com/telegraphic/hickle.git 93 | via manual download: Go to https://github.com/telegraphic/hickle and on right hand side you will find `Download ZIP` file 94 | 95 | 5. cd to your downloaded `hickle` directory 96 | 97 | 6. Then run the following command in the `hickle` directory: 98 | `python setup.py install` 99 | 100 | 101 | Usage example 102 | ------------- 103 | 104 | Hickle is nice and easy to use, and should look very familiar to those of you who have pickled before: 105 | 106 | ```python 107 | import os 108 | import hickle as hkl 109 | import numpy as np 110 | 111 | # Create a numpy array of data 112 | array_obj = np.ones(32768, dtype='float32') 113 | 114 | # Dump to file 115 | hkl.dump(array_obj, 'test.hkl', mode='w') 116 | 117 | # Dump data, with compression 118 | hkl.dump(array_obj, 'test_gzip.hkl', mode='w', compression='gzip') 119 | 120 | # Compare filesizes 121 | print 'uncompressed: %i bytes' % os.path.getsize('test.hkl') 122 | print 'compressed: %i bytes' % os.path.getsize('test_gzip.hkl') 123 | 124 | # Load data 125 | array_hkl = hkl.load('test_gzip.hkl') 126 | 127 | # Check the two are the same file 128 | assert array_hkl.dtype == array_obj.dtype 129 | assert np.all((array_hkl, array_obj)) 130 | ``` 131 | 132 | Compression options 133 | ------------------- 134 | 135 | hickle passes keyword arguments on to h5py, so you can do things like: 136 | ```python 137 | hkl.dump(array_obj, 'test_lzf.hkl', mode='w', compression='lzf', scaleoffset=0, 138 | chunks=(100, 100), shuffle=True, fletcher32=True) 139 | ``` 140 | Have a look at http://docs.h5py.org/en/latest/high/dataset.html for an explanation 141 | of these keywords. 142 | -------------------------------------------------------------------------------- /hickle_legacy.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """ 3 | # hickle_legacy.py 4 | 5 | Created by Danny Price 2012-05-28. 6 | 7 | Hickle is a HDF5 based clone of Pickle. Instead of serializing to a 8 | pickle file, Hickle dumps to a HDF5 file. It is designed to be as similar 9 | to pickle in usage as possible. 10 | 11 | ## Notes 12 | 13 | This is a legacy handler, for hickle v1 files. 14 | If V2 reading fails, this will be called as a fail-over. 15 | 16 | """ 17 | 18 | import os 19 | import exceptions 20 | import numpy as np 21 | import h5py as h5 22 | from types import NoneType 23 | 24 | __version__ = "1.3.0" 25 | __author__ = "Danny Price" 26 | 27 | #################### 28 | ## Error handling ## 29 | #################### 30 | 31 | 32 | class FileError(exceptions.Exception): 33 | """ An exception raised if the file is fishy""" 34 | 35 | def __init__(self): 36 | return 37 | 38 | def __str__(self): 39 | print "Error: cannot open file. Please pass either a filename string, a file object, or a h5py.File" 40 | 41 | 42 | class NoMatchError(exceptions.Exception): 43 | """ An exception raised if the object type is not understood (or supported)""" 44 | 45 | def __init__(self): 46 | return 47 | 48 | def __str__(self): 49 | print "Error: this type of python object cannot be converted into a hickle." 50 | 51 | 52 | class ToDoError(exceptions.Exception): 53 | """ An exception raised for non-implemented functionality""" 54 | 55 | def __init__(self): 56 | return 57 | 58 | def __str__(self): 59 | print "Error: this functionality hasn't been implemented yet." 60 | 61 | 62 | class H5GroupWrapper(h5.Group): 63 | def create_dataset(self, *args, **kwargs): 64 | kwargs['track_times'] = getattr(self, 'track_times', True) 65 | return super(H5GroupWrapper, self).create_dataset(*args, **kwargs) 66 | 67 | def create_group(self, *args, **kwargs): 68 | group = super(H5GroupWrapper, self).create_group(*args, **kwargs) 69 | group.__class__ = H5GroupWrapper 70 | group.track_times = getattr(self, 'track_times', True) 71 | return group 72 | 73 | 74 | class H5FileWrapper(h5.File): 75 | def create_dataset(self, *args, **kwargs): 76 | kwargs['track_times'] = getattr(self, 'track_times', True) 77 | return super(H5FileWrapper, self).create_dataset(*args, **kwargs) 78 | 79 | def create_group(self, *args, **kwargs): 80 | group = super(H5FileWrapper, self).create_group(*args, **kwargs) 81 | group.__class__ = H5GroupWrapper 82 | group.track_times = getattr(self, 'track_times', True) 83 | return group 84 | 85 | 86 | def file_opener(f, mode='r', track_times=True): 87 | """ A file opener helper function with some error handling. 88 | 89 | This can open files through a file object, a h5py file, or just the filename. 90 | """ 91 | # Were we handed a file object or just a file name string? 92 | if type(f) is file: 93 | filename, mode = f.name, f.mode 94 | f.close() 95 | h5f = h5.File(filename, mode) 96 | 97 | elif type(f) is h5._hl.files.File: 98 | h5f = f 99 | elif type(f) is str: 100 | filename = f 101 | h5f = h5.File(filename, mode) 102 | else: 103 | raise FileError 104 | 105 | h5f.__class__ = H5FileWrapper 106 | h5f.track_times = track_times 107 | return h5f 108 | 109 | 110 | ############# 111 | ## dumpers ## 112 | ############# 113 | 114 | def dump_ndarray(obj, h5f, **kwargs): 115 | """ dumps an ndarray object to h5py file""" 116 | h5f.create_dataset('data', data=obj, **kwargs) 117 | h5f.create_dataset('type', data=['ndarray']) 118 | 119 | 120 | def dump_np_dtype(obj, h5f, **kwargs): 121 | """ dumps an np dtype object to h5py file""" 122 | h5f.create_dataset('data', data=obj) 123 | h5f.create_dataset('type', data=['np_dtype']) 124 | 125 | 126 | def dump_np_dtype_dict(obj, h5f, **kwargs): 127 | """ dumps an np dtype object within a group""" 128 | h5f.create_dataset('data', data=obj) 129 | h5f.create_dataset('_data', data=['np_dtype']) 130 | 131 | 132 | def dump_masked(obj, h5f, **kwargs): 133 | """ dumps an ndarray object to h5py file""" 134 | h5f.create_dataset('data', data=obj, **kwargs) 135 | h5f.create_dataset('mask', data=obj.mask, **kwargs) 136 | h5f.create_dataset('type', data=['masked']) 137 | 138 | 139 | def dump_list(obj, h5f, **kwargs): 140 | """ dumps a list object to h5py file""" 141 | 142 | # Check if there are any numpy arrays in the list 143 | contains_numpy = any(isinstance(el, np.ndarray) for el in obj) 144 | 145 | if contains_numpy: 146 | _dump_list_np(obj, h5f, **kwargs) 147 | else: 148 | h5f.create_dataset('data', data=obj, **kwargs) 149 | h5f.create_dataset('type', data=['list']) 150 | 151 | 152 | def _dump_list_np(obj, h5f, **kwargs): 153 | """ Dump a list of numpy objects to file """ 154 | 155 | np_group = h5f.create_group('data') 156 | h5f.create_dataset('type', data=['np_list']) 157 | 158 | ii = 0 159 | for np_item in obj: 160 | np_group.create_dataset("%s" % ii, data=np_item, **kwargs) 161 | ii += 1 162 | 163 | 164 | def dump_tuple(obj, h5f, **kwargs): 165 | """ dumps a list object to h5py file""" 166 | 167 | # Check if there are any numpy arrays in the list 168 | contains_numpy = any(isinstance(el, np.ndarray) for el in obj) 169 | 170 | if contains_numpy: 171 | _dump_tuple_np(obj, h5f, **kwargs) 172 | else: 173 | h5f.create_dataset('data', data=obj, **kwargs) 174 | h5f.create_dataset('type', data=['tuple']) 175 | 176 | 177 | def _dump_tuple_np(obj, h5f, **kwargs): 178 | """ Dump a tuple of numpy objects to file """ 179 | 180 | np_group = h5f.create_group('data') 181 | h5f.create_dataset('type', data=['np_tuple']) 182 | 183 | ii = 0 184 | for np_item in obj: 185 | np_group.create_dataset("%s" % ii, data=np_item, **kwargs) 186 | ii += 1 187 | 188 | 189 | def dump_set(obj, h5f, **kwargs): 190 | """ dumps a set object to h5py file""" 191 | obj = list(obj) 192 | h5f.create_dataset('data', data=obj, **kwargs) 193 | h5f.create_dataset('type', data=['set']) 194 | 195 | 196 | def dump_string(obj, h5f, **kwargs): 197 | """ dumps a list object to h5py file""" 198 | h5f.create_dataset('data', data=[obj], **kwargs) 199 | h5f.create_dataset('type', data=['string']) 200 | 201 | 202 | def dump_none(obj, h5f, **kwargs): 203 | """ Dump None type to file """ 204 | h5f.create_dataset('data', data=[0], **kwargs) 205 | h5f.create_dataset('type', data=['none']) 206 | 207 | 208 | def dump_unicode(obj, h5f, **kwargs): 209 | """ dumps a list object to h5py file""" 210 | dt = h5.special_dtype(vlen=unicode) 211 | ll = len(obj) 212 | dset = h5f.create_dataset('data', shape=(ll, ), dtype=dt, **kwargs) 213 | dset[:ll] = obj 214 | h5f.create_dataset('type', data=['unicode']) 215 | 216 | 217 | def _dump_dict(dd, hgroup, **kwargs): 218 | for key in dd: 219 | if type(dd[key]) in (str, int, float, unicode, bool): 220 | # Figure out type to be stored 221 | types = {str: 'str', int: 'int', float: 'float', 222 | unicode: 'unicode', bool: 'bool', NoneType: 'none'} 223 | _key = types.get(type(dd[key])) 224 | 225 | # Store along with dtype info 226 | if _key == 'unicode': 227 | dd[key] = str(dd[key]) 228 | 229 | hgroup.create_dataset("%s" % key, data=[dd[key]], **kwargs) 230 | hgroup.create_dataset("_%s" % key, data=[_key]) 231 | 232 | elif type(dd[key]) in (type(np.array([1])), type(np.ma.array([1]))): 233 | 234 | if hasattr(dd[key], 'mask'): 235 | hgroup.create_dataset("_%s" % key, data=["masked"]) 236 | hgroup.create_dataset("%s" % key, data=dd[key].data, **kwargs) 237 | hgroup.create_dataset("_%s_mask" % key, data=dd[key].mask, **kwargs) 238 | else: 239 | hgroup.create_dataset("_%s" % key, data=["ndarray"]) 240 | hgroup.create_dataset("%s" % key, data=dd[key], **kwargs) 241 | 242 | elif type(dd[key]) is list: 243 | hgroup.create_dataset("%s" % key, data=dd[key], **kwargs) 244 | hgroup.create_dataset("_%s" % key, data=["list"]) 245 | 246 | elif type(dd[key]) is tuple: 247 | hgroup.create_dataset("%s" % key, data=dd[key], **kwargs) 248 | hgroup.create_dataset("_%s" % key, data=["tuple"]) 249 | 250 | elif type(dd[key]) is set: 251 | hgroup.create_dataset("%s" % key, data=list(dd[key]), **kwargs) 252 | hgroup.create_dataset("_%s" % key, data=["set"]) 253 | 254 | elif isinstance(dd[key], dict): 255 | new_group = hgroup.create_group("%s" % key) 256 | _dump_dict(dd[key], new_group, **kwargs) 257 | 258 | elif type(dd[key]) is NoneType: 259 | hgroup.create_dataset("%s" % key, data=[0], **kwargs) 260 | hgroup.create_dataset("_%s" % key, data=["none"]) 261 | 262 | else: 263 | if type(dd[key]).__module__ == np.__name__: 264 | #print type(dd[key]) 265 | hgroup.create_dataset("%s" % key, data=dd[key]) 266 | hgroup.create_dataset("_%s" % key, data=["np_dtype"]) 267 | #new_group = hgroup.create_group("%s" % key) 268 | #dump_np_dtype_dict(dd[key], new_group) 269 | else: 270 | raise NoMatchError 271 | 272 | 273 | def dump_dict(obj, h5f='', **kwargs): 274 | """ dumps a dictionary to h5py file """ 275 | h5f.create_dataset('type', data=['dict']) 276 | hgroup = h5f.create_group('data') 277 | _dump_dict(obj, hgroup, **kwargs) 278 | 279 | 280 | def no_match(obj, h5f, *args, **kwargs): 281 | """ If no match is made, raise an exception """ 282 | import cPickle 283 | 284 | pickled_obj = cPickle.dumps(obj) 285 | h5f.create_dataset('type', data=['pickle']) 286 | h5f.create_dataset('data', data=[pickled_obj]) 287 | 288 | print "Warning: %s type not understood, data have been serialized" % type(obj) 289 | #raise NoMatchError 290 | 291 | 292 | def dumper_lookup(obj): 293 | """ What type of object are we trying to pickle? 294 | 295 | This is a python dictionary based equivalent of a case statement. 296 | It returns the correct helper function for a given data type. 297 | """ 298 | t = type(obj) 299 | 300 | types = { 301 | list: dump_list, 302 | tuple: dump_tuple, 303 | set: dump_set, 304 | dict: dump_dict, 305 | str: dump_string, 306 | unicode: dump_unicode, 307 | NoneType: dump_none, 308 | np.ndarray: dump_ndarray, 309 | np.ma.core.MaskedArray: dump_masked, 310 | np.float16: dump_np_dtype, 311 | np.float32: dump_np_dtype, 312 | np.float64: dump_np_dtype, 313 | np.int8: dump_np_dtype, 314 | np.int16: dump_np_dtype, 315 | np.int32: dump_np_dtype, 316 | np.int64: dump_np_dtype, 317 | np.uint8: dump_np_dtype, 318 | np.uint16: dump_np_dtype, 319 | np.uint32: dump_np_dtype, 320 | np.uint64: dump_np_dtype, 321 | np.complex64: dump_np_dtype, 322 | np.complex128: dump_np_dtype, 323 | } 324 | 325 | match = types.get(t, no_match) 326 | return match 327 | 328 | 329 | def dump(obj, file, mode='w', track_times=True, **kwargs): 330 | """ Write a pickled representation of obj to the open file object file. 331 | 332 | Parameters 333 | ---------- 334 | obj: object 335 | python object o store in a Hickle 336 | file: file object, filename string, or h5py.File object 337 | file in which to store the object. A h5py.File or a filename is also acceptable. 338 | mode: string 339 | optional argument, 'r' (read only), 'w' (write) or 'a' (append). Ignored if file is a file object. 340 | compression: str 341 | optional argument. Applies compression to dataset. Options: None, gzip, lzf (+ szip, if installed) 342 | track_times: bool 343 | optional argument. If set to False, repeated hickling will produce identical files. 344 | """ 345 | 346 | try: 347 | # See what kind of object to dump 348 | dumper = dumper_lookup(obj) 349 | # Open the file 350 | h5f = file_opener(file, mode, track_times) 351 | print "dumping %s to file %s" % (type(obj), repr(h5f)) 352 | dumper(obj, h5f, **kwargs) 353 | h5f.close() 354 | except NoMatchError: 355 | fname = h5f.filename 356 | h5f.close() 357 | try: 358 | os.remove(fname) 359 | except: 360 | print "Warning: dump failed. Could not remove %s" % fname 361 | finally: 362 | raise NoMatchError 363 | 364 | 365 | ############# 366 | ## loaders ## 367 | ############# 368 | 369 | def load(file, safe=True): 370 | """ Load a hickle file and reconstruct a python object 371 | 372 | Parameters 373 | ---------- 374 | file: file object, h5py.File, or filename string 375 | 376 | safe (bool): Disable automatic depickling of arbitrary python objects. 377 | DO NOT set this to False unless the file is from a trusted source. 378 | (see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation) 379 | """ 380 | 381 | try: 382 | h5f = file_opener(file) 383 | dtype = h5f["type"][0] 384 | 385 | if dtype == 'dict': 386 | group = h5f["data"] 387 | data = load_dict(group) 388 | elif dtype == 'pickle': 389 | data = load_pickle(h5f, safe) 390 | elif dtype == 'np_list': 391 | group = h5f["data"] 392 | data = load_np_list(group) 393 | elif dtype == 'np_tuple': 394 | group = h5f["data"] 395 | data = load_np_tuple(group) 396 | elif dtype == 'masked': 397 | data = np.ma.array(h5f["data"][:], mask=h5f["mask"][:]) 398 | elif dtype == 'none': 399 | data = None 400 | else: 401 | if dtype in ('string', 'unicode'): 402 | data = h5f["data"][0] 403 | else: 404 | try: 405 | data = h5f["data"][:] 406 | except ValueError: 407 | data = h5f["data"] 408 | types = { 409 | 'list': list, 410 | 'set': set, 411 | 'unicode': unicode, 412 | 'string': str, 413 | 'ndarray': load_ndarray, 414 | 'np_dtype': load_np_dtype 415 | } 416 | 417 | mod = types.get(dtype, no_match) 418 | data = mod(data) 419 | finally: 420 | if 'h5f' in locals(): 421 | h5f.close() 422 | return data 423 | 424 | 425 | def load_pickle(h5f, safe=True): 426 | """ Deserialize and load a pickled object within a hickle file 427 | 428 | WARNING: Pickle has 429 | 430 | Parameters 431 | ---------- 432 | h5f: h5py.File object 433 | 434 | safe (bool): Disable automatic depickling of arbitrary python objects. 435 | DO NOT set this to False unless the file is from a trusted source. 436 | (see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation) 437 | """ 438 | 439 | if not safe: 440 | import cPickle 441 | 442 | data = h5f["data"][:] 443 | data = cPickle.loads(data[0]) 444 | return data 445 | else: 446 | print "\nWarning: Object is of an unknown type, and has not been loaded" 447 | print " for security reasons (it could be malicious code). If" 448 | print " you wish to continue, manually set safe=False\n" 449 | 450 | 451 | def load_np_list(group): 452 | """ load a numpy list """ 453 | np_list = [] 454 | for key in sorted(group.keys()): 455 | data = group[key][:] 456 | np_list.append(data) 457 | return np_list 458 | 459 | 460 | def load_np_tuple(group): 461 | """ load a tuple containing numpy arrays """ 462 | return tuple(load_np_list(group)) 463 | 464 | 465 | def load_ndarray(arr): 466 | """ Load a numpy array """ 467 | # Nothing to be done! 468 | return arr 469 | 470 | 471 | def load_np_dtype(arr): 472 | """ Load a numpy array """ 473 | # Just return first value 474 | return arr.value 475 | 476 | 477 | def load_dict(group): 478 | """ Load dictionary """ 479 | 480 | dd = {} 481 | for key in group.keys(): 482 | if isinstance(group[key], h5._hl.group.Group): 483 | new_group = group[key] 484 | dd[key] = load_dict(new_group) 485 | elif not key.startswith("_"): 486 | _key = "_%s" % key 487 | 488 | if group[_key][0] == 'np_dtype': 489 | dd[key] = group[key].value 490 | elif group[_key][0] in ('str', 'int', 'float', 'unicode', 'bool'): 491 | dd[key] = group[key][0] 492 | elif group[_key][0] == 'masked': 493 | key_ma = "_%s_mask" % key 494 | dd[key] = np.ma.array(group[key][:], mask=group[key_ma]) 495 | else: 496 | dd[key] = group[key][:] 497 | 498 | # Convert numpy constructs back to string 499 | dtype = group[_key][0] 500 | types = {'str': str, 'int': int, 'float': float, 501 | 'unicode': unicode, 'bool': bool, 'list': list, 'none' : NoneType} 502 | try: 503 | mod = types.get(dtype) 504 | if dtype == 'none': 505 | dd[key] = None 506 | else: 507 | dd[key] = mod(dd[key]) 508 | except: 509 | pass 510 | return dd 511 | 512 | 513 | def load_large(file): 514 | """ Load a large hickle file (returns the h5py object not the data) 515 | 516 | Parameters 517 | ---------- 518 | file: file object, h5py.File, or filename string 519 | """ 520 | 521 | h5f = file_opener(file) 522 | return h5f -------------------------------------------------------------------------------- /tests/test_hickle.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | # test_hickle.py 5 | 6 | Unit tests for hickle module. 7 | 8 | """ 9 | 10 | import os 11 | from hickle import * 12 | import hickle 13 | import hashlib 14 | import time 15 | 16 | import h5py 17 | import numpy as np 18 | from pprint import pprint 19 | 20 | NESTED_DICT = { 21 | "level1_1": { 22 | "level2_1": [1, 2, 3], 23 | "level2_2": [4, 5, 6] 24 | }, 25 | "level1_2": { 26 | "level2_1": [1, 2, 3], 27 | "level2_2": [4, 5, 6] 28 | }, 29 | "level1_3": { 30 | "level2_1": { 31 | "level3_1": [1, 2, 3], 32 | "level3_2": [4, 5, 6] 33 | }, 34 | "level2_2": [4, 5, 6] 35 | } 36 | } 37 | 38 | DUMP_CACHE = [] # Used in test_track_times() 39 | 40 | 41 | def test_string(): 42 | """ Dumping and loading a string """ 43 | filename, mode = 'test.h5', 'w' 44 | string_obj = "The quick brown fox jumps over the lazy dog" 45 | dump(string_obj, filename, mode) 46 | string_hkl = load(filename) 47 | #print "Initial list: %s"%list_obj 48 | #print "Unhickled data: %s"%list_hkl 49 | try: 50 | assert type(string_obj) == type(string_hkl) == str 51 | assert string_obj == string_hkl 52 | os.remove(filename) 53 | except AssertionError: 54 | os.remove(filename) 55 | raise 56 | 57 | 58 | def test_unicode(): 59 | """ Dumping and loading a unicode string """ 60 | filename, mode = 'test.h5', 'w' 61 | u = unichr(233) + unichr(0x0bf2) + unichr(3972) + unichr(6000) 62 | dump(u, filename, mode) 63 | u_hkl = load(filename) 64 | 65 | try: 66 | assert type(u) == type(u_hkl) == unicode 67 | assert u == u_hkl 68 | # For those interested, uncomment below to see what those codes are: 69 | # for i, c in enumerate(u_hkl): 70 | # print i, '%04x' % ord(c), unicodedata.category(c), 71 | # print unicodedata.name(c) 72 | except AssertionError: 73 | os.remove(filename) 74 | raise 75 | 76 | 77 | def test_list(): 78 | """ Dumping and loading a list """ 79 | filename, mode = 'test.h5', 'w' 80 | list_obj = [1, 2, 3, 4, 5] 81 | dump(list_obj, filename, mode) 82 | list_hkl = load(filename) 83 | #print "Initial list: %s"%list_obj 84 | #print "Unhickled data: %s"%list_hkl 85 | try: 86 | assert type(list_obj) == type(list_hkl) == list 87 | assert list_obj == list_hkl 88 | import h5py 89 | a = h5py.File(filename) 90 | 91 | os.remove(filename) 92 | except AssertionError: 93 | print "ERR:", list_obj, list_hkl 94 | import h5py 95 | os.remove(filename) 96 | raise 97 | 98 | 99 | def test_set(): 100 | """ Dumping and loading a list """ 101 | filename, mode = 'test.h5', 'w' 102 | list_obj = set([1, 0, 3, 4.5, 11.2]) 103 | dump(list_obj, filename, mode) 104 | list_hkl = load(filename) 105 | #print "Initial list: %s"%list_obj 106 | #print "Unhickled data: %s"%list_hkl 107 | try: 108 | assert type(list_obj) == type(list_hkl) == set 109 | assert list_obj == list_hkl 110 | os.remove(filename) 111 | except AssertionError: 112 | os.remove(filename) 113 | raise 114 | 115 | 116 | def test_numpy(): 117 | """ Dumping and loading numpy array """ 118 | filename, mode = 'test.h5', 'w' 119 | dtypes = ['float32', 'float64', 'complex64', 'complex128'] 120 | 121 | for dt in dtypes: 122 | array_obj = np.ones(8, dtype=dt) 123 | dump(array_obj, filename, mode) 124 | array_hkl = load(filename) 125 | try: 126 | assert array_hkl.dtype == array_obj.dtype 127 | assert np.all((array_hkl, array_obj)) 128 | os.remove(filename) 129 | except AssertionError: 130 | os.remove(filename) 131 | print array_hkl 132 | print array_obj 133 | raise 134 | 135 | 136 | def test_masked(): 137 | """ Test masked numpy array """ 138 | filename, mode = 'test.h5', 'w' 139 | a = np.ma.array([1,2,3,4], dtype='float32', mask=[0,1,0,0]) 140 | 141 | dump(a, filename, mode) 142 | a_hkl = load(filename) 143 | 144 | try: 145 | assert a_hkl.dtype == a.dtype 146 | assert np.all((a_hkl, a)) 147 | os.remove(filename) 148 | except AssertionError: 149 | os.remove(filename) 150 | print a_hkl 151 | print a 152 | raise 153 | 154 | 155 | def test_dict(): 156 | """ Test dictionary dumping and loading """ 157 | filename, mode = 'test.h5', 'w' 158 | 159 | dd = { 160 | 'name' : 'Danny', 161 | 'age' : 28, 162 | 'height' : 6.1, 163 | 'dork' : True, 164 | 'nums' : [1, 2, 3], 165 | 'narr' : np.array([1,2,3]), 166 | #'unic' : u'dan[at]thetelegraphic.com' 167 | } 168 | 169 | 170 | dump(dd, filename, mode) 171 | dd_hkl = load(filename) 172 | 173 | for k in dd.keys(): 174 | try: 175 | assert k in dd_hkl.keys() 176 | 177 | if type(dd[k]) is type(np.array([1])): 178 | assert np.all((dd[k], dd_hkl[k])) 179 | else: 180 | #assert dd_hkl[k] == dd[k] 181 | pass 182 | assert type(dd_hkl[k]) == type(dd[k]) 183 | except AssertionError: 184 | print k 185 | print dd_hkl[k] 186 | print dd[k] 187 | print type(dd_hkl[k]), type(dd[k]) 188 | os.remove(filename) 189 | raise 190 | os.remove(filename) 191 | 192 | 193 | def test_compression(): 194 | """ Test compression on datasets""" 195 | 196 | filename, mode = 'test.h5', 'w' 197 | dtypes = ['int32', 'float32', 'float64', 'complex64', 'complex128'] 198 | 199 | comps = [None, 'gzip', 'lzf'] 200 | 201 | for dt in dtypes: 202 | for cc in comps: 203 | array_obj = np.ones(32768, dtype=dt) 204 | dump(array_obj, filename, mode, compression=cc) 205 | print cc, os.path.getsize(filename) 206 | array_hkl = load(filename) 207 | try: 208 | assert array_hkl.dtype == array_obj.dtype 209 | assert np.all((array_hkl, array_obj)) 210 | os.remove(filename) 211 | except AssertionError: 212 | os.remove(filename) 213 | print array_hkl 214 | print array_obj 215 | raise 216 | 217 | 218 | def test_dict_int_key(): 219 | """ Test for dictionaries with integer keys """ 220 | filename, mode = 'test.h5', 'w' 221 | 222 | dd = { 223 | 0: "test", 224 | 1: "test2" 225 | } 226 | 227 | dump(dd, filename, mode) 228 | dd_hkl = load(filename) 229 | 230 | 231 | os.remove(filename) 232 | 233 | 234 | def test_dict_nested(): 235 | """ Test for dictionaries with integer keys """ 236 | filename, mode = 'test.h5', 'w' 237 | 238 | dd = NESTED_DICT 239 | 240 | dump(dd, filename, mode) 241 | dd_hkl = load(filename) 242 | 243 | ll_hkl = dd_hkl["level1_3"]["level2_1"]["level3_1"] 244 | ll = dd["level1_3"]["level2_1"]["level3_1"] 245 | assert ll == ll_hkl 246 | os.remove(filename) 247 | 248 | 249 | def test_masked_dict(): 250 | """ Test dictionaries with masked arrays """ 251 | 252 | filename, mode = 'test.h5', 'w' 253 | 254 | dd = { 255 | "data" : np.ma.array([1,2,3], mask=[True, False, False]), 256 | "data2" : np.array([1,2,3,4,5]) 257 | } 258 | 259 | dump(dd, filename, mode) 260 | dd_hkl = load(filename) 261 | 262 | for k in dd.keys(): 263 | try: 264 | assert k in dd_hkl.keys() 265 | if type(dd[k]) is type(np.array([1])): 266 | assert np.all((dd[k], dd_hkl[k])) 267 | elif type(dd[k]) is type(np.ma.array([1])): 268 | print dd[k].data 269 | print dd_hkl[k].data 270 | assert np.allclose(dd[k].data, dd_hkl[k].data) 271 | assert np.allclose(dd[k].mask, dd_hkl[k].mask) 272 | 273 | assert type(dd_hkl[k]) == type(dd[k]) 274 | 275 | except AssertionError: 276 | print k 277 | print dd_hkl[k] 278 | print dd[k] 279 | print type(dd_hkl[k]), type(dd[k]) 280 | os.remove(filename) 281 | raise 282 | os.remove(filename) 283 | 284 | 285 | def test_nomatch(): 286 | """ Test for non-supported data types. 287 | 288 | Note: don't remember what I was trying to do with this test. 289 | Ignoring it for now. 290 | """ 291 | filename, mode = 'nomatch.h5', 'w' 292 | 293 | dd = Exception('Nothing to see here') 294 | no_match = False 295 | dump(dd, filename, mode) 296 | 297 | #dd_hkl = load(filename) 298 | dd_hkl = load(filename, safe=False) 299 | 300 | assert type(dd_hkl) == type(dd) == Exception 301 | os.remove(filename) 302 | 303 | 304 | def test_np_float(): 305 | """ Test for singular np dtypes """ 306 | filename, mode = 'np_float.h5', 'w' 307 | 308 | dtype_list = (np.float16, np.float32, np.float64, 309 | np.complex64, np.complex128, 310 | np.int8, np.int16, np.int32, np.int64, 311 | np.uint8, np.uint16, np.uint32, np.uint64) 312 | 313 | for dt in dtype_list: 314 | 315 | dd = dt(1) 316 | dump(dd, filename, mode) 317 | dd_hkl = load(filename) 318 | assert dd == dd_hkl 319 | assert dd.dtype == dd_hkl.dtype 320 | os.remove(filename) 321 | 322 | dd = {} 323 | for dt in dtype_list: 324 | dd[str(dt)] = dt(1.0) 325 | dump(dd, filename, mode) 326 | dd_hkl = load(filename) 327 | 328 | print dd 329 | for dt in dtype_list: 330 | assert dd[str(dt)] == dd_hkl[str(dt)] 331 | 332 | os.remove(filename) 333 | 334 | 335 | def md5sum(filename, blocksize=65536): 336 | """ Compute MD5 sum for a given file """ 337 | hash = hashlib.md5() 338 | 339 | with open(filename, "r+b") as f: 340 | for block in iter(lambda: f.read(blocksize), ""): 341 | hash.update(block) 342 | return hash.hexdigest() 343 | 344 | 345 | def caching_dump(obj, filename, *args, **kwargs): 346 | """ Save arguments of all dump calls """ 347 | DUMP_CACHE.append((obj, filename, args, kwargs)) 348 | return hickle_dump(obj, filename, *args, **kwargs) 349 | 350 | 351 | def test_track_times(): 352 | """ Verify that track_times = False produces identical files """ 353 | hashes = [] 354 | for obj, filename, mode, kwargs in DUMP_CACHE: 355 | if isinstance(filename, hickle.H5FileWrapper): 356 | filename = str(filename.file_name) 357 | kwargs['track_times'] = False 358 | caching_dump(obj, filename, mode, **kwargs) 359 | hashes.append(md5sum(filename)) 360 | os.remove(filename) 361 | 362 | time.sleep(1) 363 | 364 | for hash1, (obj, filename, mode, kwargs) in zip(hashes, DUMP_CACHE): 365 | if isinstance(filename, hickle.H5FileWrapper): 366 | filename = str(filename.file_name) 367 | caching_dump(obj, filename, mode, **kwargs) 368 | hash2 = md5sum(filename) 369 | print hash1, hash2 370 | try: 371 | assert hash1 == hash2 372 | os.remove(filename) 373 | except AssertionError: 374 | os.remove(filename) 375 | raise 376 | 377 | 378 | def test_comp_kwargs(): 379 | """ Test compression with some kwargs for shuffle and chunking """ 380 | 381 | filename, mode = 'test.h5', 'w' 382 | dtypes = ['int32', 'float32', 'float64', 'complex64', 'complex128'] 383 | 384 | comps = [None, 'gzip', 'lzf'] 385 | chunks = [(100, 100), (250, 250)] 386 | shuffles = [True, False] 387 | scaleoffsets = [0, 1, 2] 388 | 389 | for dt in dtypes: 390 | for cc in comps: 391 | for ch in chunks: 392 | for sh in shuffles: 393 | for so in scaleoffsets: 394 | kwargs = { 395 | 'compression' : cc, 396 | 'dtype': dt, 397 | 'chunks': ch, 398 | 'shuffle': sh, 399 | 'scaleoffset': so 400 | } 401 | #array_obj = np.random.random_integers(low=-8192, high=8192, size=(1000, 1000)).astype(dt) 402 | array_obj = NESTED_DICT 403 | dump(array_obj, filename, mode, compression=cc) 404 | print kwargs, os.path.getsize(filename) 405 | array_hkl = load(filename) 406 | try: 407 | os.remove(filename) 408 | except AssertionError: 409 | os.remove(filename) 410 | print array_hkl 411 | print array_obj 412 | raise 413 | 414 | 415 | def test_list_numpy(): 416 | """ Test converting a list of numpy arrays """ 417 | 418 | filename, mode = 'test.h5', 'w' 419 | 420 | a = np.ones(1024) 421 | b = np.zeros(1000) 422 | c = [a, b] 423 | 424 | dump(c, filename, mode) 425 | dd_hkl = load(filename) 426 | 427 | print dd_hkl 428 | 429 | assert isinstance(dd_hkl, list) 430 | assert isinstance(dd_hkl[0], np.ndarray) 431 | 432 | 433 | os.remove(filename) 434 | 435 | 436 | def test_tuple_numpy(): 437 | """ Test converting a list of numpy arrays """ 438 | 439 | filename, mode = 'test.h5', 'w' 440 | 441 | a = np.ones(1024) 442 | b = np.zeros(1000) 443 | c = (a, b, a) 444 | 445 | dump(c, filename, mode) 446 | dd_hkl = load(filename) 447 | 448 | print dd_hkl 449 | 450 | assert isinstance(dd_hkl, tuple) 451 | assert isinstance(dd_hkl[0], np.ndarray) 452 | 453 | 454 | os.remove(filename) 455 | 456 | 457 | def test_none(): 458 | """ Test None type hickling """ 459 | 460 | filename, mode = 'test.h5', 'w' 461 | 462 | a = None 463 | 464 | dump(a, filename, mode) 465 | dd_hkl = load(filename) 466 | print a 467 | print dd_hkl 468 | 469 | assert isinstance(dd_hkl, NoneType) 470 | 471 | os.remove(filename) 472 | 473 | 474 | def test_dict_none(): 475 | """ Test None type hickling """ 476 | 477 | filename, mode = 'test.h5', 'w' 478 | 479 | a = {'a': 1, 'b' : None} 480 | 481 | dump(a, filename, mode) 482 | dd_hkl = load(filename) 483 | print a 484 | print dd_hkl 485 | 486 | assert isinstance(a['b'], NoneType) 487 | 488 | os.remove(filename) 489 | 490 | 491 | def test_file_open_close(): 492 | """ https://github.com/telegraphic/hickle/issues/20 """ 493 | try: 494 | import h5py 495 | f = h5py.File('test.hdf', 'w') 496 | a = np.arange(5) 497 | 498 | dump(a, 'test.hkl') 499 | dump(a, 'test.hkl') 500 | 501 | dump(a, f, mode='w') 502 | try: 503 | dump(a, f, mode='w') 504 | except ClosedFileError: 505 | print "Tests: Closed file exception caught" 506 | 507 | finally: 508 | os.remove('test.hdf') 509 | os.remove('test.hkl') 510 | 511 | 512 | def run_file_cleanup(): 513 | """ Clean up temp files """ 514 | for filename in ('test.hdf', 'test.hkl', 'test.h5'): 515 | try: 516 | os.remove(filename) 517 | except OSError: 518 | pass 519 | 520 | 521 | def test_list_long_type(): 522 | """ Check long comes back out as a long """ 523 | filename, mode = 'test.h5', 'w' 524 | list_obj = [1L, 2L, 3L, 4L, 5L] 525 | dump(list_obj, filename, mode) 526 | list_hkl = load(filename) 527 | #print "Initial list: %s"%list_obj 528 | #print "Unhickled data: %s"%list_hkl 529 | try: 530 | assert type(list_obj) == type(list_hkl) == list 531 | assert list_obj == list_hkl 532 | assert type(list_obj[0]) == type(list_hkl[0]) 533 | 534 | os.remove(filename) 535 | except AssertionError: 536 | print "ERR:", list_obj, list_hkl 537 | import h5py 538 | a = h5py.File(filename) 539 | print a.keys() 540 | print a['data'].keys() 541 | os.remove(filename) 542 | raise 543 | 544 | 545 | def test_list_order(): 546 | """ https://github.com/telegraphic/hickle/issues/26 """ 547 | d = [np.arange(n + 1) for n in range(20)] 548 | hickle.dump(d, 'test.h5') 549 | d_hkl = hickle.load('test.h5') 550 | 551 | try: 552 | for ii, xx in enumerate(d): 553 | assert d[ii].shape == d_hkl[ii].shape 554 | for ii, xx in enumerate(d): 555 | assert np.allclose(d[ii], d_hkl[ii]) 556 | except AssertionError: 557 | print d[ii], d_hkl[ii] 558 | raise 559 | 560 | 561 | def test_embedded_array(): 562 | """ See https://github.com/telegraphic/hickle/issues/24 """ 563 | 564 | d_orig = [[np.array([10., 20.]), np.array([10, 20, 30])], [np.array([10, 2]), np.array([1.])]] 565 | hickle.dump(d_orig, 'test.h5') 566 | d_hkl = hickle.load('test.h5') 567 | 568 | for ii, xx in enumerate(d_orig): 569 | for jj, yy in enumerate(xx): 570 | assert np.allclose(d_orig[ii][jj], d_hkl[ii][jj]) 571 | 572 | print d_hkl 573 | print d_orig 574 | 575 | 576 | ################ 577 | ## NEW TESTS ## 578 | ################ 579 | 580 | 581 | def generate_nested(): 582 | a = [1, 2, 3] 583 | b = [a, a, a] 584 | c = [a, b, 's'] 585 | d = [a, b, c, c, a] 586 | e = [d, d, d, d, 1] 587 | f = {'a' : a, 'b' : b, 'e' : e} 588 | g = {'f' : f, 'a' : e, 'd': d} 589 | h = {'h': g, 'g' : f} 590 | z = [f, a, b, c, d, e, f, g, h, g, h] 591 | a = np.array([1, 2, 3, 4]) 592 | b = set([1, 2, 3, 4, 5]) 593 | c = (1, 2, 3, 4, 5) 594 | d = np.ma.array([1, 2, 3, 4, 5, 6, 7, 8]) 595 | z = {'a': a, 'b': b, 'c': c, 'd': d, 'z': z} 596 | return z 597 | 598 | 599 | def test_is_iterable(): 600 | a = [1, 2, 3] 601 | b = 1 602 | 603 | assert check_is_iterable(a) == True 604 | assert check_is_iterable(b) == False 605 | 606 | 607 | def test_check_iterable_item_type(): 608 | 609 | a = [1, 2, 3] 610 | b = [a, a, a] 611 | c = [a, b, 's'] 612 | 613 | type_a = check_iterable_item_type(a) 614 | type_b = check_iterable_item_type(b) 615 | type_c = check_iterable_item_type(c) 616 | 617 | assert type_a is int 618 | assert type_b is list 619 | assert type_c == False 620 | 621 | 622 | def test_dump_nested(): 623 | """ Dump a complicated nested object to HDF5 624 | """ 625 | z = generate_nested() 626 | dump(z, 'test.hkl', mode='w') 627 | 628 | 629 | def test_load(): 630 | 631 | a = set([1, 2, 3, 4]) 632 | b = set([5, 6, 7, 8]) 633 | c = set([9, 10, 11, 12]) 634 | z = (a, b, c) 635 | z = [z, z] 636 | z = (z, z, z, z, z) 637 | 638 | print "Original:" 639 | pprint(z) 640 | dump(z, 'test.hkl', mode='w') 641 | 642 | print "\nReconstructed:" 643 | z = load('test.hkl') 644 | pprint(z) 645 | 646 | 647 | def test_sort_keys(): 648 | keys = ['data_0', 'data_1', 'data_2', 'data_3', 'data_10'] 649 | keys_sorted = ['data_0', 'data_1', 'data_2', 'data_3', 'data_10'] 650 | assert sort_keys(keys) == keys_sorted 651 | 652 | 653 | def test_ndarray(): 654 | 655 | a = np.array([1,2,3]) 656 | b = np.array([2,3,4]) 657 | z = (a, b) 658 | 659 | print "Original:" 660 | pprint(z) 661 | dump(z, 'test.hkl', mode='w') 662 | 663 | print "\nReconstructed:" 664 | z = load('test.hkl') 665 | pprint(z) 666 | 667 | 668 | def test_ndarray_masked(): 669 | 670 | a = np.ma.array([1,2,3]) 671 | b = np.ma.array([2,3,4], mask=[True, False, True]) 672 | z = (a, b) 673 | 674 | print "Original:" 675 | pprint(z) 676 | dump(z, 'test.hkl', mode='w') 677 | 678 | print "\nReconstructed:" 679 | z = load('test.hkl') 680 | pprint(z) 681 | 682 | 683 | def test_simple_dict(): 684 | a = {'key1': 1, 'key2': 2} 685 | 686 | dump(a, 'test.hkl') 687 | z = load('test.hkl') 688 | 689 | pprint(a) 690 | pprint(z) 691 | 692 | 693 | def test_complex_dict(): 694 | a = {'akey': 1, 'akey2': 2} 695 | b = {'bkey': 2.0, 'bkey3': long(3.0)} 696 | c = {'ckey': "hello", "ckey2": "hi there"} 697 | z = {'zkey1': a, 'zkey2': b, 'zkey3': c} 698 | 699 | print "Original:" 700 | pprint(z) 701 | dump(z, 'test.hkl', mode='w') 702 | 703 | print "\nReconstructed:" 704 | z = load('test.hkl') 705 | pprint(z) 706 | 707 | 708 | def test_unicode(): 709 | a = u"unicode test" 710 | dump(a, 'test.hkl', mode='w') 711 | 712 | z = load('test.hkl') 713 | assert a == z 714 | assert type(a) == type(z) == unicode 715 | pprint(z) 716 | 717 | 718 | def test_legacy_hickles(): 719 | 720 | try: 721 | a = load("hickle_1_1_0.hkl") 722 | b = load("hickle_1_3_0.hkl") 723 | 724 | import h5py 725 | d = h5py.File("hickle_1_1_0.hkl")["data"]["a"][:] 726 | d2 = h5py.File("hickle_1_3_0.hkl")["data"]["a"][:] 727 | assert np.allclose(d, a["a"]) 728 | assert np.allclose(d2, b["a"]) 729 | 730 | except IOError: 731 | # For travis-CI 732 | a = load("tests/hickle_1_1_0.hkl") 733 | b = load("tests/hickle_1_3_0.hkl") 734 | 735 | print a 736 | print b 737 | 738 | 739 | def test_multi_hickle(): 740 | import os 741 | a = {'a': 123, 'b': [1, 2, 4]} 742 | 743 | if os.path.exists("test.hkl"): 744 | os.remove("test.hkl") 745 | dump(a, "test.hkl", path="/test", mode="w") 746 | dump(a, "test.hkl", path="/test2", mode="r+") 747 | dump(a, "test.hkl", path="/test3", mode="r+") 748 | dump(a, "test.hkl", path="/test4", mode="r+") 749 | 750 | a = load("test.hkl", path="/test") 751 | b = load("test.hkl", path="/test2") 752 | c = load("test.hkl", path="/test3") 753 | d = load("test.hkl", path="/test4") 754 | os.remove("test.hkl") 755 | 756 | def test_complex(): 757 | """ Test complex value dtype is handled correctly 758 | 759 | https://github.com/telegraphic/hickle/issues/29 """ 760 | 761 | data = {"A":1.5, "B":1.5 + 1j, "C":np.linspace(0,1,4) + 2j} 762 | dump(data, "test.hkl") 763 | data2 = load("test.hkl") 764 | for key in data.keys(): 765 | assert type(data[key]) == type(data2[key]) 766 | 767 | 768 | if __name__ == '__main__': 769 | """ Some tests and examples """ 770 | test_complex() 771 | test_file_open_close() 772 | test_dict_none() 773 | test_none() 774 | test_unicode() 775 | test_string() 776 | test_masked_dict() 777 | test_list() 778 | test_set() 779 | test_numpy() 780 | test_dict() 781 | test_compression() 782 | test_masked() 783 | test_dict_nested() 784 | test_comp_kwargs() 785 | test_list_numpy() 786 | test_tuple_numpy() 787 | test_track_times() 788 | test_list_order() 789 | test_embedded_array() 790 | test_np_float() 791 | 792 | # NEW TESTS 793 | test_legacy_hickles() 794 | test_is_iterable() 795 | test_check_iterable_item_type() 796 | test_dump_nested() 797 | test_load() 798 | test_sort_keys() 799 | test_ndarray() 800 | test_ndarray_masked() 801 | test_simple_dict() 802 | test_complex_dict() 803 | test_unicode() 804 | test_multi_hickle() 805 | 806 | #FAILING TESTS: 807 | #test_nomatch() 808 | #test_dict_int_key() 809 | #test_list_long_type() 810 | 811 | # Cleanup 812 | run_file_cleanup() 813 | print "ALL TESTS PASSED!" -------------------------------------------------------------------------------- /hickle.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """ 3 | # hickle.py 4 | 5 | Created by Danny Price 2016-02-03. 6 | 7 | Hickle is a HDF5 based clone of Pickle. Instead of serializing to a pickle 8 | file, Hickle dumps to a HDF5 file. It is designed to be as similar to pickle in 9 | usage as possible, providing a load() and dump() function. 10 | 11 | ## Notes 12 | 13 | Hickle has two main advantages over Pickle: 14 | 1) LARGE PICKLE HANDLING. Unpickling a large pickle is slow, as the Unpickler 15 | reads the entire pickle thing and loads it into memory. In comparison, HDF5 16 | files are designed for large datasets. Things are only loaded when accessed. 17 | 18 | 2) CROSS PLATFORM SUPPORT. Attempting to unpickle a pickle pickled on Windows 19 | on Linux and vice versa is likely to fail with errors like "Insecure string 20 | pickle". HDF5 files will load fine, as long as both machines have 21 | h5py installed. 22 | 23 | """ 24 | 25 | import os 26 | import numpy as np 27 | import h5py as h5 28 | import re 29 | 30 | try: 31 | from exceptions import Exception 32 | from types import NoneType 33 | except ImportError: 34 | pass # above imports will fail in python3 35 | 36 | import warnings 37 | __version__ = "2.0.4" 38 | __author__ = "Danny Price" 39 | 40 | 41 | ################## 42 | # Error handling # 43 | ################## 44 | 45 | class FileError(Exception): 46 | """ An exception raised if the file is fishy """ 47 | def __init__(self): 48 | return 49 | 50 | def __str__(self): 51 | return ("Cannot open file. Please pass either a filename " 52 | "string, a file object, or a h5py.File") 53 | 54 | 55 | class ClosedFileError(Exception): 56 | """ An exception raised if the file is fishy """ 57 | def __init__(self): 58 | return 59 | 60 | def __str__(self): 61 | return ("HDF5 file has been closed. Please pass either " 62 | "a filename string, a file object, or an open h5py.File") 63 | 64 | 65 | class NoMatchError(Exception): 66 | """ An exception raised if the object type is not understood (or 67 | supported)""" 68 | def __init__(self): 69 | return 70 | 71 | def __str__(self): 72 | return ("Error: this type of python object cannot be converted into a " 73 | "hickle.") 74 | 75 | 76 | class ToDoError(Exception): 77 | """ An exception raised for non-implemented functionality""" 78 | def __init__(self): 79 | return 80 | 81 | def __str__(self): 82 | return "Error: this functionality hasn't been implemented yet." 83 | 84 | 85 | ###################### 86 | # H5PY file wrappers # 87 | ###################### 88 | 89 | class H5GroupWrapper(h5.Group): 90 | """ Group wrapper that provides a track_times kwarg. 91 | 92 | track_times is a boolean flag that can be set to False, so that two 93 | files created at different times will have identical MD5 hashes. 94 | """ 95 | def create_dataset(self, *args, **kwargs): 96 | kwargs['track_times'] = getattr(self, 'track_times', True) 97 | return super(H5GroupWrapper, self).create_dataset(*args, **kwargs) 98 | 99 | def create_group(self, *args, **kwargs): 100 | group = super(H5GroupWrapper, self).create_group(*args, **kwargs) 101 | group.__class__ = H5GroupWrapper 102 | group.track_times = getattr(self, 'track_times', True) 103 | return group 104 | 105 | 106 | class H5FileWrapper(h5.File): 107 | """ Wrapper for h5py File that provides a track_times kwarg. 108 | 109 | track_times is a boolean flag that can be set to False, so that two 110 | files created at different times will have identical MD5 hashes. 111 | """ 112 | def create_dataset(self, *args, **kwargs): 113 | kwargs['track_times'] = getattr(self, 'track_times', True) 114 | return super(H5FileWrapper, self).create_dataset(*args, **kwargs) 115 | 116 | def create_group(self, *args, **kwargs): 117 | group = super(H5FileWrapper, self).create_group(*args, **kwargs) 118 | group.__class__ = H5GroupWrapper 119 | group.track_times = getattr(self, 'track_times', True) 120 | return group 121 | 122 | 123 | def file_opener(f, mode='r', track_times=True): 124 | """ A file opener helper function with some error handling. This can open 125 | files through a file object, a h5py file, or just the filename. 126 | 127 | Args: 128 | f (file, h5py.File, or string): File-identifier, e.g. filename or file object. 129 | mode (str): File open mode. Only required if opening by filename string. 130 | track_times (bool): Track time in HDF5; turn off if you want hickling at 131 | different times to produce identical files (e.g. for MD5 hash check). 132 | 133 | """ 134 | # Were we handed a file object or just a file name string? 135 | if isinstance(f, file): 136 | filename, mode = f.name, f.mode 137 | f.close() 138 | h5f = h5.File(filename, mode) 139 | elif isinstance(f, str) or isinstance(f, unicode): 140 | filename = f 141 | h5f = h5.File(filename, mode) 142 | elif isinstance(f, H5FileWrapper) or isinstance(f, h5._hl.files.File): 143 | try: 144 | filename = f.filename 145 | except ValueError: 146 | raise ClosedFileError() 147 | h5f = f 148 | else: 149 | print(type(f)) 150 | raise FileError 151 | 152 | h5f.__class__ = H5FileWrapper 153 | h5f.track_times = track_times 154 | return h5f 155 | 156 | 157 | ########### 158 | # DUMPERS # 159 | ########### 160 | 161 | def check_is_iterable(py_obj): 162 | """ Check whether a python object is iterable. 163 | 164 | Note: this treats unicode and string as NON ITERABLE 165 | 166 | Args: 167 | py_obj: python object to test 168 | 169 | Returns: 170 | iter_ok (bool): True if item is iterable, False is item is not 171 | """ 172 | if type(py_obj) in (str, unicode): 173 | return False 174 | try: 175 | iter(py_obj) 176 | return True 177 | except TypeError: 178 | return False 179 | 180 | 181 | def check_iterable_item_type(iter_obj): 182 | """ Check if all items within an iterable are the same type. 183 | 184 | Args: 185 | iter_obj: iterable object 186 | 187 | Returns: 188 | iter_type: type of item contained within the iterable. If 189 | the iterable has many types, a boolean False is returned instead. 190 | 191 | References: 192 | http://stackoverflow.com/questions/13252333/python-check-if-all-elements-of-a-list-are-the-same-type 193 | """ 194 | iseq = iter(iter_obj) 195 | first_type = type(next(iseq)) 196 | return first_type if all((type(x) is first_type) for x in iseq) else False 197 | 198 | 199 | def check_is_numpy_array(py_obj): 200 | """ Check if a python object is a numpy array (masked or regular) 201 | 202 | Args: 203 | py_obj: python object to check whether it is a numpy array 204 | 205 | Returns 206 | is_numpy (bool): Returns True if it is a numpy array, else False if it isn't 207 | """ 208 | 209 | is_numpy = type(py_obj) in (type(np.array([1])), type(np.ma.array([1]))) 210 | 211 | return is_numpy 212 | 213 | 214 | def _dump(py_obj, h_group, call_id=0, **kwargs): 215 | """ Dump a python object to a group within a HDF5 file. 216 | 217 | This function is called recursively by the main dump() function. 218 | 219 | Args: 220 | py_obj: python object to dump. 221 | h_group (h5.File.group): group to dump data into. 222 | call_id (int): index to identify object's relative location in the iterable. 223 | """ 224 | 225 | dumpable_dtypes = set([bool, int, float, long, complex, str, unicode]) 226 | 227 | # Firstly, check if item is a numpy array. If so, just dump it. 228 | if check_is_numpy_array(py_obj): 229 | create_hkl_dataset(py_obj, h_group, call_id, **kwargs) 230 | 231 | # next, check if item is iterable 232 | elif check_is_iterable(py_obj): 233 | item_type = check_iterable_item_type(py_obj) 234 | 235 | # item_type == False implies multiple types. Create a dataset 236 | if item_type is False: 237 | h_subgroup = create_hkl_group(py_obj, h_group, call_id) 238 | for ii, py_subobj in enumerate(py_obj): 239 | _dump(py_subobj, h_subgroup, call_id=ii, **kwargs) 240 | 241 | # otherwise, subitems have same type. Check if subtype is an iterable 242 | # (e.g. list of lists), or not (e.g. list of ints, which should be treated 243 | # as a single dataset). 244 | else: 245 | if item_type in dumpable_dtypes: 246 | create_hkl_dataset(py_obj, h_group, call_id, **kwargs) 247 | else: 248 | h_subgroup = create_hkl_group(py_obj, h_group, call_id) 249 | for ii, py_subobj in enumerate(py_obj): 250 | #print py_subobj, h_subgroup, ii 251 | _dump(py_subobj, h_subgroup, call_id=ii, **kwargs) 252 | 253 | # item is not iterable, so create a dataset for it 254 | else: 255 | create_hkl_dataset(py_obj, h_group, call_id, **kwargs) 256 | 257 | 258 | def dump(py_obj, file_obj, mode='w', track_times=True, path='/', **kwargs): 259 | """ Write a pickled representation of obj to the open file object file. 260 | 261 | Args: 262 | obj (object): python object o store in a Hickle 263 | file: file object, filename string, or h5py.File object 264 | file in which to store the object. A h5py.File or a filename is also 265 | acceptable. 266 | mode (str): optional argument, 'r' (read only), 'w' (write) or 'a' (append). 267 | Ignored if file is a file object. 268 | compression (str): optional argument. Applies compression to dataset. Options: None, gzip, 269 | lzf (+ szip, if installed) 270 | track_times (bool): optional argument. If set to False, repeated hickling will produce 271 | identical files. 272 | path (str): path within hdf5 file to save data to. Defaults to root / 273 | """ 274 | 275 | try: 276 | # Open the file 277 | h5f = file_opener(file_obj, mode, track_times) 278 | h5f.attrs["CLASS"] = 'hickle' 279 | h5f.attrs["VERSION"] = 2 280 | h5f.attrs["type"] = ['hickle'] 281 | 282 | h_root_group = h5f.get(path) 283 | 284 | if h_root_group is None: 285 | h_root_group = h5f.create_group(path) 286 | h_root_group.attrs["type"] = ['hickle'] 287 | 288 | _dump(py_obj, h_root_group, **kwargs) 289 | h5f.close() 290 | except NoMatchError: 291 | fname = h5f.filename 292 | h5f.close() 293 | try: 294 | os.remove(fname) 295 | except OSError: 296 | warnings.warn("Dump failed. Could not remove %s" % fname) 297 | finally: 298 | raise NoMatchError 299 | 300 | 301 | def create_dataset_lookup(py_obj): 302 | """ What type of object are we trying to pickle? This is a python 303 | dictionary based equivalent of a case statement. It returns the correct 304 | helper function for a given data type. 305 | 306 | Args: 307 | py_obj: python object to look-up what function to use to dump to disk 308 | 309 | Returns: 310 | match: function that should be used to dump data to a new dataset 311 | """ 312 | t = type(py_obj) 313 | 314 | types = { 315 | dict: create_dict_dataset, 316 | list: create_listlike_dataset, 317 | tuple: create_listlike_dataset, 318 | set: create_listlike_dataset, 319 | str: create_stringlike_dataset, 320 | unicode: create_stringlike_dataset, 321 | int: create_python_dtype_dataset, 322 | float: create_python_dtype_dataset, 323 | long: create_python_dtype_dataset, 324 | bool: create_python_dtype_dataset, 325 | complex: create_python_dtype_dataset, 326 | NoneType: create_none_dataset, 327 | np.ndarray: create_np_array_dataset, 328 | np.ma.core.MaskedArray: create_np_array_dataset, 329 | np.float16: create_np_dtype_dataset, 330 | np.float32: create_np_dtype_dataset, 331 | np.float64: create_np_dtype_dataset, 332 | np.int8: create_np_dtype_dataset, 333 | np.int16: create_np_dtype_dataset, 334 | np.int32: create_np_dtype_dataset, 335 | np.int64: create_np_dtype_dataset, 336 | np.uint8: create_np_dtype_dataset, 337 | np.uint16: create_np_dtype_dataset, 338 | np.uint32: create_np_dtype_dataset, 339 | np.uint64: create_np_dtype_dataset, 340 | np.complex64: create_np_dtype_dataset, 341 | np.complex128: create_np_dtype_dataset 342 | } 343 | 344 | match = types.get(t, no_match) 345 | return match 346 | 347 | 348 | def create_hkl_dataset(py_obj, h_group, call_id=0, **kwargs): 349 | """ Create a dataset within the hickle HDF5 file 350 | 351 | Args: 352 | py_obj: python object to dump. 353 | h_group (h5.File.group): group to dump data into. 354 | call_id (int): index to identify object's relative location in the iterable. 355 | 356 | """ 357 | #lookup dataset creator type based on python object type 358 | create_dataset = create_dataset_lookup(py_obj) 359 | 360 | # do the creation 361 | create_dataset(py_obj, h_group, call_id, **kwargs) 362 | 363 | 364 | def create_hkl_group(py_obj, h_group, call_id=0): 365 | """ Create a new group within the hickle file 366 | 367 | Args: 368 | h_group (h5.File.group): group to dump data into. 369 | call_id (int): index to identify object's relative location in the iterable. 370 | 371 | """ 372 | h_subgroup = h_group.create_group('data_%i' % call_id) 373 | h_subgroup.attrs["type"] = [str(type(py_obj))] 374 | return h_subgroup 375 | 376 | 377 | def create_listlike_dataset(py_obj, h_group, call_id=0, **kwargs): 378 | """ Dumper for list, set, tuple 379 | 380 | Args: 381 | py_obj: python object to dump; should be list-like 382 | h_group (h5.File.group): group to dump data into. 383 | call_id (int): index to identify object's relative location in the iterable. 384 | """ 385 | dtype = str(type(py_obj)) 386 | obj = list(py_obj) 387 | d = h_group.create_dataset('data_%i' % call_id, data=obj, **kwargs) 388 | d.attrs["type"] = [dtype] 389 | 390 | 391 | def create_np_dtype_dataset(py_obj, h_group, call_id=0, **kwargs): 392 | """ dumps an np dtype object to h5py file 393 | 394 | Args: 395 | py_obj: python object to dump; should be a numpy scalar, e.g. np.float16(1) 396 | h_group (h5.File.group): group to dump data into. 397 | call_id (int): index to identify object's relative location in the iterable. 398 | """ 399 | d = h_group.create_dataset('data_%i' % call_id, data=py_obj, **kwargs) 400 | d.attrs["type"] = ['np_dtype'] 401 | d.attrs["np_dtype"] = str(d.dtype) 402 | 403 | 404 | def create_python_dtype_dataset(py_obj, h_group, call_id=0, **kwargs): 405 | """ dumps a python dtype object to h5py file 406 | 407 | Args: 408 | py_obj: python object to dump; should be a python type (int, float, bool etc) 409 | h_group (h5.File.group): group to dump data into. 410 | call_id (int): index to identify object's relative location in the iterable. 411 | """ 412 | d = h_group.create_dataset('data_%i' % call_id, data=py_obj, 413 | dtype=type(py_obj), **kwargs) 414 | d.attrs["type"] = ['python_dtype'] 415 | d.attrs['python_subdtype'] = str(type(py_obj)) 416 | 417 | 418 | def create_dict_dataset(py_obj, h_group, call_id=0, **kwargs): 419 | """ Creates a data group for each key in dictionary 420 | 421 | Args: 422 | py_obj: python object to dump; should be dictionary 423 | h_group (h5.File.group): group to dump data into. 424 | call_id (int): index to identify object's relative location in the iterable. 425 | """ 426 | h_dictgroup = h_group.create_group('data_%i' % call_id) 427 | h_dictgroup.attrs["type"] = ['dict'] 428 | for key, py_subobj in py_obj.items(): 429 | h_subgroup = h_dictgroup.create_group(key) 430 | h_subgroup.attrs["type"] = ['dict_item'] 431 | _dump(py_subobj, h_subgroup, call_id=0, **kwargs) 432 | 433 | 434 | def create_np_array_dataset(py_obj, h_group, call_id=0, **kwargs): 435 | """ dumps an ndarray object to h5py file 436 | 437 | Args: 438 | py_obj: python object to dump; should be a numpy array or np.ma.array (masked) 439 | h_group (h5.File.group): group to dump data into. 440 | call_id (int): index to identify object's relative location in the iterable. 441 | """ 442 | if isinstance(py_obj, type(np.ma.array([1]))): 443 | d = h_group.create_dataset('data_%i' % call_id, data=py_obj, **kwargs) 444 | #m = h_group.create_dataset('mask_%i' % call_id, data=py_obj.mask, **kwargs) 445 | m = h_group.create_dataset('data_%i_mask' % call_id, data=py_obj.mask, **kwargs) 446 | d.attrs["type"] = ['ndarray_masked_data'] 447 | m.attrs["type"] = ['ndarray_masked_mask'] 448 | else: 449 | d = h_group.create_dataset('data_%i' % call_id, data=py_obj, **kwargs) 450 | d.attrs["type"] = ['ndarray'] 451 | 452 | 453 | def create_stringlike_dataset(py_obj, h_group, call_id=0, **kwargs): 454 | """ dumps a list object to h5py file 455 | 456 | Args: 457 | py_obj: python object to dump; should be string-like (unicode or string) 458 | h_group (h5.File.group): group to dump data into. 459 | call_id (int): index to identify object's relative location in the iterable. 460 | """ 461 | if isinstance(py_obj, str): 462 | d = h_group.create_dataset('data_%i' % call_id, data=[py_obj], **kwargs) 463 | d.attrs["type"] = ['string'] 464 | else: 465 | dt = h5.special_dtype(vlen=unicode) 466 | dset = h_group.create_dataset('data_%i' % call_id, shape=(1, ), dtype=dt, **kwargs) 467 | dset[0] = py_obj 468 | dset.attrs['type'] = ['unicode'] 469 | 470 | 471 | def create_none_dataset(py_obj, h_group, call_id=0, **kwargs): 472 | """ Dump None type to file 473 | 474 | Args: 475 | py_obj: python object to dump; must be None object 476 | h_group (h5.File.group): group to dump data into. 477 | call_id (int): index to identify object's relative location in the iterable. 478 | """ 479 | d = h_group.create_dataset('data_%i' % call_id, data=[0], **kwargs) 480 | d.attrs["type"] = ['none'] 481 | 482 | 483 | def no_match(py_obj, h_group, call_id=0, **kwargs): 484 | """ If no match is made, raise an exception 485 | 486 | Args: 487 | py_obj: python object to dump; default if item is not matched. 488 | h_group (h5.File.group): group to dump data into. 489 | call_id (int): index to identify object's relative location in the iterable. 490 | """ 491 | import cPickle 492 | 493 | pickled_obj = cPickle.dumps(py_obj) 494 | d = h_group.create_dataset('data_%i' % call_id, data=[pickled_obj]) 495 | d.attrs["type"] = ['pickle'] 496 | 497 | warnings.warn("%s type not understood, data have been " 498 | "serialized" % type(py_obj)) 499 | 500 | 501 | ############# 502 | ## LOADERS ## 503 | ############# 504 | 505 | class PyContainer(list): 506 | """ A group-like object into which to load datasets. 507 | 508 | In order to build up a tree-like structure, we need to be able 509 | to load datasets into a container with an append() method. 510 | Python tuples and sets do not allow this. This class provides 511 | a list-like object that be converted into a list, tuple, set or dict. 512 | """ 513 | def __init__(self): 514 | super(PyContainer, self).__init__() 515 | self.container_type = None 516 | self.name = None 517 | 518 | def convert(self): 519 | """ Convert from PyContainer to python core data type. 520 | 521 | Returns: self, either as a list, tuple, set or dict 522 | """ 523 | if self.container_type == "": 524 | return list(self) 525 | if self.container_type == "": 526 | return tuple(self) 527 | if self.container_type == "": 528 | return set(self) 529 | if self.container_type == "dict": 530 | keys = [str(item.name.split('/')[-1]) for item in self] 531 | items = [item[0] for item in self] 532 | return dict(zip(keys, items)) 533 | else: 534 | return self 535 | 536 | 537 | def load(fileobj, path='/', safe=True): 538 | """ Load a hickle file and reconstruct a python object 539 | 540 | Args: 541 | fileobj: file object, h5py.File, or filename string 542 | safe (bool): Disable automatic depickling of arbitrary python objects. 543 | DO NOT set this to False unless the file is from a trusted source. 544 | (see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation) 545 | 546 | path (str): path within hdf5 file to save data to. Defaults to root / 547 | """ 548 | 549 | try: 550 | h5f = file_opener(fileobj) 551 | 552 | h_root_group = h5f.get(path) 553 | try: 554 | assert 'CLASS' in h5f.attrs.keys() 555 | assert 'VERSION' in h5f.attrs.keys() 556 | py_container = PyContainer() 557 | py_container.container_type = 'hickle' 558 | py_container = _load(py_container, h_root_group) 559 | return py_container[0][0] 560 | except AssertionError: 561 | import hickle_legacy 562 | return hickle_legacy.load(fileobj, safe) 563 | finally: 564 | if 'h5f' in locals(): 565 | h5f.close() 566 | 567 | 568 | def load_dataset(h_node): 569 | """ Load a dataset, converting into its correct python type 570 | 571 | Args: 572 | h_node (h5py dataset): h5py dataset object to read 573 | 574 | Returns: 575 | data: reconstructed python object from loaded data 576 | """ 577 | py_type = h_node.attrs["type"][0] 578 | 579 | if h_node.shape == (): 580 | data = h_node.value 581 | else: 582 | data = h_node[:] 583 | 584 | if py_type == "": 585 | #print self.name 586 | return list(data) 587 | elif py_type == "": 588 | return tuple(data) 589 | elif py_type == "": 590 | return set(data) 591 | elif py_type == "np_dtype": 592 | subtype = h_node.attrs["np_dtype"] 593 | data = np.array(data, dtype=subtype) 594 | return data 595 | elif py_type == 'ndarray': 596 | return np.array(data) 597 | elif py_type == 'ndarray_masked_data': 598 | try: 599 | mask_path = h_node.name + "_mask" 600 | h_root = h_node.parent 601 | mask = h_root.get(mask_path)[:] 602 | except IndexError: 603 | mask = h_root.get(mask_path) 604 | except ValueError: 605 | mask = h_root.get(mask_path) 606 | data = np.ma.array(data, mask=mask) 607 | return data 608 | elif py_type == 'python_dtype': 609 | subtype = h_node.attrs["python_subdtype"] 610 | type_dict = { 611 | "": int, 612 | "": float, 613 | "": long, 614 | "": bool, 615 | "": complex 616 | } 617 | tcast = type_dict.get(subtype) 618 | return tcast(data) 619 | elif py_type == 'string': 620 | return str(data[0]) 621 | elif py_type == 'unicode': 622 | return unicode(data[0]) 623 | elif py_type == 'none': 624 | return None 625 | else: 626 | print(h_node.name, py_type, h_node.attrs.keys()) 627 | return data 628 | 629 | 630 | def sort_keys(key_list): 631 | """ Take a list of strings and sort it by integer value within string 632 | 633 | Args: 634 | key_list (list): List of keys 635 | 636 | Returns: 637 | key_list_sorted (list): List of keys, sorted by integer 638 | """ 639 | to_int = lambda x: int(re.search('\d+', x).group(0)) 640 | keys_by_int = sorted([(to_int(key), key) for key in key_list]) 641 | return [ii[1] for ii in keys_by_int] 642 | 643 | 644 | def _load(py_container, h_group): 645 | """ Load a hickle file 646 | 647 | Recursive funnction to load hdf5 data into a PyContainer() 648 | 649 | Args: 650 | py_container (PyContainer): Python container to load data into 651 | h_group (h5 group or dataset): h5py object, group or dataset, to spider 652 | and load all datasets. 653 | """ 654 | 655 | group_dtype = h5._hl.group.Group 656 | dataset_dtype = h5._hl.dataset.Dataset 657 | 658 | #either a file, group, or dataset 659 | if isinstance(h_group, H5FileWrapper) or isinstance(h_group, group_dtype): 660 | py_subcontainer = PyContainer() 661 | py_subcontainer.container_type = h_group.attrs['type'][0] 662 | py_subcontainer.name = h_group.name 663 | 664 | if py_subcontainer.container_type != 'dict': 665 | h_keys = sort_keys(h_group.keys()) 666 | else: 667 | h_keys = h_group.keys() 668 | 669 | for h_name in h_keys: 670 | h_node = h_group[h_name] 671 | py_subcontainer = _load(py_subcontainer, h_node) 672 | 673 | sub_data = py_subcontainer.convert() 674 | py_container.append(sub_data) 675 | 676 | else: 677 | # must be a dataset 678 | subdata = load_dataset(h_group) 679 | py_container.append(subdata) 680 | 681 | #print h_group.name, py_container 682 | return py_container 683 | --------------------------------------------------------------------------------