├── README └── mat73_to_pickle.py /README: -------------------------------------------------------------------------------- 1 | convert_matlab73_hdf5 2 | ===================== 3 | 4 | Convert Matlab v7.3 '.mat' files (i.e. HDF5 file format) into Python's 5 | pickle/numpy format. This recent Matlab file format is unsupported by 6 | SciPy's scipy.io.loadmat function. See notes here: 7 | http://docs.scipy.org/doc/scipy/reference/generated/scipy.io.loadmat.html 8 | 9 | This script opens the Matlab file in HDF5 format, recursively navigate 10 | the hierarchical structers and follows the pointers till final data, 11 | usually in the form of numbers, arrays and strings. Once a leaf is 12 | reached it heuristically converts the data as Python data types and 13 | put them in a (Python) dictionary following the HDF5 semantic 14 | structure. 15 | 16 | This code works well for MEG (magnetoencephalography) files saved from 17 | recent Matlab versions. For examples those you can find some of them 18 | here: 19 | ftp://ftp.fcdonders.nl/pub/biomag2012/ 20 | 21 | 22 | USAGE 23 | 24 | python mat73_to_pickle.py 25 | 26 | -------------------------------------------------------------------------------- /mat73_to_pickle.py: -------------------------------------------------------------------------------- 1 | """This function transforms Matlab7.3 HDF5 '.mat' files into a Python 2 | dictionary of arrays and strings (and some leftover). 3 | 4 | Copyright 2012, Emanuele Olivetti 5 | 6 | BSD License, 3 clauses. 7 | """ 8 | 9 | import numpy as np 10 | import h5py 11 | 12 | dtypes = {} 13 | 14 | 15 | def string(seq): 16 | """Convert a sequence of integers into a single string. 17 | """ 18 | return ''.join([chr(a) for a in seq]) 19 | 20 | 21 | def add_dtype_name(f, name): 22 | """Keep track of all dtypes and names in the HDF5 file using it. 23 | """ 24 | global dtypes 25 | dtype = f.dtype 26 | if dtypes.has_key(dtype.name): 27 | dtypes[dtype.name].add(name) 28 | else: 29 | dtypes[dtype.name] = set([name]) 30 | return 31 | 32 | 33 | def recursive_dict(f, root=None, name='root'): 34 | """This function recursively navigates the HDF5 structure from 35 | node 'f' and tries to unpack the data structure by guessing their 36 | content from dtype, shape etc.. It returns a dictionary of 37 | strings, arrays and some leftovers. 'root' is the root node of the 38 | HDF5 structure, i.e. what h5py.File() returns. 39 | 40 | Note that this function works well on the Matlab7.3 datasets on 41 | which it was tested, but in general it might be wrong and it might 42 | crash. The motivation is that it has to guess the content of 43 | substructures so it might fail. One source of headache seems to be 44 | Matlab7.3 format that represents strings as array of 'uint16' so 45 | not using the string datatype. For this reason it is not possible 46 | to discriminate strings from arrays of integers without using 47 | heuristics. 48 | """ 49 | if root is None: root = f 50 | if hasattr(f, 'keys'): 51 | a = dict(f) 52 | if u'#refs#' in a.keys(): # we don't want to keep this 53 | del(a[u'#refs#']) 54 | for k in a.keys(): 55 | # print k 56 | a[k] = recursive_dict(f[k], root, name=name+'->'+k) 57 | return a 58 | elif hasattr(f, 'shape'): 59 | if f.dtype.name not in ['object', 'uint16']: # this is a numpy array 60 | # Check shape to assess whether it can fit in memory 61 | # or not. If not recast to a smaller dtype! 62 | add_dtype_name(f, name) 63 | dtype = f.dtype 64 | if (np.prod(f.shape)*f.dtype.itemsize) > 2e9: 65 | print "WARNING: The array", name, "requires > 2Gb" 66 | if f.dtype.char=='d': 67 | print "\t Recasting", dtype, "to float32" 68 | dtype = np.float32 69 | else: 70 | raise MemoryError 71 | return np.array(f, dtype=dtype).squeeze() 72 | elif f.dtype.name in ['uint16']: # this may be a string for Matlab 73 | add_dtype_name(f, name) 74 | try: 75 | return string(f) 76 | except ValueError: # it wasn't... 77 | print "WARNING:", name, ":" 78 | print "\t", f 79 | print "\t CONVERSION TO STRING FAILED, USING ARRAY!" 80 | tmp = np.array(f).squeeze() 81 | print "\t", tmp 82 | return tmp 83 | pass 84 | elif f.dtype.name=='object': # this is a 2D array of HDF5 object references or just objects 85 | add_dtype_name(f, name) 86 | container = [] 87 | for i in range(f.shape[0]): 88 | for j in range(f.shape[1]): 89 | if str(f[i][j])=='': # reference follow it: 90 | container.append(recursive_dict(root[f[i][j]], root, name=name)) 91 | else: 92 | container.append(np.array(f[i][j]).squeeze()) 93 | try: 94 | return np.array(container).squeeze() 95 | except ValueError: 96 | print "WARNING:", name, ":" 97 | print "\t", container 98 | print "\t CANNOT CONVERT INTO NON-OBJECT ARRAY" 99 | return np.array(container, dtype=np.object).squeeze() 100 | else: 101 | raise NotImplemented 102 | else: 103 | raise NotImplemented 104 | return 105 | 106 | 107 | class Node(object): 108 | """This class creates nested objects that represent the HDF5 109 | structure of the Matlab v7.3 '.mat' file so that, for example, the 110 | structure can be easily navigated through TAB-completion in 111 | ipython. 112 | 113 | Note that 'f' and 'root' are not saved in the object as member 114 | attributes. This is done on purpose because I experienced some 115 | difficulties when pickling the Node object containing 'f' and 116 | 'root', i.e. HDF5 objects. Moreover the final object is cleaner 117 | and contains the minimum necessary things. 118 | 119 | TODO: 120 | - add nice __repr__() 121 | - add reference to parent object in order to be able to 122 | reconstruct the position of a Node in the HDF5 hierarchy, which 123 | is useful for debugging and catching issues in conversions. 124 | """ 125 | def __init__(self, f=None, name=None, root=None): 126 | recursive = False 127 | if name is None and root is None: recursive = True 128 | if name is None: name = 'root' 129 | if root is None: root = f 130 | self.__name = name 131 | if recursive: 132 | print "Recursively parsing", f 133 | self.__recursive(f, root) 134 | 135 | def __recursive(self, f, root): 136 | if hasattr(f, 'keys'): 137 | for k in f.keys(): 138 | if k == u'#refs#': continue # skip reference store 139 | # print k 140 | child = Node(name=k) 141 | tmp = child.__recursive(f[k], root) 142 | if tmp is None: tmp = child 143 | self.__setattr__(k, tmp) 144 | return None 145 | elif hasattr(f, 'shape'): 146 | if f.dtype.name not in ['object', 'uint16']: # this is a numpy array 147 | # print "ARRAY!" 148 | dtype = f.dtype 149 | if (np.prod(f.shape)*f.dtype.itemsize) > 2e9: 150 | print "WARNING: The array", self.__name, "requires > 2Gb" 151 | if f.dtype.char=='d': 152 | print "\t Recasting", dtype, "to float32" 153 | dtype = np.float32 154 | else: 155 | raise MemoryError 156 | return np.array(f, dtype=dtype).squeeze() 157 | elif f.dtype.name in ['uint16']: # this may be a string for Matlab 158 | # print "STRING!" 159 | try: 160 | return string(f) 161 | except ValueError: # it wasn't... 162 | print "WARNING:", self.__name, ":" 163 | print "\t", f 164 | print "\t CONVERSION TO STRING FAILED, USING ARRAY!" 165 | tmp = np.array(f).squeeze() 166 | print "\t", tmp 167 | return tmp 168 | pass 169 | elif f.dtype.name=='object': # this is a 2D array of HDF5 object references or just objects 170 | # print "OBJECT!" 171 | container = [] 172 | # we assume all matlab arrays are 2D arrays... 173 | for i in range(f.shape[0]): 174 | for j in range(f.shape[1]): 175 | if str(f[i][j])=='': # it's a reference so follow it: 176 | child = Node(name=str(f[i][j])) 177 | tmp = child.__recursive(root[f[i][j]], root) 178 | if tmp is None: tmp = child 179 | container.append(tmp) 180 | else: 181 | container.append(np.array(f[i][j]).squeeze()) 182 | try: 183 | return np.array(container).squeeze() 184 | except ValueError: 185 | print "WARNING:", self.__name, ":" 186 | print "\t", container 187 | print "\t CANNOT CONVERT INTO NON-OBJECT ARRAY" 188 | return np.array(container, dtype=np.object).squeeze() 189 | else: 190 | raise NotImplemented 191 | else: 192 | raise NotImplemented 193 | 194 | 195 | 196 | if __name__ == '__main__': 197 | 198 | import sys 199 | import cPickle as pickle 200 | 201 | filename = sys.argv[-1] 202 | 203 | print "Loading", filename 204 | 205 | f = h5py.File(filename, mode='r') 206 | 207 | data = recursive_dict(f) 208 | # alternatively: 209 | # data = Node(f) 210 | 211 | filename = filename[:-4]+".pickle" 212 | print "Saving", filename 213 | pickle.dump(data, open(filename,'w'), 214 | protocol=pickle.HIGHEST_PROTOCOL) 215 | 216 | 217 | --------------------------------------------------------------------------------