├── __init__.py ├── mozi ├── __init__.py ├── layers │ ├── __init__.py │ ├── embedding.py │ ├── preprocessor.py │ ├── template.py │ ├── misc.py │ ├── vae.py │ ├── linear.py │ ├── noise.py │ ├── alexnet.py │ ├── activation.py │ ├── normalization.py │ ├── convolution.py │ └── recurrent.py ├── utils │ ├── __init__.py │ ├── cnn_utils.py │ ├── theano_utils.py │ ├── check_memory.py │ ├── train_object_utils.py │ ├── progbar.py │ ├── mnist_utils.py │ ├── image.py │ └── utils.py ├── datasets │ ├── __init__.py │ ├── cifar100.py │ ├── cifar10.py │ ├── mnist.py │ ├── dataset_noise.py │ ├── imdb.py │ ├── voc.py │ ├── iterator.py │ ├── dataset.py │ └── preprocessor.py ├── env.py ├── cost.py ├── weight_init.py ├── model.py ├── learning_method.py ├── log.py └── train_object.py ├── .gitignore ├── setup.cfg ├── MANIFEST.in ├── setup.py ├── LICENSE ├── example ├── voc_alexnet.py ├── mnist_vae.py ├── mnist_dae.py ├── mnist_mlp.py ├── cifar10_cnn.py ├── datablocks_example.py └── imdb_bilstm.py ├── doc ├── vae.md ├── dae.md └── cnn.md └── README.md /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mozi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *pyc 2 | -------------------------------------------------------------------------------- /mozi/layers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mozi/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mozi/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENCE 2 | recursive-include mozi *.py 3 | recursive-include doc *.md 4 | -------------------------------------------------------------------------------- /mozi/utils/cnn_utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def valid(x, y, kernel, stride): 4 | return ((x-kernel)/stride + 1, (y-kernel)/stride + 1) 5 | 6 | 7 | def full(x, y, kernel, stride): 8 | return ((x+kernel)/stride - 1, (y+kernel)/stride - 1) 9 | 10 | 11 | def spp_outdim(input_channels, levels): 12 | outdim = 0 13 | for lvl in levels: 14 | outdim += lvl**2 * input_channels 15 | return outdim 16 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from setuptools import find_packages 3 | 4 | 5 | setup( 6 | name='mozi', 7 | version='2.0.3', 8 | author=u'Wu Zhen Zhou', 9 | author_email='hyciswu@gmail.com', 10 | install_requires=['numpy>=1.7.1', 'scipy>=0.11', 11 | 'six>=1.9.0', 'scikit-learn>=0.17', 'pandas>=0.17', 12 | 'matplotlib>=1.5', 'Theano>=0.8'], 13 | url='https://github.com/hycis/Mozi', 14 | license='The MIT License (MIT), see LICENCE', 15 | description='Deep learning package based on theano for building all kinds of models', 16 | long_description=open('README.md').read(), 17 | packages=find_packages(), 18 | zip_safe=False, 19 | include_package_data=True 20 | ) 21 | -------------------------------------------------------------------------------- /mozi/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def setenv(): 4 | NNdir = os.path.dirname(os.path.realpath(__file__)) 5 | NNdir = os.path.dirname(NNdir) 6 | 7 | # directory to save all the dataset 8 | if not os.getenv('MOZI_DATA_PATH'): 9 | os.environ['MOZI_DATA_PATH'] = NNdir + '/data' 10 | 11 | # directory for saving the database that is used for logging the results 12 | if not os.getenv('MOZI_DATABASE_PATH'): 13 | os.environ['MOZI_DATABASE_PATH'] = NNdir + '/database' 14 | 15 | # directory to save all the trained models and outputs 16 | if not os.getenv('MOZI_SAVE_PATH'): 17 | os.environ['MOZI_SAVE_PATH'] = NNdir + '/save' 18 | 19 | print('MOZI_DATA_PATH = ' + os.environ['MOZI_DATA_PATH']) 20 | print('MOZI_SAVE_PATH = ' + os.environ['MOZI_SAVE_PATH']) 21 | print('MOZI_DATABASE_PATH = ' + os.environ['MOZI_DATABASE_PATH']) 22 | -------------------------------------------------------------------------------- /mozi/utils/theano_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import numpy as np 3 | import theano 4 | import theano.tensor as T 5 | 6 | floatX = theano.config.floatX 7 | ''' 8 | from keras 9 | ''' 10 | 11 | def asfloatX(X): 12 | return np.asarray(X, dtype=floatX) 13 | 14 | def sharedX(value, dtype=floatX, name=None, borrow=False, **kwargs): 15 | return theano.shared(np.asarray(value, dtype=dtype), name=name, borrow=borrow, **kwargs) 16 | 17 | def shared_zeros(shape, dtype=floatX, name=None, **kwargs): 18 | return sharedX(np.zeros(shape), dtype=dtype, name=name, **kwargs) 19 | 20 | def shared_scalar(val=0., dtype=floatX, name=None, **kwargs): 21 | return theano.shared(np.cast[dtype](val), **kwargs) 22 | 23 | def shared_ones(shape, dtype=floatX, name=None, **kwargs): 24 | return sharedX(np.ones(shape), dtype=dtype, name=name, **kwargs) 25 | 26 | def alloc_zeros_matrix(*dims): 27 | return T.alloc(np.cast[floatX](0.), *dims) 28 | -------------------------------------------------------------------------------- /mozi/layers/embedding.py: -------------------------------------------------------------------------------- 1 | 2 | from mozi.layers.template import Template 3 | from mozi.utils.theano_utils import sharedX 4 | from mozi.weight_init import UniformWeight 5 | 6 | class Embedding(Template): 7 | ''' 8 | Turn positive integers (indexes) into denses vectors of fixed size. 9 | eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]] 10 | 11 | @input_dim: size of vocabulary (highest input integer + 1) 12 | @output_dim: size of dense representation 13 | ''' 14 | def __init__(self, input_dim, output_dim, init=UniformWeight(scale=0.1), weights=None): 15 | 16 | self.input_dim = input_dim 17 | self.output_dim = output_dim 18 | if weights is None: 19 | self.W = init((input_dim, output_dim)) 20 | else: 21 | self.W = sharedX(weights) 22 | self.params = [self.W] 23 | 24 | def _train_fprop(self, state_below): 25 | state_below = state_below.astype('int32') 26 | return self.W[state_below] 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Zhenzhou Wu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /mozi/layers/preprocessor.py: -------------------------------------------------------------------------------- 1 | 2 | import theano.tensor as T 3 | from mozi.layers.template import Template 4 | 5 | class Scale(Template): 6 | 7 | """ 8 | Scale the input into a range 9 | 10 | Parameters 11 | ---------- 12 | X : ndarray, 2-dimensional 13 | numpy matrix with examples indexed on the first axis and 14 | features indexed on the second. 15 | 16 | global_max : real 17 | the maximum value of the whole dataset. If not provided, global_max is set to X.max() 18 | 19 | global_min : real 20 | the minimum value of the whole dataset. If not provided, global_min is set to X.min() 21 | 22 | scale_range : size 2 list 23 | set the upper bound and lower bound after scaling 24 | 25 | buffer : float 26 | the buffer on the upper lower bound such that [L+buffer, U-buffer] 27 | """ 28 | 29 | 30 | def __init__(self, global_max, global_min, scale_range=[-1,1], buffer=0.1): 31 | 32 | self.scale_range = scale_range 33 | self.buffer = buffer 34 | self.max = global_max 35 | self.min = global_min 36 | assert scale_range[0] + buffer < scale_range[1] - buffer, \ 37 | 'the lower bound is larger than the upper bound' 38 | self.params = [] 39 | 40 | 41 | def _train_fprop(self, state_below): 42 | width = self.max - self.min 43 | scale = (self.scale_range[1] - self.scale_range[0] - 2 * self.buffer) / width 44 | state_below = scale * (state_below - self.min) 45 | state_below = state_below + self.scale_range[0] + self.buffer 46 | return state_below 47 | -------------------------------------------------------------------------------- /example/voc_alexnet.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | import theano.tensor as T 5 | 6 | from mozi.datasets.voc import VOC 7 | from mozi.model import Sequential 8 | from mozi.layers.alexnet import Alexnet 9 | from mozi.log import Log 10 | from mozi.train_object import TrainObject 11 | from mozi.cost import error, entropy 12 | from mozi.learning_method import SGD 13 | from mozi.env import setenv 14 | 15 | import os 16 | import theano 17 | 18 | 19 | def train(): 20 | 21 | data = VOC(batch_size=32, train_valid_test_ratio=[5,1,1]) 22 | model = Sequential(input_var=T.tensor4(), output_var=T.matrix()) 23 | model.add(Alexnet(input_shape=(3,222,222), output_dim=11)) 24 | # build learning method 25 | learning_method = SGD(learning_rate=0.01, momentum=0.9, 26 | lr_decay_factor=0.9, decay_batch=5000) 27 | # put everything into the train object 28 | train_object = TrainObject(model = model, 29 | log = None, 30 | dataset = data, 31 | train_cost = error, 32 | valid_cost = error, 33 | learning_method = learning_method, 34 | stop_criteria = {'max_epoch' : 10, 35 | 'epoch_look_back' : 5, 36 | 'percent_decrease' : 0.01} 37 | ) 38 | # finally run the code 39 | train_object.setup() 40 | train_object.run() 41 | 42 | if __name__ == '__main__': 43 | setenv() 44 | train() 45 | -------------------------------------------------------------------------------- /doc/vae.md: -------------------------------------------------------------------------------- 1 | Variational Autoencoder 2 | ===== 3 | You can try the Variational Autoencoder [Example](../example/mnist_vae.py) running on Mnist. 4 | ```python 5 | import os 6 | 7 | import theano 8 | import theano.tensor as T 9 | import numpy as np 10 | 11 | from mozi.datasets.mnist import Mnist 12 | from mozi.model import Sequential 13 | from mozi.layers.vae import VariationalAutoencoder 14 | from mozi.log import Log 15 | from mozi.train_object import TrainObject 16 | from mozi.cost import SGVB_bin 17 | from mozi.learning_method import SGD 18 | 19 | # build dataset 20 | data = Mnist(batch_size=100, binary=False, train_valid_test_ratio=[5,1,1]) 21 | # for autoencoder, the output will be equal to input 22 | data.set_train(X=data.get_train().X, y=data.get_train().X) 23 | data.set_valid(X=data.get_valid().X, y=data.get_valid().X) 24 | 25 | # build model 26 | model = Sequential(input_var=T.matrix(), output_var=T.matrix()) 27 | model.add(VariationalAutoencoder(input_dim=28*28, bottlenet_dim=200, z_dim=20)) 28 | 29 | # build learning method 30 | learning_method = SGD(learning_rate=0.0001, momentum=0.9, 31 | lr_decay_factor=0.9, decay_batch=10000) 32 | 33 | # put everything into the train object 34 | train_object = TrainObject(model = model, 35 | log = None, 36 | dataset = data, 37 | train_cost = SGVB_bin, 38 | valid_cost = SGVB_bin, 39 | learning_method = learning_method, 40 | stop_criteria = {'max_epoch' : 10, 41 | 'epoch_look_back' : 5, 42 | 'percent_decrease' : 0.01} 43 | ) 44 | # finally run the code 45 | train_object.setup() 46 | train_object.run() 47 | ``` 48 | -------------------------------------------------------------------------------- /mozi/datasets/cifar100.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logger = logging.getLogger(__name__) 3 | import os 4 | import cPickle 5 | import numpy as np 6 | import theano 7 | floatX = theano.config.floatX 8 | 9 | from mozi.utils.utils import get_file, make_one_hot 10 | from mozi.datasets.dataset import SingleBlock 11 | 12 | class Cifar100(SingleBlock): 13 | 14 | def __init__(self, flatten=False, fine_label=True, **kwargs): 15 | ''' 16 | PARAM: 17 | fine_label: True (100 classes) False (20 classes) 18 | ''' 19 | 20 | im_dir = os.environ['MOZI_DATA_PATH'] + '/cifar100/' 21 | path = 'http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz' 22 | im_dir = get_file(fpath="{}/cifar-100-python.tar.gz".format(im_dir), origin=path, untar=True) 23 | 24 | self.img_shape = (3,32,32) 25 | self.img_size = np.prod(self.img_shape) 26 | 27 | fnames = ['train', 'test'] 28 | 29 | X = [] 30 | y = [] 31 | for fname in fnames: 32 | data_path = "{}/{}".format(im_dir, fname) 33 | with open(data_path) as fin: 34 | data_batch = cPickle.load(fin) 35 | if flatten: 36 | X.extend(data_batch['data'].reshape((len(data_batch['data']), self.img_size))) 37 | else: 38 | X.extend(data_batch['data'].reshape((len(data_batch['data']),)+self.img_shape)) 39 | if fine_label: 40 | y.extend(data_batch['fine_labels']) 41 | self.n_classes = 100 42 | else: 43 | y.extend(data_batch['coarse_labels']) 44 | self.n_classes = 20 45 | 46 | X_npy = np.array(X, dtype=floatX) 47 | X_npy /= 255.0 48 | y_npy = make_one_hot(y, onehot_size=self.n_classes) 49 | 50 | super(Cifar100, self).__init__(X=X_npy, y=y_npy, **kwargs) 51 | -------------------------------------------------------------------------------- /mozi/datasets/cifar10.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logger = logging.getLogger(__name__) 3 | import os 4 | import cPickle 5 | import numpy as np 6 | import theano 7 | floatX = theano.config.floatX 8 | 9 | from mozi.utils.utils import get_file, make_one_hot 10 | from mozi.datasets.dataset import SingleBlock 11 | 12 | class Cifar10(SingleBlock): 13 | 14 | def __init__(self, flatten=False, **kwargs): 15 | 16 | im_dir = os.environ['MOZI_DATA_PATH'] + '/cifar10/' 17 | path = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' 18 | im_dir = get_file(fpath="{}/cifar-10-python.tar.gz".format(im_dir), origin=path, untar=True) 19 | self.label_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 20 | 'dog', 'frog','horse','ship','truck'] 21 | 22 | self.img_shape = (3,32,32) 23 | self.img_size = np.prod(self.img_shape) 24 | self.n_classes = 10 25 | fnames = ['data_batch_%i' % i for i in range(1,6)] + ['test_batch'] 26 | 27 | X = [] 28 | y = [] 29 | for fname in fnames: 30 | data_path = "{}/{}".format(im_dir, fname) 31 | with open(data_path) as fin: 32 | data_batch = cPickle.load(fin) 33 | if flatten: 34 | X.extend(data_batch['data'].reshape((len(data_batch['data']), self.img_size))) 35 | else: 36 | X.extend(data_batch['data'].reshape((len(data_batch['data']),)+self.img_shape)) 37 | y.extend(data_batch['labels']) 38 | 39 | 40 | X_npy = np.array(X, dtype=floatX) 41 | X_npy /= 255.0 42 | y_npy = make_one_hot(y, onehot_size=self.n_classes) 43 | ridx = np.arange(len(X_npy)) 44 | np.random.shuffle(ridx) 45 | X_npy = X_npy[ridx] 46 | y_npy = y_npy[ridx] 47 | 48 | super(Cifar10, self).__init__(X=X_npy, y=y_npy, **kwargs) 49 | -------------------------------------------------------------------------------- /mozi/datasets/mnist.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logger = logging.getLogger(__name__) 3 | import os 4 | import numpy as np 5 | import theano 6 | 7 | from mozi.utils.mnist_utils import read_mnist_images, read_mnist_labels, get_mnist_file 8 | from mozi.datasets.dataset import SingleBlock, DataBlocks 9 | 10 | 11 | class Mnist(SingleBlock): 12 | 13 | def __init__(self, binary=True, **kwargs): 14 | 15 | im_dir = os.environ['MOZI_DATA_PATH'] + '/mnist/' 16 | 17 | url = 'http://yann.lecun.com/exdb/mnist' 18 | 19 | paths = [] 20 | for fname in ['train-images-idx3-ubyte', 'train-labels-idx1-ubyte', 21 | 't10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte']: 22 | path = get_mnist_file('{}/{}'.format(im_dir,fname), origin='{}/{}.gz'.format(url,fname)) 23 | paths.append(path) 24 | 25 | train_X = read_mnist_images(paths[0], dtype='float32') 26 | train_y = read_mnist_labels(paths[1]) 27 | 28 | test_X = read_mnist_images(paths[2], dtype='float32') 29 | test_y = read_mnist_labels(paths[3]) 30 | 31 | train_X = train_X.reshape(train_X.shape[0], train_X.shape[1] * train_X.shape[2]) 32 | test_X = test_X.reshape(test_X.shape[0], test_X.shape[1] * test_X.shape[2]) 33 | 34 | X = np.concatenate((train_X, test_X), axis=0) 35 | 36 | train_y_tmp = np.zeros((train_X.shape[0], 10), dtype=theano.config.floatX) 37 | test_y_tmp = np.zeros((test_X.shape[0], 10), dtype=theano.config.floatX) 38 | 39 | for i in xrange(train_X.shape[0]): 40 | train_y_tmp[i, train_y[i]] = 1 41 | 42 | for i in xrange(test_X.shape[0]): 43 | test_y_tmp[i, test_y[i]] = 1 44 | 45 | train_y = train_y_tmp 46 | test_y = test_y_tmp 47 | 48 | if binary: 49 | X = (X >= 0.5).astype(int) 50 | y = np.concatenate((train_y, test_y), axis=0) 51 | 52 | super(Mnist, self).__init__(X=X, y=y, **kwargs) 53 | -------------------------------------------------------------------------------- /mozi/utils/check_memory.py: -------------------------------------------------------------------------------- 1 | import os 2 | _proc_status = '/proc/%d/status' % os.getpid() 3 | 4 | _scale = {'kB': 1024.0, 'mB': 1024.0*1024.0, 5 | 'KB': 1024.0, 'MB': 1024.0*1024.0} 6 | 7 | def _VmB(VmKey): 8 | '''Private. 9 | ''' 10 | global _proc_status, _scale 11 | # get pseudo file /proc//status 12 | try: 13 | t = open(_proc_status) 14 | v = t.read() 15 | t.close() 16 | except: 17 | return 0.0 # non-Linux? 18 | # get VmKey line e.g. 'VmRSS: 9999 kB\n ...' 19 | i = v.index(VmKey) 20 | v = v[i:].split(None, 3) # whitespace 21 | if len(v) < 3: 22 | return 0.0 # invalid format? 23 | # convert Vm value to bytes 24 | return float(v[1]) * _scale[v[2]] 25 | 26 | 27 | def memory(since=0.0): 28 | '''Return virtual memory usage in bytes. 29 | ''' 30 | return _VmB('VmSize:') - since 31 | 32 | def peak_Vm(since=0.0): 33 | '''Return the peak virtual memory usage in bytes 34 | ''' 35 | return _VmB('VmPeak:') - since 36 | 37 | def resident(since=0.0): 38 | '''Return resident memory usage in bytes. 39 | ''' 40 | return _VmB('VmRSS:') - since 41 | 42 | def peak_resident(since=0.0): 43 | '''Return the peak resident memory usage in bytes. 44 | ''' 45 | return _VmB('VmHWM:') - since 46 | 47 | def stacksize(since=0.0): 48 | '''Return stack size in bytes. 49 | ''' 50 | return _VmB('VmStk:') - since 51 | 52 | 53 | def get_mem_usage(): 54 | denom = 1024 * 1024 * 1024 55 | unit = 'GB' 56 | rstr = 'virtual memory: ' + str(memory()/denom) + ' {}\n'.format(unit) 57 | rstr += 'peak virtual memory: ' + str(peak_Vm()/denom) + ' {}\n'.format(unit) 58 | rstr += 'resident memory: ' + str(resident()/denom) + ' {}\n'.format(unit) 59 | rstr += 'peak resident memory: ' + str(peak_resident()/denom) + ' {}\n'.format(unit) 60 | rstr += 'stacksize: ' + str(resident()/denom) + ' {}\n'.format(unit) 61 | 62 | return rstr 63 | -------------------------------------------------------------------------------- /mozi/cost.py: -------------------------------------------------------------------------------- 1 | 2 | import theano.tensor as T 3 | import theano 4 | from mozi.utils.utils import theano_unique 5 | from mozi.utils.theano_utils import asfloatX 6 | 7 | floatX = theano.config.floatX 8 | 9 | if floatX == 'float64': 10 | epsilon = 1.0e-8 11 | else: 12 | epsilon = 1.0e-6 13 | 14 | def accuracy(y, y_pred): 15 | L = T.eq(y_pred.argmax(axis=1), y.argmax(axis=1)) 16 | return T.mean(L) 17 | # L = T.eq(y_pred.argmax(axis=1), y.argmax(axis=1)) 18 | # return T.sum(L) / y.shape[0].astype(floatX) 19 | 20 | def mse(y, y_pred): 21 | return T.mean(T.sqr(y-y_pred)) 22 | 23 | def entropy(y, y_pred): 24 | y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon) 25 | L = -(y * T.log(y_pred) + (1-y) * T.log(1-y_pred)) 26 | return T.mean(L) 27 | # L = - T.sum(y * T.log(y_pred) + (1-y) * T.log(1-y_pred), axis=1) 28 | # return T.mean(L) 29 | 30 | def error(y, y_pred): 31 | L = T.neq(y_pred.argmax(axis=1), y.argmax(axis=1)) 32 | return T.mean(L) 33 | 34 | def recall(y, y_pred): 35 | L = T.eq(y_pred.argmax(axis=1), y.argmax(axis=1)) 36 | return T.sum(L) / y.shape[0].astype(floatX) 37 | 38 | def precision(y, y_pred): 39 | L = T.eq(y_pred.argmax(axis=1), y.argmax(axis=1)) 40 | return T.sum(L) / y_pred.shape[0].astype(floatX) 41 | 42 | def f1(y, y_pred): 43 | r = recall(y, y_pred) 44 | p = precision(y, y_pred) 45 | return 2 * p * r / (p + r) 46 | 47 | def hingeloss(y, y_pred): 48 | y_pred = T.clip(y_pred, 0., 1.0) 49 | L = T.max(0, 1 - y * y_pred) 50 | return T.mean(L) 51 | 52 | def abs(y, y_pred): 53 | return T.mean(T.abs_(y-y_pred)) 54 | 55 | def SGVB_bin(y, y_pred): 56 | ''' 57 | This cost function is for variational autoencoder with binary inputs 58 | ''' 59 | ypred, miu_e, logsig_e = y_pred 60 | ypred = T.clip(ypred, epsilon, 1.0 - epsilon) 61 | logpxz = -T.nnet.binary_crossentropy(ypred, y).sum(axis=1) 62 | L = logpxz + 0.5 * (1 + 2*logsig_e - miu_e**2 - T.exp(2*logsig_e)).sum(axis=1) 63 | return L.mean() 64 | -------------------------------------------------------------------------------- /mozi/layers/template.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Template(object): 4 | """ 5 | DESCRIPTION: 6 | The interface to be implemented by any layer. 7 | """ 8 | def __init__(self): 9 | ''' 10 | FIELDS: 11 | self.params: any params from the layer that needs to be updated 12 | by backpropagation can be put inside self.params 13 | self.updates: use for updating any shared variables 14 | ''' 15 | self.params = [] 16 | self.updates = [] 17 | 18 | def _test_fprop(self, state_below): 19 | ''' 20 | DESCRIPTION: 21 | This is called during validating/testing of the model. 22 | PARAM: 23 | state_below: the input to layer 24 | ''' 25 | return self._train_fprop(state_below) 26 | 27 | def _train_fprop(self, state_below): 28 | ''' 29 | DESCRIPTION: 30 | This is called during every training batch whereby the output from the 31 | model will be used to update the parameters during error backpropagation. 32 | PARAM: 33 | state_below: the input to layer 34 | ''' 35 | raise NotImplementedError() 36 | 37 | 38 | def _layer_stats(self, state_below, layer_output): 39 | """ 40 | DESCRIPTION: 41 | Layer stats is used for debugging the layer by allowing user to peek 42 | at the weight values or the layer output or any parameters of interest 43 | during training. By computing the values of parameter of interest, 44 | for example T.max(self.W) and put in the return list, the training will 45 | print the maximum of the weight in the layer after every epoch. 46 | PARAM: 47 | state_below: the input to layer 48 | layer_output: the output from the layer 49 | RETURN: 50 | A list of tuples of [('name_a', var_a), ('name_b', var_b)] whereby var is scalar 51 | """ 52 | return [] 53 | -------------------------------------------------------------------------------- /example/mnist_vae.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | 4 | import theano 5 | import theano.tensor as T 6 | import numpy as np 7 | 8 | from mozi.datasets.mnist import Mnist 9 | from mozi.model import Sequential 10 | from mozi.layers.vae import VariationalAutoencoder 11 | from mozi.log import Log 12 | from mozi.train_object import TrainObject 13 | from mozi.cost import SGVB_bin 14 | from mozi.learning_method import * 15 | from mozi.weight_init import * 16 | from mozi.env import setenv 17 | 18 | 19 | def train(): 20 | """ 21 | This examples implements the variational autoencoder from the paper 22 | Auto-Encoding Variational Bayes by Diederik P Kingma, Max Welling, arXiv:1312.6114 23 | """ 24 | 25 | # build dataset 26 | data = Mnist(batch_size=100, binary=False, train_valid_test_ratio=[5,1,1]) 27 | # for autoencoder, the output will be equal to input 28 | data.set_train(X=data.get_train().X, y=data.get_train().X) 29 | data.set_valid(X=data.get_valid().X, y=data.get_valid().X) 30 | 31 | # build model 32 | model = Sequential(input_var=T.matrix(), output_var=T.matrix()) 33 | model.add(VariationalAutoencoder(input_dim=28*28, bottlenet_dim=200, z_dim=20)) 34 | 35 | # build learning method 36 | learning_method = SGD(learning_rate=0.0001, momentum=0.9, 37 | lr_decay_factor=0.9, decay_batch=10000) 38 | 39 | # put everything into the train object 40 | train_object = TrainObject(model = model, 41 | log = None, 42 | dataset = data, 43 | train_cost = SGVB_bin, 44 | valid_cost = SGVB_bin, 45 | learning_method = learning_method, 46 | stop_criteria = {'max_epoch' : 10, 47 | 'epoch_look_back' : 5, 48 | 'percent_decrease' : 0.01} 49 | ) 50 | # finally run the code 51 | train_object.setup() 52 | train_object.run() 53 | 54 | 55 | if __name__ == '__main__': 56 | setenv() 57 | train() 58 | -------------------------------------------------------------------------------- /mozi/layers/misc.py: -------------------------------------------------------------------------------- 1 | 2 | import theano 3 | import theano.tensor as T 4 | from mozi.layers.template import Template 5 | import numpy as np 6 | 7 | class Flatten(Template): 8 | 9 | def _train_fprop(self, state_below): 10 | size = T.prod(state_below.shape) / state_below.shape[0] 11 | nshape = (state_below.shape[0], size) 12 | return T.reshape(state_below, nshape) 13 | 14 | 15 | class Reshape(Template): 16 | 17 | def __init__(self, dims): 18 | self.params = [] 19 | self.dims = dims 20 | 21 | def _train_fprop(self, state_below): 22 | nshape = (state_below.shape[0],) + self.dims 23 | return T.reshape(state_below, nshape) 24 | 25 | 26 | class Transform(Template): 27 | 28 | def __init__(self, dims): 29 | ''' 30 | Reshaping the data such that the first dim alters when the rest of the 31 | dim is altered. If X of shape (a, b, c, d) and input dims of shape (d, e), 32 | then return shape will be (a*b*c*d/(d*e), d, e). Useful for tranforming 33 | data in RNN/LSTM with mlp layers before recurrent layers. 34 | ''' 35 | self.params = [] 36 | self.dims = dims 37 | 38 | def _train_fprop(self, state_below): 39 | first_dim = T.prod(state_below.shape) / np.prod(self.dims) 40 | return T.reshape(state_below, (first_dim,)+self.dims) 41 | 42 | 43 | class Crop(Template): 44 | def __init__(self, border): 45 | self.border = border 46 | self.params = [] 47 | assert len(self.border) == 2 48 | 49 | def _train_fprop(self, state_below): 50 | w, h = self.border 51 | return state_below[:,:,h:-h,w:-w] 52 | 53 | 54 | class Parallel(Template): 55 | 56 | def __init__(self, *models): 57 | self.models = models 58 | self.params = [] 59 | for model in self.models: 60 | for layer in model.layers: 61 | self.params += layer.params 62 | 63 | def _train_fprop(self, state_below): 64 | rstate = [] 65 | for model, state in zip(self.models, state_below): 66 | out, _ = model.train_fprop(state) 67 | rstate.append(out) 68 | return rstate 69 | -------------------------------------------------------------------------------- /mozi/layers/vae.py: -------------------------------------------------------------------------------- 1 | 2 | import theano 3 | import theano.tensor as T 4 | from theano.sandbox.rng_mrg import MRG_RandomStreams 5 | 6 | from mozi.layers.template import Template 7 | from mozi.weight_init import GaussianWeight 8 | from mozi.utils.theano_utils import shared_zeros 9 | 10 | floatX = theano.config.floatX 11 | theano_rand = MRG_RandomStreams() 12 | 13 | class VariationalAutoencoder(Template): 14 | 15 | def __init__(self, input_dim, bottlenet_dim, z_dim, weight_init=GaussianWeight(mean=0, std=0.01)): 16 | 17 | self.input_dim = input_dim 18 | self.bottlenet_dim = bottlenet_dim 19 | 20 | # encoder 21 | self.W_e = weight_init((input_dim, bottlenet_dim), name='W_e') 22 | self.b_e = shared_zeros(shape=bottlenet_dim, name='b_e') 23 | self.W_miu = weight_init((bottlenet_dim, z_dim), name='W_miu') 24 | self.b_miu = shared_zeros(shape=z_dim, name='b_miu') 25 | self.W_sig = weight_init((bottlenet_dim, z_dim), name='W_sig') 26 | self.b_sig = shared_zeros(shape=z_dim, name='b_sig') 27 | # decoder 28 | self.W1_d = weight_init((z_dim, bottlenet_dim), name='W1_d') 29 | self.b1_d = shared_zeros(shape=bottlenet_dim, name='b1_d') 30 | self.W2_d = weight_init((bottlenet_dim, input_dim), name='W2_d') 31 | self.b2_d = shared_zeros(shape=input_dim, name='b2_d') 32 | 33 | self.params = [self.W_e, self.b_e, self.W_miu, self.b_miu, self.W_sig, self.b_sig, 34 | self.W1_d, self.b1_d, self.W2_d, self.b2_d] 35 | 36 | 37 | def _train_fprop(self, state_below): 38 | h_e = T.tanh(T.dot(state_below, self.W_e) + self.b_e) 39 | miu_e = T.dot(h_e, self.W_miu) + self.b_miu 40 | logsig_e = 0.5 * (T.dot(h_e, self.W_sig) + self.b_sig) 41 | eps = theano_rand.normal(avg=0, std=1, size=logsig_e.shape, dtype=floatX) 42 | z = miu_e + T.exp(logsig_e) * eps 43 | h_d = T.tanh(T.dot(z, self.W1_d) + self.b1_d) 44 | y = T.nnet.sigmoid(T.dot(h_d, self.W2_d) + self.b2_d) 45 | return y, miu_e, logsig_e 46 | 47 | 48 | def _layer_stats(self, state_below, layer_output): 49 | y, miu, logsig = layer_output 50 | return [('W_miu', self.W_miu.mean()), 51 | ('W_e', self.W_e.mean()), 52 | ('logsig', logsig.mean()), 53 | ('ymean,', y.mean()), 54 | ('miu', miu.mean())] 55 | -------------------------------------------------------------------------------- /mozi/datasets/dataset_noise.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | """ 4 | Functionality : Define the noise that is to be added to the dataset 5 | """ 6 | 7 | import numpy as np 8 | 9 | class Noise(object): 10 | """ 11 | This is an abstract class for applying noise to dataset 12 | """ 13 | 14 | def apply(self, X): 15 | """ 16 | DESCRIPTION: 17 | This method applies noise to X and return a noisy X 18 | PARAM: 19 | X : 2d numpy array of dimension number of examples by number of dimensions 20 | """ 21 | raise NotImplementedError(str(type(self))+" does not implement an apply method.") 22 | 23 | def invert(self, X): 24 | """ 25 | DESCRIPTION: 26 | Remove the noise from X 27 | PARAM: 28 | X : 2d numpy array of dimension number of examples by number of dimensions 29 | """ 30 | raise NotImplementedError(str(type(self))+" does not implement an invert method.") 31 | 32 | 33 | 34 | 35 | class MaskOut(Noise): 36 | 37 | """ 38 | This noise masked out a portion of the dimension from each example 39 | """ 40 | 41 | def __init__(self, ratio=0.5): 42 | """ 43 | PARAM: 44 | ratio : float 45 | The portion of the inputs that is masked out 46 | """ 47 | self.ratio = ratio 48 | 49 | def apply(self, X): 50 | self.noise = np.random.binomial(size=X.shape, n=1, p=(1-self.ratio)) 51 | return X * self.noise 52 | 53 | def invert(self, X): 54 | return X / self.noise 55 | 56 | 57 | class Gaussian(Noise): 58 | """ 59 | Applies gaussian noise to each value of X 60 | """ 61 | 62 | def __init__(self, std=0.01, mean=0): 63 | self.std = std 64 | self.mean = mean 65 | 66 | def apply(self, X): 67 | return X + np.random.normal(loc=self.mean, scale=self.std, size=X.shape) 68 | 69 | 70 | 71 | class BlackOut(Noise): 72 | """ 73 | This noise masked out a random example in a dataset, 74 | adding noise in the time dimension 75 | """ 76 | 77 | def __init__(self, ratio=0.5): 78 | """ 79 | PARAM: 80 | ratio : float 81 | The portion of the examples that is masked out 82 | """ 83 | self.ratio = ratio 84 | 85 | def apply(self, X): 86 | return X * np.random.binomial(size=(X.shape[0],1), n=1, p=(1-self.ratio)) 87 | -------------------------------------------------------------------------------- /mozi/datasets/imdb.py: -------------------------------------------------------------------------------- 1 | 2 | import logging 3 | logger = logging.getLogger(__name__) 4 | import os 5 | import cPickle 6 | import numpy as np 7 | import theano 8 | floatX = theano.config.floatX 9 | 10 | from mozi.utils.utils import get_file, make_one_hot, pad_sequences 11 | from mozi.datasets.dataset import SingleBlock 12 | 13 | class IMDB(SingleBlock): 14 | 15 | def __init__(self, nb_words=None, skip_top=0, maxlen=None, seed=113, 16 | pad_zero=False, start_char=1, oov_char=2, index_from=3, **kwargs): 17 | ''' 18 | adapted from keras 19 | ''' 20 | im_dir = os.environ['MOZI_DATA_PATH'] + '/imdb/' 21 | path = "https://s3.amazonaws.com/text-datasets/imdb.pkl" 22 | im_dir = get_file(fpath="{}/imdb.pkl".format(im_dir), origin=path, untar=False) 23 | with open('{}/imdb.pkl'.format(im_dir)) as fin: 24 | X, labels = np.load(fin) 25 | np.random.seed(seed) 26 | np.random.shuffle(X) 27 | np.random.seed(seed) 28 | np.random.shuffle(labels) 29 | 30 | if start_char is not None: 31 | X = [[start_char] + [w + index_from for w in x] for x in X] 32 | elif index_from: 33 | X = [[w + index_from for w in x] for x in X] 34 | 35 | if maxlen: 36 | new_X = [] 37 | new_labels = [] 38 | for x, y in zip(X, labels): 39 | if len(x) < maxlen: 40 | new_X.append(x) 41 | new_labels.append(y) 42 | X = new_X 43 | labels = new_labels 44 | 45 | if not nb_words: 46 | nb_words = max([max(x) for x in X]) 47 | 48 | # by convention, use 2 as OOV word 49 | # reserve 'index_from' (=3 by default) characters: 0 (padding), 1 (start), 2 (OOV) 50 | if oov_char is not None: 51 | X = [[oov_char if (w >= nb_words or w < skip_top) else w for w in x] for x in X] 52 | else: 53 | nX = [] 54 | for x in X: 55 | nx = [] 56 | for w in x: 57 | if (w >= nb_words or w < skip_top): 58 | nx.append(w) 59 | nX.append(nx) 60 | X = nX 61 | 62 | if pad_zero and maxlen: 63 | X = pad_sequences(X, maxlen=maxlen) 64 | super(IMDB, self).__init__(X=np.asarray(X), y=np.asarray(labels).reshape((len(labels),1)), **kwargs) 65 | -------------------------------------------------------------------------------- /doc/dae.md: -------------------------------------------------------------------------------- 1 | 2 | Denoising Autoencoder 3 | ===== 4 | You can try the Denoising Autoencoder [Example](../example/mnist_dae.py) running on Mnist. Here we build two hidden layer encoding and decoding layers 5 | ```python 6 | from mozi.model import Sequential 7 | from mozi.layers.linear import Linear 8 | from mozi.layers.activation import * 9 | from mozi.layers.noise import Gaussian 10 | import theano.tensor as T 11 | 12 | # build model 13 | model = Sequential(input_var=T.matrix(), output_var=T.matrix()) 14 | # build encoder 15 | model.add(Gaussian()) 16 | encode_layer1 = Linear(prev_dim=28*28, this_dim=200) 17 | model.add(encode_layer1) 18 | model.add(RELU()) 19 | encode_layer2 = Linear(prev_dim=200, this_dim=50) 20 | model.add(encode_layer2) 21 | model.add(Tanh()) 22 | 23 | # build decoder 24 | decode_layer1 = Linear(prev_dim=50, this_dim=200, W=encode_layer2.W.T) 25 | model.add(decode_layer1) 26 | model.add(RELU()) 27 | decode_layer2 = Linear(prev_dim=200, this_dim=28*28, W=encode_layer1.W.T) 28 | model.add(decode_layer2) 29 | model.add(Sigmoid()) 30 | ``` 31 | Next we prepare the mnist dataset such that the input X and output y is the same 32 | ```python 33 | from mozi.datasets.mnist import Mnist 34 | 35 | # build dataset 36 | data = Mnist(batch_size=64, train_valid_test_ratio=[5,1,1]) 37 | # for autoencoder, the output will be equal to input 38 | data.set_train(X=data.get_train().X, y=data.get_train().X) 39 | data.set_valid(X=data.get_valid().X, y=data.get_valid().X) 40 | 41 | ``` 42 | 43 | 44 | Finally build learning method and put everything in train object and run 45 | ```python 46 | from mozi.train_object import TrainObject 47 | from mozi.cost import entropy 48 | from mozi.learning_method import AdaGrad 49 | 50 | learning_method = AdaGrad(learning_rate=0.01, momentum=0.9, 51 | lr_decay_factor=0.9, decay_batch=10000) 52 | 53 | train_object = TrainObject(model = model, 54 | log = None, 55 | dataset = data, 56 | train_cost = entropy, 57 | valid_cost = entropy, 58 | learning_method = learning_method, 59 | stop_criteria = {'max_epoch' : 10, 60 | 'epoch_look_back' : 5, 61 | 'percent_decrease' : 0.01} 62 | ) 63 | # finally run the code 64 | train_object.setup() 65 | train_object.run() 66 | ``` 67 | -------------------------------------------------------------------------------- /doc/cnn.md: -------------------------------------------------------------------------------- 1 | 2 | Convolution Neural Network 3 | ===== 4 | You can try the Convolution Neural Network [Example](../example/cifar10_cnn.py) running on Cifar10. Here we build four convolution layers with two fully-connected layers 5 | ```python 6 | from mozi.model import Sequential 7 | from mozi.layers.linear import Linear 8 | from mozi.layers.noise import Dropout 9 | from mozi.layers.activation import * 10 | from mozi.layers.convolution import * 11 | from mozi.layers.misc import Flatten 12 | import theano.tensor as T 13 | 14 | model = Sequential(input_var=T.tensor4(), output_var=T.matrix()) 15 | model.add(Convolution2D(input_channels=3, filters=32, kernel_size=(3,3), stride=(1,1), border_mode='full')) 16 | model.add(RELU()) 17 | model.add(Convolution2D(input_channels=32, filters=32, kernel_size=(3,3), stride=(1,1))) 18 | model.add(RELU()) 19 | model.add(Pooling2D(poolsize=(2, 2), mode='max')) 20 | model.add(Dropout(0.25)) 21 | 22 | model.add(Convolution2D(input_channels=32, filters=64, kernel_size=(3,3), stride=(1,1), border_mode='full')) 23 | model.add(RELU()) 24 | model.add(Convolution2D(input_channels=64, filters=64, kernel_size=(3,3), stride=(1,1),)) 25 | model.add(RELU()) 26 | model.add(Pooling2D(poolsize=(2, 2), mode='max')) 27 | model.add(Dropout(0.25)) 28 | 29 | model.add(Flatten()) 30 | model.add(Linear(64*8*8, 512)) 31 | model.add(RELU()) 32 | model.add(Dropout(0.5)) 33 | 34 | model.add(Linear(512, 10)) 35 | model.add(Softmax()) 36 | ``` 37 | Next build the `Cifar10` dataset, `LearningMethod` and put everything in `TrainObject` and run 38 | ```python 39 | from mozi.datasets.cifar10 import Cifar10 40 | from mozi.train_object import TrainObject 41 | from mozi.cost import error, entropy 42 | from mozi.learning_method import SGD 43 | 44 | data = Cifar10(batch_size=64, train_valid_test_ratio=[5,1,1]) 45 | learning_method = SGD(learning_rate=0.01, momentum=0.9, 46 | lr_decay_factor=0.9, decay_batch=5000) 47 | train_object = TrainObject(model = model, 48 | log = None, 49 | dataset = data, 50 | train_cost = entropy, 51 | valid_cost = error, 52 | learning_method = learning_method, 53 | stop_criteria = {'max_epoch' : 10, 54 | 'epoch_look_back' : 5, 55 | 'percent_decrease' : 0.01} 56 | ) 57 | train_object.setup() 58 | train_object.run() 59 | ``` 60 | -------------------------------------------------------------------------------- /example/mnist_dae.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import theano 4 | import theano.tensor as T 5 | import numpy as np 6 | 7 | from mozi.datasets.mnist import Mnist 8 | from mozi.model import Sequential 9 | from mozi.layers.linear import Linear 10 | from mozi.layers.activation import * 11 | from mozi.layers.noise import Dropout, Gaussian 12 | from mozi.log import Log 13 | from mozi.train_object import TrainObject 14 | from mozi.cost import mse, error, entropy 15 | from mozi.learning_method import * 16 | from mozi.weight_init import * 17 | from mozi.env import setenv 18 | 19 | from sklearn.metrics import accuracy_score 20 | 21 | 22 | def train(): 23 | 24 | # build dataset 25 | data = Mnist(batch_size=64, train_valid_test_ratio=[5,1,1]) 26 | # for autoencoder, the output will be equal to input 27 | data.set_train(X=data.get_train().X, y=data.get_train().X) 28 | data.set_valid(X=data.get_valid().X, y=data.get_valid().X) 29 | 30 | # build model 31 | model = Sequential(input_var=T.matrix(), output_var=T.matrix()) 32 | # build encoder 33 | model.add(Gaussian()) 34 | encode_layer1 = Linear(prev_dim=28*28, this_dim=200) 35 | model.add(encode_layer1) 36 | model.add(RELU()) 37 | encode_layer2 = Linear(prev_dim=200, this_dim=50) 38 | model.add(encode_layer2) 39 | model.add(Tanh()) 40 | 41 | # build decoder 42 | decode_layer1 = Linear(prev_dim=50, this_dim=200, W=encode_layer2.W.T) 43 | model.add(decode_layer1) 44 | model.add(RELU()) 45 | decode_layer2 = Linear(prev_dim=200, this_dim=28*28, W=encode_layer1.W.T) 46 | model.add(decode_layer2) 47 | model.add(Sigmoid()) 48 | 49 | # build learning method 50 | learning_method = AdaGrad(learning_rate=0.01, momentum=0.9, 51 | lr_decay_factor=0.9, decay_batch=10000) 52 | 53 | # put everything into the train object 54 | train_object = TrainObject(model = model, 55 | log = None, 56 | dataset = data, 57 | train_cost = entropy, 58 | valid_cost = entropy, 59 | learning_method = learning_method, 60 | stop_criteria = {'max_epoch' : 10, 61 | 'epoch_look_back' : 5, 62 | 'percent_decrease' : 0.01} 63 | ) 64 | # finally run the code 65 | train_object.setup() 66 | train_object.run() 67 | 68 | 69 | 70 | if __name__ == '__main__': 71 | setenv() 72 | train() 73 | -------------------------------------------------------------------------------- /mozi/weight_init.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import theano 4 | from mozi.utils.theano_utils import sharedX 5 | 6 | 7 | def get_fans(shape): 8 | '''From keras''' 9 | fan_in = shape[0] if len(shape) == 2 else np.prod(shape[1:]) 10 | fan_out = shape[1] if len(shape) == 2 else shape[0] 11 | return fan_in, fan_out 12 | 13 | 14 | class WeightInitialization(object): 15 | 16 | def __call__(self, dim, name='W'): 17 | raise NotImplementedError(str(type(self))+" does not implement __call__.") 18 | 19 | 20 | class GaussianWeight(WeightInitialization): 21 | def __init__(self, mean=0, std=0.1): 22 | self.mean = mean 23 | self.std = std 24 | 25 | def __call__(self, dim, name='W', **kwargs): 26 | W_values = np.random.normal(loc=self.mean, scale=self.std, size=dim) 27 | return sharedX(name=name, value=W_values, borrow=True, **kwargs) 28 | 29 | 30 | class XavierUniformWeight(WeightInitialization): 31 | def __call__(self, dim, name='W', **kwargs): 32 | fan_in, fan_out = get_fans(dim) 33 | W_values = np.random.uniform(low = -4 * np.sqrt(6. / (fan_in + fan_out)), 34 | high = 4 * np.sqrt(6. / (fan_in + fan_out)), 35 | size = dim) 36 | return sharedX(name=name, value=W_values, borrow=True, **kwargs) 37 | 38 | 39 | class UniformWeight(WeightInitialization): 40 | def __init__(self, scale=0.05): 41 | self.scale = scale 42 | 43 | def __call__(self, dim, name='W', **kwargs): 44 | W_values = np.random.uniform(low=-self.scale, high=self.scale, size=dim) 45 | return sharedX(name=name, value=W_values, borrow=True, **kwargs) 46 | 47 | 48 | class OrthogonalWeight(WeightInitialization): 49 | def __init__(self, scale=1.1): 50 | self.scale = scale 51 | 52 | def __call__(self, dim, name='W', **kwargs): 53 | ''' From Lasagne 54 | ''' 55 | flat_shape = (dim[0], np.prod(dim[1:])) 56 | a = np.random.normal(0.0, 1.0, flat_shape) 57 | u, _, v = np.linalg.svd(a, full_matrices=False) 58 | # pick the one with the correct shape 59 | q = u if u.shape == flat_shape else v 60 | q = q.reshape(dim) 61 | return sharedX(name=name, value=self.scale * q[:dim[0],:dim[1]], borrow=True, **kwargs) 62 | 63 | 64 | class Identity(WeightInitialization): 65 | def __init__(self, scale=1): 66 | self.scale = scale 67 | 68 | def __call__(self, dim, name='W', **kwargs): 69 | if len(dim) != 2 or dim[0] != dim[1]: 70 | raise Exception("Identity matrix initialization can only be used for 2D square matrices") 71 | else: 72 | return sharedX(self.scale * np.identity(dim[0]), **kwargs) 73 | -------------------------------------------------------------------------------- /mozi/datasets/voc.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from mozi.utils.utils import get_file, make_one_hot 4 | from mozi.datasets.dataset import SingleBlock 5 | import xml.etree.ElementTree as ET 6 | import os 7 | import glob 8 | from skimage.transform import resize 9 | from skimage.io import imread 10 | import cPickle 11 | import marshal 12 | import numpy as np 13 | 14 | class VOC(SingleBlock): 15 | 16 | def __init__(self, resized_shape=(222,222,3), **kwargs): 17 | ''' 18 | using only voc 2012 for actions classification, total 2154 images 19 | resized_shape is of (height, width, channel) 20 | ''' 21 | im_dir = os.environ['MOZI_DATA_PATH'] + '/voc' 22 | path = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar' 23 | im_dir = get_file(fpath="{}/VOCtrainval_11-May-2012.tar".format(im_dir), origin=path, untar=True) 24 | actls = ['jumping', 'phoning', 'playinginstrument', 'reading', 'ridingbike', 25 | 'ridinghorse', 'running', 'takingphoto', 'usingcomputer', 'walking', 26 | 'other'] 27 | X_path = os.environ['MOZI_DATA_PATH'] + '/voc/X.npy' 28 | y_path = os.environ['MOZI_DATA_PATH'] + '/voc/y.npy' 29 | if not os.path.exists(X_path) or not os.path.exists(y_path): 30 | print X_path + ' does not exists, generating..' 31 | annote = im_dir + '/VOC2012/Annotations' 32 | images = im_dir + '/VOC2012/JPEGImages' 33 | files = glob.glob(annote + '/2012*xml') 34 | labels = [] 35 | rimage = [] 36 | for f in files: 37 | bname = os.path.basename(f).rstrip('.xml') 38 | image = imread('{}/{}.jpg'.format(images, bname)) 39 | rimage.append(resize(image, resized_shape)) 40 | tree = ET.parse(f) 41 | root = tree.getroot() 42 | actions = root.find('object').find('actions') 43 | 44 | for act in actions: 45 | if act.text == '1': 46 | labels.append(actls.index(act.tag)) 47 | # only restrict to one action per photo 48 | break 49 | 50 | print 'saving data' 51 | with open(X_path, 'wb') as Xout, open(y_path, 'wb') as yout: 52 | X = np.asarray(rimage) 53 | y = np.asarray(labels) 54 | np.save(Xout, X) 55 | np.save(yout, y) 56 | 57 | else: 58 | print X_path + ' exists, loading..' 59 | with open(X_path, 'rb') as Xin, open(y_path, 'rb') as yin: 60 | X = np.load(Xin) 61 | y = np.load(yin) 62 | 63 | super(VOC, self).__init__(X=np.rollaxis(X,3,1), y=make_one_hot(y,len(actls)), **kwargs) 64 | 65 | # x = VOC() 66 | -------------------------------------------------------------------------------- /example/mnist_mlp.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import theano 4 | import theano.tensor as T 5 | import numpy as np 6 | 7 | from mozi.datasets.mnist import Mnist 8 | from mozi.datasets.preprocessor import * 9 | from mozi.model import Sequential 10 | from mozi.layers.linear import * 11 | from mozi.layers.activation import * 12 | from mozi.layers.noise import Dropout 13 | from mozi.log import Log 14 | from mozi.train_object import TrainObject 15 | from mozi.cost import mse, error 16 | from mozi.learning_method import * 17 | from mozi.weight_init import * 18 | from mozi.env import setenv 19 | 20 | 21 | def train(): 22 | 23 | # build dataset 24 | batch_size = 64 25 | data = Mnist(batch_size=batch_size, train_valid_test_ratio=[5,1,1]) 26 | 27 | # build model 28 | model = Sequential(input_var=T.matrix(), output_var=T.matrix()) 29 | model.add(Linear(prev_dim=28*28, this_dim=200)) 30 | model.add(RELU()) 31 | model.add(Linear(prev_dim=200, this_dim=100)) 32 | model.add(RELU()) 33 | model.add(Dropout(0.5)) 34 | model.add(Linear(prev_dim=100, this_dim=10)) 35 | model.add(Softmax()) 36 | 37 | # build learning method 38 | decay_batch = int(data.train.X.shape[0] * 2 / batch_size) 39 | learning_method = SGD(learning_rate=0.1, momentum=0.9, 40 | lr_decay_factor=0.9, decay_batch=decay_batch) 41 | 42 | # Build Logger 43 | log = Log(experiment_name = 'MLP', 44 | description = 'This is a tutorial', 45 | save_outputs = True, # log all the outputs from the screen 46 | save_model = True, # save the best model 47 | save_epoch_error = True, # log error at every epoch 48 | save_to_database = {'name': 'Example.sqlite3', 49 | 'records': {'Batch_Size': batch_size, 50 | 'Learning_Rate': learning_method.learning_rate, 51 | 'Momentum': learning_method.momentum}} 52 | ) # end log 53 | 54 | # put everything into the train object 55 | train_object = TrainObject(model = model, 56 | log = log, 57 | dataset = data, 58 | train_cost = mse, 59 | valid_cost = error, 60 | learning_method = learning_method, 61 | stop_criteria = {'max_epoch' : 100, 62 | 'epoch_look_back' : 5, 63 | 'percent_decrease' : 0.01} 64 | ) 65 | # finally run the code 66 | train_object.setup() 67 | train_object.run() 68 | 69 | ypred = model.fprop(data.get_test().X) 70 | ypred = np.argmax(ypred, axis=1) 71 | y = np.argmax(data.get_test().y, axis=1) 72 | accuracy = np.equal(ypred, y).astype('f4').sum() / len(y) 73 | print 'test accuracy:', accuracy 74 | 75 | 76 | if __name__ == '__main__': 77 | setenv() 78 | train() 79 | -------------------------------------------------------------------------------- /mozi/utils/train_object_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | import matplotlib 5 | import theano 6 | import theano.tensor as T 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | from theano.compile.ops import as_op 10 | from mozi.utils.progbar import Progbar 11 | 12 | import tarfile, inspect, os 13 | from six.moves.urllib.request import urlretrieve 14 | 15 | floatX = theano.config.floatX 16 | 17 | def split_list(tuple_list): 18 | """ 19 | DESCRIPTION: 20 | split a list of tuples into two lists whereby one list contains the first elements 21 | of the tuples and the other list contains the second elements. 22 | PARAM: 23 | tuple_list: a list of tuples, example tuple_list = [('a', 1), ('b', 2)] 24 | RETURN: 25 | two lists, example from above tuple_list will be split into ['a', 'b'] and [1, 2] 26 | """ 27 | ls_A = [] 28 | ls_B = [] 29 | 30 | for tuple in tuple_list: 31 | ls_A.append(tuple[0]) 32 | ls_B.append(tuple[1]) 33 | 34 | return ls_A, ls_B 35 | 36 | 37 | def generate_shared_list(ls): 38 | """ 39 | DESCRIPTION: 40 | generate a list of shared variables that matched the length of ls 41 | PARAM: 42 | ls: the list used for generating the shared variables 43 | RETURN: 44 | a list of shared variables initialized to 0 of len(ls) 45 | """ 46 | rlist = [] 47 | 48 | for i in xrange(len(ls)): 49 | rlist.append(theano.shared(np.array(0., dtype=theano.config.floatX))) 50 | 51 | return rlist 52 | 53 | 54 | def merge_lists(ls_A, ls_B): 55 | """ 56 | DESCRIPTION: 57 | merge two lists of equal length into into a list of tuples 58 | PARAM: 59 | ls_A: first list 60 | ls_B: second list 61 | RETURN: 62 | a list of tuples 63 | """ 64 | 65 | assert len(ls_A) == len(ls_B), 'two lists of different length' 66 | 67 | rlist = [] 68 | for a, b in zip(ls_A, ls_B): 69 | rlist.append((a,b)) 70 | 71 | return rlist 72 | 73 | 74 | def get_shared_values(shared_ls): 75 | """ 76 | DESCRIPTION: 77 | get a list of values from a list of shared variables 78 | PARAM: 79 | shared_ls: list of shared variables 80 | RETURN: 81 | numpy array of the list of values 82 | """ 83 | 84 | val_ls = [] 85 | for var in shared_ls: 86 | val_ls.append(var.get_value()) 87 | 88 | return np.asarray(val_ls, dtype=theano.config.floatX) 89 | 90 | 91 | def is_shared_var(var): 92 | return var.__class__.__name__ == 'TensorSharedVariable' or \ 93 | var.__class__.__name__ == 'CudaNdarraySharedVariable' 94 | 95 | 96 | def merge_var(*vars): 97 | def absortvar(v): 98 | rvar = [] 99 | if isinstance(v, (list, tuple)): 100 | rvar += v 101 | else: 102 | rvar.append(v) 103 | return rvar 104 | 105 | rvars = [] 106 | for var in vars: 107 | rvars += absortvar(var) 108 | return rvars 109 | -------------------------------------------------------------------------------- /mozi/model.py: -------------------------------------------------------------------------------- 1 | 2 | import theano 3 | 4 | 5 | class Model(object): 6 | 7 | def test_fprop(self, input_state): 8 | pass 9 | 10 | def train_fprop(self, input_state): 11 | pass 12 | 13 | class Sequential(Model): 14 | 15 | def __init__(self, input_var, output_var, verbose=True): 16 | """ 17 | PARAM: 18 | input_var (T.vector() | T.matrix() | T.tensor3() | T.tensor4()): 19 | The tensor variable input to the model that corresponds to 20 | the number of dimensions of the input X of dataset 21 | input_var (T.vector() | T.matrix() | T.tensor3() | T.tensor4()): 22 | The tensor variable output from the model that corresponds to 23 | the number of dimensions of the output y of dataset 24 | verbose (bool): 25 | print out the layer stats from each layer if True 26 | 27 | """ 28 | self.input_var = input_var 29 | self.output_var = output_var 30 | self.layers = [] 31 | self.verbose = verbose 32 | 33 | def add(self, layer): 34 | self.layers.append(layer) 35 | 36 | def pop(self, index): 37 | return self.layers.pop(index) 38 | 39 | def test_fprop(self, input_state, layers=None): 40 | test_layers_stats = [] 41 | if layers is None: 42 | layers = xrange(len(self.layers)) 43 | for i in layers: 44 | layer_output = self.layers[i]._test_fprop(input_state) 45 | stats = [] 46 | if self.verbose: 47 | stats = self.layers[i]._layer_stats(input_state, layer_output) 48 | input_state = layer_output 49 | class_name = self.layers[i].__class__.__name__ 50 | stats = [(str(i)+'_'+class_name+'_'+a, b) for (a,b) in stats] 51 | test_layers_stats += stats 52 | 53 | return input_state, test_layers_stats 54 | 55 | 56 | def train_fprop(self, input_state, layers=None): 57 | train_layers_stats = [] 58 | if layers is None: 59 | layers = xrange(len(self.layers)) 60 | for i in layers: 61 | layer_output = self.layers[i]._train_fprop(input_state) 62 | stats = [] 63 | if self.verbose: 64 | stats = self.layers[i]._layer_stats(input_state, layer_output) 65 | input_state = layer_output 66 | class_name = self.layers[i].__class__.__name__ 67 | stats = [(str(i)+'_'+class_name+'_'+a, b) for (a,b) in stats] 68 | train_layers_stats += stats 69 | 70 | return input_state, train_layers_stats 71 | 72 | 73 | def fprop(self, input_values): 74 | return self.fprop_layers(input_values) 75 | 76 | 77 | def fprop_layers(self, input_values, layers=None): 78 | output, stats = self.test_fprop(self.input_var, layers) 79 | if isinstance(self.input_var, (list, tuple)): 80 | f = theano.function(self.input_var, output, on_unused_input='warn', allow_input_downcast=True) 81 | else: 82 | f = theano.function([self.input_var], output, on_unused_input='warn', allow_input_downcast=True) 83 | 84 | if isinstance(input_values, tuple): 85 | return f(*input_values) 86 | else: 87 | return f(input_values) 88 | 89 | 90 | def get_layers(self): 91 | return self.layers 92 | -------------------------------------------------------------------------------- /mozi/utils/progbar.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import numpy as np 3 | import time 4 | import sys 5 | 6 | ''' 7 | from keras 8 | ''' 9 | 10 | class Progbar(object): 11 | def __init__(self, target, width=30, verbose=1): 12 | ''' 13 | @param target: total number of steps expected 14 | ''' 15 | self.width = width 16 | self.target = target 17 | self.sum_values = {} 18 | self.unique_values = [] 19 | self.start = time.time() 20 | self.total_width = 0 21 | self.seen_so_far = 0 22 | self.verbose = verbose 23 | 24 | def update(self, current, values=[]): 25 | ''' 26 | @param current: index of current step 27 | @param values: list of tuples (name, value_for_last_step). 28 | The progress bar will display averages for these values. 29 | ''' 30 | for k, v in values: 31 | if k not in self.sum_values: 32 | self.sum_values[k] = [v * (current-self.seen_so_far), current-self.seen_so_far] 33 | self.unique_values.append(k) 34 | else: 35 | self.sum_values[k][0] += v * (current-self.seen_so_far) 36 | self.sum_values[k][1] += (current-self.seen_so_far) 37 | self.seen_so_far = current 38 | 39 | now = time.time() 40 | if self.verbose == 1: 41 | prev_total_width = self.total_width 42 | sys.stdout.write("\b" * prev_total_width) 43 | sys.stdout.write("\r") 44 | 45 | numdigits = int(np.floor(np.log10(self.target))) + 1 46 | barstr = '%%%dd/%%%dd [' % (numdigits, numdigits) 47 | bar = barstr % (current, self.target) 48 | prog = float(current)/self.target 49 | if current > self.target: 50 | prog = 1 51 | prog_width = int(self.width*prog) 52 | if prog_width > 0: 53 | bar += ('='*(prog_width-1)) 54 | if current < self.target: 55 | bar += '>' 56 | bar += ('.'*(self.width-prog_width)) 57 | bar += ']' 58 | sys.stdout.write(bar) 59 | self.total_width = len(bar) 60 | 61 | if current: 62 | time_per_unit = (now - self.start) / current 63 | else: 64 | time_per_unit = 0 65 | eta = time_per_unit*(self.target - current) 66 | info = '' 67 | if current < self.target: 68 | info += ' - ETA: %ds' % eta 69 | else: 70 | info += ' - %ds' % (now - self.start) 71 | for k in self.unique_values: 72 | info += ' - %s: %.4f' % (k, self.sum_values[k][0]/ max(1, self.sum_values[k][1])) 73 | 74 | self.total_width += len(info) 75 | if prev_total_width > self.total_width: 76 | info += ((prev_total_width-self.total_width) * " ") 77 | 78 | sys.stdout.write(info) 79 | sys.stdout.flush() 80 | 81 | if self.verbose == 2: 82 | if current >= self.target: 83 | info = '%ds' % (now - self.start) 84 | for k in self.unique_values: 85 | info += ' - %s: %.4f' % (k, self.sum_values[k][0]/ max(1, self.sum_values[k][1])) 86 | sys.stdout.write(info + "\n") 87 | 88 | 89 | def add(self, n, values=[]): 90 | self.update(self.seen_so_far+n, values) 91 | -------------------------------------------------------------------------------- /mozi/layers/linear.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import theano 4 | import theano.tensor as T 5 | from theano.sandbox.rng_mrg import MRG_RandomStreams 6 | 7 | from mozi.utils.theano_utils import shared_zeros 8 | from mozi.weight_init import GaussianWeight 9 | from mozi.layers.template import Template 10 | 11 | floatX = theano.config.floatX 12 | theano_rand = MRG_RandomStreams() 13 | 14 | 15 | class Linear(Template): 16 | 17 | def __init__(self, prev_dim=None, this_dim=None, W=None, b=None, 18 | weight_init=GaussianWeight(mean=0, std=0.1)): 19 | """ 20 | DESCRIPTION: 21 | This is a fully connected layer 22 | PARAM: 23 | prev_dim(int): dimension of previous layer 24 | this_dim(int): dimension of this layer 25 | name(string): name of the layer 26 | W(tensor variable): Weight of 2D tensor matrix 27 | b(tensor variable): bias of 2D tensor matrix 28 | params(list): a list of params in layer that can be updated 29 | """ 30 | 31 | self.prev_dim = prev_dim 32 | self.this_dim = this_dim 33 | 34 | self.W = W 35 | if self.W is None: 36 | self.W = weight_init((prev_dim, this_dim), name='W') 37 | 38 | self.b = b 39 | if self.b is None: 40 | self.b = shared_zeros(shape=this_dim, name='b') 41 | 42 | self.params = [self.W, self.b] 43 | 44 | def _train_fprop(self, state_below): 45 | """ 46 | DESCRIPTION: 47 | performs linear transform y = dot(W, state_below) + b 48 | PARAM: 49 | state_below: 1d array of inputs from layer below 50 | """ 51 | return T.dot(state_below, self.W) + self.b 52 | 53 | 54 | def _layer_stats(self, state_below, layer_output): 55 | """ 56 | DESCRIPTION: 57 | This method is called every batch whereby the examples from test or valid set 58 | is pass through, the final result will be the mean of all the results from all 59 | the batches in an epoch from the test set or valid set. 60 | PARAM: 61 | layer_output: the output from the layer 62 | RETURN: 63 | A list of tuples of [('name_a', var_a), ('name_b', var_b)] whereby var is scalar 64 | """ 65 | w_len = T.sqrt((self.W ** 2).sum(axis=0)) 66 | max_length = T.max(w_len) 67 | mean_length = T.mean(w_len) 68 | min_length = T.min(w_len) 69 | max_output = T.max(layer_output) 70 | mean_output = T.mean(T.abs_(layer_output)) 71 | min_output = T.min(layer_output) 72 | max_state = T.max(state_below) 73 | mean_state = T.mean(T.abs_(state_below)) 74 | min_state = T.min(state_below) 75 | 76 | return [('max_W', T.max(self.W)), 77 | ('mean_W', T.mean(self.W)), 78 | ('min_W', T.min(self.W)), 79 | ('max_b', T.max(self.b)), 80 | ('mean_b', T.mean(self.b)), 81 | ('min_b', T.min(self.b)), 82 | ('max_layer_output', max_output), 83 | ('mean_layer_output', mean_output), 84 | ('min_layer_output', min_output), 85 | ('max_col_length', max_length), 86 | ('mean_col_length', mean_length), 87 | ('min_col_length', min_length), 88 | ('max_state_below', max_state), 89 | ('mean_state_below', mean_state), 90 | ('min_state_below', min_state)] 91 | -------------------------------------------------------------------------------- /mozi/layers/noise.py: -------------------------------------------------------------------------------- 1 | 2 | import theano 3 | import theano.tensor as T 4 | from theano.sandbox.rng_mrg import MRG_RandomStreams 5 | 6 | from mozi.layers.template import Template 7 | 8 | floatX = theano.config.floatX 9 | theano_rand = MRG_RandomStreams() 10 | 11 | class Dropout(Template): 12 | 13 | def __init__(self, dropout_below=0.5): 14 | ''' 15 | PARAMS: 16 | dropout_below(float): probability of the inputs from the layer below been masked out 17 | ''' 18 | self.dropout_below = dropout_below 19 | self.params = [] 20 | 21 | 22 | def _test_fprop(self, state_below): 23 | """ 24 | DESCRIPTION: 25 | resize the weight during testing for models trained with dropout. 26 | The weight will be resized to W' = self.dropout_below * W 27 | """ 28 | return state_below * (1 - self.dropout_below) 29 | 30 | 31 | def _train_fprop(self, state_below): 32 | """ 33 | DESCRIPTION: 34 | Applies dropout to the layer during training 35 | """ 36 | return theano_rand.binomial(size=state_below.shape, n=1, 37 | p=(1-self.dropout_below), 38 | dtype=floatX) * state_below 39 | 40 | 41 | class MaskOut(Template): 42 | 43 | """ 44 | This noise masked out a portion of the dimension from each example 45 | """ 46 | 47 | def __init__(self, ratio=0.5): 48 | """ 49 | PARAM: 50 | ratio : float 51 | The portion of the inputs that is masked out 52 | """ 53 | self.ratio = ratio 54 | self.params = [] 55 | 56 | def _train_fprop(self, state_below): 57 | return state_below * theano_rand.binomial(size=state_below.shape, n=1, p=(1-self.ratio), dtype=floatX) 58 | 59 | 60 | class Gaussian(Template): 61 | """ 62 | Applies gaussian noise to each value of X 63 | """ 64 | 65 | def __init__(self, std=0.1, mean=0): 66 | self.std = std 67 | self.mean = mean 68 | self.params = [] 69 | 70 | def _train_fprop(self, state_below): 71 | return state_below + theano_rand.normal(avg=self.mean, std=self.std, size=state_below.shape, dtype=floatX) 72 | 73 | 74 | class BlackOut(Template): 75 | """ 76 | This noise masked out a random example in a dataset, 77 | adding noise in the time dimension 78 | """ 79 | 80 | def __init__(self, ratio=0.5): 81 | """ 82 | PARAM: 83 | ratio : float 84 | The portion of the examples that is masked out 85 | """ 86 | self.ratio = ratio 87 | self.params = [] 88 | 89 | def _train_fprop(self, state_below): 90 | rd = theano_rand.binomial(size=(state_below.shape[0],), n=1, p=(1-self.ratio), dtype=floatX) 91 | return state_below * T.shape_padright(rd) 92 | 93 | 94 | class BatchOut(Template): 95 | """ 96 | This noise masked out a random batch in an epoch, 97 | adding noise in the time dimension 98 | """ 99 | 100 | def __init__(self, ratio=0.5): 101 | """ 102 | PARAM: 103 | ratio : float 104 | The portion of the batch that is masked out 105 | """ 106 | self.ratio = ratio 107 | self.params = [] 108 | 109 | def _train_fprop(self, state_below): 110 | rd = theano_rand.binomial(size=(1,1), n=1, p=(1-self.ratio), dtype=floatX) 111 | return state_below * T.patternbroadcast(rd, broadcastable=(True, True)) 112 | -------------------------------------------------------------------------------- /example/cifar10_cnn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | 4 | import theano.tensor as T 5 | 6 | from mozi.datasets.cifar10 import Cifar10 7 | from mozi.model import Sequential 8 | from mozi.layers.linear import * 9 | from mozi.layers.noise import Dropout 10 | from mozi.layers.activation import * 11 | from mozi.layers.convolution import * 12 | from mozi.layers.misc import Flatten 13 | from mozi.layers.normalization import * 14 | from mozi.log import Log 15 | from mozi.train_object import TrainObject 16 | from mozi.cost import error, entropy 17 | from mozi.learning_method import * 18 | from mozi.env import setenv 19 | from mozi.utils.cnn_utils import valid, full 20 | 21 | def train(): 22 | batch_size = 256 23 | short_memory = 0.9 24 | learning_rate = 0.005 25 | data = Cifar10(batch_size=batch_size, train_valid_test_ratio=[4,1,1]) 26 | _, c, h, w = data.train.X.shape 27 | 28 | model = Sequential(input_var=T.tensor4(), output_var=T.matrix()) 29 | model.add(Convolution2D(input_channels=c, filters=8, kernel_size=(3,3), stride=(1,1), border_mode='full')) 30 | h, w = full(h, w, kernel=3, stride=1) 31 | model.add(BatchNormalization(dim=8, layer_type='conv', short_memory=short_memory)) 32 | model.add(RELU()) 33 | model.add(Convolution2D(input_channels=8, filters=16, kernel_size=(3,3), stride=(1,1), border_mode='valid')) 34 | h, w = valid(h, w, kernel=3, stride=1) 35 | model.add(BatchNormalization(dim=16, layer_type='conv', short_memory=short_memory)) 36 | model.add(RELU()) 37 | model.add(Pooling2D(poolsize=(4, 4), stride=(4,4), mode='max')) 38 | h, w = valid(h, w, kernel=4, stride=4) 39 | model.add(Flatten()) 40 | model.add(Linear(16*h*w, 512)) 41 | model.add(BatchNormalization(dim=512, layer_type='fc', short_memory=short_memory)) 42 | model.add(RELU()) 43 | 44 | model.add(Linear(512, 10)) 45 | model.add(Softmax()) 46 | 47 | # learning_method = RMSprop(learning_rate=learning_rate) 48 | learning_method = Adam(learning_rate=learning_rate) 49 | # learning_method = SGD(learning_rate=0.001) 50 | 51 | # Build Logger 52 | log = Log(experiment_name = 'cifar10_cnn_tutorial', 53 | description = 'This is a tutorial', 54 | save_outputs = True, # log all the outputs from the screen 55 | save_model = True, # save the best model 56 | save_epoch_error = True, # log error at every epoch 57 | save_to_database = {'name': 'hyperparam.sqlite3', 58 | 'records': {'Batch_Size': batch_size, 59 | 'Learning_Rate': learning_method.learning_rate}} 60 | ) # end log 61 | 62 | # put everything into the train object 63 | train_object = TrainObject(model = model, 64 | log = log, 65 | dataset = data, 66 | train_cost = entropy, 67 | valid_cost = error, 68 | learning_method = learning_method, 69 | stop_criteria = {'max_epoch' : 100, 70 | 'epoch_look_back' : 10, 71 | 'percent_decrease' : 0.01} 72 | ) 73 | # finally run the code 74 | train_object.setup() 75 | train_object.run() 76 | 77 | # test the model on test set 78 | ypred = model.fprop(data.get_test().X) 79 | ypred = np.argmax(ypred, axis=1) 80 | y = np.argmax(data.get_test().y, axis=1) 81 | accuracy = np.equal(ypred, y).astype('f4').sum() / len(y) 82 | print 'test accuracy:', accuracy 83 | 84 | 85 | if __name__ == '__main__': 86 | setenv() 87 | train() 88 | -------------------------------------------------------------------------------- /mozi/layers/alexnet.py: -------------------------------------------------------------------------------- 1 | 2 | from mozi.layers.activation import RELU, Softmax 3 | from mozi.layers.normalization import LRN 4 | from mozi.layers.convolution import Convolution2D, Pooling2D 5 | from mozi.layers.linear import Linear 6 | from mozi.layers.noise import Dropout 7 | from mozi.layers.misc import Flatten 8 | from mozi.layers.template import Template 9 | 10 | 11 | class Alexnet(Template): 12 | 13 | def __init__(self, input_shape, output_dim): 14 | ''' 15 | FIELDS: 16 | self.params: any params from the layer that needs to be updated 17 | by backpropagation can be put inside self.params 18 | PARAMS: 19 | input_shape: tuple 20 | shape of the input image with format (channel, height, width) 21 | output_dim: int 22 | the output dimension of the model 23 | ''' 24 | assert len(input_shape) == 3, 'input_shape must be a tuple or list of dim (channel, height, width)' 25 | c, h, w = input_shape 26 | 27 | valid = lambda x, y, kernel, stride : ((x-kernel)/stride + 1, (y-kernel)/stride + 1) 28 | full = lambda x, y, kernel, stride : ((x+kernel)/stride - 1, (y+kernel)/stride - 1) 29 | 30 | self.layers = [] 31 | self.layers.append(Convolution2D(input_channels=3, filters=96, kernel_size=(11,11), 32 | stride=(4,4), border_mode='valid')) 33 | nh, nw = valid(h, w, 11, 4) 34 | self.layers.append(RELU()) 35 | self.layers.append(LRN()) 36 | self.layers.append(Pooling2D(poolsize=(3,3), stride=(2,2), mode='max')) 37 | nh, nw = valid(nh, nw, 3, 2) 38 | self.layers.append(Convolution2D(input_channels=96, filters=256, kernel_size=(5,5), 39 | stride=(1,1), border_mode='full')) 40 | nh, nw = full(nh, nw, 5, 1) 41 | self.layers.append(RELU()) 42 | self.layers.append(LRN()) 43 | self.layers.append(Pooling2D(poolsize=(3,3), stride=(2,2), mode='max')) 44 | nh, nw = valid(nh, nw, 3, 2) 45 | self.layers.append(Convolution2D(input_channels=256, filters=384, kernel_size=(3,3), 46 | stride=(1,1), border_mode='full')) 47 | nh, nw = full(nh, nw, 3, 1) 48 | self.layers.append(RELU()) 49 | self.layers.append(Convolution2D(input_channels=384, filters=384, kernel_size=(3,3), 50 | stride=(1,1), border_mode='full')) 51 | nh, nw = full(nh, nw, 3, 1) 52 | self.layers.append(RELU()) 53 | self.layers.append(Convolution2D(input_channels=384, filters=256, kernel_size=(3,3), 54 | stride=(1,1), border_mode='full')) 55 | nh, nw = full(nh, nw, 3, 1) 56 | self.layers.append(RELU()) 57 | self.layers.append(Pooling2D(poolsize=(3,3), stride=(2,2), mode='max')) 58 | nh, nw = valid(nh, nw, 3, 2) 59 | 60 | self.layers.append(Flatten()) 61 | self.layers.append(Linear(256*nh*nw,4096)) 62 | self.layers.append(RELU()) 63 | self.layers.append(Dropout(0.5)) 64 | self.layers.append(Linear(4096,4096)) 65 | self.layers.append(RELU()) 66 | self.layers.append(Dropout(0.5)) 67 | self.layers.append(Linear(4096,output_dim)) 68 | self.layers.append(Softmax()) 69 | 70 | self.params = [] 71 | for layer in self.layers: 72 | self.params += layer.params 73 | 74 | def _test_fprop(self, state_below): 75 | for layer in self.layers: 76 | state_below = layer._test_fprop(state_below) 77 | return state_below 78 | 79 | def _train_fprop(self, state_below): 80 | for layer in self.layers: 81 | state_below = layer._train_fprop(state_below) 82 | return state_below 83 | -------------------------------------------------------------------------------- /example/datablocks_example.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import theano.tensor as T 4 | 5 | from mozi.datasets.cifar10 import Cifar10 6 | from mozi.model import Sequential 7 | from mozi.layers.linear import * 8 | from mozi.layers.noise import Dropout 9 | from mozi.layers.activation import * 10 | from mozi.layers.convolution import * 11 | from mozi.layers.misc import Flatten 12 | from mozi.log import Log 13 | from mozi.train_object import TrainObject 14 | from mozi.cost import error, entropy 15 | from mozi.learning_method import SGD 16 | from mozi.datasets.dataset import DataBlocks 17 | 18 | def setenv(): 19 | NNdir = os.path.dirname(os.path.realpath(__file__)) 20 | NNdir = os.path.dirname(NNdir) 21 | 22 | # directory to save all the dataset 23 | if not os.getenv('MOZI_DATA_PATH'): 24 | os.environ['MOZI_DATA_PATH'] = NNdir + '/data' 25 | 26 | # directory for saving the database that is used for logging the results 27 | if not os.getenv('MOZI_DATABASE_PATH'): 28 | os.environ['MOZI_DATABASE_PATH'] = NNdir + '/database' 29 | 30 | # directory to save all the trained models and outputs 31 | if not os.getenv('MOZI_SAVE_PATH'): 32 | os.environ['MOZI_SAVE_PATH'] = NNdir + '/save' 33 | 34 | print('MOZI_DATA_PATH = ' + os.environ['MOZI_DATA_PATH']) 35 | print('MOZI_SAVE_PATH = ' + os.environ['MOZI_SAVE_PATH']) 36 | print('MOZI_DATABASE_PATH = ' + os.environ['MOZI_DATABASE_PATH']) 37 | 38 | 39 | def train(): 40 | # create a fake dataset 41 | X1 = np.random.rand(100000, 1000) 42 | y1 = np.random.rand(100000, 10) 43 | with open('X1.npy', 'wb') as xin, open('y1.npy', 'wb') as yin: 44 | np.save(xin, X1) 45 | np.save(yin, y1) 46 | 47 | X2 = np.random.rand(100000, 1000) 48 | y2 = np.random.rand(100000, 10) 49 | with open('X2.npy', 'wb') as xin, open('y2.npy', 'wb') as yin: 50 | np.save(xin, X2) 51 | np.save(yin, y2) 52 | 53 | X3 = np.random.rand(100000, 1000) 54 | y3 = np.random.rand(100000, 10) 55 | with open('X3.npy', 'wb') as xin, open('y3.npy', 'wb') as yin: 56 | np.save(xin, X3) 57 | np.save(yin, y3) 58 | 59 | # now we can create the data by putting the paths 60 | # ('X1.npy', 'y1.npy') and ('X2.npy', 'y2.npy') into DataBlocks 61 | data = DataBlocks(data_paths=[('X1.npy', 'y1.npy'), ('X2.npy', 'y2.npy'), ('X3.npy', 'y3.npy')], 62 | batch_size=100, train_valid_test_ratio=[3,2,0], allow_preload=False) 63 | 64 | 65 | model = Sequential(input_var=T.matrix(), output_var=T.matrix()) 66 | model.add(Linear(prev_dim=1000, this_dim=200)) 67 | model.add(RELU()) 68 | model.add(Linear(prev_dim=200, this_dim=100)) 69 | model.add(RELU()) 70 | model.add(Dropout(0.5)) 71 | model.add(Linear(prev_dim=100, this_dim=10)) 72 | model.add(Softmax()) 73 | 74 | # build learning method 75 | learning_method = SGD(learning_rate=0.01, momentum=0.9, 76 | lr_decay_factor=0.9, decay_batch=5000) 77 | 78 | # put everything into the train object 79 | train_object = TrainObject(model = model, 80 | log = None, 81 | dataset = data, 82 | train_cost = entropy, 83 | valid_cost = error, 84 | learning_method = learning_method, 85 | stop_criteria = {'max_epoch' : 10, 86 | 'epoch_look_back' : 5, 87 | 'percent_decrease' : 0.01} 88 | ) 89 | # finally run the code 90 | train_object.setup() 91 | train_object.run() 92 | 93 | for X_path, y_path in [('X1.npy', 'y1.npy'), ('X2.npy', 'y2.npy')]: 94 | with open(X_path) as Xin, open(y_path) as yin: 95 | # test the model on test set 96 | ypred = model.fprop(np.load(Xin)) 97 | ypred = np.argmax(ypred, axis=1) 98 | y = np.argmax(np.load(yin), axis=1) 99 | accuracy = np.equal(ypred, y).astype('f4').sum() / len(y) 100 | print 'combined accuracy for blk %s:'%X_path, accuracy 101 | 102 | 103 | if __name__ == '__main__': 104 | setenv() 105 | train() 106 | -------------------------------------------------------------------------------- /mozi/layers/activation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | from theano.sandbox.rng_mrg import MRG_RandomStreams 5 | from mozi.layers.template import Template 6 | from mozi.utils.theano_utils import sharedX 7 | 8 | floatX = theano.config.floatX 9 | theano_rand = MRG_RandomStreams() 10 | 11 | 12 | class Sigmoid(Template): 13 | def _train_fprop(self, state_below): 14 | return T.nnet.sigmoid(state_below) 15 | 16 | 17 | class RELU(Template): 18 | def _train_fprop(self, state_below): 19 | return state_below * (state_below > 0.) 20 | 21 | 22 | class PRELU(Template): 23 | def __init__(self, dim, alpha=0.2): 24 | ''' 25 | y = wx + b 26 | if y > 0 then z = y else z = alpha * y 27 | return z 28 | alpha: the gradient of the slope which is updated by backpropagation 29 | ''' 30 | self.alpha = sharedX(np.ones(dim) * alpha, name='PRELU_gradient') 31 | self.params = [self.alpha] 32 | 33 | def _test_fprop(self, state_below): 34 | return self._train_fprop(state_below) 35 | 36 | def _train_fprop(self, state_below): 37 | return state_below * (state_below >= 0) \ 38 | + self.alpha * state_below * (state_below < 0) 39 | 40 | 41 | class LeakyRELU(Template): 42 | def __init__(self, alpha=0.01): 43 | self.alpha = sharedX(alpha) 44 | self.params = [] 45 | 46 | def _train_fprop(self, state_below): 47 | return state_below * (state_below >= 0) \ 48 | + self.alpha * state_below * (state_below < 0) 49 | 50 | 51 | class Noisy_RELU(Template): 52 | def __init__(self, sparsity_factor=0.1, threshold_lr=0.01, alpha=0.01, std=0.1, num_batch=10000, **kwargs): 53 | ''' 54 | sparsityFactor: the micro sparsity of signals through each neuron 55 | threshold_lr: the learning rate of learning the optimum threshold for each neuron 56 | so that the activeness of the neuron approaches sparsityFactor 57 | alpha_range: {start_weight, num_batches, final_weight} for setting the weight on the 58 | contemporary sparsity when calculating the mean sparsity over many batches. 59 | For the start, it will place more weight on the contemporary, but as more 60 | epoch goes through, the weight on contemporary batch should decrease, so 61 | that mean_sparsity will be more stable. 62 | std: the standard deviation of the noise 63 | ''' 64 | super(Noisy_RELU, self).__init__(**kwargs) 65 | self.sparsity_factor = sparsity_factor 66 | self.threshold_lr = threshold_lr 67 | self.alpha = alpha 68 | self.std = std 69 | self.num_batch = num_batch 70 | self.threshold = 0. 71 | self.activity = 0. 72 | self.batch_count = 0 73 | 74 | 75 | def _test_fprop(self, state_below): 76 | return output * (output > self.threshold) 77 | 78 | def _train_fprop(self, state_below): 79 | if self.batch_count > self.num_batch: 80 | return state_below * (state_below > self.threshold) 81 | 82 | else: 83 | self.batch_count += 1 84 | state_below = state_below + theano_rand.normal(size=state_below.shape, std=self.std, dtype=floatX) 85 | state_below = state_below * (state_below > self.threshold) 86 | activity = theano.mean(state_below > 0, axis=0) 87 | self.activity = self.alpha * activity + (1-self.alpha) * self.activity 88 | self.threshold += self.threshold_lr * (self.activity - self.sparsity_factor) 89 | return state_below * (state_below > self.threshold) 90 | 91 | 92 | class Softmax(Template): 93 | def _train_fprop(self, state_below): 94 | return T.nnet.softmax(state_below) 95 | 96 | 97 | class Tanh(Template): 98 | def _train_fprop(self, state_below): 99 | return T.tanh(state_below) 100 | 101 | 102 | class Softplus(Template): 103 | def _train_fprop(self, state_below): 104 | return T.nnet.softplus(state_below) 105 | 106 | 107 | class ELU(Template): 108 | def __init__(self, alpha=1.0): 109 | self.alpha = alpha 110 | 111 | def _train_fprop(self, state_below): 112 | return self.alpha*(T.exp(state_below)-1)*(state_below<0) + state_below*(state_below>=0) 113 | -------------------------------------------------------------------------------- /mozi/layers/normalization.py: -------------------------------------------------------------------------------- 1 | 2 | from mozi.layers.template import Template 3 | from mozi.utils.theano_utils import shared_zeros, sharedX, shared_ones 4 | from mozi.weight_init import UniformWeight 5 | import theano.tensor as T 6 | import theano 7 | 8 | class BatchNormalization(Template): 9 | 10 | def __init__(self, dim, layer_type, gamma_init=UniformWeight(), short_memory=0.01): 11 | ''' 12 | REFERENCE: 13 | Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift 14 | PARAMS: 15 | short_memory: short term memory 16 | y_t is the latest value, the moving average x_tp1 is calculated as 17 | x_tp1 = memory * y_t + (1-memory) * x_t, the larger the short term 18 | memory, the more weight is put on contempory. 19 | layer_type: fc or conv 20 | epsilon: 21 | denominator min value for preventing division by zero in computing std 22 | dim: for fc layers, shape is the layer dimension, for conv layers, 23 | shape is the number of feature maps 24 | ''' 25 | 26 | assert layer_type in ['fc', 'conv'] 27 | self.layer_type = layer_type 28 | self.epsilon = 1e-6 29 | self.dim = dim 30 | self.mem = short_memory 31 | 32 | if self.layer_type == 'fc': 33 | input_shape = (1, dim) 34 | self.broadcastable = (True, False) 35 | elif self.layer_type == 'conv': 36 | input_shape = (1, dim, 1, 1) 37 | self.broadcastable = (True, False, True, True) 38 | 39 | self.gamma = gamma_init(input_shape, name='gamma') 40 | self.beta = shared_zeros(input_shape, name='beta') 41 | self.params = [self.gamma, self.beta] 42 | self.moving_mean = 0 43 | self.moving_var = 1 44 | 45 | def _train_fprop(self, state_below): 46 | if self.layer_type == 'fc': 47 | miu = state_below.mean(axis=0) 48 | var = T.mean((state_below - miu)**2, axis=0) 49 | elif self.layer_type == 'conv': 50 | miu = state_below.mean(axis=(0,2,3), keepdims=True) 51 | var = T.mean((state_below - miu)**2, axis=(0,2,3), keepdims=True) 52 | self.moving_mean = self.mem * miu + (1-self.mem) * self.moving_mean 53 | self.moving_var = self.mem * var + (1-self.mem) * self.moving_var 54 | 55 | Z = (state_below - self.moving_mean) / T.sqrt(self.moving_var + self.epsilon) 56 | gamma = T.patternbroadcast(self.gamma, self.broadcastable) 57 | beta = T.patternbroadcast(self.beta, self.broadcastable) 58 | return gamma * Z + beta 59 | 60 | 61 | def _test_fprop(self, state_below): 62 | Z = (state_below - self.moving_mean) / T.sqrt(self.moving_var + self.epsilon) 63 | gamma = T.patternbroadcast(self.gamma, self.broadcastable) 64 | beta = T.patternbroadcast(self.beta, self.broadcastable) 65 | return gamma * Z + beta 66 | 67 | 68 | def _layer_stats(self, state_below, layer_output): 69 | return [('moving_mean', T.mean(self.moving_mean)), 70 | ('moving_std', T.mean(self.moving_var)), 71 | ('gamma_mean', T.mean(self.gamma)), 72 | ('beta_mean', T.mean(self.beta)), 73 | ('gamma_max', T.max(self.gamma))] 74 | 75 | 76 | # class LRN(Template): 77 | # """ 78 | # Adapted from pylearn2 79 | # Local Response Normalization 80 | # """ 81 | # 82 | # def __init__(self, n=5, alpha=0.0001, beta=0.75, k=2): 83 | # super(LRN, self).__init__() 84 | # self.n = n 85 | # self.alpha = alpha 86 | # self.beta = beta 87 | # self.k = k 88 | # assert self.n % 2 == 1, 'only odd n is supported' 89 | # 90 | # def _train_fprop(self, state_below): 91 | # half = self.n / 2 92 | # sq = T.sqr(state_below) 93 | # b, ch, r, c = state_below.shape 94 | # extra_channels = T.alloc(0., b, ch + 2*half, r, c) 95 | # sq = T.set_subtensor(extra_channels[:,half:half+ch,:,:], sq) 96 | # scale = self.k 97 | # 98 | # for i in xrange(self.n): 99 | # scale += self.alpha * sq[:,i:i+ch,:,:] 100 | # 101 | # scale = scale ** self.beta 102 | # return state_below / scale 103 | # 104 | # def _test_fprop(self, state_below): 105 | # return self._train_fprop(state_below) 106 | -------------------------------------------------------------------------------- /example/imdb_bilstm.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | from mozi.datasets.imdb import IMDB 5 | from mozi.model import Sequential 6 | from mozi.layers.linear import Linear 7 | from mozi.layers.noise import Dropout 8 | from mozi.layers.activation import RELU, Sigmoid 9 | from mozi.layers.normalization import BatchNormalization 10 | from mozi.layers.embedding import Embedding 11 | from mozi.env import setenv 12 | from mozi.layers.recurrent import BiLSTM, LSTM 13 | from mozi.layers.misc import Transform, Flatten, Reshape 14 | from mozi.learning_method import SGD 15 | from mozi.log import Log 16 | from mozi.train_object import TrainObject 17 | from mozi.cost import mse, error 18 | import theano.tensor as T 19 | 20 | import cPickle 21 | import sys 22 | 23 | ''' 24 | Train a BiDirectionLSTM LSTM on the IMDB sentiment classification task. 25 | The dataset is actually too small for LSTM to be of any advantage 26 | compared to simpler, much faster methods such as TF-IDF+LogReg. 27 | Notes: 28 | - RNNs are tricky. Choice of batch size is important, 29 | choice of loss and optimizer is critical, etc. 30 | Most configurations won't converge. 31 | - LSTM loss decrease during training can be quite different 32 | from what you see with CNNs/MLPs/etc. It's more or less a sigmoid 33 | instead of an inverse exponential. 34 | GPU command: 35 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python imdb_lstm.py 36 | 250s/epoch on GPU (GT 650M), vs. 400s/epoch on CPU (2.4Ghz Core i7). 37 | ''' 38 | 39 | def train(): 40 | max_features=20000 41 | maxseqlen = 100 # cut texts after this number of words (among top max_features most common words) 42 | batch_size = 16 43 | word_vec_len = 256 44 | iter_class = 'SequentialRecurrentIterator' 45 | seq_len = 10 46 | 47 | data = IMDB(pad_zero=True, maxlen=100, nb_words=max_features, batch_size=batch_size, 48 | train_valid_test_ratio=[8,2,0], iter_class=iter_class, seq_len=seq_len) 49 | 50 | print('Build model...') 51 | model = Sequential(input_var=T.matrix(), output_var=T.matrix()) 52 | model.add(Embedding(max_features, word_vec_len)) 53 | 54 | # MLP layers 55 | model.add(Transform((word_vec_len,))) # transform from 3d dimensional input to 2d input for mlp 56 | model.add(Linear(word_vec_len, 100)) 57 | model.add(RELU()) 58 | model.add(BatchNormalization(dim=100, layer_type='fc')) 59 | model.add(Linear(100,100)) 60 | model.add(RELU()) 61 | model.add(BatchNormalization(dim=100, layer_type='fc')) 62 | model.add(Linear(100, word_vec_len)) 63 | model.add(RELU()) 64 | model.add(Transform((maxseqlen, word_vec_len))) # transform back from 2d to 3d for recurrent input 65 | 66 | # Stacked up BiLSTM layers 67 | model.add(BiLSTM(word_vec_len, 50, output_mode='concat', return_sequences=True)) 68 | model.add(BiLSTM(100, 24, output_mode='sum', return_sequences=True)) 69 | model.add(LSTM(24, 24, return_sequences=True)) 70 | 71 | # MLP layers 72 | model.add(Reshape((24 * maxseqlen,))) 73 | model.add(BatchNormalization(dim=24 * maxseqlen, layer_type='fc')) 74 | model.add(Linear(24 * maxseqlen, 50)) 75 | model.add(RELU()) 76 | model.add(Dropout(0.2)) 77 | model.add(Linear(50, 1)) 78 | model.add(Sigmoid()) 79 | 80 | # build learning method 81 | decay_batch = int(data.train.X.shape[0] * 5 / batch_size) 82 | learning_method = SGD(learning_rate=0.1, momentum=0.9, 83 | lr_decay_factor=1.0, decay_batch=decay_batch) 84 | 85 | # Build Logger 86 | log = Log(experiment_name = 'MLP', 87 | description = 'This is a tutorial', 88 | save_outputs = True, # log all the outputs from the screen 89 | save_model = True, # save the best model 90 | save_epoch_error = True, # log error at every epoch 91 | save_to_database = {'name': 'Example.sqlite3', 92 | 'records': {'Batch_Size': batch_size, 93 | 'Learning_Rate': learning_method.learning_rate, 94 | 'Momentum': learning_method.momentum}} 95 | ) # end log 96 | 97 | # put everything into the train object 98 | train_object = TrainObject(model = model, 99 | log = log, 100 | dataset = data, 101 | train_cost = mse, 102 | valid_cost = error, 103 | learning_method = learning_method, 104 | stop_criteria = {'max_epoch' : 100, 105 | 'epoch_look_back' : 5, 106 | 'percent_decrease' : 0.01} 107 | ) 108 | # finally run the code 109 | train_object.setup() 110 | train_object.run() 111 | 112 | 113 | if __name__ == '__main__': 114 | setenv() 115 | train() 116 | -------------------------------------------------------------------------------- /mozi/utils/mnist_utils.py: -------------------------------------------------------------------------------- 1 | """Low-level utilities for reading in raw MNIST files.""" 2 | 3 | __author__ = "David Warde-Farley" 4 | __copyright__ = "Copyright 2012, Universite de Montreal" 5 | __credits__ = ["David Warde-Farley"] 6 | __license__ = "3-clause BSD" 7 | __email__ = "wardefar@iro" 8 | __maintainer__ = "David Warde-Farley" 9 | 10 | 11 | import struct 12 | import numpy 13 | import gzip 14 | 15 | import tarfile, inspect, os 16 | from six.moves.urllib.request import urlretrieve 17 | from mozi.utils.progbar import Progbar 18 | 19 | MNIST_IMAGE_MAGIC = 2051 20 | MNIST_LABEL_MAGIC = 2049 21 | 22 | def get_mnist_file(fpath, origin): 23 | datadir = os.path.dirname(fpath) 24 | if not os.path.exists(datadir): 25 | os.makedirs(datadir) 26 | 27 | try: 28 | f = open(fpath) 29 | except: 30 | print('Downloading data from', origin) 31 | 32 | global progbar 33 | progbar = None 34 | def dl_progress(count, block_size, total_size): 35 | global progbar 36 | if progbar is None: 37 | progbar = Progbar(total_size) 38 | else: 39 | progbar.update(count*block_size) 40 | 41 | urlretrieve(origin, fpath + '.gz', dl_progress) 42 | progbar = None 43 | 44 | fin = gzip.open(fpath + '.gz', 'rb') 45 | fout = open(fpath, 'wb') 46 | fout.write(fin.read()) 47 | fin.close() 48 | fout.close() 49 | 50 | return fpath 51 | 52 | 53 | class open_if_filename(object): 54 | def __init__(self, f, mode='r', buffering=-1): 55 | self._f = f 56 | self._mode = mode 57 | self._buffering = buffering 58 | self._handle = None 59 | 60 | def __enter__(self): 61 | if isinstance(self._f, basestring): 62 | self._handle = open(self._f, self._mode, self._buffering) 63 | else: 64 | self._handle = self._f 65 | return self._handle 66 | 67 | def __exit__(self, exc_type, exc_value, traceback): 68 | if self._handle is not self._f: 69 | self._handle.close() 70 | 71 | 72 | def read_mnist_images(fn, dtype=None): 73 | """ 74 | Read MNIST images from the original ubyte file format. 75 | 76 | Parameters 77 | ---------- 78 | fn : str or object 79 | Filename/path from which to read labels, or an open file 80 | object for the same (will not be closed for you). 81 | 82 | dtype : str or object, optional 83 | A NumPy dtype or string that can be converted to one. 84 | If unspecified, images will be returned in their original 85 | unsigned byte format. 86 | 87 | Returns 88 | ------- 89 | images : ndarray, shape (n_images, n_rows, n_cols) 90 | An image array, with individual examples indexed along the 91 | first axis and the image dimensions along the second and 92 | third axis. 93 | 94 | Notes 95 | ----- 96 | If the dtype provided was boolean, the resulting array will 97 | be boolean with `True` if the corresponding pixel had a value 98 | greater than or equal to 128, `False otherwise. 99 | 100 | If the dtype provided was a float or complex dtype, the values 101 | will be mapped to the unit interval [0, 1], with pixel values 102 | that were 255 in the original unsigned byte representation 103 | equal to 1.0. 104 | """ 105 | with open_if_filename(fn, 'rb') as f: 106 | magic, number, rows, cols = struct.unpack('>iiii', f.read(16)) 107 | if magic != MNIST_IMAGE_MAGIC: 108 | raise ValueError('wrong magic number reading MNIST image file: ' + 109 | fn) 110 | array = numpy.fromfile(f, dtype='uint8').reshape((number, rows, cols)) 111 | if dtype: 112 | dtype = numpy.dtype(dtype) 113 | # If the user wants booleans, threshold at half the range. 114 | if dtype.kind is 'b': 115 | array = array >= 128 116 | else: 117 | # Otherwise, just convert. 118 | array = array.astype(dtype) 119 | # I don't know why you'd ever turn MNIST into complex, 120 | # but just in case, check for float *or* complex dtypes. 121 | # Either way, map to the unit interval. 122 | if dtype.kind in ('f', 'c'): 123 | array /= 255. 124 | return array 125 | 126 | 127 | def read_mnist_labels(fn): 128 | """ 129 | Read MNIST labels from the original ubyte file format. 130 | 131 | Parameters 132 | ---------- 133 | fn : str or object 134 | Filename/path from which to read labels, or an open file 135 | object for the same (will not be closed for you). 136 | 137 | Returns 138 | ------- 139 | labels : ndarray, shape (nlabels,) 140 | A one-dimensional unsigned byte array containing the 141 | labels as integers. 142 | """ 143 | with open_if_filename(fn, 'rb') as f: 144 | magic, number = struct.unpack('>ii', f.read(8)) 145 | if magic != MNIST_LABEL_MAGIC: 146 | raise ValueError('wrong magic number reading MNIST label file: ' + 147 | fn) 148 | array = numpy.fromfile(f, dtype='uint8') 149 | return array 150 | -------------------------------------------------------------------------------- /mozi/utils/image.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | ''' 4 | Adapted from http://deeplearning.net/tutorial/code/utils.py 5 | ''' 6 | 7 | 8 | def scale_to_unit_interval(ndar, eps=1e-8): 9 | """ Scales all values in the ndarray ndar to be between 0 and 1 """ 10 | ndar = ndar.copy() 11 | ndar -= ndar.min() 12 | ndar *= 1.0 / (ndar.max() + eps) 13 | return ndar 14 | 15 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0), 16 | scale_rows_to_unit_interval=True, 17 | output_pixel_vals=True): 18 | """ 19 | Transform an array with one flattened image per row, into an array in 20 | which images are reshaped and layed out like tiles on a floor. 21 | 22 | This function is useful for visualizing datasets whose rows are images, 23 | and also columns of matrices for transforming those rows 24 | (such as the first layer of a neural net). 25 | 26 | :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can 27 | be 2-D ndarrays or None; 28 | :param X: a 2-D array in which every row is a flattened image. 29 | 30 | :type img_shape: tuple; (height, width) 31 | :param img_shape: the original shape of each image 32 | 33 | :type tile_shape: tuple; (rows, cols) 34 | :param tile_shape: the number of images to tile (rows, cols) 35 | 36 | :param output_pixel_vals: if output should be pixel values (i.e. int8 37 | values) or floats 38 | 39 | :param scale_rows_to_unit_interval: if the values need to be scaled before 40 | being plotted to [0,1] or not 41 | 42 | 43 | :returns: array suitable for viewing as an image. 44 | (See:`PIL.Image.fromarray`.) 45 | :rtype: a 2-d array with same dtype as X. 46 | 47 | """ 48 | 49 | assert len(img_shape) == 2 50 | assert len(tile_shape) == 2 51 | assert len(tile_spacing) == 2 52 | 53 | # The expression below can be re-written in a more C style as 54 | # follows : 55 | # 56 | # out_shape = [0,0] 57 | # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] - 58 | # tile_spacing[0] 59 | # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] - 60 | # tile_spacing[1] 61 | out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp 62 | in zip(img_shape, tile_shape, tile_spacing)] 63 | 64 | if isinstance(X, (list, tuple)): 65 | assert len(X) == 4 66 | # Create an output numpy ndarray to store the image 67 | if output_pixel_vals: 68 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4), 69 | dtype='uint8') 70 | else: 71 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4), 72 | dtype=X.dtype) 73 | 74 | #colors default to 0, alpha defaults to 1 (opaque) 75 | if output_pixel_vals: 76 | channel_defaults = [0, 0, 0, 255] 77 | else: 78 | channel_defaults = [0., 0., 0., 1.] 79 | 80 | for i in xrange(4): 81 | if X[i] is None: 82 | # if channel is None, fill it with zeros of the correct 83 | # dtype 84 | dt = out_array.dtype 85 | if output_pixel_vals: 86 | dt = 'uint8' 87 | out_array[:, :, i] = numpy.zeros(out_shape, 88 | dtype=dt) + channel_defaults[i] 89 | else: 90 | # use a recurrent call to compute the channel and store it 91 | # in the output 92 | out_array[:, :, i] = tile_raster_images( 93 | X[i], img_shape, tile_shape, tile_spacing, 94 | scale_rows_to_unit_interval, output_pixel_vals) 95 | return out_array 96 | 97 | else: 98 | # if we are dealing with only one channel 99 | H, W = img_shape 100 | Hs, Ws = tile_spacing 101 | 102 | # generate a matrix to store the output 103 | dt = X.dtype 104 | if output_pixel_vals: 105 | dt = 'uint8' 106 | out_array = numpy.zeros(out_shape, dtype=dt) 107 | 108 | for tile_row in xrange(tile_shape[0]): 109 | for tile_col in xrange(tile_shape[1]): 110 | if tile_row * tile_shape[1] + tile_col < X.shape[0]: 111 | this_x = X[tile_row * tile_shape[1] + tile_col] 112 | if scale_rows_to_unit_interval: 113 | # if we should scale values to be between 0 and 1 114 | # do this by calling the `scale_to_unit_interval` 115 | # function 116 | this_img = scale_to_unit_interval( 117 | this_x.reshape(img_shape)) 118 | else: 119 | this_img = this_x.reshape(img_shape) 120 | # add the slice to the corresponding position in the 121 | # output array 122 | c = 1 123 | if output_pixel_vals: 124 | c = 255 125 | out_array[ 126 | tile_row * (H + Hs): tile_row * (H + Hs) + H, 127 | tile_col * (W + Ws): tile_col * (W + Ws) + W 128 | ] = this_img * c 129 | return out_array 130 | -------------------------------------------------------------------------------- /mozi/learning_method.py: -------------------------------------------------------------------------------- 1 | 2 | import theano 3 | import theano.tensor as T 4 | from theano.ifelse import ifelse 5 | floatX = theano.config.floatX 6 | 7 | import numpy as np 8 | from mozi.utils.theano_utils import sharedX, asfloatX 9 | 10 | class LearningMethod(object): 11 | 12 | def update(self, deltas, params, gparams): 13 | """ 14 | Return a list of tuples 15 | """ 16 | raise NotImplementedError(str(type(self))+" does not implement delta.") 17 | 18 | @property 19 | def learning_rate(self): 20 | return float(self.lr.get_value()) 21 | 22 | @property 23 | def momentum(self): 24 | return float(self.mom.get_value()) 25 | 26 | 27 | class DecayLearning(LearningMethod): 28 | 29 | def __init__(self, lr_decay_factor=1.0, decay_batch=10000): 30 | self.batch = sharedX(0) 31 | self.decay_batch = sharedX(decay_batch) 32 | self.lr_decay_factor = asfloatX(lr_decay_factor) 33 | 34 | def decay(self): 35 | updates = [] 36 | new_batch = ifelse(T.gt(self.batch, self.decay_batch), sharedX(0), self.batch+1) 37 | new_lr = ifelse(T.gt(self.batch, self.decay_batch), self.lr*self.lr_decay_factor, self.lr) 38 | updates.append((self.batch, new_batch)) 39 | updates.append((self.lr, new_lr)) 40 | return updates 41 | 42 | 43 | class SGD(DecayLearning): 44 | 45 | def __init__(self, learning_rate=0.01, momentum=0.9, **kwargs): 46 | super(SGD, self).__init__(**kwargs) 47 | self.lr = sharedX(learning_rate) 48 | self.mom = sharedX(momentum) 49 | 50 | def update(self, deltas, params, gparams): 51 | updates = [] 52 | for delta, param, gparam in zip(deltas, params, gparams): 53 | updates.append((delta, self.mom * delta - self.lr * gparam)) 54 | updates.append((param, param+delta)) 55 | 56 | updates += self.decay() 57 | return updates 58 | 59 | 60 | class AdaGrad(DecayLearning): 61 | 62 | def __init__(self, learning_rate=0.9, momentum=0., k=1.0, **kwargs): 63 | """ 64 | dx = -learning_rate / sqrt(k + sum(gparam^2)) * gparam 65 | ref : Chris Dyer : Notes on AdaGrad 66 | """ 67 | super(AdaGrad, self).__init__(**kwargs) 68 | self.lr = sharedX(learning_rate) 69 | self.mom = sharedX(momentum) 70 | self.k = sharedX(k) 71 | 72 | def update(self, deltas, params, gparams): 73 | updates = [] 74 | for delta, param, gparam in zip(deltas, params, gparams): 75 | eps = theano.shared(self.k.get_value() * np.ones_like(delta.get_value(borrow=True, return_internal_type=True))) 76 | updates.append((eps, eps + gparam ** 2)) 77 | updates.append((delta, self.mom * delta - self.lr * gparam / T.sqrt(eps))) 78 | updates.append((param, param+delta)) 79 | updates += self.decay() 80 | return updates 81 | 82 | 83 | class AdaDelta(LearningMethod): 84 | 85 | def __init__(self, eps=1e-6, rho=0.95): 86 | """ 87 | dx_t = -rms(dx_{t-1}) / rms(gparam_t) * gparam_t 88 | rms(dx) = sqrt(E_t(dx^2) + eps) 89 | E_t(dx^s) = rho E_{t-1}(dx^2) + (1-rho) dx^2 90 | ref : Matthew D. Zeiler: ADADELTA: AN ADAPTIVE LEARNING RATE METHOD 91 | """ 92 | self.eps = sharedX(eps) 93 | self.rho = sharedX(rho) 94 | 95 | def update(self, deltas, params, gparams): 96 | updates = [] 97 | for delta, param, gparam in zip(deltas, params, gparams): 98 | gparam_mean = theano.shared(np.zeros_like(delta.get_value(borrow=True, return_internal_type=True))) 99 | updates.append((gparam_mean, self.rho * gparam_mean + (1-self.rho) * gparam**2)) 100 | delta_mean = theano.shared(np.zeros_like(delta.get_value(borrow=True, return_internal_type=True))) 101 | updates.append((delta_mean, self.rho * delta_mean + (1-self.rho) * delta**2)) 102 | updates.append((delta, -T.sqrt(delta_mean+self.eps) / T.sqrt(gparam_mean+self.eps) * gparam)) 103 | updates.append((param, param+delta)) 104 | return updates 105 | 106 | 107 | class RMSprop(DecayLearning): 108 | 109 | def __init__(self, learning_rate=0.01, eps=1e-6, rho=0.9, **kwargs): 110 | super(RMSprop, self).__init__(**kwargs) 111 | self.lr = sharedX(learning_rate) 112 | self.eps = sharedX(eps) 113 | self.rho = sharedX(rho) 114 | 115 | def update(self, deltas, params, gparams): 116 | updates = [] 117 | for delta, param, gparam in zip(deltas, params, gparams): 118 | new_delta = self.rho * delta + (1-self.rho) * gparam**2 119 | new_param = param - self.lr * gparam / T.sqrt(new_delta + self.eps) 120 | updates.append((delta, new_delta)) 121 | updates.append((param, new_param)) 122 | updates += self.decay() 123 | return updates 124 | 125 | 126 | class Adam(DecayLearning): 127 | 128 | def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, eps=1e-8, **kwargs): 129 | super(Adam, self).__init__(**kwargs) 130 | self.lr = sharedX(learning_rate) 131 | self.iter = sharedX(0) 132 | self.beta_1 = sharedX(beta_1) 133 | self.beta_2 = sharedX(beta_2) 134 | self.eps = sharedX(eps) 135 | 136 | def update(self, deltas, params, gparams): 137 | t = self.iter + 1 138 | lr_t = self.lr * T.sqrt(1-self.beta_2**t)/(1-self.beta_1**t) 139 | updates = [] 140 | for delta, param, gparam in zip(deltas, params, gparams): 141 | m = sharedX(param.get_value() * 0.) 142 | v = sharedX(param.get_value() * 0.) 143 | m_t = (self.beta_1 * m) + (1 - self.beta_1) * gparam 144 | v_t = (self.beta_2 * v) + (1 - self.beta_2) * gparam**2 145 | param_t = param - lr_t * m_t / (T.sqrt(v_t) + self.eps) 146 | updates.append((m, m_t)) 147 | updates.append((v, v_t)) 148 | updates.append((param, param_t)) 149 | updates += self.decay() 150 | return updates 151 | -------------------------------------------------------------------------------- /mozi/log.py: -------------------------------------------------------------------------------- 1 | 2 | from datetime import datetime 3 | import os 4 | import sys 5 | import logging 6 | import cPickle 7 | import sqlite3 8 | import operator 9 | import copy 10 | import numpy as np 11 | 12 | import theano 13 | from theano.sandbox.cuda.var import CudaNdarraySharedVariable 14 | 15 | floatX = theano.config.floatX 16 | 17 | class Log: 18 | 19 | def __init__(self, experiment_name="experiment", description=None, 20 | save_outputs=False, save_model=False, 21 | save_epoch_error=False, save_to_database=None, logger=None): 22 | 23 | self.experiment_name = experiment_name 24 | self.description = description 25 | self.save_outputs = save_outputs 26 | self.save_model = save_model 27 | self.save_epoch_error = save_epoch_error 28 | self.save_to_database = save_to_database 29 | 30 | dt = datetime.now() 31 | dt = dt.strftime('%Y%m%d_%H%M_%S%f') 32 | 33 | self.exp_id = experiment_name + '_' + dt 34 | 35 | if save_outputs or save_model: 36 | save_dir = os.environ['MOZI_SAVE_PATH'] 37 | if not os.path.exists(save_dir): 38 | os.mkdir(save_dir) 39 | 40 | self.exp_dir = save_dir + '/' + self.exp_id 41 | if not os.path.exists(self.exp_dir): 42 | os.mkdir(self.exp_dir) 43 | 44 | self.logger = logger 45 | if self.logger is None: 46 | self.logger = logging.getLogger(__name__) 47 | self.logger.setLevel(logging.DEBUG) 48 | 49 | self.logger.info('exp_id: ' + experiment_name) 50 | 51 | if save_outputs: 52 | ch = logging.FileHandler(filename=self.exp_dir+'/outputs.log') 53 | ch.setLevel(logging.INFO) 54 | formatter = logging.Formatter('%(message)s') 55 | ch.setFormatter(formatter) 56 | self.logger.addHandler(ch) 57 | 58 | if save_epoch_error: 59 | self.epoch_error_path = self.exp_dir+'/epoch_error.csv' 60 | with open(self.epoch_error_path, 'wb') as epoch_file: 61 | epoch_file.write('Epoch,Train_Cost,Valid_Cost,Valid_Error\n') 62 | 63 | if description is not None: 64 | self.logger.info('Description: ' + self.description) 65 | 66 | if save_to_database: 67 | self.first_time_record = True 68 | if not os.path.exists(os.environ['MOZI_DATABASE_PATH']): 69 | os.mkdir(os.environ['MOZI_DATABASE_PATH']) 70 | 71 | def info(self, msg): 72 | self.logger.info(msg) 73 | 74 | def print_records(self): 75 | sorted_ls = sorted(self.save_to_database['records'].iteritems(), 76 | key=operator.itemgetter(0)) 77 | for key, value in sorted_ls: 78 | self.info(key + ': ' + str(value)) 79 | 80 | def _log_outputs(self, outputs): 81 | dt = datetime.now() 82 | dt = dt.strftime('%Y-%m-%d %H:%M') 83 | self.logger.info('Time: ' + dt) 84 | 85 | for (name, val) in outputs: 86 | self.logger.info(name + ': ' + str(val)) 87 | 88 | if self.save_outputs: 89 | self.logger.info('[ outputs saved to: %s ]\n' %self.exp_id) 90 | 91 | def _save_model(self, model): 92 | with open(self.exp_dir+'/model.pkl', 'wb') as pkl_file: 93 | cPickle.dump(model, pkl_file) 94 | 95 | def _save_epoch_error(self, epoch, train_cost, valid_cost, valid_error): 96 | with open(self.epoch_error_path, 'ab') as epoch_file: 97 | epoch_file.write('{},{},{},{}\n'.format(epoch, train_cost, valid_cost, valid_error)) 98 | 99 | def _save_to_database(self, epoch, train_cost, valid_error, best_valid_error): 100 | conn = sqlite3.connect(os.environ['MOZI_DATABASE_PATH'] + '/' + self.save_to_database['name']) 101 | cur = conn.cursor() 102 | 103 | if self.first_time_record: 104 | query = 'CREATE TABLE IF NOT EXISTS ' + self.experiment_name + \ 105 | '(exp_id TEXT PRIMARY KEY NOT NULL,' 106 | 107 | for k,v in self.save_to_database['records'].items(): 108 | if type(v) is str: 109 | query += k + ' TEXT,' 110 | elif type(v) is int: 111 | query += k + ' INT,' 112 | elif type(v) is float: 113 | query += k + ' REAL,' 114 | else: 115 | try: 116 | self.save_to_database['records'][k] = str(v) 117 | query += str(k) + ' TEXT,' 118 | except: 119 | raise Exception("Error: The input types for records '{}' of {}".format(k, type(v)) 120 | + " is not primitive types (str, int, float) and not castable as str.") 121 | 122 | query += 'epoch INT, train_cost REAL, valid_error REAL, best_valid_error REAL);' 123 | 124 | cur.execute(query) 125 | 126 | try: 127 | query = 'INSERT INTO ' + self.experiment_name + ' VALUES(' 128 | ls = [self.exp_id] 129 | for k, v in self.save_to_database['records'].items(): 130 | query += '?,' 131 | ls.append(v) 132 | query += '?,?,?,?,?);' 133 | ls.extend([epoch, train_cost, valid_error, best_valid_error]) 134 | cur.execute(query, ls) 135 | self.first_time_record = False 136 | 137 | except sqlite3.OperationalError as err: 138 | self.logger.error('sqlite3.OperationalError: ' + err.message) 139 | self.logger.error('Solution: Change the experiment_name in Log() to a new name, ' 140 | + 'or drop the same table name from the database. ' 141 | + 'experiment_name is used as the table name.') 142 | raise 143 | 144 | else: 145 | cur.execute('UPDATE ' + self.experiment_name + ' SET ' + 146 | 'epoch = ?, ' + 147 | 'train_cost = ?,' + 148 | 'valid_error = ?,' + 149 | 'best_valid_error = ?' + 150 | "WHERE exp_id='%s'"%self.exp_id, 151 | [epoch, 152 | train_cost, 153 | valid_error, 154 | best_valid_error]) 155 | conn.commit() 156 | conn.close() 157 | -------------------------------------------------------------------------------- /mozi/utils/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | import matplotlib 5 | import theano 6 | import theano.tensor as T 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | from theano.compile.ops import as_op 10 | from mozi.utils.progbar import Progbar 11 | from mozi.utils.train_object_utils import is_shared_var 12 | 13 | import tarfile, inspect, os 14 | from six.moves.urllib.request import urlretrieve 15 | 16 | floatX = theano.config.floatX 17 | 18 | def duplicate_param(name, tensor_list): 19 | 20 | for param in tensor_list: 21 | if param.name is name: 22 | return True 23 | 24 | return False 25 | 26 | 27 | def tile_raster_graphs(dct_reconstruct, orig, ae_reconstruct, tile_shape, tile_spacing=(0.1,0.1), 28 | slice=(0,-1), axis=None, legend=True): 29 | """ 30 | DESCRIPTION: 31 | compare the original and the reconstructed examples by plot them on the same graph 32 | PARAM: 33 | orig / ae_reconstruct / dct_reconstruct : 2d numpy array of axis label [example, feature] 34 | tile_shape : tuple 35 | tile_spacing : tuple 36 | slice : index [start:end] 37 | gives the range of values in the example to plot 38 | axis : list [x_min, x_max, y_min, y_max] 39 | sets the bounds of the x and y axis 40 | RETURN: 41 | matplotlib.plot object 42 | """ 43 | 44 | assert orig.shape == ae_reconstruct.shape, 'orig ' + str(orig.shape) + ' and reconstruct ' + \ 45 | str(ae_reconstruct.shape) + ' shapes are different' 46 | 47 | # make a little extra space between the subplots 48 | plt.subplots_adjust(wspace=tile_spacing[0], hspace=tile_spacing[1]) 49 | 50 | num_examples = orig.shape[0] 51 | if num_examples > tile_shape[0] * tile_shape[1]: 52 | num_examples = tile_shape[0] * tile_shape[1] 53 | 54 | for i in xrange(0, num_examples): 55 | plt.subplot(tile_shape[0], tile_shape[1], i+1) 56 | plt.plot(orig[i][slice[0]:slice[1]], 'b-', label='orig') 57 | plt.plot(ae_reconstruct[i][slice[0]:slice[1]], 'g-', label='AE reconstruct') 58 | plt.plot(dct_reconstruct[i][slice[0]:slice[1]], 'r-', label='DCT reconstruct') 59 | if legend: 60 | plt.legend(loc='best') 61 | if axis is None: 62 | plt.axis('tight') 63 | else: 64 | plt.axis(axis) 65 | return plt 66 | 67 | def make_one_hot(X, onehot_size): 68 | """ 69 | DESCRIPTION: 70 | Make a one-hot version of X 71 | PARAM: 72 | X: 1d numpy with each value in X representing the class of X 73 | onehot_size: length of the one hot vector 74 | RETURN: 75 | 2d numpy tensor, with each row been the onehot vector 76 | """ 77 | 78 | rX = np.zeros((len(X), onehot_size), dtype=theano.config.floatX) 79 | for i in xrange(len(X)): 80 | rX[i, X[i]] = 1 81 | 82 | return rX 83 | 84 | def get_file(fpath, origin, untar=False): 85 | datadir = os.path.dirname(fpath) 86 | if not os.path.exists(datadir): 87 | os.makedirs(datadir) 88 | 89 | if not os.path.exists(fpath): 90 | print('Downloading data from', origin) 91 | 92 | global progbar 93 | progbar = None 94 | def dl_progress(count, block_size, total_size): 95 | global progbar 96 | if progbar is None: 97 | progbar = Progbar(total_size) 98 | else: 99 | progbar.update(count*block_size) 100 | 101 | urlretrieve(origin, fpath, dl_progress) 102 | progbar = None 103 | 104 | dirname = "" 105 | if untar: 106 | tfile = tarfile.open(fpath, 'r:*') 107 | names = tfile.getnames() 108 | dirname = names[0] 109 | not_exists = [int(not os.path.exists("{}/{}".format(datadir, fname))) for fname in names] 110 | if sum(not_exists) > 0: 111 | print('Untaring file...') 112 | tfile.extractall(path=datadir) 113 | else: 114 | print('Files already downloaded and untarred') 115 | tfile.close() 116 | 117 | return "{}/{}".format(datadir, dirname) 118 | 119 | 120 | @as_op(itypes=[theano.tensor.fmatrix], 121 | otypes=[theano.tensor.fmatrix]) 122 | def theano_unique(a): 123 | return numpy.unique(a) 124 | 125 | 126 | def get_from_module(identifier, module_params, module_name, instantiate=False): 127 | if type(identifier) is str: 128 | res = module_params.get(identifier) 129 | if not res: 130 | raise Exception('Invalid ' + str(module_name) + ': ' + str(identifier)) 131 | if instantiate: 132 | return res() 133 | else: 134 | return res 135 | return identifier 136 | 137 | 138 | def make_tuple(*args): 139 | return args 140 | 141 | 142 | def gpu_to_cpu_model(model): 143 | for layer in model.layers: 144 | for member, value in layer.__dict__.items(): 145 | if is_shared_var(value): 146 | layer.__dict__[member] = T._shared(np.array(value.get_value(), floatX), 147 | name=value.name, borrow=False) 148 | for i in xrange(len(layer.params)): 149 | if is_shared_var(layer.params[i]): 150 | layer.params[i] = T._shared(np.array(layer.params[i].get_value(), floatX), 151 | name=layer.params[i].name, borrow=False) 152 | return model 153 | 154 | 155 | def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.): 156 | """ 157 | Pad each sequence to the same length: 158 | the length of the longuest sequence. 159 | 160 | If maxlen is provided, any sequence longer 161 | than maxlen is truncated to maxlen. Truncation happens off either the beginning (default) or 162 | the end of the sequence. 163 | 164 | Supports post-padding and pre-padding (default). 165 | 166 | """ 167 | lengths = [len(s) for s in sequences] 168 | 169 | nb_samples = len(sequences) 170 | if maxlen is None: 171 | maxlen = np.max(lengths) 172 | 173 | x = (np.ones((nb_samples, maxlen)) * value).astype(dtype) 174 | for idx, s in enumerate(sequences): 175 | if truncating == 'pre': 176 | trunc = s[-maxlen:] 177 | elif truncating == 'post': 178 | trunc = s[:maxlen] 179 | else: 180 | raise ValueError("Truncating type '%s' not understood" % padding) 181 | 182 | if padding == 'post': 183 | x[idx, :len(trunc)] = trunc 184 | elif padding == 'pre': 185 | x[idx, -len(trunc):] = trunc 186 | else: 187 | raise ValueError("Padding type '%s' not understood" % padding) 188 | return x 189 | -------------------------------------------------------------------------------- /mozi/layers/convolution.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | 4 | import numpy as np 5 | 6 | import theano 7 | from theano.sandbox.cuda.fftconv import conv2d_fft 8 | import theano.tensor as T 9 | # from theano.tensor.signal import downsample 10 | from theano.tensor.signal.pool import pool_2d 11 | from mozi.weight_init import XavierUniformWeight, GaussianWeight 12 | from mozi.layers.template import Template 13 | from mozi.utils.theano_utils import shared_zeros 14 | # from theano.tensor.nnet.conv import conv2d 15 | from theano.tensor.nnet import conv2d 16 | # from theano.tensor.signal.conv import conv2d 17 | 18 | floatX = theano.config.floatX 19 | 20 | class Convolution2D(Template): 21 | def __init__(self, input_channels, filters, kernel_size=(3,3), stride=(1,1), 22 | W=None, b=None, weight_init=GaussianWeight(mean=0, std=0.1), border_mode='valid'): 23 | ''' 24 | PARAM: 25 | border_mode: (from theano) 26 | valid: only apply filter to complete patches of the image. Generates 27 | output of shape: image_shape - filter_shape + 1 28 | full: zero-pads image to multiple of filter shape to generate output 29 | of shape: image_shape + filter_shape - 1 30 | ''' 31 | self.input_channels = input_channels 32 | self.filters = filters 33 | self.kernel_size = kernel_size 34 | self.stride = stride 35 | self.border_mode = border_mode 36 | 37 | self.W_shape = (self.filters, self.input_channels) + self.kernel_size 38 | self.W = W 39 | if self.W is None: 40 | self.W = weight_init(self.W_shape, name='W') 41 | 42 | self.b = b 43 | if self.b is None: 44 | self.b = shared_zeros(shape=(self.filters,), name='b') 45 | 46 | self.params = [self.W, self.b] 47 | 48 | 49 | def _train_fprop(self, state_below): 50 | conv_out = conv2d(state_below, self.W, 51 | border_mode=self.border_mode, 52 | subsample=self.stride) 53 | return conv_out + self.b.dimshuffle('x', 0, 'x', 'x') 54 | 55 | 56 | def _layer_stats(self, state_below, layer_output): 57 | w_max = self.W.max() 58 | w_min = self.W.min() 59 | w_mean = self.W.mean() 60 | w_std = self.W.std() 61 | return[('filter_max', w_max), 62 | ('filter_min', w_min), 63 | ('filter_mean', w_mean), 64 | ('filter_std', w_std)] 65 | 66 | 67 | 68 | class Pooling2D(Template): 69 | def __init__(self, poolsize=(2, 2), stride=None, padding=(0,0), 70 | ignore_border=True, mode='max'): 71 | ''' 72 | DESCRIPTION: 73 | pooling layer 74 | PARAM: 75 | stride: two-dimensional tuple (a, b), the separation horizontally a 76 | or vertically b between two pools 77 | padding: pad zeros to the border of the feature map 78 | mode: max | sum | average_inc_pad | average_exc_pad 79 | ignore_border: 80 | ''' 81 | 82 | self.poolsize = poolsize 83 | self.stride = stride 84 | self.padding = padding 85 | self.ignore_border = ignore_border 86 | self.mode = mode 87 | 88 | self.params = [] 89 | 90 | 91 | def _train_fprop(self, state_below): 92 | return pool_2d(state_below, ds=self.poolsize, st=self.stride, 93 | padding=self.padding, ignore_border=self.ignore_border, 94 | mode=self.mode) 95 | 96 | 97 | class ConvFFT2D(Template): 98 | def __init__(self, input_channels, filters, stride, kernel_size=(3,3), 99 | W=None, b=None, weight_init=GaussianWeight(mean=0, std=0.1), 100 | image_shape=None, border_mode='valid', pad_last_dim=False): 101 | ''' 102 | PARAM: 103 | border_mode: (from theano) 104 | valid: only apply filter to complete patches of the image. Generates 105 | output of shape: image_shape - filter_shape + 1 106 | full: zero-pads image to multiple of filter shape to generate output 107 | of shape: image_shape + filter_shape - 1 108 | ''' 109 | self.input_channels = input_channels 110 | self.filters = filters 111 | self.kernel_size = kernel_size 112 | self.border_mode = border_mode 113 | self.image_shape = image_shape 114 | self.pad_last_dim = pad_last_dim 115 | 116 | self.W_shape = (self.filters, self.input_channels) + self.kernel_size 117 | self.W = W 118 | if self.W is None: 119 | self.W = weight_init(self.W_shape, name='W') 120 | 121 | self.b = b 122 | if self.b is None: 123 | self.b = shared_zeros(shape=(self.filters,), name='b') 124 | 125 | self.params = [self.W, self.b] 126 | 127 | 128 | def _train_fprop(self, state_below): 129 | conv_out = conv2d_fft(state_below, self.W, 130 | border_mode=self.border_mode, 131 | image_shape=self.image_shape, 132 | pad_last_dim=self.pad_last_dim) 133 | return conv_out + self.b.dimshuffle('x', 0, 'x', 'x') 134 | 135 | 136 | class SpatialPyramidPooling(Template): 137 | def __init__(self, levels=[1,2,3], padding=None, ignore_border=False, mode='max'): 138 | """ 139 | DESCRIPTION: 140 | This pooling layer describe the method in SPPnet paper 141 | Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition 142 | by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun 143 | PARAM: 144 | num_levels (list): 145 | levels in SPP, example [1,2,3] 146 | ignore_border (bool): 147 | if x = 5, stride = 2 148 | True: return 3 = ceil(x / stride) 149 | False: return 2 = floor(x / stride) 150 | """ 151 | 152 | self.levels = levels 153 | self.padding = padding 154 | self.ignore_border = ignore_border 155 | self.mode = mode 156 | 157 | self.params = [] 158 | 159 | 160 | def _train_fprop(self, state_below): 161 | b, c, h, w = state_below.shape 162 | layer_out = [] 163 | for i in self.levels: 164 | out = downsample.max_pool_2d(state_below, ds=(h/i, w/i), 165 | st=(h/i, w/i), 166 | padding=self.padding, 167 | ignore_border=self.ignore_border, 168 | mode=self.mode) 169 | 170 | # theano.scan(downsample.max_pool_2d, sequences=[self.levels]) 171 | layer_out.append(out.reshape((b, T.prod(out.shape)/b))) 172 | 173 | return T.concat(layer_out, axis=1) 174 | -------------------------------------------------------------------------------- /mozi/datasets/iterator.py: -------------------------------------------------------------------------------- 1 | 2 | # from __future__ import division 3 | import warnings 4 | import numpy 5 | np = numpy 6 | from theano import config 7 | 8 | 9 | class SubsetIterator(object): 10 | def __init__(self, dataset_size, batch_size=64, num_batches=None, rng=None): 11 | """ 12 | rng: either a seed value for a numpy RandomState or 13 | numpy RandomState workalike 14 | """ 15 | self.dataset_size = dataset_size 16 | self.batch_size = batch_size 17 | self.num_batches = num_batches 18 | self.rng = rng 19 | self.idx = 0 20 | 21 | def next(self): 22 | raise NotImplementedError() 23 | 24 | def __iter__(self): 25 | self.idx = 0 26 | return self 27 | 28 | # Class-level attributes that might hint the behaviour of 29 | # FiniteDatasetIterator. 30 | 31 | # Does this return subsets that need fancy indexing? (i.e. lists 32 | # of indices) 33 | fancy = False 34 | 35 | # Does this class make use of random number generators? 36 | stochastic = False 37 | 38 | @property 39 | def num_examples(self): 40 | return self.batch_size * self.num_batches 41 | 42 | @property 43 | def uneven(self): 44 | return False 45 | 46 | 47 | class SequentialSubsetIterator(SubsetIterator): 48 | def __init__(self, dataset_size, batch_size, num_batches=None, rng=None): 49 | if rng is not None: 50 | raise ValueError("non-None rng argument not supported for " 51 | "sequential batch iteration") 52 | assert num_batches is None or num_batches >= 0 53 | if batch_size is None: 54 | if num_batches is not None: 55 | batch_size = int(numpy.ceil(dataset_size / num_batches)) 56 | else: 57 | raise ValueError("need one of batch_size, num_batches " 58 | "for sequential batch iteration") 59 | elif batch_size is not None: 60 | if num_batches is not None: 61 | max_num_batches = numpy.ceil(dataset_size / batch_size) 62 | if num_batches > max_num_batches: 63 | raise ValueError("dataset of %d examples can only provide " 64 | "%d batches with batch_size %d, but %d " 65 | "batches were requested" % 66 | (dataset_size, max_num_batches, 67 | batch_size, num_batches)) 68 | else: 69 | num_batches = numpy.ceil(dataset_size / float(batch_size)) 70 | self.next_batch_no = 0 71 | self.batch = 0 72 | super(SequentialSubsetIterator, self).__init__(dataset_size, batch_size, num_batches) 73 | self.idx = 0 74 | self.indices = np.arange(self.dataset_size) 75 | 76 | 77 | def next(self): 78 | if self.batch >= self.num_batches or self.idx >= self.dataset_size: 79 | raise StopIteration() 80 | 81 | # this fix the problem where dataset_size % batch_size != 0 82 | elif (self.idx + self.batch_size) > self.dataset_size: 83 | self.last = self.indices[self.idx : self.dataset_size] 84 | self.idx = self.dataset_size 85 | return self.last 86 | 87 | else: 88 | self.last = self.indices[self.idx : self.idx + self.batch_size] 89 | self.idx += self.batch_size 90 | self.batch += 1 91 | return self.last 92 | 93 | fancy = False 94 | stochastic = False 95 | 96 | @property 97 | def num_examples(self): 98 | product = self.batch_size * self.num_batches 99 | return min(product, self.dataset_size) 100 | 101 | @property 102 | def uneven(self): 103 | return self.batch_size * self.num_batches > self.dataset_size 104 | 105 | 106 | class ShuffledSequentialSubsetIterator(SequentialSubsetIterator): 107 | 108 | stochastic = True 109 | fancy = True 110 | 111 | def __init__(self, dataset_size, batch_size, num_batches=None, rng=None): 112 | super(ShuffledSequentialSubsetIterator, self).__init__( 113 | dataset_size, 114 | batch_size, 115 | num_batches, 116 | None 117 | ) 118 | self.idx = 0 119 | self.indices = np.arange(self.dataset_size) 120 | if rng is not None and hasattr(rng, 'random_integers'): 121 | self.rng = rng 122 | else: 123 | self.rng = numpy.random.RandomState(rng) 124 | self.shuffled = numpy.arange(self.dataset_size) 125 | self.rng.shuffle(self.shuffled) 126 | 127 | def next(self): 128 | if self.batch >= self.num_batches or self.idx >= self.dataset_size: 129 | raise StopIteration() 130 | 131 | # this fix the problem where dataset_size % batch_size != 0 132 | elif (self.idx + self.batch_size) > self.dataset_size: 133 | rval = self.shuffled[self.idx: self.dataset_size] 134 | self.idx = self.dataset_size 135 | return rval 136 | else: 137 | rval = self.shuffled[self.idx: self.idx + self.batch_size] 138 | self.idx += self.batch_size 139 | self.batch += 1 140 | return rval 141 | 142 | 143 | class SequentialContinuousIterator(SubsetIterator): 144 | 145 | def __init__(self, dataset_size, batch_size, step_size=1): 146 | ''' 147 | The is for continous sequence with fix step at a time. 148 | ''' 149 | super(SequentialRecurrentIterator, self).__init__(dataset_size, batch_size) 150 | self.idx = 0 151 | self.indices = np.arange(self.dataset_size) 152 | self.step_size = step_size 153 | assert self.step_size > 0 154 | 155 | def next(self): 156 | if self.idx + self.batch_size > self.dataset_size: 157 | raise StopIteration() 158 | 159 | rval = self.indices[self.idx:self.idx+self.batch_size] 160 | self.idx += self.step_size 161 | return rval 162 | 163 | class SequentialRecurrentIterator(SubsetIterator): 164 | 165 | def __init__(self, dataset_size, batch_size, seq_len): 166 | ''' 167 | This is for generating sequences of equal len (seq_len) with (batch_size) 168 | number of sequences. example of seq_len 3 and batch_size 4 will generate 169 | [0, 1, 2; 1, 2, 3; 2, 3, 4; 3, 4, 5] 170 | ''' 171 | super(SequentialRecurrentIterator, self).__init__(dataset_size, batch_size) 172 | assert dataset_size >= seq_len, 'size of dataset has to be at least larger than sequence length' 173 | self.seq_len = seq_len 174 | self.ridx = np.concatenate([np.arange(self.seq_len) + i for i in range(batch_size)]) 175 | 176 | def __iter__(self): 177 | self.ridx = np.concatenate([np.arange(self.seq_len) + i for i in range(batch_size)]) 178 | 179 | def next(self): 180 | if self.ridx[-1] >= self.dataset_size: 181 | last = self.ridx[-1] - self.dataset_size + 1 182 | if len(self.ridx[:-last*self.seq_len]) == 0: 183 | raise StopIteration() 184 | ridx = np.copy(self.ridx) 185 | self.ridx += self.batch_size 186 | return ridx[:-last*self.seq_len] 187 | else: 188 | ridx = np.copy(self.ridx) 189 | self.ridx += self.batch_size 190 | return ridx 191 | -------------------------------------------------------------------------------- /mozi/layers/recurrent.py: -------------------------------------------------------------------------------- 1 | 2 | from mozi.utils.theano_utils import shared_zeros, alloc_zeros_matrix, shared_ones 3 | from mozi.layers.template import Template 4 | from mozi.weight_init import OrthogonalWeight, GaussianWeight, Identity 5 | import theano.tensor as T 6 | import theano 7 | 8 | 9 | class LSTM(Template): 10 | 11 | def __init__(self, input_dim, output_dim, truncate_gradient=-1, return_sequences=True, 12 | weight_init=OrthogonalWeight(), inner_init=GaussianWeight(mean=0, std=0.1)): 13 | 14 | self.input_dim = input_dim 15 | self.output_dim = output_dim 16 | self.truncate_gradient = truncate_gradient 17 | self.return_sequences = return_sequences 18 | 19 | self.W_i = weight_init((self.input_dim, self.output_dim)) 20 | self.U_i = inner_init((self.output_dim, self.output_dim)) 21 | self.b_i = shared_zeros((self.output_dim), name='b_i') 22 | 23 | self.W_f = weight_init((self.input_dim, self.output_dim)) 24 | self.U_f = inner_init((self.output_dim, self.output_dim)) 25 | self.b_f = shared_ones((self.output_dim), name='b_f') 26 | 27 | self.W_c = weight_init((self.input_dim, self.output_dim)) 28 | self.U_c = inner_init((self.output_dim, self.output_dim)) 29 | self.b_c = shared_zeros((self.output_dim), name='b_c') 30 | 31 | self.W_o = weight_init((self.input_dim, self.output_dim)) 32 | self.U_o = inner_init((self.output_dim, self.output_dim)) 33 | self.b_o = shared_zeros((self.output_dim), name='b_o') 34 | 35 | self.params = [ 36 | self.W_i, self.U_i, self.b_i, 37 | self.W_c, self.U_c, self.b_c, 38 | self.W_f, self.U_f, self.b_f, 39 | self.W_o, self.U_o, self.b_o, 40 | ] 41 | 42 | 43 | def _step(self, xi_t, xf_t, xo_t, xc_t, 44 | h_tm1, c_tm1, u_i, u_f, u_o, u_c): 45 | i_t = T.nnet.sigmoid(xi_t + T.dot(h_tm1, u_i)) 46 | f_t = T.nnet.sigmoid(xf_t + T.dot(h_tm1, u_f)) 47 | o_t = T.nnet.sigmoid(xo_t + T.dot(h_tm1, u_o)) 48 | g_t = T.tanh(xc_t + T.dot(h_tm1, u_c)) 49 | c_t = f_t * c_tm1 + i_t * g_t 50 | 51 | h_t = o_t * T.tanh(c_t) 52 | return h_t, c_t 53 | 54 | 55 | def _train_fprop(self, state_below): 56 | X = state_below.dimshuffle((1, 0, 2)) 57 | 58 | xi = T.dot(X, self.W_i) + self.b_i 59 | xf = T.dot(X, self.W_f) + self.b_f 60 | xc = T.dot(X, self.W_c) + self.b_c 61 | xo = T.dot(X, self.W_o) + self.b_o 62 | 63 | [outputs, memories], updates = theano.scan( 64 | self._step, 65 | sequences=[xi, xf, xo, xc], 66 | outputs_info=[ 67 | T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), 68 | T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) 69 | ], 70 | non_sequences=[self.U_i, self.U_f, self.U_o, self.U_c], 71 | truncate_gradient=self.truncate_gradient) 72 | 73 | if self.return_sequences: 74 | return outputs.dimshuffle((1, 0, 2)) 75 | return outputs[-1] 76 | 77 | 78 | class BiLSTM(Template): 79 | ''' 80 | Bidirection LSTM 81 | ''' 82 | 83 | def __init__(self, input_dim, output_dim, weight_init=OrthogonalWeight(), 84 | inner_init=GaussianWeight(mean=0, std=0.1), truncate_gradient=-1, 85 | output_mode='concat', return_sequences=False, return_idx=-1): 86 | 87 | self.input_dim = input_dim 88 | self.output_dim = output_dim 89 | self.truncate_gradient = truncate_gradient 90 | self.output_mode = output_mode # output_mode is either sum or concatenate 91 | self.return_sequences = return_sequences 92 | self.return_idx = return_idx 93 | # forward weights 94 | self.W_i = weight_init((self.input_dim, self.output_dim)) 95 | self.U_i = inner_init((self.output_dim, self.output_dim)) 96 | self.b_i = shared_zeros((self.output_dim), name='b_i') 97 | 98 | self.W_f = weight_init((self.input_dim, self.output_dim)) 99 | self.U_f = inner_init((self.output_dim, self.output_dim)) 100 | self.b_f = shared_ones((self.output_dim), name='b_f') 101 | 102 | self.W_c = weight_init((self.input_dim, self.output_dim)) 103 | self.U_c = inner_init((self.output_dim, self.output_dim)) 104 | self.b_c = shared_zeros((self.output_dim), name='b_c') 105 | 106 | self.W_o = weight_init((self.input_dim, self.output_dim)) 107 | self.U_o = inner_init((self.output_dim, self.output_dim)) 108 | self.b_o = shared_zeros((self.output_dim), name='b_o') 109 | 110 | # backward weights 111 | self.Wb_i = weight_init((self.input_dim, self.output_dim)) 112 | self.Ub_i = inner_init((self.output_dim, self.output_dim)) 113 | self.bb_i = shared_zeros((self.output_dim), name='bb_i') 114 | 115 | self.Wb_f = weight_init((self.input_dim, self.output_dim)) 116 | self.Ub_f = inner_init((self.output_dim, self.output_dim)) 117 | self.bb_f = shared_ones((self.output_dim), name='bb_f') 118 | 119 | self.Wb_c = weight_init((self.input_dim, self.output_dim)) 120 | self.Ub_c = inner_init((self.output_dim, self.output_dim)) 121 | self.bb_c = shared_zeros((self.output_dim), name='bb_c') 122 | 123 | self.Wb_o = weight_init((self.input_dim, self.output_dim)) 124 | self.Ub_o = inner_init((self.output_dim, self.output_dim)) 125 | self.bb_o = shared_zeros((self.output_dim), name='bb_o') 126 | 127 | self.params = [ 128 | self.W_i, self.U_i, self.b_i, 129 | self.W_c, self.U_c, self.b_c, 130 | self.W_f, self.U_f, self.b_f, 131 | self.W_o, self.U_o, self.b_o, 132 | 133 | self.Wb_i, self.Ub_i, self.bb_i, 134 | self.Wb_c, self.Ub_c, self.bb_c, 135 | self.Wb_f, self.Ub_f, self.bb_f, 136 | self.Wb_o, self.Ub_o, self.bb_o, 137 | ] 138 | 139 | 140 | def _forward_step(self, 141 | xi_t, xf_t, xo_t, xc_t, 142 | h_tm1, c_tm1, 143 | u_i, u_f, u_o, u_c): 144 | i_t = T.nnet.sigmoid(xi_t + T.dot(h_tm1, u_i)) 145 | f_t = T.nnet.sigmoid(xf_t + T.dot(h_tm1, u_f)) 146 | o_t = T.nnet.sigmoid(xo_t + T.dot(h_tm1, u_o)) 147 | g_t = T.tanh(xc_t + T.dot(h_tm1, u_c)) 148 | c_t = f_t * c_tm1 + i_t * g_t 149 | h_t = o_t * T.tanh(c_t) 150 | return h_t, c_t 151 | 152 | 153 | def get_forward_output(self, state_below): 154 | X = state_below.dimshuffle((1,0,2)) 155 | 156 | xi = T.dot(X, self.W_i) + self.b_i 157 | xf = T.dot(X, self.W_f) + self.b_f 158 | xc = T.dot(X, self.W_c) + self.b_c 159 | xo = T.dot(X, self.W_o) + self.b_o 160 | 161 | [outputs, memories], updates = theano.scan( 162 | self._forward_step, 163 | sequences=[xi, xf, xo, xc], 164 | outputs_info=[ 165 | T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), 166 | T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) 167 | ], 168 | non_sequences=[self.U_i, self.U_f, self.U_o, self.U_c], 169 | truncate_gradient=self.truncate_gradient 170 | ) 171 | return outputs.dimshuffle((1,0,2)) 172 | 173 | 174 | def get_backward_output(self, state_below): 175 | X = state_below.dimshuffle((1,0,2)) 176 | 177 | xi = T.dot(X, self.Wb_i) + self.bb_i 178 | xf = T.dot(X, self.Wb_f) + self.bb_f 179 | xc = T.dot(X, self.Wb_c) + self.bb_c 180 | xo = T.dot(X, self.Wb_o) + self.bb_o 181 | 182 | [outputs, memories], updates = theano.scan( 183 | self._forward_step, 184 | sequences=[xi, xf, xo, xc], 185 | outputs_info=[ 186 | T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), 187 | T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) 188 | ], 189 | non_sequences=[self.Ub_i, self.Ub_f, self.Ub_o, self.Ub_c], 190 | go_backwards = True, 191 | truncate_gradient=self.truncate_gradient 192 | ) 193 | return outputs.dimshuffle((1,0,2)) 194 | 195 | 196 | def _train_fprop(self, state_below): 197 | forward = self.get_forward_output(state_below) 198 | backward = self.get_backward_output(state_below) 199 | if self.output_mode == 'sum': 200 | output = forward + backward 201 | elif self.output_mode == 'concat': 202 | output = T.concatenate([forward, backward], axis=2) 203 | else: 204 | raise Exception('output mode is not sum or concat') 205 | if self.return_sequences==False: 206 | return output[:,self.return_idx,:] 207 | elif self.return_sequences==True: 208 | return output 209 | else: 210 | raise Exception('Unexpected output shape for return_sequences') 211 | -------------------------------------------------------------------------------- /mozi/datasets/dataset.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import mozi.datasets.iterator as iterators 4 | import numpy as np 5 | import theano 6 | from multiprocessing import Process, Queue 7 | import time 8 | floatX = theano.config.floatX 9 | 10 | import logging 11 | internal_logger = logging.getLogger(__name__) 12 | logging.basicConfig(level=logging.DEBUG) 13 | 14 | from mozi.log import Log 15 | 16 | class IterMatrix(object): 17 | 18 | def __init__(self, X, y, iter_class='SequentialSubsetIterator', 19 | batch_size=100, **kwargs): 20 | self.X = X 21 | self.y = y 22 | self.batch_size = batch_size 23 | self.iter_class = iter_class 24 | self.kwargs = kwargs 25 | self.iterator = getattr(iterators, self.iter_class) 26 | 27 | def __iter__(self): 28 | return self.iterator(dataset_size=self.dataset_size, 29 | batch_size=self.batch_size, **self.kwargs) 30 | 31 | def set_iterator(self, iterator): 32 | self.iterator = iterator 33 | 34 | def __getitem__(self, key): 35 | return self.X[key], self.y[key] 36 | 37 | @property 38 | def dataset_size(self): 39 | return self.X.shape[0] if self.X is not None else -1 40 | 41 | 42 | class IterDatasets(IterMatrix): 43 | 44 | def __init__(self, X, y, iter_class='SequentialSubsetIterator', 45 | batch_size=100, **kwargs): 46 | self.X = X 47 | self.y = y 48 | self.batch_size = batch_size 49 | self.iter_class = iter_class 50 | self.kwargs = kwargs 51 | self.iterator = getattr(iterators, self.iter_class) 52 | 53 | def __getitem__(self, key): 54 | Xslice = [] 55 | yslice = [] 56 | for dataset in self.X: 57 | Xslice.append(dataset[key]) 58 | for label in self.y: 59 | yslice.append(label[key]) 60 | return Xslice + yslice 61 | 62 | def __len__(self): 63 | return self.dataset_size 64 | 65 | @property 66 | def dataset_size(self): 67 | if isinstance(self.X, tuple): 68 | if len(self.X) == 0: 69 | dsize = -1 70 | else: 71 | dsize = len(self.y[0]) 72 | elif self.X is None: 73 | dsize = -1 74 | else: 75 | dsize = len(self.X) 76 | return dsize 77 | 78 | 79 | class Dataset(object): 80 | 81 | def __init__(self, log): 82 | self.log = log 83 | if self.log is None: 84 | # use default Log setting, using the internal logger 85 | self.log = Log(logger=internal_logger) 86 | 87 | 88 | def __iter__(self): 89 | raise NotImplementedError(str(type(self))+" does not implement the __iter__ method.") 90 | 91 | def next(self): 92 | raise NotImplementedError(str(type(self))+" does not implement the next method.") 93 | 94 | @property 95 | def nblocks(self): 96 | raise NotImplementedError(str(type(self))+" does not implement the nblocks method.") 97 | 98 | 99 | class SingleBlock(Dataset): 100 | 101 | def __init__(self, X=None, y=None, train_valid_test_ratio=[8,1,1], log=None, **kwargs): 102 | ''' 103 | All the data is loaded into memory for one go training 104 | ''' 105 | super(SingleBlock, self).__init__(log=log) 106 | self.ratio = train_valid_test_ratio 107 | self.train = IterMatrix(X=None, y=None, **kwargs) 108 | self.valid = IterMatrix(X=None, y=None, **kwargs) 109 | self.test = IterMatrix(X=None, y=None, **kwargs) 110 | 111 | assert len(self.ratio) == 3, 'the size of list is not 3' 112 | 113 | if X is not None and y is not None: 114 | self.set_Xy(X, y) 115 | 116 | def __iter__(self): 117 | self.iter = True 118 | return self 119 | 120 | def next(self): 121 | if self.iter: 122 | # only one iteration since there is only one data block 123 | self.iter = False 124 | return self 125 | else: 126 | raise StopIteration 127 | 128 | @property 129 | def nblocks(self): 130 | return 1 131 | 132 | def set_Xy(self, X, y): 133 | num_examples = len(X) 134 | total_ratio = sum(self.ratio) 135 | num_train = int(self.ratio[0] * 1.0 * num_examples / total_ratio) 136 | num_valid = int(self.ratio[1] * 1.0 * num_examples / total_ratio) 137 | 138 | train_X = X[:num_train] 139 | train_y = y[:num_train] 140 | 141 | valid_X = X[num_train:num_train+num_valid] 142 | valid_y = y[num_train:num_train+num_valid] 143 | 144 | test_X = X[num_train+num_valid:] 145 | test_y = y[num_train+num_valid:] 146 | 147 | self.train.X = train_X 148 | self.train.y = train_y 149 | 150 | if self.ratio[1] == 0: 151 | self.log.info('Valid set is empty! It is needed for early stopping and saving best model') 152 | 153 | self.valid.X = valid_X 154 | self.valid.y = valid_y 155 | 156 | if self.ratio[2] == 0: 157 | self.log.info('Test set is empty! It is needed for testing the best model') 158 | 159 | self.test.X = test_X 160 | self.test.y = test_y 161 | 162 | 163 | def get_train(self): 164 | return self.train 165 | 166 | def get_valid(self): 167 | return self.valid 168 | 169 | def get_test(self): 170 | return self.test 171 | 172 | def set_train(self, X, y): 173 | self.train.X = X 174 | self.train.y = y 175 | 176 | def set_valid(self, X, y): 177 | self.valid.X = X 178 | self.valid.y = y 179 | 180 | def set_test(self, X, y): 181 | self.test.X = X 182 | self.test.y = y 183 | 184 | 185 | class DataBlocks(Dataset): 186 | 187 | def __init__(self, data_paths, train_valid_test_ratio=[8,1,1], log=None, allow_preload=False, **kwargs): 188 | 189 | """ 190 | DESCRIPTION: 191 | This is class for processing blocks of data, whereby dataset is loaded 192 | and unloaded into memory one block at a time. 193 | PARAM: 194 | data_paths(list): contains the paths to the numpy data files. It's a 195 | list of tuples whereby the first element of the tuple 196 | is the X path, and the second is the y path. 197 | example [(X_path1, y_path1),(X_path2, y_path2)] 198 | allow_preload(bool): by allowing preload, it will preload the next data block 199 | while training at the same time on the current datablock, 200 | this will reduce time but will also cost more memory. 201 | 202 | """ 203 | super(DataBlocks, self).__init__(log=log) 204 | assert isinstance(data_paths, (list,tuple)), "data_paths is not a list" 205 | self.data_paths = data_paths 206 | self.single_block = SingleBlock(None, None, train_valid_test_ratio, log, **kwargs) 207 | self.allow_preload = allow_preload 208 | self.q = Queue() 209 | 210 | def __iter__(self): 211 | self.files = iter(self.data_paths) 212 | if self.allow_preload: 213 | self.lastblock = False 214 | bufile = next(self.files) 215 | self.load_Xy(bufile, self.q) 216 | return self 217 | 218 | def next(self): 219 | if self.allow_preload: 220 | if self.lastblock: 221 | raise StopIteration 222 | 223 | try: 224 | X, y = self.q.get(block=True, timeout=None) 225 | self.single_block.set_Xy(X,y) 226 | bufile = next(self.files) 227 | p = Process(target=self.load_Xy, args=(bufile, self.q)) 228 | p.start() 229 | except: 230 | self.lastblock = True 231 | else: 232 | fpaths = next(self.files) 233 | X,y = self.openfile(fpaths) 234 | self.single_block.set_Xy(X=X, y=y) 235 | 236 | return self.single_block 237 | 238 | @staticmethod 239 | def openfile(paths): 240 | assert isinstance(paths, (list,tuple)), str(type(paths)) + "is not a tuple or list" 241 | with open(paths[0], 'rb') as X_fin, open(paths[1], 'rb') as y_fin: 242 | X = np.load(X_fin) 243 | y = np.load(y_fin) 244 | return X,y 245 | 246 | def load_Xy(self, paths, q): 247 | self.log.info('..loading: ' + str(paths)) 248 | X,y = self.openfile(paths) 249 | q.put((X,y)) 250 | 251 | @property 252 | def nblocks(self): 253 | return len(self.data_paths) 254 | 255 | 256 | class MultiInputsData(SingleBlock): 257 | 258 | def __init__(self, X=None, y=None, train_valid_test_ratio=[8,1,1], log=None, **kwargs): 259 | 260 | """ 261 | DESCRIPTION: 262 | This class is used for multitask learning where we have multiple data 263 | inputs and multiple data output. 264 | PARAM: 265 | X (tuple of arrays or just one array of X): If our input is X1 and X2, both 266 | with same number of rows, then X = (X1, X2) 267 | y (tuple of arrays or just one array of y): label of same number of rows as 268 | input data X 269 | """ 270 | super(MultiInputsData, self).__init__(train_valid_test_ratio=train_valid_test_ratio, 271 | log=log, **kwargs) 272 | 273 | self.train = IterDatasets(None, None, **kwargs) 274 | self.valid = IterDatasets(None, None, **kwargs) 275 | self.test = IterDatasets(None, None, **kwargs) 276 | self.set(X, y) 277 | 278 | 279 | def set(self, X, y): 280 | if isinstance(X, tuple): 281 | self.num_examples = len(X[0]) 282 | for dataset in X: 283 | assert len(dataset) == self.num_examples, 'number of rows for different datasets is not the same' 284 | elif X is None: 285 | self.num_examples = 0 286 | X = () 287 | else: 288 | self.num_examples = len(X) 289 | X = (X,) 290 | 291 | if isinstance(y, tuple): 292 | for label in y: 293 | assert len(label) == self.num_examples, 'number of rows for different y is not the same' 294 | elif y is None: 295 | y = () 296 | assert self.num_examples == 0 297 | else: 298 | assert len(y) == self.num_examples, 'number of rows for y is not the same as input features' 299 | y = (y,) 300 | 301 | total_ratio = sum(self.ratio) 302 | num_train = int(float(self.ratio[0]) * self.num_examples / total_ratio) 303 | num_valid = int(float(self.ratio[1]) * self.num_examples / total_ratio) 304 | 305 | trainset = () 306 | validset = () 307 | testset = () 308 | for dataset in X: 309 | trainset += (dataset[:num_train],) 310 | validset += (dataset[num_train:num_train+num_valid],) 311 | testset += (dataset[num_train+num_valid:],) 312 | 313 | trainlbl = () 314 | validlbl = () 315 | testlbl = () 316 | for label in y: 317 | trainlbl += (label[:num_train],) 318 | validlbl += (label[num_train:num_train+num_valid],) 319 | testlbl += (label[num_train+num_valid:],) 320 | 321 | self.train.X = trainset 322 | self.train.y = trainlbl 323 | 324 | if self.ratio[1] == 0: 325 | self.log.info('Valid set is empty! It is needed for early stopping and saving best model') 326 | self.valid.X = validset 327 | self.valid.y = validlbl 328 | 329 | if self.ratio[2] == 0: 330 | self.log.info('Test set is empty! It is needed for testing the best model') 331 | self.test.X = testset 332 | self.test.y = testlbl 333 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Mozi 2 | ===== 3 | 4 | Mozi is based on Theano with a clean and sharp design, the **design philosophy** of Mozi 5 | 6 | 1. **Fast and Simple**: The main engine of the package is only 200 lines of code. There is only one full compiled graph for training which ensures all the data manipulation happens in one go through the pipeline and as a result make it super fast package. 7 | 2. **Highly Modular**: Building a model in Mozi is like building a house with Lego, you can design whatever imaginable layers and stack them together easily. 8 | 3. **Model Abstract from Training**: In order to facilitate deployment of trained model for real use, the model is abstracted away from the training module and keep as minimalist as possible. Objective is to allowed realtime deployment and easy model exchange. 9 | 4. **Logging System**: Mozi provides a full logging feature that allows user to log the training results and the hyperparameters to the database for paranormal overview. Also it allows automatic saving of best model and logging of all training outputs for easy aftermath analysis. 10 | 11 | --- 12 | ### Install 13 | 14 | First you need to install [theano](https://github.com/Theano/Theano) 15 | 16 | You can simply install mozi via pip for latest stable version 17 | ``` 18 | sudo pip install mozi 19 | ``` 20 | or install via pip for bleeding edge version 21 | ```bash 22 | sudo pip install git+https://github.com/hycis/Mozi.git@master 23 | ``` 24 | or simply clone and add to `PYTHONPATH`. 25 | ```bash 26 | git clone https://github.com/hycis/Mozi.git 27 | export PYTHONPATH=/path/to/Mozi:$PYTHONPATH 28 | ``` 29 | in order for the install to persist via export `PYTHONPATH`. Add `PYTHONPATH=/path/to/TensorGraph:$PYTHONPATH` to your `.bashrc` for linux or 30 | `.bash_profile` for mac. While this method works, you will have to ensure that 31 | all the dependencies in [setup.py](setup.py) are installed. 32 | 33 | --- 34 | ### Set Environment 35 | In Mozi, we need to set three environment paths 36 | * *MOZI_DATA_PATH* 37 | * *MOZI_SAVE_PATH* 38 | * *MOZI_DATABASE_PATH* 39 | 40 | `MOZI_DATA_PATH` is the directory for saving and loading the datasets. 41 | `MOZI_SAVE_PATH` is the directory for saving all the models, the output log and epoch error. 42 | `MOZI_DATABASE_PATH` is the directory for saving the database that contains tables recording the hyperparameters and test errors for each training job. 43 | 44 | --- 45 | ### Let's Have Fun! 46 | Building a model in Mozi is as simple as 47 | 48 | ```python 49 | import theano.tensor as T 50 | from mozi.model import Sequential 51 | model = Sequential(input_var=T.matrix(), output_var=T.matrix()) 52 | ``` 53 | The `input_var` is the input tensor variable that corresponds to the number of dimensions of the input dataset. The `output_var` is the tensor variable that corresponds to the target of the dataset. `T.matrix` for `2d data`, `T.tensor3` for `3d data` and `T.tensor4` for `4d data`. Next add the layers 54 | 55 | ```python 56 | from mozi.layers.linear import Linear 57 | from mozi.layers.activation import RELU, Softmax 58 | from mozi.layers.noise import Dropout 59 | 60 | model.add(Linear(prev_dim=28*28, this_dim=200)) 61 | model.add(RELU()) 62 | model.add(Linear(prev_dim=200, this_dim=100)) 63 | model.add(RELU()) 64 | model.add(Dropout(0.5)) 65 | model.add(Linear(prev_dim=100, this_dim=10)) 66 | model.add(Softmax()) 67 | ``` 68 | To train the model, first build a dataset and a learning method, here we use the mnist dataset and SGD 69 | ```python 70 | from mozi.datasets.mnist import Mnist 71 | from mozi.learning_method import SGD 72 | 73 | data = Mnist(batch_size=64, train_valid_test_ratio=[5,1,1]) 74 | learning_method = SGD(learning_rate=0.1, momentum=0.9, lr_decay_factor=0.9, decay_batch=10000) 75 | ``` 76 | Finally build a training object and put everything in to train the model 77 | ```python 78 | from mozi.train_object import TrainObject 79 | from mozi.cost import mse, error 80 | 81 | train_object = TrainObject(model = model, 82 | log = None, 83 | dataset = data, 84 | train_cost = mse, 85 | valid_cost = error, 86 | learning_method = learning_method, 87 | stop_criteria = {'max_epoch' : 10, 88 | 'epoch_look_back' : 5, 89 | 'percent_decrease' : 0.01} 90 | ) 91 | train_object.setup() 92 | train_object.run() 93 | ``` 94 | #### Stopping Criteria 95 | ```python 96 | stop_criteria = {'max_epoch' : 10, 97 | 'epoch_look_back' : 5, 98 | 'percent_decrease' : 0.01} 99 | ``` 100 | The stopping criteria here means the training will stop if training has reach 'max_epoch' of 10 or the validation error does not decrease by at least 1% in the past 5 epoch. 101 | #### Test Model 102 | And that's it! Once the training is done, to test the model, it's as simple as calling the forward propagation `fprop(X)` in model 103 | ```python 104 | import numpy as np 105 | 106 | ypred = model.fprop(data.get_test().X) 107 | ypred = np.argmax(ypred, axis=1) 108 | y = np.argmax(data.get_test().y, axis=1) 109 | accuracy = np.equal(ypred, y).astype('f4').sum() / len(y) 110 | print 'test accuracy:', accuracy 111 | ``` 112 | --- 113 | ### More Examples 114 | Mozi can be used to build effectively any kind of architecture. Below is another few examples 115 | * [**Convolution Neural Network**](doc/cnn.md) 116 | * [**Denoising Autoencoder**](doc/dae.md) 117 | * [**Variational Autoencoder**](doc/vae.md) 118 | * [**Alexnet**](example/voc_alexnet.py) 119 | 120 | --- 121 | ### Layer Template 122 | To build a layer for Mozi, the layer has to implement the template 123 | ```python 124 | class Template(object): 125 | """ 126 | DESCRIPTION: 127 | The interface to be implemented by any layer. 128 | """ 129 | def __init__(self): 130 | self.params = [] # all params that needs to be updated by training go into the list 131 | 132 | def _test_fprop(self, state_below): 133 | # the testing track whereby no params update is performed after data flows through this track 134 | raise NotImplementedError() 135 | 136 | def _train_fprop(self, state_below): 137 | # the training track whereby params is updated every time data flows through this track 138 | raise NotImplementedError() 139 | 140 | def _layer_stats(self, state_below, layer_output): 141 | # calculate everything you want to know about the layer, the input, the output, 142 | # the weight and put in the return list in the format [('W max', T.max(W)), ('W min', T.min(W))]. 143 | # This method provides a peek into the layer and is useful for debugging. 144 | return [] 145 | ``` 146 | Each layer provides two tracks: training track and testing track. During training, the model will call `_train_fprop` in every layer and the output from the model will be used to update the params in `self.params` in each layer. During testing, `_test_fprop` is called in every layer and the output is used to evaluate the model and to judge if model should stop training based on the stopping criteria set in the `TrainObject`. We can also peek into each layer by putting whatever we want to know about the layer into `_layer_stats`. For example, if we want to know what is the maximum weight in a layer, we can compute `T.max(W)` and return `[('W max', T.max(W))]` from `_layer_stats`, so that after every epoch, `'W max'` will be calculated for that layer and output to screen. 147 | 148 | --- 149 | ### Data Interface 150 | Mozi provides two data interface, one is for dataset small enough to fit all into the memory [(SingleBlock)](mozi/datasets/dataset.py#L99), another is for large datasets which cannot be fit into memory in one go and has to be broken up into blocks and load into training one block at a time [(DataBlocks)](mozi/datasets/dataset.py#L185). 151 | Check out the [Mnist](mozi/datasets/mnist.py) or [Cifar10](mozi/datasets/cifar10.py) examples on how to build a dataset. 152 | 153 | #### Using SingleBlock Directly 154 | Besides subclassing `SingleBlock` or `DataBlocks` to create dataset like Mnist and Cifar10 example, we can use `SingleBlock` or `DataBlocks` directly to build a dataset in as simple as 155 | ```python 156 | from mozi.datasets.dataset import SingleBlock 157 | import numpy as np 158 | X = np.random.rand(1000, 5) 159 | y = np.random.rand(1000, 3) 160 | data = SingleBlock(X=X, y=y, batch_size=100, train_valid_test_ratio=[3,1,1]) 161 | train_object = TrainObject(dataset = data) 162 | ``` 163 | 164 | #### For Large Dataset Trained in Blocks 165 | For large dataset that cannot fit into memory, you can use `DataBlocks` to train block by block. Below is an example for demonstration. You can also run the [Example](example/datablocks_exp.py) 166 | ```python 167 | from mozi.datasets.dataset import DataBlocks 168 | import numpy as np 169 | 170 | # we have two blocks of 100 images each saved 171 | # as ('X1.npy', 'y1.npy') and ('X2.npy', 'y2.npy') 172 | X1 = np.random.rand(1000, 3, 32, 32) 173 | y1 = np.random.rand(1000, 10) 174 | with open('X1.npy', 'wb') as xin, open('y1.npy', 'wb') as yin: 175 | np.save(xin, X1) 176 | np.save(yin, y1) 177 | 178 | X2 = np.random.rand(1000, 3, 32, 32) 179 | y2 = np.random.rand(1000, 10) 180 | with open('X2.npy', 'wb') as xin, open('y2.npy', 'wb') as yin: 181 | np.save(xin, X1) 182 | np.save(yin, y1) 183 | 184 | # now we can create the data by putting the paths 185 | # ('X1.npy', 'y1.npy') and ('X2.npy', 'y2.npy') into DataBlocks 186 | # during training, the TrainObject will load and unload one block at a time 187 | data = DataBlocks(data_paths=[('X1.npy', 'y1.npy'), ('X2.npy', 'y2.npy')], 188 | batch_size=100, train_valid_test_ratio=[3,1,1]) 189 | train_object = TrainObject(dataset = data) 190 | ``` 191 | 192 | --- 193 | ### Logging 194 | Mozi provides a logging module for automatic saving of best model and logging the errors for each epoch. 195 | 196 | 197 | ```python 198 | from mozi.log import Log 199 | 200 | log = Log(experiment_name = 'MLP', 201 | description = 'This is a tutorial', 202 | save_outputs = True, # log all the outputs from the screen 203 | save_model = True, # save the best model 204 | save_epoch_error = True, # log error at every epoch 205 | save_to_database = {'name': 'Example.db', 206 | 'records': {'Batch_Size': data.batch_size, 207 | 'Learning_Rate': learning_method.learning_rate, 208 | 'Momentum': learning_method.momentum}} 209 | ) # end log 210 | ``` 211 | The log module allows logging of outputs from screen, saving best model and epoch-errors. It also allows recording of hyperparameters to the database using the `save_to_database` argument, the `save_to_database` argument takes in a dictionary that contains two fields `'name'` and `'records'`. `'name'` indicates the name of the database to save the recording table. The name of the recording table will follow the experiment name under argument `experiment_name`. The `'records'` field takes in a dictionary of unrestricted number of hyperparameters that we want to record. The `'records'` only accepts primitive data types (str, int, float). 212 | Once log object is built, it can be passed into `TrainObject` as 213 | ```python 214 | TrainObject(log = log) 215 | ``` 216 | 217 | 218 | 219 | 220 | --- 221 | ### Load Saved Model 222 | When we set `save_model` in `Log` to be true, the best model is automatically saved to `MOZI_SAVE_PATH`. The model is serialized in pickle format, so to load the saved model. We can use cPickle. 223 | ```python 224 | import cPickle 225 | with open('model.pkl', 'rb') as fout: 226 | model = cPickle.load(fout) 227 | y = model.fprop(X) 228 | ``` 229 | --- 230 | 231 | ### Why Mozi? 232 | [Mozi](https://en.wikiquote.org/wiki/Mozi) (墨子) (470 B.C - 391 B.C) is a Chinese philosopher during warring states period (春秋戰國), his philosophy advocates peace, simplicity, universal love and pragmatism. 233 | 234 | --- 235 | ### Licence 236 | MIT Licence 237 | -------------------------------------------------------------------------------- /mozi/train_object.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import theano 4 | import theano.tensor as T 5 | floatX = theano.config.floatX 6 | 7 | import numpy as np 8 | 9 | import time, datetime 10 | import sys 11 | 12 | import logging 13 | internal_logger = logging.getLogger(__name__) 14 | logging.basicConfig(level=logging.DEBUG) 15 | 16 | from mozi.log import Log 17 | from mozi.utils.theano_utils import shared_zeros 18 | from mozi.utils.train_object_utils import split_list, generate_shared_list, merge_lists, \ 19 | get_shared_values, is_shared_var, merge_var 20 | 21 | from mozi.utils.check_memory import get_mem_usage 22 | from mozi.utils.progbar import Progbar 23 | 24 | 25 | class TrainObject(): 26 | 27 | def __init__(self, model, dataset, train_cost, valid_cost, learning_method, stop_criteria, log=None, verbose=True): 28 | self.model = model 29 | self.dataset = dataset 30 | self.train_cost = train_cost 31 | self.valid_cost = valid_cost 32 | self.learning_method = learning_method 33 | self.stop_criteria = stop_criteria 34 | self.log = log 35 | self.verbose = verbose 36 | 37 | 38 | if self.log is None: 39 | # use default Log setting 40 | self.log = Log(logger=internal_logger) 41 | 42 | elif self.log.save_to_database: 43 | self.log.print_records() 44 | self.log.info('\n') 45 | 46 | 47 | def setup(self): 48 | 49 | self.log.info( '..begin setting up train object') 50 | 51 | #===================[ build params and deltas list ]==================# 52 | 53 | params = [] 54 | deltas = [] 55 | 56 | for i, layer in enumerate(self.model.layers): 57 | layer_name = "{}_{}".format(layer.__class__.__name__, i) 58 | if hasattr(layer, 'params'): 59 | for param in layer.params: 60 | # checked that the param to be updated is shared variable 61 | if is_shared_var(param): 62 | param.name = str(i) + '_' + str(param.name) 63 | param.name += '_' + layer.__class__.__name__ 64 | params += [param] 65 | deltas += [shared_zeros(shape=param.shape.eval())] 66 | 67 | #=====================[ training params updates ]=====================# 68 | 69 | self.log.info("..update params: " + str(params)) 70 | train_y_pred, train_layers_stats = self.model.train_fprop(self.model.input_var) 71 | train_cost = self.train_cost(self.model.output_var, train_y_pred).astype(floatX) 72 | gparams = T.grad(train_cost, params) 73 | train_updates = self.learning_method.update(deltas, params, gparams) 74 | 75 | #=================[ append updates from each layer ]==================# 76 | 77 | for i, layer in enumerate(self.model.layers): 78 | layer_name = "{}_{}".format(layer.__class__.__name__, i) 79 | if hasattr(layer, 'updates') and len(layer.updates) > 0: 80 | self.log.info("..{}: has shared variable updates".format(layer_name)) 81 | train_updates += layer.updates 82 | 83 | #----[ append updates of stats from each layer to train updates ]-----# 84 | 85 | self.train_stats_names, train_stats_vars = split_list(train_layers_stats) 86 | train_stats_vars = [var.astype(floatX) for var in train_stats_vars] 87 | self.train_stats_shared = generate_shared_list(train_stats_vars) 88 | train_stats_updates = merge_lists(self.train_stats_shared, train_stats_vars) 89 | if self.verbose: 90 | train_updates += train_stats_updates 91 | 92 | #-------------------------[ train functions ]-------------------------# 93 | 94 | self.log.info('..begin compiling functions') 95 | self.training = theano.function(inputs=merge_var(self.model.input_var, self.model.output_var), 96 | outputs=train_cost, 97 | updates=train_updates, 98 | on_unused_input='warn', 99 | allow_input_downcast=True) 100 | 101 | self.log.info('..training function compiled') 102 | 103 | #=============================[ testing ]=============================# 104 | 105 | test_y_pred, test_layers_stats = self.model.test_fprop(self.model.input_var) 106 | 107 | #-----[ append updates of stats from each layer to test updates ]-----# 108 | 109 | self.test_stats_names, test_stats_vars = split_list(test_layers_stats) 110 | test_stats_vars = [var.astype(floatX) for var in test_stats_vars] 111 | self.test_stats_shared = generate_shared_list(test_stats_vars) 112 | test_stats_updates = [] 113 | if self.verbose: 114 | test_stats_updates = merge_lists(self.test_stats_shared, test_stats_vars) 115 | 116 | #-------------------------[ test functions ]--------------------------# 117 | 118 | test_stopping_error = self.valid_cost(self.model.output_var, test_y_pred).astype(floatX) 119 | test_cost = self.train_cost(self.model.output_var, test_y_pred).astype(floatX) 120 | 121 | self.testing = theano.function(inputs=merge_var(self.model.input_var, self.model.output_var), 122 | outputs=(test_stopping_error, test_cost), 123 | updates=test_stats_updates, 124 | on_unused_input='warn', 125 | allow_input_downcast=True) 126 | 127 | self.log.info('..testing function compiled') 128 | 129 | 130 | def run(self): 131 | 132 | best_valid_error = float(sys.maxint) 133 | valid_error = float(sys.maxint) 134 | 135 | train_cost = float(sys.maxint) 136 | valid_cost = float(sys.maxint) 137 | 138 | train_stats_values = [] 139 | valid_stats_values = [] 140 | 141 | epoch = 0 142 | error_dcr = 0 143 | self.best_epoch_last_update = 0 144 | self.best_valid_last_update = float(sys.maxint) 145 | 146 | train_stats_names = ['train_' + name for name in self.train_stats_names] 147 | valid_stats_names = ['valid_' + name for name in self.test_stats_names] 148 | 149 | job_start = time.time() 150 | 151 | while (self.continue_learning(epoch, error_dcr, best_valid_error)): 152 | 153 | if epoch > 0: 154 | self.log.info("best_epoch_last_update: %d"%self.best_epoch_last_update) 155 | self.log.info("valid_error_decrease: %f"%error_dcr) 156 | self.log.info("best_valid_last_update: %f"%self.best_valid_last_update) 157 | self.log.info("========[ End of Epoch ]========\n\n") 158 | 159 | epoch += 1 160 | 161 | start_time = time.time() 162 | 163 | num_train_examples = 0 164 | total_train_cost = 0. 165 | train_stats_values = np.zeros(len(train_stats_names), dtype=floatX) 166 | 167 | num_valid_examples = 0 168 | total_valid_cost = 0. 169 | total_valid_stopping_cost = 0. 170 | valid_stats_values = np.zeros(len(valid_stats_names), dtype=floatX) 171 | 172 | blk = 0 173 | 174 | for block in self.dataset: 175 | block_time = time.time() 176 | blk += 1 177 | 178 | train_set = block.get_train() 179 | valid_set = block.get_valid() 180 | 181 | #====================[ Training Progress ]====================# 182 | if train_set.dataset_size > 0: 183 | self.log.info('..training '+ self.dataset.__class__.__name__ 184 | + ' block %s/%s'%(blk, self.dataset.nblocks)) 185 | 186 | progbar = Progbar(target=train_set.dataset_size) 187 | blk_sz = 0 188 | for idx in train_set: 189 | cost = self.training(*train_set[idx]) 190 | total_train_cost += cost * len(idx) 191 | num_train_examples += len(idx) 192 | train_stats_values += len(idx) * get_shared_values(self.train_stats_shared) 193 | blk_sz += len(idx) 194 | progbar.update(blk_sz) 195 | print 196 | 197 | #===================[ Validating Progress ]===================# 198 | if valid_set.dataset_size > 0: 199 | 200 | self.log.info('..validating ' + self.dataset.__class__.__name__ 201 | + ' block %s/%s'%(blk, self.dataset.nblocks)) 202 | 203 | progbar = Progbar(target=valid_set.dataset_size) 204 | blk_sz = 0 205 | for idx in valid_set: 206 | stopping_cost, cost = self.testing(*valid_set[idx]) 207 | total_valid_cost += cost * len(idx) 208 | total_valid_stopping_cost += stopping_cost * len(idx) 209 | num_valid_examples += len(idx) 210 | valid_stats_values += len(idx) * get_shared_values(self.test_stats_shared) 211 | blk_sz += len(idx) 212 | progbar.update(blk_sz) 213 | print 214 | 215 | self.log.info('block time: %0.2fs'%(time.time()-block_time)) 216 | self.log.info(get_mem_usage()) 217 | 218 | #-------[ Update train best cost and error values ]-------# 219 | if num_train_examples > 0: 220 | train_cost = total_train_cost / num_train_examples 221 | train_stats_values /= num_train_examples 222 | 223 | #-------[ Update valid best cost and error values ]-------# 224 | if num_valid_examples > 0: 225 | valid_error = total_valid_stopping_cost / num_valid_examples 226 | valid_cost = total_valid_cost / num_valid_examples 227 | valid_stats_values /= num_valid_examples 228 | 229 | if valid_error < best_valid_error: 230 | best_valid_error = valid_error 231 | self.log.info('..best validation error so far') 232 | if self.log.save_model: 233 | self.log._save_model(self.model) 234 | self.log.info('..model saved') 235 | 236 | if valid_error < self.best_valid_last_update: 237 | error_dcr = self.best_valid_last_update - valid_error 238 | else: 239 | error_dcr = 0 240 | 241 | #==============[ save to database, save epoch error]==============# 242 | if self.log.save_to_database: 243 | self.log._save_to_database(epoch, train_cost, valid_error, best_valid_error) 244 | self.log.info('..sent to database: %s:%s' % (self.log.save_to_database['name'], 245 | self.log.experiment_name)) 246 | 247 | if self.log.save_epoch_error: 248 | self.log._save_epoch_error(epoch, train_cost, valid_cost, valid_error) 249 | self.log.info('..epoch error saved') 250 | 251 | end_time = time.time() 252 | 253 | #=====================[ log outputs to file ]=====================# 254 | merged_train = merge_lists(train_stats_names, train_stats_values) 255 | merged_valid = merge_lists(valid_stats_names, valid_stats_values) 256 | 257 | outputs = [('epoch', epoch), 258 | ('runtime(s)', int(end_time-start_time)), 259 | ('train_' + self.train_cost.func_name, train_cost), 260 | ('valid_' + self.train_cost.func_name, valid_cost), 261 | ('valid_' + self.valid_cost.func_name, valid_error), 262 | ('best_valid_' + self.valid_cost.func_name, best_valid_error)] 263 | 264 | outputs += merged_train + merged_valid 265 | self.log._log_outputs(outputs) 266 | 267 | 268 | job_end = time.time() 269 | self.log.info('Job Completed on %s'%time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime(job_end))) 270 | ttl_time = int(job_end - job_start) 271 | dt = datetime.timedelta(seconds=ttl_time) 272 | self.log.info('Total Time Taken: %s'%str(dt)) 273 | self.log.info("========[ End of Job ]========\n\n") 274 | 275 | 276 | def continue_learning(self, epoch, error_dcr, best_valid_error): 277 | 278 | if epoch > self.stop_criteria['max_epoch']: 279 | return False 280 | 281 | elif self.stop_criteria['percent_decrease'] is None or \ 282 | self.stop_criteria['epoch_look_back'] is None: 283 | return True 284 | 285 | elif np.abs(float(error_dcr) / self.best_valid_last_update) \ 286 | >= self.stop_criteria['percent_decrease']: 287 | self.best_valid_last_update = best_valid_error 288 | self.best_epoch_last_update = epoch 289 | return True 290 | 291 | elif epoch - self.best_epoch_last_update > \ 292 | self.stop_criteria['epoch_look_back']: 293 | return False 294 | 295 | else: 296 | return True 297 | -------------------------------------------------------------------------------- /mozi/datasets/preprocessor.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | Functionality for preprocessing Datasets. With Preprocessor, GCN, Standardize adapted from pylearn2 4 | """ 5 | 6 | import sys 7 | import copy 8 | import logging 9 | import time 10 | import warnings 11 | import numpy as np 12 | import sklearn.preprocessing as preproc 13 | try: 14 | from scipy import linalg 15 | except ImportError: 16 | warnings.warn("Could not import scipy.linalg") 17 | from theano import function 18 | import theano.tensor as T 19 | import theano 20 | import scipy 21 | 22 | log = logging.getLogger(__name__) 23 | 24 | class Preprocessor(object): 25 | """ 26 | Adapted from pylearn2 27 | 28 | Abstract class. 29 | 30 | An object that can preprocess a dataset. 31 | 32 | Preprocessing a dataset implies changing the data that 33 | a dataset actually stores. This can be useful to save 34 | memory--if you know you are always going to access only 35 | the same processed version of the dataset, it is better 36 | to process it once and discard the original. 37 | 38 | Preprocessors are capable of modifying many aspects of 39 | a dataset. For example, they can change the way that it 40 | converts between different formats of data. They can 41 | change the number of examples that a dataset stores. 42 | In other words, preprocessors can do a lot more than 43 | just example-wise transformations of the examples stored 44 | in the dataset. 45 | """ 46 | 47 | def apply(self, X): 48 | """ 49 | dataset: The dataset to act on. 50 | can_fit: If True, the Preprocessor can adapt internal parameters 51 | based on the contents of dataset. Otherwise it must not 52 | fit any parameters, or must re-use old ones. 53 | 54 | Typical usage: 55 | # Learn PCA preprocessing and apply it to the training set 56 | my_pca_preprocessor.apply(training_set, can_fit = True) 57 | # Now apply the same transformation to the test set 58 | my_pca_preprocessor.apply(test_set, can_fit = False) 59 | 60 | Note: this method must take a dataset, rather than a numpy ndarray, 61 | for a variety of reasons: 62 | 1) Preprocessors should work on any dataset, and not all 63 | datasets will store their data as ndarrays. 64 | 2) Preprocessors often need to change a dataset's metadata. 65 | For example, suppose you have a DenseDesignMatrix dataset 66 | of images. If you implement a fovea Preprocessor that 67 | reduces the dimensionality of images by sampling them finely 68 | near the center and coarsely with blurring at the edges, 69 | then your preprocessor will need to change the way that the 70 | dataset converts example vectors to images for visualization. 71 | """ 72 | 73 | raise NotImplementedError(str(type(self))+" does not implement an apply method.") 74 | 75 | def invert(self, X): 76 | """ 77 | Do any necessary prep work to be able to support the "inverse" method 78 | later. Default implementation is no-op. 79 | """ 80 | raise NotImplementedError(str(type(self))+" does not implement an invert method.") 81 | 82 | class ExamplewisePreprocessor(Preprocessor): 83 | """ 84 | Abstract class. 85 | 86 | A Preprocessor that restricts the actions it can do in its 87 | apply method so that it could be implemented as a Block's 88 | perform method. 89 | 90 | In other words, this Preprocessor can't modify the Dataset's 91 | metadata, etc. 92 | 93 | TODO: can these things fit themselves in their apply method? 94 | That seems like a difference from Block. 95 | """ 96 | 97 | def as_block(self): 98 | raise NotImplementedError(str(type(self))+" does not implement as_block.") 99 | 100 | class Standardize(ExamplewisePreprocessor): 101 | """ 102 | Adapted from pylearn2 103 | Subtracts the mean and divides by the standard deviation. 104 | """ 105 | def __init__(self, global_mean=None, global_std=None, std_eps=1e-4): 106 | """ 107 | Initialize a Standardize preprocessor. 108 | 109 | Parameters 110 | ---------- 111 | global_mean : bool 112 | If `True`, subtract the (scalar) mean over every element 113 | in the design matrix. If `False`, subtract the mean from 114 | each column (feature) separately. Default is `False`. 115 | global_std : bool 116 | If `True`, after centering, divide by the (scalar) standard 117 | deviation of every element in the design matrix. If `False`, 118 | divide by the column-wise (per-feature) standard deviation. 119 | Default is `False`. 120 | std_eps : float 121 | Stabilization factor added to the standard deviations before 122 | dividing, to prevent standard deviations very close to zero 123 | from causing the feature values to blow up too much. 124 | Default is `1e-4`. 125 | """ 126 | self._std_eps = std_eps 127 | self._mean = global_mean 128 | self._std = global_std 129 | 130 | def apply(self, X): 131 | if self._mean is None: 132 | self._mean = X.mean(axis=0) 133 | if self._std is None: 134 | self._std = X.std(axis=0) 135 | X = (X - self._mean) / (self._std_eps + self._std) 136 | return X 137 | 138 | def invert(self, X): 139 | return X * (self._std_eps + self._std) + self._mean 140 | 141 | 142 | 143 | 144 | 145 | 146 | class GCN(Preprocessor): 147 | 148 | """ 149 | Adapted from pylearn2 150 | Global contrast normalizes by (optionally) subtracting the mean 151 | across features and then normalizes by either the vector norm 152 | or the standard deviation (across features, for each example). 153 | 154 | Parameters 155 | ---------- 156 | X : ndarray, 2-dimensional 157 | Design matrix with examples indexed on the first axis and 158 | features indexed on the second. 159 | 160 | scale : float, optional 161 | Multiply features by this const. 162 | 163 | subtract_mean : bool, optional 164 | Remove the mean across features/pixels before normalizing. 165 | Defaults to `False`. 166 | 167 | use_std : bool, optional 168 | Normalize by the per-example standard deviation across features 169 | instead of the vector norm. 170 | 171 | sqrt_bias : float, optional 172 | Fudge factor added inside the square root. Defaults to 0. 173 | 174 | min_divisor : float, optional 175 | If the divisor for an example is less than this value, 176 | do not apply it. Defaults to `1e-8`. 177 | """ 178 | 179 | def __init__(self, scale=1., subtract_mean=True, use_std=False, 180 | sqrt_bias=0., min_divisor=1e-8): 181 | 182 | self.scale = scale 183 | self.subtract_mean = subtract_mean 184 | self.use_std = use_std 185 | self.sqrt_bias = sqrt_bias 186 | self.min_divisor = min_divisor 187 | 188 | def apply(self, X): 189 | """ 190 | Returns 191 | ------- 192 | Xp : ndarray, 2-dimensional 193 | The contrast-normalized features. 194 | 195 | Notes 196 | ----- 197 | `sqrt_bias` = 10 and `use_std = True` (and defaults for all other 198 | parameters) corresponds to the preprocessing used in [1]. 199 | 200 | .. [1] A. Coates, H. Lee and A. Ng. "An Analysis of Single-Layer 201 | Networks in Unsupervised Feature Learning". AISTATS 14, 2011. 202 | http://www.stanford.edu/~acoates/papers/coatesleeng_aistats_2011.pdf 203 | """ 204 | assert X.ndim == 2, "X.ndim must be 2" 205 | scale = float(self.scale) 206 | # Note: this is per-example mean across pixels, not the 207 | # per-pixel mean across examples. So it is perfectly fine 208 | # to subtract this without worrying about whether the current 209 | # object is the train, valid, or test set. 210 | if self.subtract_mean: 211 | self.mean = np.mean(X, axis=1) 212 | X = X - self.mean[:, np.newaxis] # Makes a copy. 213 | else: 214 | X = X.copy() 215 | 216 | if self.use_std: 217 | # ddof=1 simulates MATLAB's var() behaviour, which is what Adam 218 | # Coates' code does. 219 | self.normalizers = np.sqrt(self.sqrt_bias + X.var(axis=1, ddof=1)) / scale 220 | else: 221 | self.normalizers = np.sqrt(self.sqrt_bias + (X ** 2).sum(axis=1)) / scale 222 | # Don't normalize by anything too small. 223 | self.normalizers[self.normalizers < self.min_divisor] = 1. 224 | X = X / self.normalizers[:, np.newaxis] # Does not make a copy. 225 | return X 226 | 227 | def invert(self, X): 228 | try: 229 | if self.subtract_mean: 230 | X = X + self.mean 231 | rval = X * self.normalizers[:, np.newaxis] 232 | return rval 233 | except AttributeError: 234 | print 'apply() needs to be used before invert()' 235 | except: 236 | print "Unexpected error:", sys.exc_info()[0] 237 | 238 | 239 | class GCN_IMG(GCN): 240 | 241 | def apply(self, X): 242 | assert X.ndim == 4, 'img dimension should be 4 of (b, c, h, w)' 243 | b, c, h, w = X.shape 244 | newX = super(GCN_IMG, self).apply(X.reshape((b*c, h*w))) 245 | return newX.reshape((b,c,h,w)) 246 | 247 | def invert(self, X): 248 | assert X.ndim == 4, 'img dimension should be 4 of (b, c, h, w)' 249 | b, c, h, w = X.shape 250 | newX = super(GCN_IMG, self).invert(X.reshape((b*c, h*w))) 251 | return newX.reshape((b,c,h,w)) 252 | 253 | 254 | class LogGCN(GCN): 255 | 256 | def __init__(self, positive_values=True, **kwarg): 257 | ''' 258 | postive_values: bool 259 | indicates whether the output of the processor should be scaled to be positive 260 | ''' 261 | self.positive_values = positive_values 262 | super(LogGCN, self).__init__(**kwarg) 263 | 264 | def apply(self, X): 265 | if self.positive_values: 266 | rval = X + 1 267 | rval = np.log(rval) 268 | return super(LogGCN, self).apply(rval) 269 | 270 | def invert(self, X): 271 | X = super(LogGCN, self).invert(X) 272 | if self.positive_values: 273 | return np.exp(X) - 1 274 | else: 275 | return np.exp(X) 276 | 277 | class Log(Preprocessor): 278 | 279 | def __init__(self, positive_values=False, **kwarg): 280 | ''' 281 | postive_values: bool 282 | indicates whether the output of the processor should be scaled to be positive 283 | ''' 284 | self.positive_values = positive_values 285 | 286 | def apply(self, X): 287 | if self.positive_values: 288 | X = X + 1 289 | return np.log(X) 290 | 291 | def invert(self, X): 292 | if self.positive_values: 293 | return np.exp(X) - 1 294 | else: 295 | return np.exp(X) 296 | 297 | 298 | class Scale(Preprocessor): 299 | 300 | """ 301 | Scale the input into a range 302 | 303 | Parameters 304 | ---------- 305 | X : ndarray, 2-dimensional 306 | numpy matrix with examples indexed on the first axis and 307 | features indexed on the second. 308 | 309 | global_max : real 310 | the maximum value of the whole dataset. If not provided, global_max is set to X.max() 311 | 312 | global_min : real 313 | the minimum value of the whole dataset. If not provided, global_min is set to X.min() 314 | 315 | scale_range : size 2 list 316 | set the upper bound and lower bound after scaling 317 | 318 | buffer : float 319 | the buffer on the upper lower bound such that [L+buffer, U-buffer] 320 | """ 321 | 322 | 323 | def __init__(self, global_max=None, global_min=None, scale_range=[-1,1], buffer=0.1): 324 | 325 | self.scale_range = scale_range 326 | self.buffer = buffer 327 | self.max = global_max 328 | self.min = global_min 329 | assert scale_range[0] + buffer < scale_range[1] - buffer, \ 330 | 'the lower bound is larger than the upper bound' 331 | 332 | def apply(self, X): 333 | self.max = self.max if self.max else X.max() 334 | self.min = self.min if self.min else X.min() 335 | width = self.max - self.min 336 | assert width > 0, 'the max is not bigger than the min' 337 | scale = (self.scale_range[1] - self.scale_range[0] - 2 * self.buffer) / width 338 | X = scale * (X - self.min) 339 | X = X + self.scale_range[0] + self.buffer 340 | 341 | return X 342 | 343 | def invert(self, X): 344 | if self.max is None or self.min is None: 345 | raise ValueError('to use invert, either global_max and global_min are provided or \ 346 | apply(X) is used before') 347 | width = self.max - self.min 348 | assert width > 0, 'the max is not bigger than the min' 349 | scale = width / (self.scale_range[1] - self.scale_range[0] - 2 * self.buffer) 350 | X = scale * (X - self.scale_range[0] - self.buffer) 351 | X = X + self.min 352 | 353 | return X 354 | 355 | 356 | class Pipeline(Preprocessor): 357 | 358 | def __init__(self, preprocessors, inplace=False): 359 | self.preprocessors = preprocessors 360 | self.inplace = inplace 361 | 362 | def apply(self, X): 363 | newX = X 364 | if not self.inplace: 365 | newX = X.copy() 366 | for proc in self.preprocessors: 367 | newX = proc.apply(newX) 368 | return newX 369 | 370 | def invert(self, X): 371 | newX = X.copy() 372 | for proc in self.preprocessors: 373 | newX = proc.invert(newX) 374 | return newX 375 | 376 | class Normalize(Preprocessor): 377 | 378 | def __init__(self, norm='l2', axis=1, channelwise=False): 379 | """ 380 | normalize each data vector to unit length 381 | 382 | Parameters 383 | ---------- 384 | X : ndarray, 2-dimensional 385 | numpy matrix with examples indexed on the first axis and 386 | features indexed on the second. 387 | norm : l1, l2 or max 388 | channelwise: apply preprocessing channelwise 389 | """ 390 | self.norm = norm 391 | self.axis = axis 392 | self.channelwise = channelwise 393 | 394 | 395 | def apply(self, X): 396 | if X.ndim == 4 and self.channelwise: 397 | shape = X.shape 398 | flattern_X = np.reshape(X, (shape[0]*shape[1], shape[2]*shape[3])) 399 | flattern_X = preproc.normalize(flattern_X, norm=self.norm, axis=1, copy=True) 400 | return flattern_X.reshape(shape) 401 | 402 | if X.ndim > 2: 403 | shape = X.shape 404 | flattern_X = np.reshape(X, (shape[0], np.prod(shape[1:]))) 405 | flattern_X = preproc.normalize(flattern_X, norm=self.norm, axis=self.axis, copy=True) 406 | return flattern_X.reshape(shape) 407 | 408 | return preproc.normalize(X, norm=self.norm, axis=self.axis, copy=True) 409 | 410 | 411 | class Sigmoid(Preprocessor): 412 | 413 | def apply(self, X): 414 | return 1 / (1 + np.exp(-X)) 415 | 416 | def invert(self, X): 417 | return np.log(X / (1-X + 1e-9)) 418 | 419 | 420 | class ZCA(Preprocessor): 421 | 422 | """ 423 | from pylearn2 424 | Performs ZCA whitening. 425 | .. TODO:: 426 | WRITEME properly 427 | add reference 428 | Parameters 429 | ---------- 430 | n_components : integer, optional 431 | Keeps the n_components biggest eigenvalues and corresponding 432 | eigenvectors of covariance matrix. 433 | n_drop_components : integer, optional 434 | Drops the n_drop_components smallest eigenvalues and corresponding 435 | eigenvectors of covariance matrix. Will only drop components 436 | when n_components is not set i.e. n_components has preference over 437 | n_drop_components. 438 | filter_bias : float, optional 439 | TODO: verify that default of 0.1 is what was used in the 440 | Coates and Ng paper, add reference 441 | store_inverse : bool, optional 442 | When self.apply(dataset, can_fit=True) store not just the 443 | preprocessing matrix, but its inverse. This is necessary when 444 | using this preprocessor to instantiate a ZCA_Dataset. 445 | """ 446 | 447 | def __init__(self, n_components=None, n_drop_components=None, 448 | filter_bias=0.1, store_inverse=True): 449 | warnings.warn("This ZCA preprocessor class is known to yield very " 450 | "different results on different platforms. If you plan " 451 | "to conduct experiments with this preprocessing on " 452 | "multiple machines, it is probably a good idea to do " 453 | "the preprocessing on a single machine and copy the " 454 | "preprocessed datasets to the others, rather than " 455 | "preprocessing the data independently in each " 456 | "location.") 457 | # TODO: test to see if differences across platforms 458 | # e.g., preprocessing STL-10 patches in LISA lab versus on 459 | # Ian's Ubuntu 11.04 machine 460 | # are due to the problem having a bad condition number or due to 461 | # different version numbers of scipy or something 462 | self.n_components = n_components 463 | self.n_drop_components = n_drop_components 464 | self.copy = True 465 | self.filter_bias = np.cast[theano.config.floatX](filter_bias) 466 | self.has_fit_ = False 467 | self.store_inverse = store_inverse 468 | self.P_ = None # set by fit() 469 | self.inv_P_ = None # set by fit(), if self.store_inverse is True 470 | 471 | # Analogous to DenseDesignMatrix.design_loc. If not None, the 472 | # matrices P_ and inv_P_ will be saved together in 473 | # (or .npz, if the suffix is omitted). 474 | self.matrices_save_path = None 475 | 476 | @staticmethod 477 | def _gpu_matrix_dot(matrix_a, matrix_b, matrix_c=None): 478 | """ 479 | Performs matrix multiplication. 480 | Attempts to use the GPU if it's available. If the matrix multiplication 481 | is too big to fit on the GPU, this falls back to the CPU after throwing 482 | a warning. 483 | Parameters 484 | ---------- 485 | matrix_a : WRITEME 486 | matrix_b : WRITEME 487 | matrix_c : WRITEME 488 | """ 489 | if not hasattr(ZCA._gpu_matrix_dot, 'theano_func'): 490 | ma, mb = T.matrices('A', 'B') 491 | mc = T.dot(ma, mb) 492 | ZCA._gpu_matrix_dot.theano_func = \ 493 | theano.function([ma, mb], mc, allow_input_downcast=True) 494 | 495 | theano_func = ZCA._gpu_matrix_dot.theano_func 496 | 497 | try: 498 | if matrix_c is None: 499 | return theano_func(matrix_a, matrix_b) 500 | else: 501 | matrix_c[...] = theano_func(matrix_a, matrix_b) 502 | return matrix_c 503 | except MemoryError: 504 | warnings.warn('Matrix multiplication too big to fit on GPU. ' 505 | 'Re-doing with CPU. Consider using ' 506 | 'THEANO_FLAGS="device=cpu" for your next ' 507 | 'preprocessor run') 508 | return np.dot(matrix_a, matrix_b, matrix_c) 509 | 510 | @staticmethod 511 | def _gpu_mdmt(mat, diags): 512 | """ 513 | Performs the matrix multiplication M * D * M^T. 514 | First tries to do this on the GPU. If this throws a MemoryError, it 515 | falls back to the CPU, with a warning message. 516 | Parameters 517 | ---------- 518 | mat : WRITEME 519 | diags : WRITEME 520 | """ 521 | 522 | floatX = theano.config.floatX 523 | 524 | # compile theano function 525 | if not hasattr(ZCA._gpu_mdmt, 'theano_func'): 526 | t_mat = T.matrix('M') 527 | t_diags = T.vector('D') 528 | result = T.dot(t_mat * t_diags, t_mat.T) 529 | ZCA._gpu_mdmt.theano_func = theano.function( 530 | [t_mat, t_diags], 531 | result, 532 | allow_input_downcast=True) 533 | 534 | try: 535 | # function()-call above had to downcast the data. Emit warnings. 536 | if str(mat.dtype) != floatX: 537 | warnings.warn('Implicitly converting mat from dtype=%s to ' 538 | '%s for gpu' % (mat.dtype, floatX)) 539 | if str(diags.dtype) != floatX: 540 | warnings.warn('Implicitly converting diag from dtype=%s to ' 541 | '%s for gpu' % (diags.dtype, floatX)) 542 | 543 | return ZCA._gpu_mdmt.theano_func(mat, diags) 544 | 545 | except MemoryError: 546 | # fall back to cpu 547 | warnings.warn('M * D * M^T was too big to fit on GPU. ' 548 | 'Re-doing with CPU. Consider using ' 549 | 'THEANO_FLAGS="device=cpu" for your next ' 550 | 'preprocessor run') 551 | return np.dot(mat * diags, mat.T) 552 | 553 | def set_matrices_save_path(self, matrices_save_path): 554 | """ 555 | Analogous to DenseDesignMatrix.use_design_loc(). 556 | If a matrices_save_path is set, when this ZCA is pickled, the internal 557 | parameter matrices will be saved separately to `matrices_save_path`, as 558 | a numpy .npz archive. This uses half the memory that a normal pickling 559 | does. 560 | Parameters 561 | ---------- 562 | matrices_save_path : WRITEME 563 | """ 564 | if matrices_save_path is not None: 565 | assert isinstance(matrices_save_path, str) 566 | matrices_save_path = os.path.abspath(matrices_save_path) 567 | 568 | if os.path.isdir(matrices_save_path): 569 | raise IOError('Matrix save path "%s" must not be an existing ' 570 | 'directory.') 571 | 572 | assert matrices_save_path[-1] not in ('/', '\\') 573 | if not os.path.isdir(os.path.split(matrices_save_path)[0]): 574 | raise IOError('Couldn\'t find parent directory:\n' 575 | '\t"%s"\n' 576 | '\t of matrix path\n' 577 | '\t"%s"') 578 | 579 | self.matrices_save_path = matrices_save_path 580 | 581 | def __getstate__(self): 582 | """ 583 | Used by pickle. Returns a dictionary to pickle in place of 584 | self.__dict__. 585 | If self.matrices_save_path is set, this saves the matrices P_ and 586 | inv_P_ separately in matrices_save_path as a .npz archive, which uses 587 | much less space & memory than letting pickle handle them. 588 | """ 589 | result = copy.copy(self.__dict__) # shallow copy 590 | if self.matrices_save_path is not None: 591 | matrices = {'P_': self.P_} 592 | if self.inv_P_ is not None: 593 | matrices['inv_P_'] = self.inv_P_ 594 | 595 | np.savez(self.matrices_save_path, **matrices) 596 | 597 | # Removes the matrices from the dictionary to be pickled. 598 | for key, matrix in matrices.items(): 599 | del result[key] 600 | 601 | return result 602 | 603 | def __setstate__(self, state): 604 | """ 605 | Used to unpickle. 606 | Parameters 607 | ---------- 608 | state : dict 609 | The dictionary created by __setstate__, presumably unpickled 610 | from disk. 611 | """ 612 | 613 | # Patch old pickle files 614 | if 'matrices_save_path' not in state: 615 | state['matrices_save_path'] = None 616 | 617 | if state['matrices_save_path'] is not None: 618 | matrices = np.load(state['matrices_save_path']) 619 | 620 | # puts matrices' items into state, overriding any colliding keys in 621 | # state. 622 | state = dict(state.items() + matrices.items()) 623 | del matrices 624 | 625 | self.__dict__.update(state) 626 | 627 | if not hasattr(self, "inv_P_"): 628 | self.inv_P_ = None 629 | 630 | def fit(self, X): 631 | """ 632 | Fits this `ZCA` instance to a design matrix `X`. 633 | Parameters 634 | ---------- 635 | X : ndarray 636 | A matrix where each row is a datum. 637 | Notes 638 | ----- 639 | Implementation details: 640 | Stores result as `self.P_`. 641 | If self.store_inverse is true, this also computes `self.inv_P_`. 642 | """ 643 | 644 | assert X.dtype in ['float32', 'float64'] 645 | assert not np.any(np.isnan(X)) 646 | assert len(X.shape) == 2 647 | n_samples = X.shape[0] 648 | if self.copy: 649 | X = X.copy() 650 | # Center data 651 | self.mean_ = np.mean(X, axis=0) 652 | X -= self.mean_ 653 | 654 | log.info('computing zca of a {0} matrix'.format(X.shape)) 655 | t1 = time.time() 656 | 657 | bias = self.filter_bias * scipy.sparse.identity(X.shape[1], 658 | theano.config.floatX) 659 | 660 | covariance = ZCA._gpu_matrix_dot(X.T, X) / X.shape[0] + bias 661 | t2 = time.time() 662 | log.info("cov estimate took {0} seconds".format(t2 - t1)) 663 | 664 | t1 = time.time() 665 | eigs, eigv = linalg.eigh(covariance) 666 | t2 = time.time() 667 | 668 | log.info("eigh() took {0} seconds".format(t2 - t1)) 669 | assert not np.any(np.isnan(eigs)) 670 | assert not np.any(np.isnan(eigv)) 671 | assert eigs.min() > 0 672 | 673 | if self.n_components and self.n_drop_components: 674 | raise ValueError('Either n_components or n_drop_components' 675 | 'should be specified') 676 | 677 | if self.n_components: 678 | eigs = eigs[-self.n_components:] 679 | eigv = eigv[:, -self.n_components:] 680 | 681 | if self.n_drop_components: 682 | eigs = eigs[self.n_drop_components:] 683 | eigv = eigv[:, self.n_drop_components:] 684 | 685 | t1 = time.time() 686 | 687 | sqrt_eigs = np.sqrt(eigs) 688 | try: 689 | self.P_ = ZCA._gpu_mdmt(eigv, 1.0 / sqrt_eigs) 690 | except MemoryError: 691 | warnings.warn() 692 | self.P_ = np.dot(eigv * (1.0 / sqrt_eigs), eigv.T) 693 | 694 | t2 = time.time() 695 | assert not np.any(np.isnan(self.P_)) 696 | self.has_fit_ = True 697 | 698 | if self.store_inverse: 699 | self.inv_P_ = ZCA._gpu_mdmt(eigv, sqrt_eigs) 700 | else: 701 | self.inv_P_ = None 702 | 703 | def apply(self, X, can_fit=True): 704 | """ 705 | .. todo:: 706 | WRITEME 707 | """ 708 | # Compiles apply.x_minus_mean_times_p(), a numeric Theano function that 709 | # evauates dot(X - mean, P) 710 | if not hasattr(ZCA, '_x_minus_mean_times_p'): 711 | x_symbol = T.matrix('X') 712 | mean_symbol = T.vector('mean') 713 | p_symbol = T.matrix('P_') 714 | new_x_symbol = T.dot(x_symbol - mean_symbol, p_symbol) 715 | ZCA._x_minus_mean_times_p = theano.function([x_symbol, 716 | mean_symbol, 717 | p_symbol], 718 | new_x_symbol) 719 | 720 | assert X.dtype in ['float32', 'float64'] 721 | if not self.has_fit_: 722 | assert can_fit 723 | self.fit(X) 724 | 725 | new_X = ZCA._gpu_matrix_dot(X - self.mean_, self.P_) 726 | return new_X 727 | 728 | def invert(self, X): 729 | """ 730 | .. todo:: 731 | WRITEME 732 | """ 733 | assert X.ndim == 2 734 | 735 | if self.inv_P_ is None: 736 | warnings.warn("inv_P_ was None. Computing " 737 | "inverse of P_ now. This will take " 738 | "some time. For efficiency, it is recommended that " 739 | "in the future you compute the inverse in ZCA.fit() " 740 | "instead, by passing it store_inverse=True.") 741 | log.info('inverting...') 742 | self.inv_P_ = np.linalg.inv(self.P_) 743 | log.info('...done inverting') 744 | 745 | return self._gpu_matrix_dot(X, self.inv_P_) + self.mean_ 746 | --------------------------------------------------------------------------------