├── LICENSE
├── README.md
├── exp
    ├── cmuarctic.py
    ├── fruitspeech.py
    ├── librispeech.py
    ├── mnist.py
    ├── net.py
    └── test_fruitspeech_model.py
├── net.py
├── tests
    ├── bi_multilayer_softmax.py
    ├── bi_softmax.py
    ├── multilayer_softmax.py
    ├── net.py
    ├── predict.py
    └── softmax.py
└── util
    ├── continue_model_mnist.py
    └── load_model.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014, Kyle Kastner
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of gomorrah nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | net
2 | ===
3 | 
4 | "A library for testing implementations of neural networks and training
5 | algorithms"
6 | 


--------------------------------------------------------------------------------
/exp/cmuarctic.py:
--------------------------------------------------------------------------------
 1 | from net import RecurrentNetwork, load_cmuarctic, labels_to_chars
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | 
 5 | (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = load_cmuarctic()
 6 | clf = RecurrentNetwork(learning_alg="sfg",
 7 |                        hidden_layer_sizes=[500],
 8 |                        max_col_norm=1.9635,
 9 |                        max_iter=1000, cost="ctc", bidirectional=True,
10 |                        learning_rate=0.0001, momentum=0.9,
11 |                        recurrent_activation="lstm",
12 |                        random_seed=1999)
13 | 
14 | tx = train_x[2]
15 | tx = (tx - tx.mean()) / tx.std()
16 | clf.fit(train_x[2], train_y[2])
17 | y = labels_to_chars(train_y[2])
18 | print(y)
19 | 


--------------------------------------------------------------------------------
/exp/fruitspeech.py:
--------------------------------------------------------------------------------
 1 | from net import RecurrentNetwork, load_fruitspeech, labels_to_chars
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | 
 5 | (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = load_fruitspeech()
 6 | clf = RecurrentNetwork(learning_alg="rmsprop",
 7 |                        hidden_layer_sizes=[500],
 8 |                        max_iter=100, cost="encdec", bidirectional=True,
 9 |                        learning_rate=0.00002, momentum=0.9,
10 |                        recurrent_activation="lstm",
11 |                        random_seed=1999)
12 | 
13 | all_frames = np.vstack(train_x)
14 | means = np.mean(all_frames, axis=0)
15 | std = np.std(all_frames, axis=0)
16 | for n, t in enumerate(train_x):
17 |     train_x[n] = (t - means) / std
18 | 
19 | for n, v in enumerate(valid_x):
20 |     valid_x[n] = (v - means) / std
21 | 
22 | from IPython import embed; embed()
23 | 
24 | 
25 | clf.fit(train_x, train_y, valid_x, valid_y)
26 | y_hat = labels_to_chars(clf.predict(valid_x[0])[0])
27 | y = labels_to_chars(valid_y[0])
28 | print(y_hat)
29 | print(y)
30 | 


--------------------------------------------------------------------------------
/exp/librispeech.py:
--------------------------------------------------------------------------------
 1 | from net import RecurrentNetwork, load_librispeech, labels_to_chars
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = load_librispeech()
 6 | clf = RecurrentNetwork(learning_alg="rmsprop", hidden_layer_sizes=[500,],
 7 |                        max_iter=100, cost="ctc", bidirectional=True,
 8 |                        learning_rate=0.0001, momentum=0.9,
 9 |                        recurrent_activation="lstm",
10 |                        random_seed=1999)
11 | print(labels_to_chars(train_y[2]))
12 | means = np.mean(train_x[2], axis=0)
13 | std = np.std(train_x[2], axis=0)
14 | tx = (train_x[2] - means) / std
15 | clf.fit(tx, train_y[2])
16 | from IPython import embed; embed()
17 | 


--------------------------------------------------------------------------------
/exp/mnist.py:
--------------------------------------------------------------------------------
 1 | from net import load_mnist, FeedforwardClassifier
 2 | 
 3 | datasets = load_mnist()
 4 | 
 5 | train_set_x, train_set_y = datasets[0]
 6 | valid_set_x, valid_set_y = datasets[1]
 7 | test_set_x, test_set_y = datasets[2]
 8 | 
 9 | print('... building the model')
10 | # construct the MLP class
11 | classifier = FeedforwardClassifier(hidden_layer_sizes=[500],
12 |                                    random_seed=1999)
13 | 
14 | print('... training')
15 | classifier.fit(train_set_x, train_set_y, valid_set_x, valid_set_y)
16 | 


--------------------------------------------------------------------------------
/exp/net.py:
--------------------------------------------------------------------------------
1 | ../net.py


--------------------------------------------------------------------------------
/exp/test_fruitspeech_model.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | try:
 3 |     import cPickle
 4 | except ImportError:
 5 |     import pickle as cPickle
 6 | import matplotlib.pyplot as plt
 7 | from net import load_fruitspeech, labels_to_chars
 8 | import numpy as np
 9 | (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = load_fruitspeech()
10 | 
11 | f = open(sys.argv[1])
12 | clf = cPickle.load(f)
13 | 
14 | all_frames = np.vstack(train_x)
15 | means = np.mean(all_frames, axis=0)
16 | std = np.std(all_frames, axis=0)
17 | for n, t in enumerate(train_x):
18 |     train_x[n] = (t - means) / std
19 | 
20 | for n, v in enumerate(valid_x):
21 |     valid_x[n] = (v - means) / std
22 | 
23 | for n, v in enumerate(valid_y):
24 |     y = labels_to_chars(v)
25 |     y_hat = labels_to_chars(clf.predict(valid_x[n])[0])
26 |     print("Expected: %s, predicted: %s" % (y, y_hat))
27 | 


--------------------------------------------------------------------------------
/net.py:
--------------------------------------------------------------------------------
   1 | # -*- coding: utf 8 -*-
   2 | from __future__ import division
   3 | try:
   4 |     import cPickle
   5 | except ImportError:
   6 |     import pickle as cPickle
   7 | import gzip
   8 | import tarfile
   9 | import tempfile
  10 | import os
  11 | import numpy as np
  12 | from scipy import linalg
  13 | from scipy.io import wavfile
  14 | import tables
  15 | import numbers
  16 | import glob
  17 | import random
  18 | import theano
  19 | import string
  20 | import theano.tensor as T
  21 | from theano.compat.python2x import OrderedDict
  22 | import matplotlib
  23 | matplotlib.use('Agg')
  24 | import matplotlib.pyplot as plt
  25 | # Sandbox?
  26 | import fnmatch
  27 | from theano.tensor.shared_randomstreams import RandomStreams
  28 | 
  29 | 
  30 | def concatenate(tensor_list, axis=0):
  31 |     """
  32 |     Alternative implementation of `theano.tensor.concatenate`.
  33 |     This function does exactly the same thing, but contrary to Theano's own
  34 |     implementation, the gradient is implemented on the GPU.
  35 |     Stolen from Lasagne
  36 |     """
  37 |     if axis < 0:
  38 |         axis += tensor_list[0].ndim
  39 | 
  40 |     concat_size = sum(tensor.shape[axis] for tensor in tensor_list)
  41 | 
  42 |     output_shape = ()
  43 |     for k in range(axis):
  44 |         output_shape += (tensor_list[0].shape[k],)
  45 |     output_shape += (concat_size,)
  46 |     for k in range(axis + 1, tensor_list[0].ndim):
  47 |         output_shape += (tensor_list[0].shape[k],)
  48 | 
  49 |     out = T.zeros(output_shape)
  50 |     offset = 0
  51 |     for tensor in tensor_list:
  52 |         indices = ()
  53 |         for k in range(axis):
  54 |             indices += (slice(None),)
  55 |         indices += (slice(offset, offset + tensor.shape[axis]),)
  56 |         for k in range(axis + 1, tensor_list[0].ndim):
  57 |             indices += (slice(None),)
  58 | 
  59 |         out = T.set_subtensor(out[indices], tensor)
  60 |         offset += tensor.shape[axis]
  61 | 
  62 |     return out
  63 | 
  64 | 
  65 | def minibatch_indices(X, minibatch_size):
  66 |     minibatch_indices = np.arange(0, len(X), minibatch_size)
  67 |     minibatch_indices = np.asarray(list(minibatch_indices) + [len(X)])
  68 |     start_indices = minibatch_indices[:-1]
  69 |     end_indices = minibatch_indices[1:]
  70 |     return zip(start_indices, end_indices)
  71 | 
  72 | 
  73 | def make_minibatch(X, y, one_hot_size):
  74 |     minibatch_size = len(X)
  75 |     is_one_hot = True
  76 |     X_max_sizes = np.max([xi.shape for xi in X], axis=0)
  77 |     X_max_sizes = np.asarray([minibatch_size] + list(X_max_sizes))
  78 |     # Order into time, samples, feature
  79 |     X_max_sizes = np.array([X_max_sizes[1], X_max_sizes[0],
  80 |                             X_max_sizes[2]])
  81 |     y_max_sizes = np.max([yi.shape for yi in y], axis=0)
  82 |     y_max_sizes = np.array([minibatch_size] + list(y_max_sizes))
  83 |     # Order into time, samples, label
  84 |     # dim is 1 for output label? This may need adjustment for regression
  85 |     if len(y_max_sizes) == 3:
  86 |         y_max_sizes = np.array([y_max_sizes[1], y_max_sizes[0], y_max_sizes[2]])
  87 |     elif len(y_max_sizes) < 3:
  88 |         y_max_sizes = np.array([y_max_sizes[1], y_max_sizes[0], one_hot_size])
  89 |         is_one_hot = False
  90 |     else:
  91 |         raise ValueError("y must be 2 or 3 dimensional!")
  92 | 
  93 |     for y_t in y:
  94 |         if not np.all(np.in1d([0, 1], np.unique(y_t.ravel()))):
  95 |             is_one_hot = False
  96 |     X_n = np.zeros(X_max_sizes, dtype=X[0].dtype)
  97 |     y_n = np.zeros(y_max_sizes).astype(theano.config.floatX)
  98 |     X_mask = np.zeros((X_max_sizes[0], X_max_sizes[1])).astype(
  99 |         theano.config.floatX)
 100 |     y_mask = np.zeros((y_max_sizes[0], y_max_sizes[1])).astype(
 101 |         theano.config.floatX)
 102 |     for n, t in enumerate(X):
 103 |         xshp = X[n].shape
 104 |         X_n[:xshp[0], n, :xshp[1]] = X[n]
 105 |         X_mask[:xshp[0], n] = 1.
 106 | 
 107 |     for n, t in enumerate(y):
 108 |         yshp = y[n].shape
 109 |         if not is_one_hot:
 110 |             for i, v in enumerate(y[n]):
 111 |                 y_n[i, n, v] = 1.
 112 |         else:
 113 |             y_n[:yshp[0], n, :yshp[1]] = y[n]
 114 |         y_mask[:yshp[0], n] = 1.
 115 |     return X_n, y_n, X_mask, y_mask
 116 | 
 117 | 
 118 | def labels_to_chars(labels):
 119 |     return "".join([chr(l + 97) for l in labels])
 120 | 
 121 | 
 122 | def _make_ctc_labels(y):
 123 |     # Assume that class values are sequential! and start from 0
 124 |     highest_class = np.max([np.max(d) for d in y])
 125 |     # Need to insert blanks at start, end, and between each label
 126 |     # See A. Graves "Supervised Sequence Labelling with Recurrent Neural
 127 |     # Networks" figure 7.2 (pg. 58)
 128 |     # (http://www.cs.toronto.edu/~graves/preprint.pdf)
 129 |     blank = highest_class + 1
 130 |     y_fixed = [blank * np.ones(2 * yi.shape[0] + 1).astype('int32')
 131 |                for yi in y]
 132 |     for i, yi in enumerate(y):
 133 |         y_fixed[i][1:-1:2] = yi
 134 |     return y_fixed
 135 | 
 136 | 
 137 | def relu(x):
 138 |     return x * (x > 1e-6)
 139 | 
 140 | 
 141 | def clip_relu(x, clip_lim=20):
 142 |     return x * (T.lt(x, 1e-6) and T.gt(x, clip_lim))
 143 | 
 144 | 
 145 | def dropout(random_state, X, keep_prob=0.5):
 146 |     if keep_prob > 0. and keep_prob < 1.:
 147 |         seed = random_state.randint(2 ** 30)
 148 |         srng = RandomStreams(seed)
 149 |         mask = srng.binomial(n=1, p=keep_prob, size=X.shape,
 150 |                              dtype=theano.config.floatX)
 151 |         return X * mask
 152 |     return X
 153 | 
 154 | 
 155 | def fast_dropout(random_state, X):
 156 |     seed = random_state.randint(2 ** 30)
 157 |     srng = RandomStreams(seed)
 158 |     mask = srng.normal(size=X.shape, avg=1., dtype=theano.config.floatX)
 159 |     return X * mask
 160 | 
 161 | 
 162 | def shared_zeros(shape):
 163 |     """ Builds a theano shared variable filled with a zeros numpy array """
 164 |     return theano.shared(value=np.zeros(*shape).astype(theano.config.floatX),
 165 |                          borrow=True)
 166 | 
 167 | 
 168 | def shared_rand(shape, rng):
 169 |     """ Builds a theano shared variable filled with random values """
 170 |     return theano.shared(value=(0.01 * (rng.rand(*shape) - 0.5)).astype(
 171 |         theano.config.floatX), borrow=True)
 172 | 
 173 | 
 174 | def np_rand(shape, rng):
 175 |     return (0.01 * (rng.rand(*shape) - 0.5)).astype(theano.config.floatX)
 176 | 
 177 | 
 178 | def np_randn(shape, rng, name=None):
 179 |     """ Builds a numpy variable filled with random normal values """
 180 |     return (0.01 * rng.randn(*shape)).astype(theano.config.floatX)
 181 | 
 182 | 
 183 | def np_ortho(shape, rng, name=None):
 184 |     """ Builds a theano variable filled with orthonormal random values """
 185 |     g = rng.randn(*shape)
 186 |     o_g = linalg.svd(g)[0]
 187 |     return o_g.astype(theano.config.floatX)
 188 | 
 189 | 
 190 | def shared_ortho(shape, rng, name=None):
 191 |     """ Builds a theano shared variable filled with random values """
 192 |     g = rng.randn(*shape)
 193 |     o_g = linalg.svd(g)[0]
 194 |     return theano.shared(value=o_g.astype(theano.config.floatX), borrow=True)
 195 | 
 196 | 
 197 | def load_mnist():
 198 |     # Check if dataset is in the data directory.
 199 |     data_path = os.path.join(os.path.split(__file__)[0], "data")
 200 |     if not os.path.exists(data_path):
 201 |         os.makedirs(data_path)
 202 | 
 203 |     dataset = 'mnist.pkl.gz'
 204 |     data_file = os.path.join(data_path, dataset)
 205 |     if os.path.isfile(data_file):
 206 |         dataset = data_file
 207 | 
 208 |     if (not os.path.isfile(data_file)):
 209 |         try:
 210 |             import urllib
 211 |             urllib.urlretrieve('http://google.com')
 212 |         except AttributeError:
 213 |             import urllib.request as urllib
 214 |         url = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
 215 |         print('Downloading data from %s' % url)
 216 |         urllib.urlretrieve(url, data_file)
 217 | 
 218 |     print('... loading data')
 219 |     # Load the dataset
 220 |     f = gzip.open(data_file, 'rb')
 221 |     try:
 222 |         train_set, valid_set, test_set = cPickle.load(f, encoding="latin1")
 223 |     except TypeError:
 224 |         train_set, valid_set, test_set = cPickle.load(f)
 225 |     f.close()
 226 | 
 227 |     test_x, test_y = test_set
 228 |     test_x = test_x.astype('float32')
 229 |     test_y = test_y.astype('int32')
 230 |     valid_x, valid_y = valid_set
 231 |     valid_x = valid_x.astype('float32')
 232 |     valid_y = valid_y.astype('int32')
 233 |     train_x, train_y = train_set
 234 |     train_x = train_x.astype('float32')
 235 |     train_y = train_y.astype('int32')
 236 | 
 237 |     rval = [(train_x, train_y), (valid_x, valid_y), (test_x, test_y)]
 238 |     return rval
 239 | 
 240 | 
 241 | def load_cifar10():
 242 |     # Check if dataset is in the data directory.
 243 |     data_path = os.path.join(os.path.split(__file__)[0], "data")
 244 |     if not os.path.exists(data_path):
 245 |         os.makedirs(data_path)
 246 | 
 247 |     dataset = 'cifar-10-python.tar.gz'
 248 |     data_file = os.path.join(data_path, dataset)
 249 |     if os.path.isfile(data_file):
 250 |         dataset = data_file
 251 | 
 252 |     if (not os.path.isfile(data_file)):
 253 |         try:
 254 |             import urllib
 255 |             urllib.urlretrieve('http://google.com')
 256 |         except AttributeError:
 257 |             import urllib.request as urllib
 258 |         url = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
 259 |         print('Downloading data from %s' % url)
 260 |         urllib.urlretrieve(url, data_file)
 261 | 
 262 |     print('... loading data')
 263 |     tar = tarfile.open(data_file)
 264 |     os.chdir(data_path)
 265 |     tar.extractall()
 266 |     tar.close()
 267 | 
 268 |     data_path = os.path.join(data_path, "cifar-10-batches-py")
 269 |     batch_files = glob.glob(os.path.join(data_path, "*batch*"))
 270 |     train_data = []
 271 |     train_labels = []
 272 |     test_data = []
 273 |     test_labels = []
 274 |     for f in batch_files:
 275 |         batch_file = open(f, 'rb')
 276 |         d = cPickle.load(batch_file)
 277 |         batch_file.close()
 278 |         fname = f.split(os.path.sep)[-1]
 279 |         if "data" in fname:
 280 |             data = d['data']
 281 |             labels = d['labels']
 282 |             train_data.append(data)
 283 |             train_labels.append(labels)
 284 |         elif "test" in fname:
 285 |             data = d['data']
 286 |             labels = d['labels']
 287 |             test_data.append(data)
 288 |             test_labels.append(labels)
 289 | 
 290 |     # Split into 40000 train 10000 valid 10000 test
 291 |     train_x = np.asarray(train_data)
 292 |     train_y = np.asarray(train_labels)
 293 |     test_x = np.asarray(test_data)
 294 |     test_y = np.asarray(test_labels)
 295 |     valid_x = train_x[-10000:]
 296 |     valid_y = train_y[-10000:]
 297 |     train_x = train_x[:-10000]
 298 |     train_y = train_y[:-10000]
 299 | 
 300 |     test_x = test_x.astype('float32')
 301 |     test_y = test_y.astype('int32')
 302 |     valid_x = valid_x.astype('float32')
 303 |     valid_y = valid_y.astype('int32')
 304 |     train_x = train_x.astype('float32')
 305 |     train_y = train_y.astype('int32')
 306 | 
 307 |     rval = [(train_x, train_y), (valid_x, valid_y), (test_x, test_y)]
 308 |     return rval
 309 | 
 310 | 
 311 | def load_scribe():
 312 |     # Check if dataset is in the data directory.
 313 |     data_path = os.path.join(os.path.split(__file__)[0], "data")
 314 |     if not os.path.exists(data_path):
 315 |         os.makedirs(data_path)
 316 | 
 317 |     dataset = 'scribe.pkl'
 318 |     data_file = os.path.join(data_path, dataset)
 319 |     if os.path.isfile(data_file):
 320 |         dataset = data_file
 321 | 
 322 |     if (not os.path.isfile(data_file)):
 323 |         try:
 324 |             import urllib
 325 |             urllib.urlretrieve('http://google.com')
 326 |             url = 'https://dl.dropboxusercontent.com/u/15378192/scribe2.pkl'
 327 |         except AttributeError:
 328 |             import urllib.request as urllib
 329 |             url = 'https://dl.dropboxusercontent.com/u/15378192/scribe3.pkl'
 330 |         print('Downloading data from %s' % url)
 331 |         urllib.urlretrieve(url, data_file)
 332 | 
 333 |     print('... loading data')
 334 |     with open(data_file, 'rb') as pkl_file:
 335 |         data = cPickle.load(pkl_file)
 336 | 
 337 |     data_x, data_y = [], []
 338 |     for x, y in zip(data['x'], data['y']):
 339 |         data_y.append(np.asarray(y, dtype=np.int32))
 340 |         data_x.append(np.asarray(x, dtype=theano.config.floatX).T)
 341 | 
 342 |     train_x = data_x[:750]
 343 |     train_y = data_y[:750]
 344 |     valid_x = data_x[750:900]
 345 |     valid_y = data_y[750:900]
 346 |     test_x = data_x[900:]
 347 |     test_y = data_y[900:]
 348 |     rval = [(train_x, train_y), (valid_x, valid_y), (test_x, test_y)]
 349 |     return rval
 350 | 
 351 | 
 352 | # A tricky trick for monkeypatching an instancemethod that is
 353 | # CPython :( there must be a better way
 354 | class _cVLArray(tables.VLArray):
 355 |     pass
 356 | 
 357 | 
 358 | def load_fruitspeech():
 359 |     # Check if dataset is in the data directory.
 360 |     data_path = os.path.join(os.path.split(__file__)[0], "data")
 361 |     if not os.path.exists(data_path):
 362 |         os.makedirs(data_path)
 363 | 
 364 |     dataset = 'audio.tar.gz'
 365 |     data_file = os.path.join(data_path, dataset)
 366 |     if os.path.isfile(data_file):
 367 |         dataset = data_file
 368 | 
 369 |     if not os.path.isfile(data_file):
 370 |         try:
 371 |             import urllib
 372 |             urllib.urlretrieve('http://google.com')
 373 |             url = 'https://dl.dropboxusercontent.com/u/15378192/audio.tar.gz'
 374 |         except AttributeError:
 375 |             import urllib.request as urllib
 376 |             url = 'https://dl.dropboxusercontent.com/u/15378192/audio.tar.gz'
 377 |         print('Downloading data from %s' % url)
 378 |         urllib.urlretrieve(url, data_file)
 379 | 
 380 |     print('... loading data')
 381 |     if not os.path.exists(os.path.join(data_path, "audio")):
 382 |         tar = tarfile.open(data_file)
 383 |         os.chdir(data_path)
 384 |         tar.extractall()
 385 |         tar.close()
 386 | 
 387 |     h5_file_path = os.path.join(data_path, "saved_fruit.h5")
 388 |     if not os.path.exists(h5_file_path):
 389 |         data_path = os.path.join(data_path, "audio")
 390 | 
 391 |         audio_matches = []
 392 |         for root, dirnames, filenames in os.walk(data_path):
 393 |             for filename in fnmatch.filter(filenames, '*.wav'):
 394 |                 audio_matches.append(os.path.join(root, filename))
 395 | 
 396 |         random.seed(1999)
 397 |         random.shuffle(audio_matches)
 398 | 
 399 |         # http://mail.scipy.org/pipermail/numpy-discussion/2011-March/055219.html
 400 |         h5_file = tables.openFile(h5_file_path, mode='w')
 401 |         data_x = h5_file.createVLArray(h5_file.root, 'data_x',
 402 |                                        tables.Float32Atom(shape=()),
 403 |                                        filters=tables.Filters(1))
 404 |         data_x_shapes = h5_file.createVLArray(h5_file.root, 'data_x_shapes',
 405 |                                               tables.Int32Atom(shape=()),
 406 |                                               filters=tables.Filters(1))
 407 |         data_y = h5_file.createVLArray(h5_file.root, 'data_y',
 408 |                                        tables.Int32Atom(shape=()),
 409 |                                        filters=tables.Filters(1))
 410 |         for wav_path in audio_matches:
 411 |             # Convert chars to int classes
 412 |             word = wav_path.split(os.sep)[-1][:-6]
 413 |             chars = [ord(c) - 97 for c in word]
 414 |             data_y.append(np.array(chars, dtype='int32'))
 415 |             fs, d = wavfile.read(wav_path)
 416 |             # Preprocessing from A. Graves "Towards End-to-End Speech
 417 |             # Recognition"
 418 |             Pxx, _, _, _ = plt.specgram(d, NFFT=256, noverlap=128)
 419 |             data_x_shapes.append(np.array(Pxx.T.shape, dtype='int32'))
 420 |             data_x.append(Pxx.T.astype('float32').flatten())
 421 |         h5_file.close()
 422 | 
 423 |     h5_file = tables.openFile(h5_file_path, mode='r')
 424 |     data_x = h5_file.root.data_x
 425 |     data_x_shapes = h5_file.root.data_x_shapes
 426 |     data_y = h5_file.root.data_y
 427 |     # A dirty hack to only monkeypatch data_x
 428 |     data_x.__class__ = _cVLArray
 429 | 
 430 |     # override getter so that it gets reshaped to 2D when fetched
 431 |     old_getter = data_x.__getitem__
 432 | 
 433 |     def getter(self, key):
 434 |         if isinstance(key, numbers.Integral) or isinstance(key, np.integer):
 435 |             return old_getter(key).reshape(data_x_shapes[key]).astype(
 436 |                 theano.config.floatX)
 437 |         elif isinstance(key, slice):
 438 |             start, stop, step = self._processRange(key.start, key.stop,
 439 |                                                    key.step)
 440 |             return [o.reshape(s) for o, s in zip(
 441 |                 self.read(start, stop, step), data_x_shapes[slice(
 442 |                     start, stop, step)])]
 443 | 
 444 |     # Patch __getitem__ in custom subclass, applying to all instances of it
 445 |     _cVLArray.__getitem__ = getter
 446 | 
 447 |     train_x = data_x[:80]
 448 |     train_y = data_y[:80]
 449 |     valid_x = data_x[80:90]
 450 |     valid_y = data_y[80:90]
 451 |     test_x = data_x[90:]
 452 |     test_y = data_y[90:]
 453 |     rval = [(train_x, train_y), (valid_x, valid_y), (test_x, test_y)]
 454 |     return rval
 455 | 
 456 | 
 457 | def load_cmuarctic():
 458 |     # Check if dataset is in the data directory.
 459 |     data_path = os.path.join(os.path.split(__file__)[0], "data")
 460 |     if not os.path.exists(data_path):
 461 |         os.makedirs(data_path)
 462 | 
 463 |     urls = ['http://www.speech.cs.cmu.edu/cmu_arctic/packed/cmu_us_awb_arctic-0.95-release.tar.bz2',
 464 |             'http://www.speech.cs.cmu.edu/cmu_arctic/packed/cmu_us_bdl_arctic-0.95-release.tar.bz2',
 465 |             'http://www.speech.cs.cmu.edu/cmu_arctic/packed/cmu_us_clb_arctic-0.95-release.tar.bz2',
 466 |             'http://www.speech.cs.cmu.edu/cmu_arctic/packed/cmu_us_jmk_arctic-0.95-release.tar.bz2',
 467 |             'http://www.speech.cs.cmu.edu/cmu_arctic/packed/cmu_us_ksp_arctic-0.95-release.tar.bz2',
 468 |             'http://www.speech.cs.cmu.edu/cmu_arctic/packed/cmu_us_rms_arctic-0.95-release.tar.bz2',
 469 |             'http://www.speech.cs.cmu.edu/cmu_arctic/packed/cmu_us_slt_arctic-0.95-release.tar.bz2',
 470 |             ]
 471 | 
 472 |     data_files = []
 473 | 
 474 |     for url in urls:
 475 |         dataset = url.split('/')[-1]
 476 |         data_file = os.path.join(data_path, dataset)
 477 |         data_files.append(data_file)
 478 |         if os.path.isfile(data_file):
 479 |             dataset = data_file
 480 |         if not os.path.isfile(data_file):
 481 |             try:
 482 |                 import urllib
 483 |                 urllib.urlretrieve('http://google.com')
 484 |             except AttributeError:
 485 |                 import urllib.request as urllib
 486 |             print('Downloading data from %s' % url)
 487 |             urllib.urlretrieve(url, data_file)
 488 | 
 489 |     print('... loading data')
 490 | 
 491 |     folder_paths = []
 492 |     for data_file in data_files:
 493 |         folder_name = data_file.split(os.sep)[-1].split("-")[0]
 494 |         folder_path = os.path.join(data_path, folder_name)
 495 |         folder_paths.append(folder_path)
 496 |         if not os.path.exists(folder_path):
 497 |             tar = tarfile.open(data_file)
 498 |             os.chdir(data_path)
 499 |             tar.extractall()
 500 |             tar.close()
 501 | 
 502 |     h5_file_path = os.path.join(data_path, "saved_cmu.h5")
 503 |     if not os.path.exists(h5_file_path):
 504 |         # http://mail.scipy.org/pipermail/numpy-discussion/2011-March/055219.html
 505 |         h5_file = tables.openFile(h5_file_path, mode='w')
 506 |         data_x = h5_file.createVLArray(h5_file.root, 'data_x',
 507 |                                        tables.Float32Atom(shape=()),
 508 |                                        filters=tables.Filters(1))
 509 |         data_x_shapes = h5_file.createVLArray(h5_file.root, 'data_x_shapes',
 510 |                                               tables.Int32Atom(shape=()),
 511 |                                               filters=tables.Filters(1))
 512 |         data_y = h5_file.createVLArray(h5_file.root, 'data_y',
 513 |                                        tables.Int32Atom(shape=()),
 514 |                                        filters=tables.Filters(1))
 515 |         data_meta = h5_file.createVLArray(h5_file.root, 'data_meta',
 516 |                                           tables.StringAtom(200),
 517 |                                           filters=tables.Filters(1))
 518 |         for folder_path in folder_paths:
 519 |             audio_matches = []
 520 |             for root, dirnames, filenames in os.walk(folder_path):
 521 |                 for filename in fnmatch.filter(filenames, '*.wav'):
 522 |                     audio_matches.append(os.path.join(root, filename))
 523 | 
 524 |             f = open(os.path.join(folder_path, "etc", "txt.done.data"))
 525 |             read_raw_text = f.readlines()
 526 |             f.close()
 527 |             # Remove all punctuations
 528 |             list_text = [t.strip().lower().translate(
 529 |                 string.maketrans("", ""), string.punctuation).split(" ")[1:-1]
 530 |                 for t in read_raw_text]
 531 |             # Get rid of numbers, even though it will probably hurt
 532 |             # recognition on certain examples
 533 |             cleaned_lookup = {lt[0]: " ".join(lt[1:]).translate(
 534 |                 None, string.digits).strip() for lt in list_text}
 535 |             data_meta.append(folder_path.split(os.sep)[-1])
 536 | 
 537 |             for wav_path in audio_matches:
 538 |                 lookup_key = wav_path.split(os.sep)[-1][:-4]
 539 |                 # Some files aren't consistent!
 540 |                 if "_" in cleaned_lookup.keys()[0] and "_" not in lookup_key:
 541 |                     # Needs an _ to match text format... sometimes!
 542 |                     lookup_key = lookup_key[:6] + "_" + lookup_key[6:]
 543 |                 elif "_" not in cleaned_lookup.keys()[0]:
 544 |                     lookup_key = lookup_key.translate(None, "_")
 545 |                 try:
 546 |                     words = cleaned_lookup[lookup_key]
 547 |                     # Convert chars to int classes
 548 |                     chars = [ord(c) - 97 for c in words]
 549 |                     # Make spaces last class
 550 |                     chars = [c if c >= 0 else 26 for c in chars]
 551 |                     data_y.append(np.array(chars, dtype='int32'))
 552 |                     # Convert chars to int classes
 553 |                     fs, d = wavfile.read(wav_path)
 554 |                     # Preprocessing from A. Graves "Towards End-to-End Speech
 555 |                     # Recognition"
 556 |                     Pxx, _, _, _ = plt.specgram(d, NFFT=256, noverlap=128)
 557 |                     data_x_shapes.append(np.array(Pxx.T.shape, dtype='int32'))
 558 |                     data_x.append(Pxx.T.astype('float32').flatten())
 559 |                 except KeyError:
 560 |                     # Necessary because some labels are missing in some folders
 561 |                     print("Skipping %s due to missing key" % wav_path)
 562 | 
 563 |         h5_file.close()
 564 | 
 565 |     h5_file = tables.openFile(h5_file_path, mode='r')
 566 |     data_x = h5_file.root.data_x
 567 |     data_x_shapes = h5_file.root.data_x_shapes
 568 |     data_y = h5_file.root.data_y
 569 |     # A dirty hack to only monkeypatch data_x
 570 |     data_x.__class__ = _cVLArray
 571 | 
 572 |     # override getter so that it gets reshaped to 2D when fetched
 573 |     old_getter = data_x.__getitem__
 574 | 
 575 |     def getter(self, key):
 576 |         if isinstance(key, numbers.Integral) or isinstance(key, np.integer):
 577 |             return old_getter(key).reshape(data_x_shapes[key]).astype(
 578 |                 theano.config.floatX)
 579 |         elif isinstance(key, slice):
 580 |             start, stop, step = self._processRange(key.start, key.stop,
 581 |                                                    key.step)
 582 |             return [o.reshape(s) for o, s in zip(
 583 |                 self.read(start, stop, step), data_x_shapes[slice(
 584 |                     start, stop, step)])]
 585 | 
 586 |     # Patch __getitem__ in custom subclass, applying to all instances of it
 587 |     _cVLArray.__getitem__ = getter
 588 | 
 589 |     train_x = data_x[:6000]
 590 |     train_y = data_y[:6000]
 591 |     valid_x = data_x[6000:7500]
 592 |     valid_y = data_y[6000:7500]
 593 |     test_x = data_x[7500:]
 594 |     test_y = data_y[7500:]
 595 |     rval = [(train_x, train_y), (valid_x, valid_y), (test_x, test_y)]
 596 |     return rval
 597 | 
 598 | 
 599 | def load_librispeech():
 600 |     # Check if dataset is in the data directory.
 601 |     data_path = os.path.join(os.path.split(__file__)[0], "data")
 602 |     if not os.path.exists(data_path):
 603 |         os.makedirs(data_path)
 604 | 
 605 |     dataset = 'dev-clean.tar.gz'
 606 |     data_file = os.path.join(data_path, dataset)
 607 |     if os.path.isfile(data_file):
 608 |         dataset = data_file
 609 | 
 610 |     if not os.path.isfile(data_file):
 611 |         try:
 612 |             import urllib
 613 |             urllib.urlretrieve('http://google.com')
 614 |             url = 'http://www.openslr.org/resources/12/dev-clean.tar.gz'
 615 |         except AttributeError:
 616 |             import urllib.request as urllib
 617 |             url = 'http://www.openslr.org/resources/12/dev-clean.tar.gz'
 618 |         print('Downloading data from %s' % url)
 619 |         urllib.urlretrieve(url, data_file)
 620 | 
 621 |     print('... loading data')
 622 |     if not os.path.exists(os.path.join(data_path, "LibriSpeech", "dev-clean")):
 623 |         tar = tarfile.open(data_file)
 624 |         os.chdir(data_path)
 625 |         tar.extractall()
 626 |         tar.close()
 627 | 
 628 |     h5_file_path = os.path.join(data_path, "saved_libri.h5")
 629 |     if not os.path.exists(h5_file_path):
 630 |         data_path = os.path.join(data_path, "LibriSpeech", "dev-clean")
 631 | 
 632 |         audio_matches = []
 633 |         for root, dirnames, filenames in os.walk(data_path):
 634 |             for filename in fnmatch.filter(filenames, '*.flac'):
 635 |                 audio_matches.append(os.path.join(root, filename))
 636 | 
 637 |         text_matches = []
 638 |         for root, dirnames, filenames in os.walk(data_path):
 639 |             for filename in fnmatch.filter(filenames, '*.txt'):
 640 |                 text_matches.append(os.path.join(root, filename))
 641 | 
 642 |         # http://mail.scipy.org/pipermail/numpy-discussion/2011-March/055219.html
 643 |         h5_file = tables.openFile(h5_file_path, mode='w')
 644 |         data_x = h5_file.createVLArray(h5_file.root, 'data_x',
 645 |                                        tables.Float32Atom(shape=()),
 646 |                                        filters=tables.Filters(1))
 647 |         data_x_shapes = h5_file.createVLArray(h5_file.root, 'data_x_shapes',
 648 |                                               tables.Int32Atom(shape=()),
 649 |                                               filters=tables.Filters(1))
 650 |         data_y = h5_file.createVLArray(h5_file.root, 'data_y',
 651 |                                        tables.Int32Atom(shape=()),
 652 |                                        filters=tables.Filters(1))
 653 |         for full_t in text_matches:
 654 |             f = open(full_t, 'r')
 655 |             for line in f.readlines():
 656 |                 word_splits = line.strip().split(" ")
 657 |                 file_tag = word_splits[0]
 658 |                 words = word_splits[1:]
 659 |                 # Convert chars to int classes
 660 |                 chars = [ord(c) - 97 for c in (" ").join(words).lower()]
 661 |                 # Make spaces last class
 662 |                 chars = [c if c >= 0 else 26 for c in chars]
 663 |                 data_y.append(np.array(chars, dtype='int32'))
 664 |                 audio_path = [a for a in audio_matches if file_tag in a]
 665 |                 if len(audio_path) != 1:
 666 |                     raise ValueError("More than one match for"
 667 |                                      "tag %s!" % file_tag)
 668 |                 if not os.path.exists(audio_path[0][:-5] + ".wav"):
 669 |                     r = os.system("ffmpeg -i %s %s.wav" % (audio_path[0],
 670 |                                                            audio_path[0][:-5]))
 671 |                     if r:
 672 |                         raise ValueError("A problem occured converting flac to"
 673 |                                          "wav, make sure ffmpeg is installed")
 674 |                 wav_path = audio_path[0][:-5] + '.wav'
 675 |                 fs, d = wavfile.read(wav_path)
 676 |                 # Preprocessing from A. Graves "Towards End-to-End Speech
 677 |                 # Recognition"
 678 |                 Pxx, _, _, _ = plt.specgram(d, NFFT=256, noverlap=128)
 679 |                 data_x_shapes.append(np.array(Pxx.T.shape, dtype='int32'))
 680 |                 data_x.append(Pxx.T.astype('float32').flatten())
 681 |             f.close()
 682 |         h5_file.close()
 683 | 
 684 |     h5_file_path = os.path.join(data_path, "saved_libri.h5")
 685 |     h5_file = tables.openFile(h5_file_path, mode='r')
 686 |     data_x = h5_file.root.data_x
 687 |     data_x_shapes = h5_file.root.data_x_shapes
 688 |     data_y = h5_file.root.data_y
 689 |     # A dirty hack to only monkeypatch data_x
 690 |     data_x.__class__ = _cVLArray
 691 | 
 692 |     # override getter so that it gets reshaped to 2D when fetched
 693 |     old_getter = data_x.__getitem__
 694 | 
 695 |     def getter(self, key):
 696 |         if isinstance(key, numbers.Integral) or isinstance(key, np.integer):
 697 |             return old_getter(key).reshape(data_x_shapes[key]).astype(
 698 |                 theano.config.floatX)
 699 |         elif isinstance(key, slice):
 700 |             start, stop, step = self._processRange(key.start, key.stop,
 701 |                                                    key.step)
 702 |             return [o.reshape(s) for o, s in zip(
 703 |                 self.read(start, stop, step), data_x_shapes[slice(
 704 |                     start, stop, step)])]
 705 | 
 706 |     # Patch __getitem__ in custom subclass, applying to all instances of it
 707 |     _cVLArray.__getitem__ = getter
 708 | 
 709 |     train_x = data_x[:2000]
 710 |     train_y = data_y[:2000]
 711 |     valid_x = data_x[2000:2500]
 712 |     valid_y = data_y[2000:2500]
 713 |     test_x = data_x[2500:]
 714 |     test_y = data_y[2500:]
 715 |     rval = [(train_x, train_y), (valid_x, valid_y), (test_x, test_y)]
 716 |     return rval
 717 | 
 718 | 
 719 | class BaseNet(object):
 720 |     def __getstate__(self):
 721 |         if not hasattr(self, '_pickle_skip_list'):
 722 |             self._pickle_skip_list = []
 723 |             for k, v in self.__dict__.items():
 724 |                 try:
 725 |                     f = tempfile.TemporaryFile()
 726 |                     cPickle.dump(v, f)
 727 |                 except:
 728 |                     self._pickle_skip_list.append(k)
 729 |         state = OrderedDict()
 730 |         for k, v in self.__dict__.items():
 731 |             if k not in self._pickle_skip_list:
 732 |                 state[k] = v
 733 |         return state
 734 | 
 735 |     def __setstate__(self, state):
 736 |         self.__dict__ = state
 737 | 
 738 | 
 739 | class TrainingMixin(object):
 740 |     def get_sgd_updates(self, X_sym, y_sym, params, cost, learning_rate,
 741 |                         momentum):
 742 |         gparams = T.grad(cost, params)
 743 |         updates = OrderedDict()
 744 | 
 745 |         if not hasattr(self, "momentum_velocity_"):
 746 |             self.momentum_velocity_ = [0.] * len(gparams)
 747 | 
 748 |         for n, (param, gparam) in enumerate(zip(params, gparams)):
 749 |             velocity = self.momentum_velocity_[n]
 750 |             update_step = momentum * velocity - learning_rate * gparam
 751 |             self.momentum_velocity_[n] = update_step
 752 |             updates[param] = param + update_step
 753 | 
 754 |         return updates
 755 | 
 756 |     def _norm_constraint(self, param, update_step, max_col_norm):
 757 |         stepped_param = param + update_step
 758 |         if param.get_value(borrow=True).ndim == 2:
 759 |             col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
 760 |             desired_norms = T.clip(col_norms, 0, max_col_norm)
 761 |             scale = desired_norms / (1e-7 + col_norms)
 762 |             new_param = param * scale
 763 |             new_update_step = update_step * scale
 764 |         else:
 765 |             new_param = param
 766 |             new_update_step = update_step
 767 |         return new_param, new_update_step
 768 | 
 769 |     def get_clip_sgd_updates(self, X_sym, y_sym, params, cost, learning_rate,
 770 |                              momentum, rescale=5.):
 771 |         gparams = T.grad(cost, params)
 772 |         updates = OrderedDict()
 773 | 
 774 |         if not hasattr(self, "momentum_velocity_"):
 775 |             self.momentum_velocity_ = [0.] * len(gparams)
 776 | 
 777 |         # Gradient clipping
 778 |         grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams)))
 779 |         not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
 780 |         grad_norm = T.sqrt(grad_norm)
 781 |         scaling_num = rescale
 782 |         scaling_den = T.maximum(rescale, grad_norm)
 783 |         for n, (param, gparam) in enumerate(zip(params, gparams)):
 784 |             # clip gradient directly, not momentum etc.
 785 |             gparam = T.switch(not_finite, 0.1 * param,
 786 |                               gparam * (scaling_num / scaling_den))
 787 |             velocity = self.momentum_velocity_[n]
 788 |             update_step = momentum * velocity - learning_rate * gparam
 789 |             self.momentum_velocity_[n] = update_step
 790 |             updates[param] = param + update_step
 791 |         return updates
 792 | 
 793 |     def get_clip_rmsprop_updates(self, X_sym, y_sym, params, cost,
 794 |                                  learning_rate, momentum, rescale=5.):
 795 |         gparams = T.grad(cost, params)
 796 |         updates = OrderedDict()
 797 | 
 798 |         if not hasattr(self, "running_average_"):
 799 |             self.running_square_ = [0.] * len(gparams)
 800 |             self.running_avg_ = [0.] * len(gparams)
 801 |             self.updates_storage_ = [0.] * len(gparams)
 802 | 
 803 |         if not hasattr(self, "momentum_velocity_"):
 804 |             self.momentum_velocity_ = [0.] * len(gparams)
 805 | 
 806 |         # Gradient clipping
 807 |         grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams)))
 808 |         not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
 809 |         grad_norm = T.sqrt(grad_norm)
 810 |         scaling_num = rescale
 811 |         scaling_den = T.maximum(rescale, grad_norm)
 812 |         for n, (param, gparam) in enumerate(zip(params, gparams)):
 813 |             gparam = T.switch(not_finite, 0.1 * param,
 814 |                               gparam * (scaling_num / scaling_den))
 815 |             combination_coeff = 0.9
 816 |             minimum_grad = 1e-4
 817 |             old_square = self.running_square_[n]
 818 |             new_square = combination_coeff * old_square + (
 819 |                 1. - combination_coeff) * T.sqr(gparam)
 820 |             old_avg = self.running_avg_[n]
 821 |             new_avg = combination_coeff * old_avg + (
 822 |                 1. - combination_coeff) * gparam
 823 |             rms_grad = T.sqrt(new_square - new_avg ** 2)
 824 |             rms_grad = T.maximum(rms_grad, minimum_grad)
 825 |             velocity = self.momentum_velocity_[n]
 826 |             update_step = momentum * velocity - learning_rate * (
 827 |                 gparam / rms_grad)
 828 |             self.running_square_[n] = new_square
 829 |             self.running_avg_[n] = new_avg
 830 |             self.updates_storage_[n] = update_step
 831 |             self.momentum_velocity_[n] = update_step
 832 |             updates[param] = param + update_step
 833 | 
 834 |         return updates
 835 | 
 836 |     def get_sfg_updates(self, X_sym, y_sym, params, cost,
 837 |                         learning_rate, momentum):
 838 |         gparams = T.grad(cost, params)
 839 |         updates = OrderedDict()
 840 |         from sfg import SFG
 841 |         if not hasattr(self, "sfg_"):
 842 |             self.count_ = theano.shared(0)
 843 |             self.slow_freq_ = 20
 844 |             self.sfg_ = SFG(params, gparams)
 845 | 
 846 |         slow_updates, fast_updates = self.sfg_.updates(self.learning_rate,
 847 |                                                        self.momentum,
 848 |                                                        epsilon=0.0001,
 849 |                                                        momentum_clipping=None)
 850 |         for param in slow_updates.keys():
 851 |             updates[param] = theano.ifelse.ifelse(T.eq(self.count_,
 852 |                                                        self.slow_freq_ - 1),
 853 |                                                   slow_updates[param],
 854 |                                                   fast_updates[param])
 855 |         updates[self.count_] = T.mod(self.count_ + 1, self.slow_freq_)
 856 |         return updates
 857 | 
 858 | 
 859 | def init_linear_layer(input_size, output_size, random_state):
 860 |     W_values = np.asarray(random_state.uniform(
 861 |         low=-np.sqrt(6. / (input_size + output_size)),
 862 |         high=np.sqrt(6. / (input_size + output_size)),
 863 |         size=(input_size, output_size)), dtype=theano.config.floatX)
 864 |     W = theano.shared(value=W_values, name='W', borrow=True)
 865 |     b_values = np.zeros((output_size,), dtype=theano.config.floatX)
 866 |     b = theano.shared(value=b_values, name='b', borrow=True)
 867 |     params = [W, b]
 868 |     return params
 869 | 
 870 | 
 871 | def build_linear_layer_from_params(params, input_variable):
 872 |     W, b = params
 873 |     output_variable = T.dot(input_variable, W) + b
 874 |     return output_variable, params
 875 | 
 876 | 
 877 | def build_linear_layer(input_size, output_size, input_variable, random_state):
 878 |     params = init_linear_layer(input_size, output_size, random_state)
 879 |     return build_linear_layer_from_params(params, input_variable)
 880 | 
 881 | 
 882 | def init_tanh_layer(input_size, output_size, random_state):
 883 |     W_values = np.asarray(random_state.uniform(
 884 |         low=-np.sqrt(6. / (input_size + output_size)),
 885 |         high=np.sqrt(6. / (input_size + output_size)),
 886 |         size=(input_size, output_size)), dtype=theano.config.floatX)
 887 |     W = theano.shared(value=W_values, name='W', borrow=True)
 888 |     b_values = np.zeros((output_size,), dtype=theano.config.floatX)
 889 |     b = theano.shared(value=b_values, name='b', borrow=True)
 890 |     params = [W, b]
 891 |     return params
 892 | 
 893 | 
 894 | def build_tanh_layer_from_params(params, input_variable):
 895 |     W, b = params
 896 |     output_variable = T.tanh(T.dot(input_variable, W) + b)
 897 |     return output_variable, params
 898 | 
 899 | 
 900 | def build_tanh_layer(input_size, output_size, input_variable, random_state):
 901 |     params = init_tanh_layer(input_size, output_size, random_state)
 902 |     return build_tanh_layer_from_params(params, input_variable)
 903 | 
 904 | 
 905 | def build_relu_layer(input_size, output_size, input_variable, random_state):
 906 |     W_values = np.asarray(random_state.uniform(
 907 |         low=-np.sqrt(6. / (input_size + output_size)),
 908 |         high=np.sqrt(6. / (input_size + output_size)),
 909 |         size=(input_size, output_size)), dtype=theano.config.floatX)
 910 |     W = theano.shared(value=W_values, name='W', borrow=True)
 911 |     b_values = np.zeros((output_size,), dtype=theano.config.floatX)
 912 |     b = theano.shared(value=b_values, name='b', borrow=True)
 913 |     output_variable = relu(T.dot(input_variable, W) + b)
 914 |     params = [W, b]
 915 |     return output_variable, params
 916 | 
 917 | 
 918 | def build_sigmoid_layer(input_size, output_size, input_variable, random_state):
 919 |     W_values = np.asarray(random_state.uniform(
 920 |         low=-np.sqrt(6. / (input_size + output_size)),
 921 |         high=np.sqrt(6. / (input_size + output_size)),
 922 |         size=(input_size, output_size)), dtype=theano.config.floatX)
 923 |     W = theano.shared(value=4 * W_values, name='W', borrow=True)
 924 |     b_values = np.zeros((output_size,), dtype=theano.config.floatX)
 925 |     b = theano.shared(value=b_values, name='b', borrow=True)
 926 |     output_variable = T.nnet.sigmoid(T.dot(input_variable, W) + b)
 927 |     params = [W, b]
 928 |     return output_variable, params
 929 | 
 930 | 
 931 | def softmax_cost(y_hat_sym, y_sym):
 932 |     return -T.mean(T.log(y_hat_sym)[T.arange(y_sym.shape[0]), y_sym])
 933 | 
 934 | """
 935 | class FeedforwardNetwork(BaseNet, TrainingMixin):
 936 |     def __init__(self, hidden_layer_sizes=[500], batch_size=100, max_iter=1E3,
 937 |                  learning_rate=0.01, momentum=0., learning_alg="sgd",
 938 |                  activation="tanh", model_save_name="saved_model",
 939 |                  save_frequency=100, random_seed=None):
 940 | 
 941 |         if random_seed is None or type(random_seed) is int:
 942 |             self.random_state = np.random.RandomState(random_seed)
 943 |         self.max_iter = int(max_iter)
 944 |         self.hidden_layer_sizes = hidden_layer_sizes
 945 |         self.batch_size = batch_size
 946 |         self.save_frequency = save_frequency
 947 |         self.model_save_name = model_save_name
 948 | 
 949 |         self.learning_rate = learning_rate
 950 |         self.momentum = momentum
 951 |         self.learning_alg = learning_alg
 952 |         if activation == "relu":
 953 |             self.feedforward_function = build_relu_layer
 954 |         elif activation == "tanh":
 955 |             self.feedforward_function = build_tanh_layer
 956 |         elif activation == "sigmoid":
 957 |             self.feedforward_function = build_sigmoid_layer
 958 |         else:
 959 |             raise ValueError("Value %s not understood for activation"
 960 |                              % activation)
 961 | 
 962 |     def _setup_functions(self, X_sym, y_sym, layer_sizes):
 963 |         input_variable = X_sym
 964 |         params = []
 965 |         for i, (input_size, output_size) in enumerate(zip(layer_sizes[:-1],
 966 |                                                           layer_sizes[1:-1])):
 967 |             output_variable, layer_params = self.feedforward_function(
 968 |                 input_size, output_size, input_variable, self.random_state)
 969 |             params.extend(layer_params)
 970 |             input_variable = output_variable
 971 | 
 972 |         output_variable, layer_params = build_linear_layer(
 973 |             layer_sizes[-2], layer_sizes[-1], input_variable, self.random_state)
 974 |         params.extend(layer_params)
 975 |         y_hat_sym = T.nnet.softmax(output_variable)
 976 |         cost = softmax_cost(y_hat_sym, y_sym)
 977 | 
 978 |         self.params_ = params
 979 | 
 980 |         if self.learning_alg == "sgd":
 981 |             updates = self.get_sgd_updats(X_sym, y_sym, params, cost,
 982 |                                           self.learning_rate,
 983 |                                           self.momentum)
 984 |         else:
 985 |             raise ValueError("Algorithm %s is not "
 986 |                              "a valid argument for learning_alg!"
 987 |                              % self.learning_alg)
 988 |         self.fit_function = theano.function(
 989 |             inputs=[X_sym, y_sym], outputs=cost, updates=updates)
 990 |         self.loss_function = theano.function(
 991 |             inputs=[X_sym, y_sym], outputs=cost)
 992 | 
 993 |         self.predict_function = theano.function(
 994 |             inputs=[X_sym],
 995 |             outputs=[y_hat_sym],)
 996 | 
 997 |     def partial_fit(self, X, y):
 998 |         return self.fit_function(X, y.astype('int32'))
 999 | 
1000 |     def fit(self, X, y, valid_X=None, valid_y=None):
1001 |         input_size = X.shape[1]
1002 |         output_size = len(np.unique(y))
1003 |         X_sym = T.matrix('x')
1004 |         y_sym = T.ivector('y')
1005 |         self.layers_ = []
1006 |         self.layer_sizes_ = [input_size]
1007 |         self.layer_sizes_.extend(self.hidden_layer_sizes)
1008 |         self.layer_sizes_.append(output_size)
1009 |         self.training_loss_ = []
1010 |         self.validation_loss_ = []
1011 | 
1012 |         if not hasattr(self, 'fit_function'):
1013 |             self._setup_functions(X_sym, y_sym,
1014 |                                   self.layer_sizes_)
1015 | 
1016 |         batch_indices = list(range(0, X.shape[0], self.batch_size))
1017 |         if X.shape[0] != batch_indices[-1]:
1018 |             batch_indices.append(X.shape[0])
1019 | 
1020 |         best_valid_loss = np.inf
1021 |         for itr in range(self.max_iter):
1022 |             print("Starting pass %d through the dataset" % itr)
1023 |             batch_bounds = list(zip(batch_indices[:-1], batch_indices[1:]))
1024 |             # Random minibatches
1025 |             self.random_state.shuffle(batch_bounds)
1026 |             for start, end in batch_bounds:
1027 |                 self.partial_fit(X[start:end], y[start:end])
1028 |             current_train_loss = self.loss_function(X, y)
1029 |             self.training_loss_.append(current_train_loss)
1030 | 
1031 |             if (itr % self.save_frequency) == 0 or (itr == self.max_iter):
1032 |                 f = open(self.model_save_name + "_snapshot.pkl", 'wb')
1033 |                 cPickle.dump(self, f, protocol=2)
1034 |                 f.close()
1035 | 
1036 |             if valid_X is not None:
1037 |                 current_valid_loss = self.loss_function(valid_X, valid_y)
1038 |                 self.validation_loss_.append(current_valid_loss)
1039 |                 print("Validation loss %f" % current_valid_loss)
1040 |                 # if we got the best validation score until now, save
1041 |                 if current_valid_loss < best_valid_loss:
1042 |                     best_valid_loss = current_valid_loss
1043 |                     f = open(self.model_save_name + "_best.pkl", 'wb')
1044 |                     cPickle.dump(self, f, protocol=2)
1045 |                     f.close()
1046 |         return self
1047 | 
1048 |     def predict(self, X):
1049 |         return np.argmax(self.predict_function(X), axis=1)
1050 | """
1051 | 
1052 | 
1053 | def init_recurrent_conditional_lstm_layer(input_size, hidden_size, output_size,
1054 |                                           random_state):
1055 |     # input to LSTM
1056 |     W_ = np.concatenate(
1057 |         [np_rand((input_size, hidden_size), random_state),
1058 |          np_rand((input_size, hidden_size), random_state),
1059 |          np_rand((input_size, hidden_size), random_state),
1060 |          np_rand((input_size, hidden_size), random_state)],
1061 |         axis=1)
1062 | 
1063 |     W = theano.shared(W_, borrow=True)
1064 | 
1065 |     # LSTM to LSTM
1066 |     U_ = np.concatenate(
1067 |         [np_ortho((hidden_size, hidden_size), random_state),
1068 |          np_ortho((hidden_size, hidden_size), random_state),
1069 |          np_ortho((hidden_size, hidden_size), random_state),
1070 |          np_ortho((hidden_size, hidden_size), random_state)],
1071 |         axis=1)
1072 | 
1073 |     U = theano.shared(U_, borrow=True)
1074 | 
1075 |     # bias to LSTM
1076 |     # TODO: Ilya init for biases...
1077 |     b = shared_zeros((4 * hidden_size,))
1078 | 
1079 |     # Context to LSTM
1080 |     Wc = shared_rand((output_size, 4 * hidden_size), random_state)
1081 | 
1082 |     # attention: context to hidden
1083 |     Wc_att = shared_ortho((output_size, output_size), random_state)
1084 | 
1085 |     # attention: LSTM to hidden
1086 |     Wd_att = shared_rand((hidden_size, output_size), random_state)
1087 | 
1088 |     # attention: hidden bias
1089 |     b_att = shared_zeros((output_size,))
1090 | 
1091 |     # attention
1092 |     U_att = shared_rand((output_size, 1), random_state)
1093 |     c_att = shared_zeros((1,))
1094 | 
1095 |     params = [W, U, b, Wc, Wc_att, Wd_att, b_att, U_att, c_att]
1096 | 
1097 |     return params
1098 | 
1099 | 
1100 | def build_recurrent_conditional_lstm_layer(input_size, hidden_size, output_size,
1101 |                                            input_variable, mask, context,
1102 |                                            context_mask, init_state,
1103 |                                            init_memory, random_state,
1104 |                                            one_step=False):
1105 |     params = init_recurrent_conditional_lstm_layer(input_size, hidden_size,
1106 |                                                    output_size, random_state)
1107 | 
1108 |     return build_recurrent_conditional_lstm_layer_from_params(params,
1109 |                                                               input_variable,
1110 |                                                               mask, context,
1111 |                                                               context_mask,
1112 |                                                               init_state,
1113 |                                                               init_memory,
1114 |                                                               random_state,
1115 |                                                               one_step=one_step)
1116 | 
1117 | 
1118 | def build_recurrent_conditional_lstm_layer_from_params(params, input_variable,
1119 |                                                        mask, context,
1120 |                                                        context_mask, init_state,
1121 |                                                        init_memory,
1122 |                                                        random_state,
1123 |                                                        one_step=False):
1124 |     [W, U, b, Wc, Wc_att, Wd_att, b_att, U_att, c_att] = params
1125 | 
1126 |     n_steps = input_variable.shape[0]
1127 |     n_samples = input_variable.shape[1]
1128 |     n_features = input_variable.shape[2]
1129 | 
1130 |     hidden_size = U.shape[0]
1131 | 
1132 |     # projected context
1133 |     projected_context = T.dot(context, Wc_att) + b_att
1134 | 
1135 |     # projected input
1136 |     x = T.dot(input_variable, W) + b
1137 | 
1138 |     def _slice(X, n, hidden_size):
1139 |         # Function is needed because tensor size changes across calls to step?
1140 |         if X.ndim == 3:
1141 |             return X[:, :, n * hidden_size:(n + 1) * hidden_size]
1142 |         return X[:, n * hidden_size:(n + 1) * hidden_size]
1143 | 
1144 |     def step(x_t, m, h_tm1, c_tm1, ctx_t, att, pctx_):
1145 |         projected_state = T.dot(h_tm1, Wd_att)
1146 |         pctx_ = T.tanh(pctx_ + projected_state[None, :, :])
1147 |         new_att = T.dot(pctx_, U_att) + c_att
1148 |         new_att = new_att.reshape([new_att.shape[0], new_att.shape[1]])
1149 |         new_att = T.exp(new_att) * context_mask
1150 |         new_att = new_att / new_att.sum(axis=0, keepdims=True)
1151 |         # Current context
1152 |         ctx_t = (context * new_att[:, :, None]).sum(axis=0)
1153 | 
1154 |         preactivation = T.dot(h_tm1, U)
1155 |         preactivation += x_t
1156 |         preactivation += T.dot(ctx_t, Wc)
1157 | 
1158 |         i_t = T.nnet.sigmoid(_slice(preactivation, 0, hidden_size))
1159 |         f_t = T.nnet.sigmoid(_slice(preactivation, 1, hidden_size))
1160 |         o_t = T.nnet.sigmoid(_slice(preactivation, 2, hidden_size))
1161 |         c_t = T.tanh(_slice(preactivation, 3, hidden_size))
1162 | 
1163 |         c_t = f_t * c_tm1 + i_t * c_t
1164 |         c_t = m[:, None] * c_t + (1. - m)[:, None] * c_tm1
1165 |         h_t = o_t * T.tanh(c_t)
1166 |         h_t = m[:, None] * h_t + (1. - m)[:, None] * h_tm1
1167 |         return (h_t, c_t, ctx_t, new_att.T, projected_state,
1168 |                 i_t, f_t, o_t, preactivation)
1169 | 
1170 |     init_context = T.zeros((n_samples, context.shape[2]),
1171 |                             dtype=theano.config.floatX)
1172 |     init_att = T.zeros((n_samples, context.shape[0]),
1173 |                         dtype=theano.config.floatX)
1174 |     # Scan cannot handle batch sizes of 1?
1175 |     # Unbroadcast can fix it... but still weird
1176 |     #https://github.com/Theano/Theano/issues/1772
1177 |     #init_context = T.unbroadcast(init_context, 0)
1178 |     #init_att = T.unbroadcast(init_att, 0)
1179 | 
1180 |     if one_step:
1181 |         rval = step(x, mask, init_state, init_memory, None, None,
1182 |                     projected_context)
1183 |     else:
1184 |         rval, _ = theano.scan(step,
1185 |                               sequences=[x, mask],
1186 |                               outputs_info=[init_state, init_memory,
1187 |                                             init_context, init_att,
1188 |                                             None, None, None, None, None],
1189 |                               non_sequences=[projected_context,],
1190 |                               n_steps=n_steps)
1191 | 
1192 |     #hidden = rval[0]
1193 |     #state = rval[1]
1194 |     #final_context = rval[2]
1195 |     #final_att = rval[3]
1196 |     return rval[:4], params
1197 | 
1198 | 
1199 | def init_recurrent_lstm_layer(input_size, hidden_size, output_size,
1200 |                               random_state):
1201 |     # input to LSTM
1202 |     W_ = np.concatenate(
1203 |         [np_rand((input_size, hidden_size), random_state),
1204 |          np_rand((input_size, hidden_size), random_state),
1205 |          np_rand((input_size, hidden_size), random_state),
1206 |          np_rand((input_size, hidden_size), random_state)],
1207 |         axis=1)
1208 | 
1209 |     W = theano.shared(W_, borrow=True)
1210 | 
1211 |     # LSTM to LSTM
1212 |     U_ = np.concatenate(
1213 |         [np_ortho((hidden_size, hidden_size), random_state),
1214 |          np_ortho((hidden_size, hidden_size), random_state),
1215 |          np_ortho((hidden_size, hidden_size), random_state),
1216 |          np_ortho((hidden_size, hidden_size), random_state)],
1217 |         axis=1)
1218 | 
1219 |     U = theano.shared(U_, borrow=True)
1220 | 
1221 |     # bias to LSTM
1222 |     b = shared_zeros((4 * hidden_size,))
1223 | 
1224 |     params = [W, U, b]
1225 |     return params
1226 | 
1227 | 
1228 | def build_recurrent_lstm_layer(input_size, hidden_size, output_size,
1229 |                                input_variable, mask,
1230 |                                random_state, one_step=False):
1231 |     params = init_recurrent_lstm_layer(input_size, hidden_size, output_size,
1232 |                                        random_state)
1233 |     return build_recurrent_lstm_layer_from_params(params, input_variable, mask,
1234 |                                                   random_state,
1235 |                                                   one_step=one_step)
1236 | 
1237 | 
1238 | def build_recurrent_lstm_layer_from_params(params, input_variable, mask,
1239 |                                            random_state, one_step=False):
1240 |     [W, U, b] = params
1241 | 
1242 |     hidden_size = U.shape[0]
1243 | 
1244 |     n_steps = input_variable.shape[0]
1245 |     n_samples = input_variable.shape[1]
1246 |     n_features = input_variable.shape[2]
1247 | 
1248 |     def _slice(X, n, hidden_size):
1249 |         # Function is needed because tensor size changes across calls to step?
1250 |         if X.ndim == 3:
1251 |             return X[:, :, n * hidden_size:(n + 1) * hidden_size]
1252 |         return X[:, n * hidden_size:(n + 1) * hidden_size]
1253 | 
1254 |     def step(x_t, m, h_tm1, c_tm1):
1255 |         preactivation = T.dot(h_tm1, U)
1256 |         preactivation += x_t
1257 |         preactivation += b
1258 | 
1259 |         i_t = T.nnet.sigmoid(_slice(preactivation, 0, hidden_size))
1260 |         f_t = T.nnet.sigmoid(_slice(preactivation, 1, hidden_size))
1261 |         o_t = T.nnet.sigmoid(_slice(preactivation, 2, hidden_size))
1262 |         c_t = T.tanh(_slice(preactivation, 3, hidden_size))
1263 | 
1264 |         c_t = f_t * c_tm1 + i_t * c_t
1265 |         c_t = m[:, None] * c_t + (1. - m)[:, None] * c_tm1
1266 |         h_t = o_t * T.tanh(c_t)
1267 |         h_t = m[:, None] * h_t + (1. - m)[:, None] * h_tm1
1268 |         return h_t, c_t, i_t, f_t, o_t, preactivation
1269 | 
1270 |     # Scan cannot handle batch sizes of 1?
1271 |     # Unbroadcast can fix it... but still weird
1272 |     #https://github.com/Theano/Theano/issues/1772
1273 |     init_hidden = T.zeros((n_samples, hidden_size))
1274 |     init_cell = T.zeros((n_samples, hidden_size))
1275 |     init_hidden = T.unbroadcast(init_hidden, 0)
1276 |     init_cell = T.unbroadcast(init_cell, 0)
1277 | 
1278 |     x = T.dot(input_variable, W) + b
1279 |     if one_step:
1280 |         rval = step(x, mask, init_hidden, init_cell)
1281 |     else:
1282 |         rval, _ = theano.scan(step,
1283 |                               sequences=[x, mask],
1284 |                               outputs_info=[init_hidden, init_cell,
1285 |                                             None, None, None, None],
1286 |                               n_steps=n_steps)
1287 | 
1288 |     hidden = rval[0]
1289 |     return hidden, params
1290 | 
1291 | 
1292 | def recurrence_relation(size):
1293 |     """
1294 |     Based on code from Shawn Tan
1295 |     """
1296 | 
1297 |     eye2 = T.eye(size + 2)
1298 |     return T.eye(size) + eye2[2:, 1:-1] + eye2[2:, :-2] * (T.arange(size) % 2)
1299 | 
1300 | 
1301 | def path_probs(predict, y_sym):
1302 |     """
1303 |     Based on code from Rakesh - blank is assumed to be highest class in y_sym
1304 |     """
1305 |     pred_y = predict[:, y_sym]
1306 |     rr = recurrence_relation(y_sym.shape[0])
1307 | 
1308 |     def step(p_curr, p_prev):
1309 |         return p_curr * T.dot(p_prev, rr)
1310 | 
1311 |     probabilities, _ = theano.scan(
1312 |         step,
1313 |         sequences=[pred_y],
1314 |         outputs_info=[T.eye(y_sym.shape[0])[0]]
1315 |     )
1316 |     return probabilities
1317 | 
1318 | 
1319 | def _epslog(X):
1320 |     return T.cast(T.log(T.clip(X, 1E-12, 1E12)), theano.config.floatX)
1321 | 
1322 | 
1323 | def log_path_probs(y_hat_sym, y_sym):
1324 |     """
1325 |     Based on code from Shawn Tan with calculations in log space
1326 |     """
1327 |     pred_y = y_hat_sym[:, y_sym]
1328 |     rr = recurrence_relation(y_sym.shape[0])
1329 | 
1330 |     def step(logp_curr, logp_prev):
1331 |         return logp_curr + _epslog(T.dot(T.exp(logp_prev), rr))
1332 | 
1333 |     log_probs, _ = theano.scan(
1334 |         step,
1335 |         sequences=[_epslog(pred_y)],
1336 |         outputs_info=[_epslog(T.eye(y_sym.shape[0])[0])]
1337 |     )
1338 |     return log_probs
1339 | 
1340 | 
1341 | def ctc_cost(y_hat_sym, y_sym):
1342 |     """
1343 |     Based on code from Shawn Tan
1344 |     """
1345 |     forward_probs = path_probs(y_hat_sym, y_sym)
1346 |     backward_probs = path_probs(y_hat_sym[::-1], y_sym[::-1])[::-1, ::-1]
1347 |     probs = forward_probs * backward_probs / y_hat_sym[:, y_sym]
1348 |     total_probs = T.sum(probs)
1349 |     return -T.log(total_probs)
1350 | 
1351 | 
1352 | def log_ctc_cost(y_hat_sym, y_sym):
1353 |     """
1354 |     Based on code from Shawn Tan with sum calculations in log space
1355 |     """
1356 |     log_forward_probs = log_path_probs(y_hat_sym, y_sym)
1357 |     log_backward_probs = log_path_probs(
1358 |         y_hat_sym[::-1], y_sym[::-1])[::-1, ::-1]
1359 |     log_probs = log_forward_probs + log_backward_probs - _epslog(
1360 |         y_hat_sym[:, y_sym])
1361 |     log_probs = log_probs.flatten()
1362 |     max_log = T.max(log_probs)
1363 |     # Stable logsumexp
1364 |     loss = max_log + T.log(T.sum(T.exp(log_probs - max_log)))
1365 |     return -loss
1366 | 
1367 | 
1368 | def rnn_check_array(X, y=None):
1369 |     if type(X) == np.ndarray and len(X.shape) == 2:
1370 |         X = [X.astype(theano.config.floatX)]
1371 |     elif type(X) == np.ndarray and len(X.shape) == 3:
1372 |         X = X.astype(theano.config.floatX)
1373 |     elif type(X) == list:
1374 |         if type(X[0]) == np.ndarray and len(X[0].shape) == 2:
1375 |             X = [x.astype(theano.config.floatX) for x in X]
1376 |         else:
1377 |             raise ValueError("X must be a 2D numpy array or an"
1378 |                              "iterable of 2D numpy arrays")
1379 |     try:
1380 |         X[0].shape[1]
1381 |     except AttributeError:
1382 |         raise ValueError("X must be a 2D numpy array or an"
1383 |                          "iterable of 2D numpy arrays")
1384 | 
1385 |     if y is not None:
1386 |         if type(y) == np.ndarray and len(y.shape) == 1:
1387 |             y = [y.astype('int32')]
1388 |         elif type(y) == np.ndarray and len(y.shape) == 2:
1389 |             y = y.astype('int32')
1390 |         elif type(y) == list:
1391 |             if type(y[0]) == np.ndarray and len(y[0].shape) == 1:
1392 |                 y = [yi.astype('int32') for yi in y]
1393 |             elif type(y[0]) != np.ndarray:
1394 |                 y = [np.asarray(y).astype('int32')]
1395 |         try:
1396 |             y[0].shape[0]
1397 |         except AttributeError:
1398 |             raise ValueError("y must be an iterable of 1D numpy arrays")
1399 |         return X, y
1400 |     else:
1401 |         # If y is not passed don't return it
1402 |         return X
1403 | 
1404 | 
1405 | class RecurrentNetwork(BaseNet, TrainingMixin):
1406 |     def __init__(self, hidden_layer_sizes=[100], max_iter=1E2,
1407 |                  learning_rate=0.01, momentum=0., learning_alg="sgd",
1408 |                  recurrent_activation="lstm", minibatch_size=1,
1409 |                  bidirectional=False, cost="softmax", save_frequency=10,
1410 |                  model_save_name="saved_model", random_seed=None,
1411 |                  input_checking=True):
1412 |         if random_seed is None or type(random_seed) is int:
1413 |             self.random_state = np.random.RandomState(random_seed)
1414 |         self.learning_rate = learning_rate
1415 |         self.learning_alg = learning_alg
1416 |         self.momentum = momentum
1417 |         self.bidirectional = bidirectional
1418 |         self.cost = cost
1419 |         self.hidden_layer_sizes = hidden_layer_sizes
1420 |         self.max_iter = int(max_iter)
1421 |         self.minibatch_size = minibatch_size
1422 |         self.save_frequency = save_frequency
1423 |         self.model_save_name = model_save_name
1424 |         self.recurrent_activation = recurrent_activation
1425 |         self.input_checking = input_checking
1426 |         if recurrent_activation == "lstm":
1427 |             self.recurrent_function = build_recurrent_lstm_layer
1428 |         else:
1429 |             raise ValueError("Value %s not understood for recurrent_activation"
1430 |                              % recurrent_activation)
1431 | 
1432 |     def _setup_functions(self, X_sym, y_sym, X_mask, y_mask, layer_sizes):
1433 |         input_variable = X_sym
1434 | 
1435 |         # layer_sizes consists of input size, all hidden sizes, and output size
1436 |         hidden_sizes = layer_sizes[1:-1]
1437 |         # set these to stop pep8 vim plugin from complaining
1438 |         input_size = None
1439 |         output_size = None
1440 |         for n in range(len(hidden_sizes)):
1441 |             if (n - 1) < 0:
1442 |                 input_size = layer_sizes[0]
1443 |             else:
1444 |                 if self.bidirectional:
1445 |                     # Accomodate for concatenated hiddens
1446 |                     input_size = 2 * output_size
1447 |                 else:
1448 |                     input_size = output_size
1449 |             hidden_size = hidden_sizes[n]
1450 |             if (n + 1) != len(hidden_sizes):
1451 |                 output_size = hidden_sizes[n + 1]
1452 |             else:
1453 |                 output_size = layer_sizes[-1]
1454 | 
1455 |             forward_hidden, forward_params = self.recurrent_function(
1456 |                 input_size, hidden_size, output_size, input_variable, X_mask,
1457 |                 self.random_state)
1458 | 
1459 |             if self.bidirectional:
1460 |                 backward_hidden, backward_params = self.recurrent_function(
1461 |                     input_size, hidden_size, output_size, input_variable[::-1],
1462 |                     X_mask[::-1], self.random_state)
1463 |                 params = forward_params + backward_params
1464 |                 input_variable = concatenate(
1465 |                     [forward_hidden, backward_hidden[::-1]],
1466 |                     axis=forward_hidden.ndim - 1)
1467 |             else:
1468 |                 params = forward_params
1469 |                 input_variable = forward_hidden
1470 | 
1471 |         if self.bidirectional:
1472 |             # Accomodate for concatenated hiddens
1473 |             sz = 2 * hidden_sizes[-1]
1474 |         else:
1475 |             sz = hidden_sizes[-1]
1476 | 
1477 |         if self.cost == "softmax":
1478 |             # easy mode
1479 |             output, output_params = build_linear_layer(sz, output_size,
1480 |                                                        input_variable,
1481 |                                                        self.random_state)
1482 |             params = params + output_params
1483 |             shp = output.shape
1484 |             output = output.reshape([shp[0] * shp[1], shp[2]])
1485 |             y_hat_sym = T.nnet.softmax(output)
1486 |             y_sym_reshaped = y_sym.reshape([shp[0] * shp[1], shp[2]])
1487 |             cost = -T.mean((y_sym_reshaped * T.log(y_hat_sym)).sum(axis=1))
1488 | 
1489 |         elif self.cost == "encdec":
1490 |             # hardmode
1491 |             context = input_variable
1492 |             context_mean = context[0]
1493 | 
1494 |             init_state, state_params = build_tanh_layer(sz, hidden_sizes[-1],
1495 |                                                         context_mean,
1496 |                                                         self.random_state)
1497 |             init_memory, memory_params = build_tanh_layer(sz, hidden_sizes[-1],
1498 |                                                           context_mean,
1499 |                                                           self.random_state)
1500 |             # partial sampler setup
1501 |             self._encode = theano.function([X_sym, X_mask],
1502 |                                            [init_state, init_memory, context])
1503 |             init_state_sampler = T.matrix()
1504 |             init_memory_sampler = T.matrix()
1505 |             y_sw_sampler = T.tensor3()
1506 |             y_sw_mask = T.alloc(1., y_sw_sampler.shape[0], 1)
1507 | 
1508 |             # need this style of init to reuse params for sampler and actual
1509 |             # training. This makes this part quite nasty - dictionary
1510 |             # for initialization and params is making more and more sense.
1511 |             # conditional params will be reused below
1512 |             conditional_params = init_recurrent_conditional_lstm_layer(
1513 |                 output_size, hidden_sizes[-1], sz, self.random_state)
1514 | 
1515 |             rval, _p = build_recurrent_conditional_lstm_layer_from_params(
1516 |                 conditional_params, y_sw_sampler, y_sw_mask, context, X_mask,
1517 |                 init_state_sampler, init_memory_sampler,
1518 |                 self.random_state, one_step=True)
1519 |             next_state, next_memory, sampler_contexts, _ = rval
1520 |             #end sampler parts... for now
1521 | 
1522 |             params = params + state_params + memory_params
1523 |             shifted_labels = T.zeros_like(y_sym)
1524 |             shifted_labels = T.set_subtensor(shifted_labels[1:], y_sym[:-1])
1525 |             y_sym = shifted_labels
1526 | 
1527 |             rval, _p = build_recurrent_conditional_lstm_layer_from_params(
1528 |                 conditional_params, shifted_labels, y_mask, context, X_mask,
1529 |                 init_state, init_memory, self.random_state)
1530 |             projected_hidden, _, contexts, attention = rval
1531 | 
1532 |             params = params + conditional_params
1533 | 
1534 |             # once again, need to use same params for sample gen
1535 |             lh_params = init_linear_layer(hidden_sizes[-1], output_size,
1536 |                                           self.random_state)
1537 |             logit_hidden, _ = build_linear_layer_from_params(lh_params,
1538 |                                                              projected_hidden)
1539 |             params = params + lh_params
1540 | 
1541 |             lo_params = init_linear_layer(output_size, output_size,
1542 |                                           self.random_state)
1543 |             logit_out, _ = build_linear_layer_from_params(lo_params,
1544 |                                                              y_sym)
1545 |             params = params + lo_params
1546 | 
1547 | 
1548 |             lc_params = init_linear_layer(sz, output_size,
1549 |                                           self.random_state)
1550 |             logit_contexts, _ = build_linear_layer_from_params(lc_params,
1551 |                                                                contexts)
1552 |             params = params + lc_params
1553 | 
1554 |             logit = T.tanh(logit_hidden + logit_out + logit_contexts)
1555 |             output_params = init_linear_layer(output_size, output_size,
1556 |                                               self.random_state)
1557 |             output, _ = build_linear_layer_from_params(output_params,
1558 |                                                        logit)
1559 |             params = params + output_params
1560 | 
1561 |             shp = output.shape
1562 |             output = output.reshape([shp[0] * shp[1], shp[2]])
1563 |             y_hat_sym = T.nnet.softmax(output)
1564 | 
1565 |             # Need to apply mask so that cost isn't punished
1566 |             y_sym_reshaped = (y_sym * y_mask.dimshuffle(0, 1, 'x')).reshape(
1567 |                 [shp[0] * shp[1], shp[2]])
1568 |             y_sym_reshaped = y_sym.reshape([shp[0] * shp[1], shp[2]])
1569 |             cost = -T.mean((y_sym_reshaped * T.log(y_hat_sym)).sum(axis=1))
1570 | 
1571 |             # Finish sampler
1572 |             logit_sampler_hidden, _ = build_linear_layer_from_params(lh_params,
1573 |                                                                      next_state)
1574 |             logit_sampler_out, _ = build_linear_layer_from_params(lo_params,
1575 |                                                                   y_sw_sampler)
1576 |             logit_sampler_contexts, _ = build_linear_layer_from_params(
1577 |                 lc_params, sampler_contexts)
1578 |             logit_sampler = T.tanh(logit_sampler_hidden + logit_sampler_out
1579 |                                    + logit_sampler_contexts)
1580 |             output_sampler, _ = build_linear_layer_from_params(output_params,
1581 |                                                        logit_sampler)
1582 |             shp = output_sampler.shape
1583 |             output_sampler = output_sampler.reshape([shp[0] * shp[1], shp[2]])
1584 |             y_hat_sampler = T.nnet.softmax(output_sampler)
1585 |             self._sampler_step = theano.function(
1586 |                 [y_sw_sampler, context, X_mask, init_state_sampler,
1587 |                  init_memory_sampler],
1588 |                 [y_hat_sampler, next_state, next_memory])
1589 | 
1590 |         else:
1591 |             raise ValueError("Value of %s not a valid cost!"
1592 |                              % self.cost)
1593 | 
1594 |         self.params_ = params
1595 | 
1596 |         if self.learning_alg == "sgd":
1597 |             updates = self.get_clip_sgd_updates(
1598 |                 X_sym, y_sym, params, cost, self.learning_rate, self.momentum)
1599 |         elif self.learning_alg == "rmsprop":
1600 |             updates = self.get_clip_rmsprop_updates(
1601 |                 X_sym, y_sym, params, cost, self.learning_rate, self.momentum)
1602 |         elif self.learning_alg == "sfg":
1603 |             updates = self.get_sfg_updates(
1604 |                 X_sym, y_sym, params, cost, self.learning_rate, self.momentum)
1605 |         else:
1606 |             raise ValueError("Value of %s not a valid learning_alg!"
1607 |                              % self.learning_alg)
1608 | 
1609 |         if self.cost == "softmax":
1610 |             self.fit_function = theano.function(inputs=[X_sym, y_sym, X_mask,
1611 |                                                         y_mask],
1612 |                                                 outputs=cost,
1613 |                                                 updates=updates,
1614 |                                                 on_unused_input="ignore")
1615 | 
1616 |             self.loss_function = theano.function(inputs=[X_sym, y_sym, X_mask,
1617 |                                                         y_mask],
1618 |                                                 outputs=cost,
1619 |                                                 on_unused_input="ignore")
1620 | 
1621 |             self.predict_function = theano.function(
1622 |                 inputs=[X_sym, X_mask],
1623 |                 outputs=y_hat_sym,
1624 |                 on_unused_input="ignore")
1625 | 
1626 |         else:
1627 |             self.fit_function = theano.function(inputs=[X_sym, y_sym, X_mask,
1628 |                                                         y_mask],
1629 |                                                 outputs=cost,
1630 |                                                 updates=updates,
1631 |                                                 on_unused_input="warn")
1632 | 
1633 |             self.loss_function = theano.function(inputs=[X_sym, y_sym, X_mask,
1634 |                                                         y_mask],
1635 |                                                 outputs=cost,
1636 |                                                 on_unused_input="warn")
1637 | 
1638 |             self.predict_function = theano.function(
1639 |                 inputs=[X_sym, X_mask, y_sym, y_mask],
1640 |                 outputs=y_hat_sym)
1641 | 
1642 | 
1643 |     def fit(self, X, y, valid_X=None, valid_y=None):
1644 |         if self.input_checking:
1645 |             X, y = rnn_check_array(X, y)
1646 |         input_size = X[0].shape[1]
1647 |         # Assume that class values are sequential! and start from 0
1648 |         highest_class = np.max([np.max(d) for d in y])
1649 |         lowest_class = np.min([np.min(d) for d in y])
1650 |         if lowest_class != 0:
1651 |             raise ValueError("Labels must start from 0!")
1652 |         # Create a list of all classes, then get uniques
1653 |         # sum(lists, []) is list concatenation
1654 |         all_classes = np.unique(sum([list(np.unique(d)) for d in y], []))
1655 |         # +1 to include endpoint
1656 |         output_size = len(np.arange(lowest_class, highest_class + 1))
1657 |         X_sym = T.tensor3('x')
1658 |         y_sym = T.tensor3('y')
1659 |         X_mask = T.matrix('x_mask')
1660 |         y_mask = T.matrix('y_mask')
1661 | 
1662 |         self.layers_ = []
1663 |         self.layer_sizes_ = [input_size]
1664 |         self.layer_sizes_.extend(self.hidden_layer_sizes)
1665 |         self.layer_sizes_.append(output_size)
1666 |         if not hasattr(self, 'fit_function'):
1667 |             print("Building model!")
1668 |             self._setup_functions(X_sym, y_sym, X_mask, y_mask,
1669 |                                   self.layer_sizes_)
1670 |         self.training_loss_ = []
1671 |         if valid_X is not None:
1672 |             self.validation_loss_ = []
1673 |             if self.input_checking:
1674 |                 valid_X, valid_y = rnn_check_array(valid_X, valid_y)
1675 |                 for vy in valid_y:
1676 |                     if not np.in1d(np.unique(vy), all_classes).all():
1677 |                         raise ValueError(
1678 |                             "Validation set contains classes not in training"
1679 |                             "set! Training set classes: %s\n, Validation set \
1680 |                              classes: %s" % (all_classes, np.unique(vy)))
1681 | 
1682 |         best_valid_loss = np.inf
1683 |         for itr in range(self.max_iter):
1684 |             print("Starting pass %d through the dataset" % itr)
1685 |             total_train_loss = 0
1686 |             for i, j in minibatch_indices(X, self.minibatch_size):
1687 |                 X_n, y_n, X_mask, y_mask = make_minibatch(X[i:j], y[i:j],
1688 |                                                           output_size)
1689 |                 train_loss = self.fit_function(X_n, y_n, X_mask, y_mask)
1690 |                 total_train_loss += train_loss
1691 |             current_train_loss = total_train_loss / len(X)
1692 |             print("Training loss %f" % current_train_loss)
1693 |             self.training_loss_.append(current_train_loss)
1694 | 
1695 |             if (itr % self.save_frequency) == 0 or (itr == self.max_iter):
1696 |                 f = open(self.model_save_name + "_snapshot.pkl", 'wb')
1697 |                 cPickle.dump(self, f, protocol=2)
1698 |                 f.close()
1699 | 
1700 |             if valid_X is not None:
1701 |                 total_valid_loss = 0
1702 |                 for i, j in minibatch_indices(valid_X, self.minibatch_size):
1703 |                     valid_X_n, valid_y_n, X_mask, y_mask = make_minibatch(
1704 |                         valid_X[i:j], valid_y[i:j], output_size)
1705 |                     valid_loss = self.loss_function(valid_X_n, valid_y_n,
1706 |                                                     X_mask, y_mask)
1707 |                     total_valid_loss += valid_loss
1708 |                 current_valid_loss = total_valid_loss / len(valid_X)
1709 |                 print("Validation loss %f" % current_valid_loss)
1710 |                 self.validation_loss_.append(current_valid_loss)
1711 |                 if current_valid_loss < best_valid_loss:
1712 |                     best_valid_loss = current_valid_loss
1713 |                     f = open(self.model_save_name + "_best.pkl", 'wb')
1714 |                     cPickle.dump(self, f, protocol=2)
1715 |                     f.close()
1716 | 
1717 | 
1718 |     def predict(self, X):
1719 |         raise ValueError("Not yet implemented!")
1720 |         X = rnn_check_array(X)
1721 |         predictions = []
1722 |         for n in range(len(X)):
1723 |             X_n = X[n][None].transpose(1, 0, 2)
1724 |             X_mask = np.ones((len(X_n), 1)).astype(theano.config.floatX)
1725 |             pred = np.argmax(self.predict_function(X_n, X_mask)[0], axis=1)
1726 |             predictions.append(pred)
1727 |         return predictions
1728 | 
1729 |     def predict_proba(self, X):
1730 |         raise ValueError("Not yet implemented!")
1731 |         X = rnn_check_array(X)
1732 |         predictions = []
1733 |         for n in range(len(X)):
1734 |             X_n = X[n][None].transpose(1, 0, 2)
1735 |             X_mask = np.ones((len(X_n), 1)).astype(theano.config.floatX)
1736 |             pred = self.predict_function(X_n, X_mask)[0]
1737 |             predictions.append(pred)
1738 |         return predictions
1739 | 


--------------------------------------------------------------------------------
/tests/bi_multilayer_softmax.py:
--------------------------------------------------------------------------------
 1 | from net import RecurrentNetwork
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Test adapted from Mohammad P
 6 | # https://github.com/mohammadpz/Recurrent-Neural-Networks
 7 | 
 8 | n_u = 2
 9 | n_h = 6
10 | n_y = 3
11 | time_steps = 10
12 | n_seq = 100
13 | # n_y is equal to the number of calsses
14 | random_state = np.random.RandomState(1999)
15 | 
16 | seq = random_state.randn(n_seq, time_steps, n_u)
17 | targets = np.zeros((n_seq, time_steps), dtype=np.int32)
18 | 
19 | thresh = 0.5
20 | targets[:, 2:][seq[:, 1:-1, 1] > seq[:, :-2, 0] + thresh] = 1
21 | targets[:, 2:][seq[:, 1:-1, 1] < seq[:, :-2, 0] - thresh] = 2
22 | 
23 | clf = RecurrentNetwork(learning_alg="sgd", hidden_layer_sizes=[n_h, n_h],
24 |                        max_iter=1E3, cost="softmax", learning_rate=0.1,
25 |                        momentum=0.99, recurrent_activation="lstm",
26 |                        bidirectional=True, random_seed=1999)
27 | 
28 | clf.fit(seq, targets)
29 | 
30 | plt.close('all')
31 | fig = plt.figure()
32 | plt.grid()
33 | ax1 = plt.subplot(211)
34 | 
35 | plt.scatter(np.arange(time_steps), targets[1], marker='o', c='b')
36 | plt.grid()
37 | 
38 | guess = clf.predict_proba(seq[1])
39 | guessed_probs = plt.imshow(guess[0].T, interpolation='nearest', cmap='gray')
40 | ax1.set_title('blue points: true class, grayscale: model output (white mean class)')
41 | 
42 | ax2 = plt.subplot(212)
43 | plt.plot(clf.training_loss_)
44 | plt.grid()
45 | ax2.set_title('Training loss')
46 | plt.show()
47 | 


--------------------------------------------------------------------------------
/tests/bi_softmax.py:
--------------------------------------------------------------------------------
 1 | from net import RecurrentNetwork
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Test adapted from Mohammad P
 6 | # https://github.com/mohammadpz/Recurrent-Neural-Networks
 7 | 
 8 | n_u = 2
 9 | n_h = 6
10 | n_y = 3
11 | time_steps = 10
12 | n_seq = 100
13 | # n_y is equal to the number of calsses
14 | random_state = np.random.RandomState(1999)
15 | 
16 | seq = random_state.randn(n_seq, time_steps, n_u)
17 | targets = np.zeros((n_seq, time_steps), dtype=np.int32)
18 | 
19 | thresh = 0.5
20 | targets[:, 2:][seq[:, 1:-1, 1] > seq[:, :-2, 0] + thresh] = 1
21 | targets[:, 2:][seq[:, 1:-1, 1] < seq[:, :-2, 0] - thresh] = 2
22 | 
23 | clf = RecurrentNetwork(learning_alg="sgd", hidden_layer_sizes=[n_h],
24 |                        max_iter=1E3, cost="softmax", learning_rate=0.1,
25 |                        momentum=0.9, bidirectional=True,
26 |                        recurrent_activation="lstm", random_seed=1999)
27 | 
28 | clf.fit(seq, targets)
29 | 
30 | plt.close('all')
31 | fig = plt.figure()
32 | plt.grid()
33 | ax1 = plt.subplot(211)
34 | 
35 | plt.scatter(np.arange(time_steps), targets[1], marker='o', c='b')
36 | plt.grid()
37 | 
38 | guess = clf.predict_proba(seq[1])
39 | guessed_probs = plt.imshow(guess[0].T, interpolation='nearest', cmap='gray')
40 | ax1.set_title('blue points: true class, grayscale: model output (white mean class)')
41 | 
42 | ax2 = plt.subplot(212)
43 | plt.plot(clf.training_loss_)
44 | plt.grid()
45 | ax2.set_title('Training loss')
46 | plt.show()
47 | 


--------------------------------------------------------------------------------
/tests/multilayer_softmax.py:
--------------------------------------------------------------------------------
 1 | from net import RecurrentNetwork
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Test adapted from Mohammad P
 6 | # https://github.com/mohammadpz/Recurrent-Neural-Networks
 7 | 
 8 | n_u = 2
 9 | n_h = 6
10 | n_y = 3
11 | time_steps = 10
12 | n_seq = 100
13 | # n_y is equal to the number of calsses
14 | random_state = np.random.RandomState(1999)
15 | 
16 | seq = random_state.randn(n_seq, time_steps, n_u)
17 | targets = np.zeros((n_seq, time_steps), dtype=np.int32)
18 | 
19 | thresh = 0.5
20 | targets[:, 2:][seq[:, 1:-1, 1] > seq[:, :-2, 0] + thresh] = 1
21 | targets[:, 2:][seq[:, 1:-1, 1] < seq[:, :-2, 0] - thresh] = 2
22 | 
23 | clf = RecurrentNetwork(learning_alg="sgd", hidden_layer_sizes=[n_h, n_h],
24 |                        max_iter=1E3, cost="softmax", learning_rate=0.1,
25 |                        momentum=0.99, recurrent_activation="lstm",
26 |                        random_seed=1999)
27 | 
28 | clf.fit(seq, targets)
29 | 
30 | plt.close('all')
31 | fig = plt.figure()
32 | plt.grid()
33 | ax1 = plt.subplot(211)
34 | 
35 | plt.scatter(np.arange(time_steps), targets[1], marker='o', c='b')
36 | plt.grid()
37 | 
38 | guess = clf.predict_proba(seq[1])
39 | guessed_probs = plt.imshow(guess[0].T, interpolation='nearest', cmap='gray')
40 | ax1.set_title('blue points: true class, grayscale: model output (white mean class)')
41 | 
42 | ax2 = plt.subplot(212)
43 | plt.plot(clf.training_loss_)
44 | plt.grid()
45 | ax2.set_title('Training loss')
46 | plt.show()
47 | 


--------------------------------------------------------------------------------
/tests/net.py:
--------------------------------------------------------------------------------
1 | ../net.py


--------------------------------------------------------------------------------
/tests/predict.py:
--------------------------------------------------------------------------------
 1 | from net import RecurrentNetwork
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Test adapted from Mohammad P
 6 | # https://github.com/mohammadpz/Recurrent-Neural-Networks
 7 | 
 8 | n_u = 2
 9 | n_h = 6
10 | n_y = 3
11 | time_steps = 10
12 | n_seq = 100
13 | # n_y is equal to the number of calsses
14 | random_state = np.random.RandomState(1999)
15 | 
16 | seq = random_state.randn(n_seq, time_steps, n_u)
17 | targets = np.zeros((n_seq, time_steps), dtype=np.int32)
18 | 
19 | thresh = 0.5
20 | targets[:, 2:][seq[:, 1:-1, 1] > seq[:, :-2, 0] + thresh] = 1
21 | targets[:, 2:][seq[:, 1:-1, 1] < seq[:, :-2, 0] - thresh] = 2
22 | 
23 | clf = RecurrentNetwork(learning_alg="sgd", hidden_layer_sizes=[n_h],
24 |                        max_iter=10, cost="softmax", learning_rate=0.1,
25 |                        momentum=0.99, recurrent_activation="lstm",
26 |                        random_seed=1999)
27 | 
28 | clf.fit(seq, targets)
29 | clf.predict(seq)
30 | 
31 | plt.close('all')
32 | fig = plt.figure()
33 | plt.grid()
34 | ax1 = plt.subplot(211)
35 | 
36 | plt.scatter(np.arange(time_steps), targets[1], marker='o', c='b')
37 | plt.grid()
38 | 
39 | guess = clf.predict_proba(seq[1])
40 | guessed_probs = plt.imshow(guess[0].T, interpolation='nearest', cmap='gray')
41 | ax1.set_title('blue points: true class, grayscale: model output (white mean class)')
42 | 
43 | ax2 = plt.subplot(212)
44 | plt.plot(clf.training_loss_)
45 | plt.grid()
46 | ax2.set_title('Training loss')
47 | plt.show()
48 | 


--------------------------------------------------------------------------------
/tests/softmax.py:
--------------------------------------------------------------------------------
 1 | from net import RecurrentNetwork
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Test adapted from Mohammad P
 6 | # https://github.com/mohammadpz/Recurrent-Neural-Networks
 7 | 
 8 | n_u = 2
 9 | n_h = 6
10 | n_y = 3
11 | time_steps = 10
12 | n_seq = 100
13 | # n_y is equal to the number of classes
14 | random_state = np.random.RandomState(1999)
15 | 
16 | seq = random_state.randn(n_seq, time_steps, n_u)
17 | targets = np.zeros((n_seq, time_steps), dtype=np.int32)
18 | 
19 | thresh = 0.5
20 | targets[:, 2:][seq[:, 1:-1, 1] > seq[:, :-2, 0] + thresh] = 1
21 | targets[:, 2:][seq[:, 1:-1, 1] < seq[:, :-2, 0] - thresh] = 2
22 | 
23 | clf = RecurrentNetwork(learning_alg="sgd", hidden_layer_sizes=[n_h],
24 |                        max_iter=1E3, cost="softmax", learning_rate=0.1,
25 |                        momentum=0.99, recurrent_activation="lstm",
26 |                        random_seed=1999)
27 | 
28 | clf.fit(seq, targets)
29 | 
30 | plt.close('all')
31 | fig = plt.figure()
32 | plt.grid()
33 | ax1 = plt.subplot(211)
34 | 
35 | plt.scatter(np.arange(time_steps), targets[1], marker='o', c='b')
36 | plt.grid()
37 | 
38 | guess = clf.predict_proba(seq[1])
39 | guessed_probs = plt.imshow(guess[0].T, interpolation='nearest', cmap='gray')
40 | ax1.set_title('blue points: true class, grayscale: model output (white mean class)')
41 | 
42 | ax2 = plt.subplot(212)
43 | plt.plot(clf.training_loss_)
44 | plt.grid()
45 | ax2.set_title('Training loss')
46 | plt.show()
47 | 


--------------------------------------------------------------------------------
/util/continue_model_mnist.py:
--------------------------------------------------------------------------------
 1 | import cPickle
 2 | import time
 3 | import os
 4 | from net import load_data
 5 | 
 6 | datasets = load_data('mnist.pkl.gz')
 7 | train_set_x, train_set_y = datasets[0]
 8 | valid_set_x, valid_set_y = datasets[1]
 9 | test_set_x, test_set_y = datasets[2]
10 | 
11 | f = open('model.save', 'rb')
12 | classifier = cPickle.load(f)
13 | print '... training'
14 | start_time = time.clock()
15 | classifier.fit(train_set_x, train_set_y, valid_set_x, valid_set_y)
16 | end_time = time.clock()
17 | print('The code for file ' + os.path.split(__file__)[1] +
18 |       ' ran for %.2fm' % ((end_time - start_time) / 60.))
19 | 


--------------------------------------------------------------------------------
/util/load_model.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import cPickle
 3 | except ImportError:
 4 |     import pickle as cPickle
 5 | import sys
 6 | 
 7 | f = open(sys.argv[1], 'rb')
 8 | clf = cPickle.load(f)
 9 | 
10 | from IPython import embed; embed()
11 | 


--------------------------------------------------------------------------------