├── .gitignore ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── Tutorial.ipynb ├── setup.py └── theano_lstm ├── __init__.py ├── masked_loss.py └── shared_memory.py /.gitignore: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | # Compiled source # 3 | ################### 4 | *.com 5 | *.class 6 | *.dll 7 | *.exe 8 | *.o 9 | *.so 10 | 11 | # Packages # 12 | ############ 13 | # it's better to unpack these files and commit the raw source 14 | # git has its own built in compression methods 15 | *.7z 16 | *.dmg 17 | *.gz 18 | *.iso 19 | *.jar 20 | *.rar 21 | *.tar 22 | *.zip 23 | *.gem 24 | *.pem 25 | 26 | 27 | *.egg-info 28 | 29 | # Saves # 30 | ######### 31 | saves/* 32 | imported_saves/* 33 | pvdm_snapshots/* 34 | sentiment_data/* 35 | *.npy 36 | *.vocab 37 | *.svocab 38 | text8 39 | __pycache__/* 40 | *.pyc 41 | .ipynb_checkpoints 42 | __pycache__ 43 | 44 | build/ 45 | dist/ 46 | 47 | # Logs and databases # 48 | ###################### 49 | *.log 50 | *.sql 51 | *.sqlite 52 | 53 | # OS generated files # 54 | ###################### 55 | .DS_Store 56 | .DS_Store? 57 | ._* 58 | .Spotlight-V100 59 | .Trashes 60 | ehthumbs.db 61 | Thumbs.db 62 | ======= 63 | .DS_Store 64 | >>>>>>> 9ced40381de6f9e6c2b02fc8ba7bb993203c6d62 -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | LICENSE 2 | ======= 3 | 4 | Copyright (c) 2014--2015, Jonathan Raiman 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | * Neither the name of Theano nor the names of its contributors may be 16 | used to endorse or promote products derived from this software without 17 | specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY 20 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 23 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 26 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Small Theano LSTM recurrent network module 2 | ------------------------------------------ 3 | 4 | @author: Jonathan Raiman 5 | @date: December 10th 2014 6 | 7 | Implements most of the great things that came out 8 | in 2014 concerning recurrent neural networks, and 9 | some good optimizers for these types of networks. 10 | 11 | ### Key Features 12 | 13 | This module contains several Layer types that are useful 14 | for prediction and modeling from sequences: 15 | 16 | * A non-recurrent **Layer**, with a connection matrix W, and bias b 17 | * A recurrent **RNN Layer** that takes as input its previous hidden activation and has an initial hidden activation 18 | * A recurrent **LSTM Layer** that takes as input its previous hidden activation and memory cell values, and has initial values for both of those 19 | * An **Embedding** layer that contains an embedding matrix and takes integers as input and returns slices from its embedding matrix (e.g. word vectors) 20 | * A non-recurrent **GatedInput**, with a connection matrix W, and bias b, that multiplies a single scalar to each input (gating jointly multiple inputs) 21 | * Deals with exploding and vanishing gradients with a subgradient optimizer (Adadelta) and element-wise gradient clipping (à la Alex Graves) 22 | 23 | This module also contains the **SGD**, **AdaGrad**, and **AdaDelta** gradient descent methods that are constructed using an objective function and a set of theano variables, and returns an `updates` dictionary to pass to a theano function. 24 | 25 | 26 | ### Quick Tutorial 27 | 28 | See [a short tutorial for sequence forecasting here](http://nbviewer.ipython.org/github/JonathanRaiman/theano_lstm/blob/master/Tutorial.ipynb). 29 | Or read on for some usage examples. 30 | 31 | ### Usage 32 | 33 | Here is an example of usage with stacked LSTM units, using 34 | Adadelta to optimize, and using a scan operation from Theano (a symbolic loop for backpropagation through time). 35 | 36 | dropout = 0.0 37 | 38 | model = StackedCells(4, layers=[20, 20], activation=T.tanh, celltype=LSTM) 39 | model.layers[0].in_gate2.activation = lambda x: x 40 | model.layers.append(Layer(20, 2, lambda x: T.nnet.softmax(x)[0])) 41 | 42 | # in this example dynamics is a random function that takes our 43 | # output along with the current state and produces an observation 44 | # for t + 1 45 | 46 | def step(x, *prev_hiddens): 47 | new_states = stacked_rnn.forward(x, prev_hiddens, dropout) 48 | return [dynamics(x, new_states[-1])] + new_states[:-1] 49 | 50 | initial_obs = T.vector() 51 | timesteps = T.iscalar() 52 | 53 | result, updates = theano.scan(step, 54 | n_steps=timesteps, 55 | outputs_info=[dict(initial=initial_obs, taps=[-1])] + [dict(initial=layer.initial_hidden_state, taps=[-1]) for layer in model.layers if hasattr(layer, 'initial_hidden_state')]) 56 | 57 | target = T.vector() 58 | 59 | cost = (result[0][:,[0,2]] - target[[0,2]]).norm(L=2) / timesteps 60 | 61 | updates, gsums, xsums, lr, max_norm = \ 62 | create_optimization_updates(cost, model.params, method='adadelta') 63 | 64 | update_fun = theano.function([initial_obs, target, timesteps], cost, updates = updates, allow_input_downcast=True) 65 | predict_fun = theano.function([initial_obs, timesteps], result[0], allow_input_downcast=True) 66 | 67 | for example, label in training_set: 68 | c = update_fun(example, label, 10) 69 | 70 | ### Minibatch usage 71 | 72 | Suppose you now have many sequences (of equal length -- we'll generalize this later). Then training can be done in batches: 73 | 74 | model = StackedCells(4, layers=[20, 20], activation=T.tanh, celltype=LSTM) 75 | model.layers[0].in_gate2.activation = lambda x: x 76 | model.layers.append(Layer(20, 2, lambda x: T.nnet.softmax(x)[0])) 77 | 78 | # in this example dynamics is a function that simulates the behavior of a double 79 | # pendulum and takes our current state and produces an observation 80 | # for t + 1 81 | def dynamics(x, u): 82 | dydx = T.alloc(0.0, 4) 83 | dydx = T.set_subtensor(dydx[0], x[1]) 84 | del_ = x[2]-x[0] 85 | den1 = (M1+M2)*L1 - M2*L1*T.cos(del_)*T.cos(del_) 86 | dydx = T.set_subtensor(dydx[1],\n", 87 | ( M2*L1 * x[1] * x[1] * T.sin(del_) * T.cos(del_) 88 | + M2*G * T.sin(x[2]) * T.cos(del_) + 89 | M2*L2 * x[3] * x[3] * T.sin(del_) 90 | - (M1+M2)*G * T.sin(x[0]))/den1 ) 91 | dydx = T.set_subtensor(dydx[2], x[3]) 92 | 93 | den2 = (L2/L1)*den1 94 | dydx = T.set_subtensor(dydx[3], (-M2*L2 * x[3]*x[3]*T.sin(del_) * T.cos(del_) 95 | + (M1+M2)*G * T.sin(x[0])*T.cos(del_) 96 | - (M1+M2)*L1 * x[1]*x[1]*T.sin(del_) 97 | - (M1+M2)*G * T.sin(x[2]))/den2 + u ) 98 | return x + dydx * dt 99 | 100 | def step(x, *prev_hiddens): 101 | new_states = stacked_rnn.forward(x, prev_hiddens, dropout) 102 | return [dynamics(x, new_states[-1])] + new_states[:-1] 103 | 104 | # switch to a matrix of observations: 105 | initial_obs = T.imatrix() 106 | timesteps = T.iscalar() 107 | 108 | result, updates = theano.scan(step, 109 | n_steps=timesteps, 110 | outputs_info=[dict(initial=initial_obs, taps=[-1])] + [dict(initial=layer.initial_hidden_state, taps=[-1]) for layer in model.layers if hasattr(layer, 'initial_hidden_state')]) 111 | 112 | target = T.ivector() 113 | 114 | cost = (result[0][:,:,[0,2]] - target[:,[0,2]]).norm(L=2) / timesteps 115 | 116 | updates, gsums, xsums, lr, max_norm = \ 117 | create_optimization_updates(cost, model.params, method='adadelta') 118 | 119 | update_fun = theano.function([initial_obs, target, timesteps], cost, updates = updates, allow_input_downcast=True) 120 | predict_fun = theano.function([initial_obs, timesteps], result[0], allow_input_downcast=True) 121 | 122 | for minibatch, labels in minibatches: 123 | c = update_fun(minibatch, label, 10) 124 | 125 | ### Minibatch usage with different sizes 126 | 127 | Generalization can be made to different sequence length if we accept the minor cost of forward-propagating parts of our graph we don't care about. To do this we make all sequences the same length by padding the end of the shorter ones with some symbol. Then use a binary matrix of the same size than all your minibatch sequences. The matrix has a 1 in areas when the error should be calculated, and zero otherwise. Elementwise mutliply this mask with your output, and then apply your objective function to this masked output. The error will be obtained everywhere, but will be zero in areas that were masked, yielding the correct error function. 128 | While there is some waste computation, the parallelization can offset this cost and make the overall computation faster. 129 | 130 | #### MaskedLoss usage 131 | 132 | To use different length sequences, consider the following approach: 133 | 134 | * you have sequences *y_1, y_2, ..., y_n*, and labels *l_1, l_2, ..., l_n*. 135 | * pad all the sequences to the longest sequence *y_k*, and form a matrix **Y** of all padded sequences 136 | * similarly form the labels at each timestep for each padded sequence (with zeros, or some other symbol for labels in padded areas) 137 | * then record the length of the true labels (codelengths) needed before padding *c_1, c_2, ..., c_n*, and the length of the sequences before padding *l_1, l_2, ..., l_n* 138 | * pass the lengths, targets, and predictions to the masked loss as follows: 139 | 140 | predictions, updates = theano.scan(prediction_step, etc...) 141 | 142 | error = masked_loss( 143 | predictions, 144 | padded_labels, 145 | codelengths, 146 | label_starts).mean() 147 | 148 | Visually this goes something like this, for the case with three inputs, three outputs, but a single label for 149 | the final output: 150 | 151 | inputs [ x_1 x_2 x_3 ] 152 | 153 | outputs [ p_1 p_2 p_3 ] 154 | 155 | labels [ ... ... l_1 ] 156 | 157 | then we would have a matrix *x* with *x_1, x_2, x_3*, and `predictions` in the code above would contain *p_1, p_2, p_3*. 158 | We would then pass to `masked_loss` the codelength [ 1 ], since there is only "l_1" to predict, and the `label_starts` [ 2 ], 159 | indicating that errors should be computed at the third prediction (with zero index). 160 | 161 | #### Dropout Usage in Theano Scan 162 | 163 | To get dropout to work and be dynamically modifyiable without recompiling let's consider the following usage example. 164 | 165 | First we define a variable with the likelihood that a neuron will be dropped (randomly set to 0): 166 | 167 | dropout = theano.shared(np.float64(0.3).astype(theano.config.floatX)) 168 | deterministic = False # for now 169 | 170 | Create some model: 171 | 172 | model = theano_lstm.StackedCells(50, layers=[100], celltype=theano_lstm.LSTM, activation=T.tanh) 173 | 174 | Now we want to introduce dropout noise between the input and the LSTM. To use Dropout outside of a Theano `scan` loop you could simply multiply elementwise by a binomial random variable ([see examples here](https://gist.github.com/SnippyHolloW/8a0f820261926e2f41cc)), but if you plan on using recurrent networks with a Theano `scan` you need to call your random numbers outside of the loop. 175 | 176 | In order to keep track of these dropout activations we'll generate *masks*. *Masks* are a list with all the realizations of binomials. We generate this list with `MultiDropout`, a special function in the `theano_lstm` module that takes different hidden layer sizes and returns a list of matrices with binomial random variable realizations inside: 177 | 178 | if dropout.get_value() > 0: 179 | if deterministic: 180 | # just multiply by the likelihood of being kept: 181 | masks = [np.float32(1.) - self.dropout for i in range(2)] 182 | else: 183 | shapes = [50, 100] 184 | masks = theano_lstm.MultiDropout( [(x.shape[0], shape) for shape in shapes] if x.ndim > 1 else shapes, 185 | self.dropout) 186 | else: 187 | masks = [] 188 | 189 | Now our loop forward function is as follows: 190 | 191 | def step(obs, hidden_state, *masks): 192 | new_state = model.forward(obs, [hidden_state], list(masks)) 193 | return new_state[1] 194 | 195 | We pass it to Theano's scan: 196 | 197 | result, _ = theano.scan(step, 198 | sequences = seq, 199 | non_sequences = masks, 200 | outputs_info = [dict(initial=model.layers[0].initial_hidden_state, taps=[-1])] 201 | ) 202 | 203 | And We're done. 204 | 205 | **Note:** To not use *Masks* pass an empty list `[]` instead. 206 | -------------------------------------------------------------------------------- /Tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%load_ext autoreload\n", 12 | "%autoreload 2\n", 13 | "import theano, theano.tensor as T\n", 14 | "import numpy as np\n", 15 | "import theano_lstm\n", 16 | "import random" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## A Nonsensical Language Model using Theano LSTM\n", 24 | "\n", 25 | "Today we will train a **nonsensical** language model !\n", 26 | "\n", 27 | "We will first collect some language data, convert it to numbers, and then feed it to a recurrent neural network and ask it to predict upcoming words. When we are done we will have a machine that can generate sentences from our made-up language ad-infinitum !\n", 28 | "\n", 29 | "### Collect Language Data\n", 30 | "\n", 31 | "The first step here is to get some data. Since we are basing our language on nonsense, we need to generate good nonsense using a sampler.\n", 32 | "\n", 33 | "Our sampler will take a probability table as input, e.g. a language where people are equally likely to say \"a\" or \"b\" would be written as follows:\n", 34 | "\n", 35 | " nonsense = Sampler({\"a\": 0.5, \"b\": 0.5})\n", 36 | " \n", 37 | "We get samples from this language like this:\n", 38 | "\n", 39 | " word = nonsense()\n", 40 | " \n", 41 | "We overloaded the `__call__` method and got this syntactic sugar." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 192, 47 | "metadata": { 48 | "collapsed": true 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "## Fake dataset:\n", 53 | "\n", 54 | "class Sampler:\n", 55 | " def __init__(self, prob_table):\n", 56 | " total_prob = 0.0\n", 57 | " if type(prob_table) is dict:\n", 58 | " for key, value in prob_table.items():\n", 59 | " total_prob += value\n", 60 | " elif type(prob_table) is list:\n", 61 | " prob_table_gen = {}\n", 62 | " for key in prob_table:\n", 63 | " prob_table_gen[key] = 1.0 / (float(len(prob_table)))\n", 64 | " total_prob = 1.0\n", 65 | " prob_table = prob_table_gen\n", 66 | " else:\n", 67 | " raise ArgumentError(\"__init__ takes either a dict or a list as its first argument\")\n", 68 | " if total_prob <= 0.0:\n", 69 | " raise ValueError(\"Probability is not strictly positive.\")\n", 70 | " self._keys = []\n", 71 | " self._probs = []\n", 72 | " for key in prob_table:\n", 73 | " self._keys.append(key)\n", 74 | " self._probs.append(prob_table[key] / total_prob)\n", 75 | " \n", 76 | " def __call__(self):\n", 77 | " sample = random.random()\n", 78 | " seen_prob = 0.0\n", 79 | " for key, prob in zip(self._keys, self._probs):\n", 80 | " if (seen_prob + prob) >= sample:\n", 81 | " return key\n", 82 | " else:\n", 83 | " seen_prob += prob\n", 84 | " return key" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "### Parts of Speech\n", 92 | "\n", 93 | "Now that we have a `Sampler` we can create a couple different word groups that our language uses to distinguish between different probability distributions easily:" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "collapsed": true 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "samplers = {\n", 105 | " \"punctuation\": Sampler({\".\": 0.49, \",\": 0.5, \";\": 0.03, \"?\": 0.05, \"!\": 0.05}),\n", 106 | " \"stop\": Sampler({\"the\": 10, \"from\": 5, \"a\": 9, \"they\": 3, \"he\": 3, \"it\" : 2.5, \"she\": 2.7, \"in\": 4.5}),\n", 107 | " \"noun\": Sampler([\"cat\", \"broom\", \"boat\", \"dog\", \"car\", \"wrangler\", \"mexico\", \"lantern\", \"book\", \"paper\", \"joke\",\"calendar\", \"ship\", \"event\"]),\n", 108 | " \"verb\": Sampler([\"ran\", \"stole\", \"carried\", \"could\", \"would\", \"do\", \"can\", \"carry\", \"catapult\", \"jump\", \"duck\"]),\n", 109 | " \"adverb\": Sampler([\"rapidly\", \"calmly\", \"cooly\", \"in jest\", \"fantastically\", \"angrily\", \"dazily\"])\n", 110 | " }" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "### Simple Grammar\n", 118 | "\n", 119 | "To create sentences from our language we create a simple recursion that goes as follows:\n", 120 | "\n", 121 | "1. If the sentence we have ends with a full stop, a question mark, or an exclamation point then end at once!\n", 122 | "2. Else our sentence should have:\n", 123 | " * A stop word\n", 124 | " * A noun\n", 125 | " * An adverb (with prob 0.3), or 2 adverbs (with prob 0.3*0.3=0.09)\n", 126 | " * A verb\n", 127 | " * Another noun (with prob 0.2), or 2 more nouns connected by a dash (with prob 0.2*0.1=0.02)\n", 128 | "3. If our sentence is now over 500 characters, add a full stop and end at once!\n", 129 | "4. Else add some punctuation and go back to (1)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 193, 135 | "metadata": { 136 | "collapsed": true 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "def generate_nonsense(word = \"\"):\n", 141 | " if word.endswith(\".\"):\n", 142 | " return word\n", 143 | " else:\n", 144 | " if len(word) > 0:\n", 145 | " word += \" \"\n", 146 | " word += samplers[\"stop\"]()\n", 147 | " word += \" \" + samplers[\"noun\"]()\n", 148 | " if random.random() > 0.7:\n", 149 | " word += \" \" + samplers[\"adverb\"]()\n", 150 | " if random.random() > 0.7:\n", 151 | " word += \" \" + samplers[\"adverb\"]()\n", 152 | " word += \" \" + samplers[\"verb\"]()\n", 153 | " if random.random() > 0.8:\n", 154 | " word += \" \" + samplers[\"noun\"]()\n", 155 | " if random.random() > 0.9:\n", 156 | " word += \"-\" + samplers[\"noun\"]()\n", 157 | " if len(word) > 500:\n", 158 | " word += \".\"\n", 159 | " else:\n", 160 | " word += \" \" + samplers[\"punctuation\"]()\n", 161 | " return generate_nonsense(word)\n", 162 | "\n", 163 | "def generate_dataset(total_size, ):\n", 164 | " sentences = []\n", 165 | " for i in range(total_size):\n", 166 | " sentences.append(generate_nonsense())\n", 167 | " return sentences\n", 168 | "\n", 169 | "# generate dataset \n", 170 | "lines = generate_dataset(100)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "### Utilities\n", 178 | "\n", 179 | "Now that we have our training corpus for our language model (optionally you could gather an actual corpus from the web :), we can now create our first utility, `Vocab`, that will hold the mapping from words to an index, and perfom the conversions from words to indices and vice-versa:" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": { 186 | "collapsed": true 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "### Utilities:\n", 191 | "class Vocab:\n", 192 | " __slots__ = [\"word2index\", \"index2word\", \"unknown\"]\n", 193 | " \n", 194 | " def __init__(self, index2word = None):\n", 195 | " self.word2index = {}\n", 196 | " self.index2word = []\n", 197 | " \n", 198 | " # add unknown word:\n", 199 | " self.add_words([\"**UNKNOWN**\"])\n", 200 | " self.unknown = 0\n", 201 | " \n", 202 | " if index2word is not None:\n", 203 | " self.add_words(index2word)\n", 204 | " \n", 205 | " def add_words(self, words):\n", 206 | " for word in words:\n", 207 | " if word not in self.word2index:\n", 208 | " self.word2index[word] = len(self.word2index)\n", 209 | " self.index2word.append(word)\n", 210 | " \n", 211 | " def __call__(self, line):\n", 212 | " \"\"\"\n", 213 | " Convert from numerical representation to words\n", 214 | " and vice-versa.\n", 215 | " \"\"\"\n", 216 | " if type(line) is np.ndarray:\n", 217 | " return \" \".join([self.index2word[word] for word in line])\n", 218 | " if type(line) is list:\n", 219 | " if len(line) > 0:\n", 220 | " if line[0] is int:\n", 221 | " return \" \".join([self.index2word[word] for word in line])\n", 222 | " indices = np.zeros(len(line), dtype=np.int32)\n", 223 | " else:\n", 224 | " line = line.split(\" \")\n", 225 | " indices = np.zeros(len(line), dtype=np.int32)\n", 226 | " \n", 227 | " for i, word in enumerate(line):\n", 228 | " indices[i] = self.word2index.get(word, self.unknown)\n", 229 | " \n", 230 | " return indices\n", 231 | " \n", 232 | " @property\n", 233 | " def size(self):\n", 234 | " return len(self.index2word)\n", 235 | " \n", 236 | " def __len__(self):\n", 237 | " return len(self.index2word)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "### Create a Mapping from numbers to words\n", 245 | "\n", 246 | "Now we can use the `Vocab` class to gather all the words and store an Index:" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": { 253 | "collapsed": true 254 | }, 255 | "outputs": [], 256 | "source": [ 257 | "vocab = Vocab()\n", 258 | "for line in lines:\n", 259 | " vocab.add_words(line.split(\" \"))" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "To send our sentences in one big chunk to our neural network we transform each sentence into a row vector and place each of these rows into a bigger matrix that holds all these rows. Not all sentences have the same length, so we will pad those that are too short with 0s in `pad_into_matrix`:" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 168, 272 | "metadata": { 273 | "collapsed": false 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "def pad_into_matrix(rows, padding = 0):\n", 278 | " if len(rows) == 0:\n", 279 | " return np.array([0, 0], dtype=np.int32)\n", 280 | " lengths = map(len, rows)\n", 281 | " width = max(lengths)\n", 282 | " height = len(rows)\n", 283 | " mat = np.empty([height, width], dtype=rows[0].dtype)\n", 284 | " mat.fill(padding)\n", 285 | " for i, row in enumerate(rows):\n", 286 | " mat[i, 0:len(row)] = row\n", 287 | " return mat, list(lengths)\n", 288 | "\n", 289 | "# transform into big numerical matrix of sentences:\n", 290 | "numerical_lines = []\n", 291 | "for line in lines:\n", 292 | " numerical_lines.append(vocab(line))\n", 293 | "numerical_lines, numerical_lengths = pad_into_matrix(numerical_lines)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "## Build a Recurrent Neural Network\n", 301 | "\n", 302 | "Now the real work is upon us! Thank goodness we have our language data ready. We now create a recurrent neural network by connecting an Embedding $E$ for each word in our corpus, and stacking some special cells together to form a prediction function. Mathematically we want:\n", 303 | "\n", 304 | "$$\\mathrm{argmax_{E, \\Phi}} {\\bf P}(w_{k+1}| w_{k}, \\dots, w_{0}; E, \\Phi) = f(x, h)$$\n", 305 | "\n", 306 | "with $f(\\cdot, \\cdot)$ the function our recurrent neural network performs at each timestep that takes as inputs:\n", 307 | "\n", 308 | "* an observation $x$, and\n", 309 | "* a previous state $h$,\n", 310 | "\n", 311 | "and outputs a probability distribution $\\hat{p}$ over the next word.\n", 312 | "\n", 313 | "We have $x = E[ w_{k}]$ our observation at time $k$, and $h$ the internal state of our neural network, and $\\Phi$ is the set of parameters used by our classifier, and recurrent neural network, and $E$ is the embedding for our words.\n", 314 | "\n", 315 | "In practice we will obtain $E$ and $\\Phi$ iteratively using gradient descent on the error our network is making in its prediction. To do this we define our error as the [Kullback-Leibler divergence](http://en.wikipedia.org/wiki/Kullback–Leibler_divergence) (a distance between probability distributions) between our estimate of $\\hat{p} = {\\bf P}(w_{k+1}| w_{k}, \\dots, w_{0}; E, \\Phi)$ and the actual value of ${\\bf P}(w_{k+1}| w_{k}, \\dots, w_{0})$ from the data (e.g. a probability distribution that is 1 for word $w_k$ and 0 elsewhere).\n", 316 | "\n", 317 | "\n", 318 | "#### Theano LSTM StackedCells function\n", 319 | "\n", 320 | "To build this predictive model we make use of [theano_lstm](https://github.com/JonathanRaiman/theano_lstm), a Python module for building recurrent neural networks using Theano. The first step we take is to declare what kind of cells we want to use by declaring a celltype. There are many different celltypes we can use, but the most common these days (and incidentally most effective) are `RNN` and `LSTM`. For a more in-depth discussion of how these work I suggest checking out [Arxiv](http://arxiv.org/find/all/1/all:+lstm/0/1/0/all/0/1), or [Alex Graves' website](http://www.cs.toronto.edu/~graves/), or [Wikipedia](http://en.wikipedia.org/wiki/Long_short_term_memory). Here we use `celltype = LSTM`.\n", 321 | "\n", 322 | " self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size)\n", 323 | " \n", 324 | "Once we've declared what kind of cells we want to use, we can now choose to add an Embedding to map integers (indices) to vectors (and in our case map words to their indices, then indices to word vectors we wish to train). Intuitively this lets the network separate and recognize what it is \"seeing\" or \"receiving\" at each timestep. To add an Embedding we create `Embedding(vocabulary_size, size_of_embedding_vectors)` and insert it at the begging of the `StackedCells`'s layers list (thereby telling `StackedCells` that this Embedding layer needs to be activated before the other ones):\n", 325 | " \n", 326 | " # add an embedding\n", 327 | " self.model.layers.insert(0, Embedding(vocab_size, input_size))\n", 328 | " \n", 329 | "The final output of our network needs to be a probability distribution over the next words (but in different application areas this could be a sentiment classification, a decision, a topic, etc...) so we add another layer that maps the internal state of the LSTMs to a probability distribution over the all the words in our language. To ensure that our prediction is indeed a probability distribution we \"activate\" our layer with a Softmax, meaning that we will exponentiate every value of the output, $q_i = e^{x_i}$, so that all values are positive, and then we will divide the output by its sum so that the output sums to 1:\n", 330 | "\n", 331 | "$$p_i = \\frac{q_i}{\\sum_j q_j}\\text{, and }\\sum_i p_i = 1.$$\n", 332 | " \n", 333 | " # add a classifier:\n", 334 | " self.model.layers.append(Layer(hidden_size, vocab_size, activation = softmax))\n", 335 | " \n", 336 | "For convenience we wrap this all in one class below.\n", 337 | "\n", 338 | "#### Prediction\n", 339 | "\n", 340 | "We have now defined our network. At each timestep we can produce a probability distribution for each input index:\n", 341 | "\n", 342 | " def create_prediction(self, greedy=False):\n", 343 | " def step(idx, *states):\n", 344 | " # new hiddens are the states we need to pass to LSTMs\n", 345 | " # from past. Because the StackedCells also include\n", 346 | " # the embeddings, and those have no state, we pass\n", 347 | " # a \"None\" instead:\n", 348 | " new_hiddens = [None] + list(states)\n", 349 | "\n", 350 | " new_states = self.model.forward(idx, prev_hiddens = new_hiddens)\n", 351 | " return new_states[1:]\n", 352 | " ...\n", 353 | " \n", 354 | "Our inputs are an integer matrix Theano symbolic variable:\n", 355 | " \n", 356 | " ...\n", 357 | " # in sequence forecasting scenario we take everything\n", 358 | " # up to the before last step, and predict subsequent\n", 359 | " # steps ergo, 0 ... n - 1, hence:\n", 360 | " inputs = self.input_mat[:, 0:-1]\n", 361 | " num_examples = inputs.shape[0]\n", 362 | " # pass this to Theano's recurrence relation function:\n", 363 | " ....\n", 364 | "\n", 365 | "Scan receives our recurrence relation `step` from above, and also needs to know what will be outputted at each step in `outputs_info`. We give `outputs_info` a set of variables corresponding to the hidden states of our StackedCells. Some of the layers have no hidden state, and thus we should simply pass a `None` to Theano, while others do require some initial state. In those cases with wrap their initial state inside a dictionary:\n", 366 | "\n", 367 | " def has_hidden(layer):\n", 368 | " \"\"\"\n", 369 | " Whether a layer has a trainable\n", 370 | " initial hidden state.\n", 371 | " \"\"\"\n", 372 | " return hasattr(layer, 'initial_hidden_state')\n", 373 | "\n", 374 | " def matrixify(vector, n):\n", 375 | " return T.repeat(T.shape_padleft(vector), n, axis=0)\n", 376 | "\n", 377 | " def initial_state(layer, dimensions = None):\n", 378 | " \"\"\"\n", 379 | " Initalizes the recurrence relation with an initial hidden state\n", 380 | " if needed, else replaces with a \"None\" to tell Theano that\n", 381 | " the network **will** return something, but it does not need\n", 382 | " to send it to the next step of the recurrence\n", 383 | " \"\"\"\n", 384 | " if dimensions is None:\n", 385 | " return layer.initial_hidden_state if has_hidden(layer) else None\n", 386 | " else:\n", 387 | " return matrixify(layer.initial_hidden_state, dimensions) if has_hidden(layer) else None\n", 388 | "\n", 389 | " def initial_state_with_taps(layer, dimensions = None):\n", 390 | " \"\"\"Optionally wrap tensor variable into a dict with taps=[-1]\"\"\"\n", 391 | " state = initial_state(layer, dimensions)\n", 392 | " if state is not None:\n", 393 | " return dict(initial=state, taps=[-1])\n", 394 | " else:\n", 395 | " return None\n", 396 | " \n", 397 | "Let's now create these inital states (note how we skip layer 1, the embeddings by doing `self.model.layers[1:]` in the iteration, this is because there is no point in passing these embeddings around in our recurrence because word vectors are only seen at the timestep they are received in this network):\n", 398 | "\n", 399 | " # choose what gets outputted at each timestep:\n", 400 | " outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]]\n", 401 | " result, _ = theano.scan(fn=step,\n", 402 | " sequences=[inputs.T],\n", 403 | " outputs_info=outputs_info)\n", 404 | "\n", 405 | " if greedy:\n", 406 | " return result[0]\n", 407 | " # softmaxes are the last layer of our network,\n", 408 | " # and are at the end of our results list:\n", 409 | " return result[-1].transpose((2,0,1))\n", 410 | " # we reorder the predictions to be:\n", 411 | " # 1. what row / example\n", 412 | " # 2. what timestep\n", 413 | " # 3. softmax dimension\n", 414 | "\n", 415 | "#### Error Function:\n", 416 | "\n", 417 | "Our error function uses `theano_lstm`'s `masked_loss` method. This method allows us to define ranges over which a probability distribution should obey a particular target distribution. We control this method by setting start and end points for these ranges. In doing so we mask the areas where we do not care what the network predicted.\n", 418 | "\n", 419 | "In our case our network predicts words we care about during the sentence, but when we pad our short sentences with 0s to fill our matrix, we do not care what the network does there, because this is happening outside the sentence we collected:\n", 420 | "\n", 421 | " def create_cost_fun (self):\n", 422 | " # create a cost function that\n", 423 | " # takes each prediction at every timestep\n", 424 | " # and guesses next timestep's value:\n", 425 | " what_to_predict = self.input_mat[:, 1:]\n", 426 | " # because some sentences are shorter, we\n", 427 | " # place masks where the sentences end:\n", 428 | " # (for how long is zero indexed, e.g. an example going from `[2,3)`)\n", 429 | " # has this value set 0 (here we substract by 1):\n", 430 | " for_how_long = self.for_how_long - 1\n", 431 | " # all sentences start at T=0:\n", 432 | " starting_when = T.zeros_like(self.for_how_long)\n", 433 | " \n", 434 | " self.cost = masked_loss(self.predictions,\n", 435 | " what_to_predict,\n", 436 | " for_how_long,\n", 437 | " starting_when).sum()\n", 438 | " \n", 439 | "#### Training Function\n", 440 | "\n", 441 | "We now have a cost function. To perform gradient descent we now need to tell Theano how each parameter must be updated at every training epoch. We `theano_lstm`'s `create_optimization_udpates` method to generate a dictionary of updates and to apply special gradient descent rules that accelerate and facilitate training (for instance scaling the gradients when they are too large or too little, and preventing gradients from becoming too big and making our model numerically unstable -- in this example we use [Adadelta](http://arxiv.org/abs/1212.5701):\n", 442 | "\n", 443 | " def create_training_function(self):\n", 444 | " updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method=\"adadelta\")\n", 445 | " self.update_fun = theano.function(\n", 446 | " inputs=[self.input_mat, self.for_how_long],\n", 447 | " outputs=self.cost,\n", 448 | " updates=updates,\n", 449 | " allow_input_downcast=True)\n", 450 | "\n", 451 | "PS: our parameters are obtained by calling `self.model.params`:\n", 452 | "\n", 453 | " @property\n", 454 | " def params(self):\n", 455 | " return self.model.params\n", 456 | " \n", 457 | "### Final Code" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 189, 463 | "metadata": { 464 | "collapsed": false 465 | }, 466 | "outputs": [], 467 | "source": [ 468 | "from theano_lstm import Embedding, LSTM, RNN, StackedCells, Layer, create_optimization_updates, masked_loss\n", 469 | "\n", 470 | "def softmax(x):\n", 471 | " \"\"\"\n", 472 | " Wrapper for softmax, helps with\n", 473 | " pickling, and removing one extra\n", 474 | " dimension that Theano adds during\n", 475 | " its exponential normalization.\n", 476 | " \"\"\"\n", 477 | " return T.nnet.softmax(x.T)\n", 478 | "\n", 479 | "def has_hidden(layer):\n", 480 | " \"\"\"\n", 481 | " Whether a layer has a trainable\n", 482 | " initial hidden state.\n", 483 | " \"\"\"\n", 484 | " return hasattr(layer, 'initial_hidden_state')\n", 485 | "\n", 486 | "def matrixify(vector, n):\n", 487 | " return T.repeat(T.shape_padleft(vector), n, axis=0)\n", 488 | "\n", 489 | "def initial_state(layer, dimensions = None):\n", 490 | " \"\"\"\n", 491 | " Initalizes the recurrence relation with an initial hidden state\n", 492 | " if needed, else replaces with a \"None\" to tell Theano that\n", 493 | " the network **will** return something, but it does not need\n", 494 | " to send it to the next step of the recurrence\n", 495 | " \"\"\"\n", 496 | " if dimensions is None:\n", 497 | " return layer.initial_hidden_state if has_hidden(layer) else None\n", 498 | " else:\n", 499 | " return matrixify(layer.initial_hidden_state, dimensions) if has_hidden(layer) else None\n", 500 | " \n", 501 | "def initial_state_with_taps(layer, dimensions = None):\n", 502 | " \"\"\"Optionally wrap tensor variable into a dict with taps=[-1]\"\"\"\n", 503 | " state = initial_state(layer, dimensions)\n", 504 | " if state is not None:\n", 505 | " return dict(initial=state, taps=[-1])\n", 506 | " else:\n", 507 | " return None\n", 508 | "\n", 509 | "class Model:\n", 510 | " \"\"\"\n", 511 | " Simple predictive model for forecasting words from\n", 512 | " sequence using LSTMs. Choose how many LSTMs to stack\n", 513 | " what size their memory should be, and how many\n", 514 | " words can be predicted.\n", 515 | " \"\"\"\n", 516 | " def __init__(self, hidden_size, input_size, vocab_size, stack_size=1, celltype=LSTM):\n", 517 | " # declare model\n", 518 | " self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size)\n", 519 | " # add an embedding\n", 520 | " self.model.layers.insert(0, Embedding(vocab_size, input_size))\n", 521 | " # add a classifier:\n", 522 | " self.model.layers.append(Layer(hidden_size, vocab_size, activation = softmax))\n", 523 | " # inputs are matrices of indices,\n", 524 | " # each row is a sentence, each column a timestep\n", 525 | " self._stop_word = theano.shared(np.int32(999999999), name=\"stop word\")\n", 526 | " self.for_how_long = T.ivector()\n", 527 | " self.input_mat = T.imatrix()\n", 528 | " self.priming_word = T.iscalar()\n", 529 | " self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024))\n", 530 | " # create symbolic variables for prediction:\n", 531 | " self.predictions = self.create_prediction()\n", 532 | " # create symbolic variable for greedy search:\n", 533 | " self.greedy_predictions = self.create_prediction(greedy=True)\n", 534 | " # create gradient training functions:\n", 535 | " self.create_cost_fun()\n", 536 | " self.create_training_function()\n", 537 | " self.create_predict_function()\n", 538 | " \n", 539 | " def stop_on(self, idx):\n", 540 | " self._stop_word.set_value(idx)\n", 541 | " \n", 542 | " @property\n", 543 | " def params(self):\n", 544 | " return self.model.params\n", 545 | " \n", 546 | " def create_prediction(self, greedy=False):\n", 547 | " def step(idx, *states):\n", 548 | " # new hiddens are the states we need to pass to LSTMs\n", 549 | " # from past. Because the StackedCells also include\n", 550 | " # the embeddings, and those have no state, we pass\n", 551 | " # a \"None\" instead:\n", 552 | " new_hiddens = [None] + list(states)\n", 553 | " \n", 554 | " new_states = self.model.forward(idx, prev_hiddens = new_hiddens)\n", 555 | " if greedy:\n", 556 | " new_idxes = new_states[-1]\n", 557 | " new_idx = new_idxes.argmax()\n", 558 | " # provide a stopping condition for greedy search:\n", 559 | " return ([new_idx.astype(self.priming_word.dtype)] + new_states[1:-1]), theano.scan_module.until(T.eq(new_idx,self._stop_word))\n", 560 | " else:\n", 561 | " return new_states[1:]\n", 562 | " # in sequence forecasting scenario we take everything\n", 563 | " # up to the before last step, and predict subsequent\n", 564 | " # steps ergo, 0 ... n - 1, hence:\n", 565 | " inputs = self.input_mat[:, 0:-1]\n", 566 | " num_examples = inputs.shape[0]\n", 567 | " # pass this to Theano's recurrence relation function:\n", 568 | " \n", 569 | " # choose what gets outputted at each timestep:\n", 570 | " if greedy:\n", 571 | " outputs_info = [dict(initial=self.priming_word, taps=[-1])] + [initial_state_with_taps(layer) for layer in self.model.layers[1:-1]]\n", 572 | " result, _ = theano.scan(fn=step,\n", 573 | " n_steps=200,\n", 574 | " outputs_info=outputs_info)\n", 575 | " else:\n", 576 | " outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]]\n", 577 | " result, _ = theano.scan(fn=step,\n", 578 | " sequences=[inputs.T],\n", 579 | " outputs_info=outputs_info)\n", 580 | " \n", 581 | " if greedy:\n", 582 | " return result[0]\n", 583 | " # softmaxes are the last layer of our network,\n", 584 | " # and are at the end of our results list:\n", 585 | " return result[-1].transpose((2,0,1))\n", 586 | " # we reorder the predictions to be:\n", 587 | " # 1. what row / example\n", 588 | " # 2. what timestep\n", 589 | " # 3. softmax dimension\n", 590 | " \n", 591 | " def create_cost_fun (self):\n", 592 | " # create a cost function that\n", 593 | " # takes each prediction at every timestep\n", 594 | " # and guesses next timestep's value:\n", 595 | " what_to_predict = self.input_mat[:, 1:]\n", 596 | " # because some sentences are shorter, we\n", 597 | " # place masks where the sentences end:\n", 598 | " # (for how long is zero indexed, e.g. an example going from `[2,3)`)\n", 599 | " # has this value set 0 (here we substract by 1):\n", 600 | " for_how_long = self.for_how_long - 1\n", 601 | " # all sentences start at T=0:\n", 602 | " starting_when = T.zeros_like(self.for_how_long)\n", 603 | " \n", 604 | " self.cost = masked_loss(self.predictions,\n", 605 | " what_to_predict,\n", 606 | " for_how_long,\n", 607 | " starting_when).sum()\n", 608 | " \n", 609 | " def create_predict_function(self):\n", 610 | " self.pred_fun = theano.function(\n", 611 | " inputs=[self.input_mat],\n", 612 | " outputs =self.predictions,\n", 613 | " allow_input_downcast=True\n", 614 | " )\n", 615 | " \n", 616 | " self.greedy_fun = theano.function(\n", 617 | " inputs=[self.priming_word],\n", 618 | " outputs=T.concatenate([T.shape_padleft(self.priming_word), self.greedy_predictions]),\n", 619 | " allow_input_downcast=True\n", 620 | " )\n", 621 | " \n", 622 | " def create_training_function(self):\n", 623 | " updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method=\"adadelta\")\n", 624 | " self.update_fun = theano.function(\n", 625 | " inputs=[self.input_mat, self.for_how_long],\n", 626 | " outputs=self.cost,\n", 627 | " updates=updates,\n", 628 | " allow_input_downcast=True)\n", 629 | " \n", 630 | " def __call__(self, x):\n", 631 | " return self.pred_fun(x)" 632 | ] 633 | }, 634 | { 635 | "cell_type": "markdown", 636 | "metadata": {}, 637 | "source": [ 638 | "### Construct model\n", 639 | "\n", 640 | "We now declare the model and parametrize it to use an RNN, and make predictions in the range provided by our vocabulary. We also tell the greedy reconstruction search that it can consider a sentence as being over when the symbol corresponding to a period appears:\n" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": null, 646 | "metadata": { 647 | "collapsed": false 648 | }, 649 | "outputs": [], 650 | "source": [ 651 | "# construct model & theano functions:\n", 652 | "model = Model(\n", 653 | " input_size=10,\n", 654 | " hidden_size=10,\n", 655 | " vocab_size=len(vocab),\n", 656 | " stack_size=1, # make this bigger, but makes compilation slow\n", 657 | " celltype=RNN # use RNN or LSTM\n", 658 | ")\n", 659 | "model.stop_on(vocab.word2index[\".\"])" 660 | ] 661 | }, 662 | { 663 | "cell_type": "markdown", 664 | "metadata": {}, 665 | "source": [ 666 | "### Train Model\n", 667 | "\n", 668 | "We run 10,000 times through our data and every 500 epochs of training we output what the model considers to be a natural continuation to the sentence \"the\":\n" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 191, 674 | "metadata": { 675 | "collapsed": false, 676 | "scrolled": false 677 | }, 678 | "outputs": [ 679 | { 680 | "name": "stdout", 681 | "output_type": "stream", 682 | "text": [ 683 | "epoch 0, error=3877.55\n", 684 | "the .\n", 685 | "epoch 100, error=3873.32\n", 686 | "epoch 200, error=3868.80\n", 687 | "epoch 300, error=3863.65\n", 688 | "epoch 400, error=3857.58\n", 689 | "epoch 500, error=3850.15\n", 690 | "the .\n", 691 | "epoch 600, error=3840.67\n", 692 | "epoch 700, error=3828.21\n", 693 | "epoch 800, error=3811.36\n", 694 | "epoch 900, error=3787.88\n", 695 | "epoch 1000, error=3754.51\n", 696 | "the .\n", 697 | "epoch 1100, error=3707.27\n", 698 | "epoch 1200, error=3652.82\n", 699 | "epoch 1300, error=3794.47\n", 700 | "epoch 1400, error=3633.05\n", 701 | "epoch 1500, error=3749.59\n", 702 | "the .\n", 703 | "epoch 1600, error=3622.81\n", 704 | "epoch 1700, error=3728.75\n", 705 | "epoch 1800, error=3615.40\n", 706 | "epoch 1900, error=3711.92\n", 707 | "epoch 2000, error=3608.67\n", 708 | "the .\n", 709 | "epoch 2100, error=3697.46\n", 710 | "epoch 2200, error=3602.14\n", 711 | "epoch 2300, error=3684.72\n", 712 | "epoch 2400, error=3595.66\n", 713 | "epoch 2500, error=3673.21\n", 714 | "the .\n", 715 | "epoch 2600, error=3589.14\n", 716 | "epoch 2700, error=3662.57\n", 717 | "epoch 2800, error=3582.49\n", 718 | "epoch 2900, error=3652.51\n", 719 | "epoch 3000, error=3575.61\n", 720 | "the .\n", 721 | "epoch 3100, error=3642.76\n", 722 | "epoch 3200, error=3568.39\n", 723 | "epoch 3300, error=3633.05\n", 724 | "epoch 3400, error=3560.71\n", 725 | "epoch 3500, error=3623.09\n", 726 | "the event .\n", 727 | "epoch 3600, error=3552.42\n", 728 | "epoch 3700, error=3612.54\n", 729 | "epoch 3800, error=3543.32\n", 730 | "epoch 3900, error=3601.00\n", 731 | "epoch 4000, error=3533.19\n", 732 | "the event .\n", 733 | "epoch 4100, error=3588.00\n", 734 | "epoch 4200, error=3521.72\n", 735 | "epoch 4300, error=3572.95\n", 736 | "epoch 4400, error=3508.52\n", 737 | "epoch 4500, error=3555.13\n", 738 | "the event .\n", 739 | "epoch 4600, error=3493.12\n", 740 | "epoch 4700, error=3533.71\n", 741 | "epoch 4800, error=3474.91\n", 742 | "epoch 4900, error=3507.69\n", 743 | "epoch 5000, error=3453.10\n", 744 | "the event .\n", 745 | "epoch 5100, error=3476.03\n", 746 | "epoch 5200, error=3426.79\n", 747 | "epoch 5300, error=3437.64\n", 748 | "epoch 5400, error=3394.89\n", 749 | "epoch 5500, error=3391.61\n", 750 | "the event .\n", 751 | "epoch 5600, error=3356.28\n", 752 | "epoch 5700, error=3337.37\n", 753 | "epoch 5800, error=3309.92\n", 754 | "epoch 5900, error=3274.99\n", 755 | "epoch 6000, error=3255.30\n", 756 | "the event .\n", 757 | "epoch 6100, error=3205.48\n", 758 | "epoch 6200, error=3192.82\n", 759 | "epoch 6300, error=3130.87\n", 760 | "epoch 6400, error=3124.29\n", 761 | "epoch 6500, error=3053.95\n", 762 | "the event stole , .\n", 763 | "epoch 6600, error=3052.72\n", 764 | "epoch 6700, error=2977.69\n", 765 | "epoch 6800, error=2981.38\n", 766 | "epoch 6900, error=2904.48\n", 767 | "epoch 7000, error=2912.80\n", 768 | "the event carried , .\n", 769 | "epoch 7100, error=2836.22\n", 770 | "epoch 7200, error=2848.95\n", 771 | "epoch 7300, error=2774.26\n", 772 | "epoch 7400, error=2790.40\n", 773 | "epoch 7500, error=2719.00\n", 774 | "the event carried , the wrangler ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the calendar ran , the\n", 775 | "epoch 7600, error=2737.30\n", 776 | "epoch 7700, error=2670.22\n", 777 | "epoch 7800, error=2689.21\n", 778 | "epoch 7900, error=2627.33\n", 779 | "epoch 8000, error=2645.85\n", 780 | "the event carried , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a cat carry , a\n", 781 | "epoch 8100, error=2589.56\n", 782 | "epoch 8200, error=2607.03\n", 783 | "epoch 8300, error=2556.31\n", 784 | "epoch 8400, error=2572.67\n", 785 | "epoch 8500, error=2527.13\n", 786 | "the event carried , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the\n", 787 | "epoch 8600, error=2542.56\n", 788 | "epoch 8700, error=2501.63\n", 789 | "epoch 8800, error=2516.40\n", 790 | "epoch 8900, error=2479.40\n", 791 | "epoch 9000, error=2493.71\n", 792 | "the event carried , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a cat jump , a\n", 793 | "epoch 9100, error=2459.98\n", 794 | "epoch 9200, error=2473.99\n", 795 | "epoch 9300, error=2442.94\n", 796 | "epoch 9400, error=2456.79\n", 797 | "epoch 9500, error=2427.89\n", 798 | "the event carried , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the ship ran , the\n", 799 | "epoch 9600, error=2441.67\n", 800 | "epoch 9700, error=2414.49\n", 801 | "epoch 9800, error=2428.29\n", 802 | "epoch 9900, error=2402.47\n" 803 | ] 804 | } 805 | ], 806 | "source": [ 807 | "# train:\n", 808 | "for i in range(10000):\n", 809 | " error = model.update_fun(numerical_lines, numerical_lengths)\n", 810 | " if i % 100 == 0:\n", 811 | " print(\"epoch %(epoch)d, error=%(error).2f\" % ({\"epoch\": i, \"error\": error}))\n", 812 | " if i % 500 == 0:\n", 813 | " print(vocab(model.greedy_fun(vocab.word2index[\"the\"])))" 814 | ] 815 | } 816 | ], 817 | "metadata": { 818 | "kernelspec": { 819 | "display_name": "Python 3", 820 | "language": "python", 821 | "name": "python3" 822 | }, 823 | "language_info": { 824 | "codemirror_mode": { 825 | "name": "ipython", 826 | "version": 3 827 | }, 828 | "file_extension": ".py", 829 | "mimetype": "text/x-python", 830 | "name": "python", 831 | "nbconvert_exporter": "python", 832 | "pygments_lexer": "ipython3", 833 | "version": "3.4.3" 834 | } 835 | }, 836 | "nbformat": 4, 837 | "nbformat_minor": 0 838 | } 839 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | def readfile(fname): 5 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 6 | 7 | setup( 8 | name='theano-lstm', 9 | version='0.0.14', 10 | description='Nano size theano lstm module', 11 | long_description=readfile('README.md'), 12 | ext_modules=[], 13 | packages=find_packages(), 14 | py_modules = [], 15 | author='Jonathan Raiman', 16 | author_email='jraiman at mit dot edu', 17 | url='https://github.com/JonathanRaiman/theano_lstm', 18 | download_url='https://github.com/JonathanRaiman/theano_lstm', 19 | keywords='Gradient Descent, Theano, LSTM, neural networks', 20 | license='MIT', 21 | platforms='any', 22 | zip_safe=False, 23 | classifiers=[ 24 | 'Intended Audience :: Science/Research', 25 | 'Operating System :: OS Independent', 26 | 'Programming Language :: Python :: 3.3', 27 | 'Programming Language :: Python :: 2.7', 28 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 29 | 'Topic :: Scientific/Engineering :: Mathematics' 30 | ], 31 | setup_requires = [], 32 | install_requires=[ 33 | 'theano', 34 | 'numpy' 35 | ], 36 | include_package_data=True, 37 | ) 38 | -------------------------------------------------------------------------------- /theano_lstm/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Small Theano LSTM recurrent network module. 3 | 4 | @author: Jonathan Raiman 5 | @date: December 10th 2014 6 | 7 | Implements most of the great things that came out 8 | in 2014 concerning recurrent neural networks, and 9 | some good optimizers for these types of networks. 10 | 11 | Note (from 5 January 2015): Dropout api is a bit sophisticated due to the way 12 | random number generators are dealt with in Theano's scan. 13 | 14 | """ 15 | 16 | import theano, theano.tensor as T 17 | import numpy as np 18 | from collections import OrderedDict 19 | 20 | srng = theano.tensor.shared_randomstreams.RandomStreams(1234) 21 | np_rng = np.random.RandomState(1234) 22 | 23 | from .masked_loss import masked_loss, masked_loss_dx 24 | from .shared_memory import wrap_params, borrow_memory, borrow_all_memories 25 | 26 | class GradClip(theano.compile.ViewOp): 27 | """ 28 | Here we clip the gradients as Alex Graves does in his 29 | recurrent neural networks. In particular this prevents 30 | explosion of gradients during backpropagation. 31 | 32 | The original poster of this code was Alex Lamb, 33 | [here](https://groups.google.com/forum/#!topic/theano-dev/GaJwGw6emK0). 34 | 35 | """ 36 | 37 | def __init__(self, clip_lower_bound, clip_upper_bound): 38 | self.clip_lower_bound = clip_lower_bound 39 | self.clip_upper_bound = clip_upper_bound 40 | assert(self.clip_upper_bound >= self.clip_lower_bound) 41 | 42 | def grad(self, args, g_outs): 43 | return [T.clip(g_out, self.clip_lower_bound, self.clip_upper_bound) for g_out in g_outs] 44 | 45 | 46 | def clip_gradient(x, bound): 47 | grad_clip = GradClip(-bound, bound) 48 | try: 49 | T.opt.register_canonicalize(theano.gof.OpRemove(grad_clip), name='grad_clip_%.1f' % (bound)) 50 | except ValueError: 51 | pass 52 | return grad_clip(x) 53 | 54 | 55 | def create_shared(out_size, in_size=None, name=None): 56 | """ 57 | Creates a shared matrix or vector 58 | using the given in_size and out_size. 59 | 60 | Inputs 61 | ------ 62 | 63 | out_size int : outer dimension of the 64 | vector or matrix 65 | in_size int (optional) : for a matrix, the inner 66 | dimension. 67 | 68 | Outputs 69 | ------- 70 | 71 | theano shared : the shared matrix, with random numbers in it 72 | 73 | """ 74 | 75 | if in_size is None: 76 | return theano.shared(random_initialization((out_size, )), name=name) 77 | else: 78 | return theano.shared(random_initialization((out_size, in_size)), name=name) 79 | 80 | 81 | def random_initialization(size): 82 | return (np_rng.standard_normal(size) * 1. / size[0]).astype(theano.config.floatX) 83 | 84 | 85 | def Dropout(shape, prob): 86 | """ 87 | Return a dropout mask on x. 88 | 89 | The probability of a value in x going to zero is prob. 90 | 91 | Inputs 92 | ------ 93 | 94 | x theano variable : the variable to add noise to 95 | prob float, variable : probability of dropping an element. 96 | size tuple(int, int) : size of the dropout mask. 97 | 98 | 99 | Outputs 100 | ------- 101 | 102 | y theano variable : x with the noise multiplied. 103 | 104 | """ 105 | 106 | mask = srng.binomial(n=1, p=1-prob, size=shape) 107 | return T.cast(mask, theano.config.floatX) 108 | 109 | 110 | def MultiDropout(shapes, dropout = 0.): 111 | """ 112 | Return all the masks needed for dropout outside of a scan loop. 113 | """ 114 | return [Dropout(shape, dropout) for shape in shapes] 115 | 116 | 117 | class Layer(object): 118 | """ 119 | Base object for neural network layers. 120 | 121 | A layer has an input set of neurons, and 122 | a hidden activation. The activation, f, is a 123 | function applied to the affine transformation 124 | of x by the connection matrix W, and the bias 125 | vector b. 126 | 127 | > y = f ( W * x + b ) 128 | 129 | """ 130 | 131 | def __init__(self, input_size, hidden_size, activation, clip_gradients=False): 132 | self.input_size = input_size 133 | self.hidden_size = hidden_size 134 | self.activation = activation 135 | self.clip_gradients = clip_gradients 136 | self.is_recursive = False 137 | self.create_variables() 138 | 139 | def create_variables(self): 140 | """ 141 | Create the connection matrix and the bias vector 142 | """ 143 | self.linear_matrix = create_shared(self.hidden_size, self.input_size, name="Layer.linear_matrix") 144 | self.bias_matrix = create_shared(self.hidden_size, name="Layer.bias_matrix") 145 | 146 | def activate(self, x): 147 | """ 148 | The hidden activation of the network 149 | """ 150 | if self.clip_gradients is not False: 151 | x = clip_gradient(x, self.clip_gradients) 152 | 153 | if x.ndim > 1: 154 | return self.activation( 155 | T.dot(self.linear_matrix, x.T) + self.bias_matrix[:,None] ).T 156 | else: 157 | return self.activation( 158 | T.dot(self.linear_matrix, x) + self.bias_matrix ) 159 | 160 | @property 161 | def params(self): 162 | return [self.linear_matrix, self.bias_matrix] 163 | 164 | @params.setter 165 | def params(self, param_list): 166 | self.linear_matrix.set_value(param_list[0].get_value()) 167 | self.bias_matrix.set_value(param_list[1].get_value()) 168 | 169 | 170 | class Embedding(Layer): 171 | """ 172 | A Matrix useful for storing word vectors or other distributed 173 | representations. 174 | 175 | use #activate(T.iscalar()) or #activate(T.ivector()) to embed 176 | a symbol. 177 | """ 178 | def __init__(self, vocabulary_size, hidden_size): 179 | """ 180 | Vocabulary size is the number of different symbols to store, 181 | and hidden_size is the size of their embedding. 182 | """ 183 | self.vocabulary_size = vocabulary_size 184 | self.hidden_size = hidden_size 185 | self.create_variables() 186 | self.is_recursive = False 187 | 188 | def create_variables(self): 189 | self.embedding_matrix = create_shared(self.vocabulary_size, self.hidden_size, name='Embedding.embedding_matrix') 190 | 191 | def activate(self, x): 192 | """ 193 | Inputs 194 | ------ 195 | 196 | x T.ivector() or T.iscalar() : indices to embed 197 | 198 | Output 199 | ------ 200 | 201 | embedding : self.embedding_matrix[x] 202 | 203 | """ 204 | 205 | return self.embedding_matrix[x] 206 | 207 | @property 208 | def params(self): 209 | return [self.embedding_matrix] 210 | 211 | @params.setter 212 | def params(self, param_list): 213 | self.embedding_matrix.set_value(param_list[0].get_value()) 214 | 215 | 216 | class RNN(Layer): 217 | """ 218 | Special recurrent layer than takes as input 219 | a hidden activation, h, from the past and 220 | an observation x. 221 | 222 | > y = f ( W * [x, h] + b ) 223 | 224 | Note: x and h are concatenated in the activation. 225 | 226 | """ 227 | def __init__(self, *args, **kwargs): 228 | super(RNN, self).__init__(*args, **kwargs) 229 | self.is_recursive = True 230 | 231 | def create_variables(self): 232 | """ 233 | Create the connection matrix and the bias vector, 234 | and the base hidden activation. 235 | 236 | """ 237 | self.linear_matrix = create_shared(self.hidden_size, self.input_size+ self.hidden_size, name="RNN.linear_matrix") 238 | self.bias_matrix = create_shared(self.hidden_size, name="RNN.bias_matrix") 239 | self.initial_hidden_state = create_shared(self.hidden_size, name="RNN.initial_hidden_state") 240 | 241 | def activate(self, x, h): 242 | """ 243 | The hidden activation of the network 244 | """ 245 | if self.clip_gradients is not False: 246 | x = clip_gradient(x, self.clip_gradients) 247 | h = clip_gradient(h, self.clip_gradients) 248 | if x.ndim > 1: 249 | return self.activation( 250 | T.dot( 251 | self.linear_matrix, 252 | T.concatenate([x, h], axis=1).T 253 | ) + self.bias_matrix[:,None] ).T 254 | else: 255 | return self.activation( 256 | T.dot( 257 | self.linear_matrix, 258 | T.concatenate([x, h]) 259 | ) + self.bias_matrix ) 260 | 261 | @property 262 | def params(self): 263 | return [self.linear_matrix, self.bias_matrix] 264 | 265 | @params.setter 266 | def params(self, param_list): 267 | self.linear_matrix.set_value(param_list[0].get_value()) 268 | self.bias_matrix.set_value(param_list[1].get_value()) 269 | 270 | class GRU(RNN): 271 | def create_variables(self): 272 | self.reset_layer = theano_lstm.RNN(self.input_size, self.hidden_size, activation = T.nnet.sigmoid) 273 | self.memory_interpolation_layer = theano_lstm.RNN(self.input_size, self.hidden_size, activation = T.nnet.sigmoid) 274 | self.memory_to_memory_layer = theano_lstm.RNN(self.input_size, self.hidden_size, activation = T.tanh) 275 | self.internal_layers = [ 276 | self.reset_layer, 277 | self.memory_interpolation_layer, 278 | self.memory_to_memory_layer 279 | ] 280 | 281 | @property 282 | def params(self): 283 | return [param for layer in self.internal_layers for param in layer.params] 284 | 285 | @params.setter 286 | def params(self, param_list): 287 | assert(len(param_list) == 6) 288 | self.reset_layer.params = param_list[0:2] 289 | self.memory_interpolation_layer.params = param_list[2:4] 290 | self.memory_to_memory_layer.params = param_list[4:6] 291 | 292 | def activate(self, x, h): 293 | reset_gate = self.reset_layer.activate( 294 | x, 295 | h 296 | ) 297 | 298 | # the new state dampened by resetting 299 | reset_h = reset_gate * h; 300 | 301 | # the new hidden state: 302 | candidate_h = self.memory_to_memory_layer.activate( 303 | x, 304 | reset_h 305 | ) 306 | 307 | # how much to update the new hidden state: 308 | update_gate = self.memory_interpolation_layer.activate( 309 | x, 310 | h 311 | ) 312 | 313 | # the new state interploated between candidate and old: 314 | new_h = ( 315 | h * (1.0 - update_gate) + 316 | candidate_h * update_gate 317 | ) 318 | return new_h 319 | 320 | class LSTM(RNN): 321 | """ 322 | The structure of the LSTM allows it to learn on problems with 323 | long term dependencies relatively easily. The "long term" 324 | memory is stored in a vector of memory cells c. 325 | Although many LSTM architectures differ in their connectivity 326 | structure and activation functions, all LSTM architectures have 327 | memory cells that are suitable for storing information for long 328 | periods of time. Here we implement the LSTM from Graves et al. 329 | (2013). 330 | """ 331 | 332 | def create_variables(self): 333 | """ 334 | Create the different LSTM gates and 335 | their variables, along with the initial 336 | hidden state for the memory cells and 337 | the initial hidden activation. 338 | 339 | """ 340 | # input gate for cells 341 | self.in_gate = Layer(self.input_size + self.hidden_size, self.hidden_size, T.nnet.sigmoid, self.clip_gradients) 342 | # forget gate for cells 343 | self.forget_gate = Layer(self.input_size + self.hidden_size, self.hidden_size, T.nnet.sigmoid, self.clip_gradients) 344 | # input modulation for cells 345 | self.in_gate2 = Layer(self.input_size + self.hidden_size, self.hidden_size, self.activation, self.clip_gradients) 346 | # output modulation 347 | self.out_gate = Layer(self.input_size + self.hidden_size, self.hidden_size, T.nnet.sigmoid, self.clip_gradients) 348 | 349 | # keep these layers organized 350 | self.internal_layers = [self.in_gate, self.forget_gate, self.in_gate2, self.out_gate] 351 | 352 | # store the memory cells in first n spots, and store the current 353 | # output in the next n spots: 354 | self.initial_hidden_state = create_shared(self.hidden_size * 2, name="LSTM.initial_hidden_state") 355 | @property 356 | def params(self): 357 | """ 358 | Parameters given by the 4 gates and the 359 | initial hidden activation of this LSTM cell 360 | layer. 361 | """ 362 | return [param for layer in self.internal_layers for param in layer.params] 363 | 364 | @params.setter 365 | def params(self, param_list): 366 | start = 0 367 | for layer in self.internal_layers: 368 | end = start + len(layer.params) 369 | layer.params = param_list[start:end] 370 | start = end 371 | 372 | def postprocess_activation(self, x, *args): 373 | if x.ndim > 1: 374 | return x[:, self.hidden_size:] 375 | else: 376 | return x[self.hidden_size:] 377 | 378 | def activate(self, x, h): 379 | """ 380 | The hidden activation, h, of the network, along 381 | with the new values for the memory cells, c, 382 | Both are concatenated as follows: 383 | 384 | > y = f( x, past ) 385 | 386 | Or more visibly, with past = [prev_c, prev_h] 387 | 388 | > [c, h] = f( x, [prev_c, prev_h] ) 389 | 390 | """ 391 | 392 | if h.ndim > 1: 393 | #previous memory cell values 394 | prev_c = h[:, :self.hidden_size] 395 | 396 | #previous activations of the hidden layer 397 | prev_h = h[:, self.hidden_size:] 398 | else: 399 | 400 | #previous memory cell values 401 | prev_c = h[:self.hidden_size] 402 | 403 | #previous activations of the hidden layer 404 | prev_h = h[self.hidden_size:] 405 | 406 | # input and previous hidden constitute the actual 407 | # input to the LSTM: 408 | if h.ndim > 1: 409 | obs = T.concatenate([x, prev_h], axis=1) 410 | else: 411 | obs = T.concatenate([x, prev_h]) 412 | # TODO could we combine these 4 linear transformations for efficiency? (e.g., http://arxiv.org/pdf/1410.4615.pdf, page 5) 413 | # how much to add to the memory cells 414 | in_gate = self.in_gate.activate(obs) 415 | 416 | # how much to forget the current contents of the memory 417 | forget_gate = self.forget_gate.activate(obs) 418 | 419 | # modulate the input for the memory cells 420 | in_gate2 = self.in_gate2.activate(obs) 421 | 422 | # new memory cells 423 | next_c = forget_gate * prev_c + in_gate2 * in_gate 424 | 425 | # modulate the memory cells to create the new output 426 | out_gate = self.out_gate.activate(obs) 427 | 428 | # new hidden output 429 | next_h = out_gate * T.tanh(next_c) 430 | 431 | if h.ndim > 1: 432 | return T.concatenate([next_c, next_h], axis=1) 433 | else: 434 | return T.concatenate([next_c, next_h]) 435 | 436 | 437 | class GatedInput(RNN): 438 | def create_variables(self): 439 | # input gate for cells 440 | self.in_gate = Layer(self.input_size + self.hidden_size, 1, T.nnet.sigmoid, self.clip_gradients) 441 | self.internal_layers = [self.in_gate] 442 | 443 | @property 444 | def params(self): 445 | """ 446 | Parameters given by the 4 gates and the 447 | initial hidden activation of this LSTM cell 448 | layer. 449 | 450 | """ 451 | return [param for layer in self.internal_layers 452 | for param in layer.params] 453 | 454 | @params.setter 455 | def params(self, param_list): 456 | start = 0 457 | for layer in self.internal_layers: 458 | end = start + len(layer.params) 459 | layer.params = param_list[start:end] 460 | start = end 461 | 462 | def activate(self, x, h): 463 | # input and previous hidden constitute the actual 464 | # input to the LSTM: 465 | if h.ndim > 1: 466 | obs = T.concatenate([x, h], axis=1) 467 | else: 468 | obs = T.concatenate([x, h]) 469 | 470 | gate = self.in_gate.activate(obs) 471 | if h.ndim > 1: 472 | gate = gate[:,0][:,None] 473 | else: 474 | gate = gate[0] 475 | 476 | return gate 477 | 478 | def postprocess_activation(self, gate, x, h): 479 | return gate * x 480 | 481 | 482 | def apply_dropout(x, mask): 483 | if mask is not None: 484 | return mask * x 485 | else: 486 | return x 487 | 488 | 489 | class StackedCells(object): 490 | """ 491 | Sequentially connect several recurrent layers. 492 | 493 | celltypes can be RNN or LSTM. 494 | 495 | """ 496 | def __init__(self, input_size, celltype=RNN, layers=None, 497 | activation=lambda x:x, clip_gradients=False): 498 | if layers is None: 499 | layers = [] 500 | self.input_size = input_size 501 | self.clip_gradients = clip_gradients 502 | self.create_layers(layers, activation, celltype) 503 | 504 | def create_layers(self, layer_sizes, activation_type, celltype): 505 | self.layers = [] 506 | prev_size = self.input_size 507 | for k, layer_size in enumerate(layer_sizes): 508 | layer = celltype(prev_size, layer_size, activation_type, 509 | clip_gradients=self.clip_gradients) 510 | self.layers.append(layer) 511 | prev_size = layer_size 512 | 513 | @property 514 | def params(self): 515 | return [param for layer in self.layers for param in layer.params] 516 | 517 | @params.setter 518 | def params(self, param_list): 519 | start = 0 520 | for layer in self.layers: 521 | end = start + len(layer.params) 522 | layer.params = param_list[start:end] 523 | start = end 524 | 525 | def forward(self, x, prev_hiddens=None, dropout=None): 526 | """ 527 | Return new hidden activations for all stacked RNNs 528 | """ 529 | if dropout is None: 530 | dropout = [] 531 | if prev_hiddens is None: 532 | prev_hiddens = [(T.repeat(T.shape_padleft(layer.initial_hidden_state), 533 | x.shape[0], axis=0) 534 | if x.ndim > 1 else layer.initial_hidden_state) 535 | if hasattr(layer, 'initial_hidden_state') else None 536 | for layer in self.layers] 537 | 538 | out = [] 539 | layer_input = x 540 | for k, layer in enumerate(self.layers): 541 | level_out = layer_input 542 | if len(dropout) > 0: 543 | level_out = apply_dropout(layer_input, dropout[k]) 544 | if layer.is_recursive: 545 | level_out = layer.activate(level_out, prev_hiddens[k]) 546 | else: 547 | level_out = layer.activate(level_out) 548 | out.append(level_out) 549 | # deliberate choice to change the upward structure here 550 | # in an RNN, there is only one kind of hidden values 551 | if hasattr(layer, 'postprocess_activation'): 552 | # in this case the hidden activation has memory cells 553 | # that are not shared upwards 554 | # along with hidden activations that can be sent 555 | # updwards 556 | if layer.is_recursive: 557 | level_out = layer.postprocess_activation(level_out, layer_input, prev_hiddens[k]) 558 | else: 559 | level_out = layer.postprocess_activation(level_out, layer_input) 560 | 561 | layer_input = level_out 562 | 563 | return out 564 | 565 | 566 | def create_optimization_updates(cost, params, updates=None, max_norm=5.0, 567 | lr=0.01, eps=1e-6, rho=0.95, 568 | method = "adadelta", gradients = None): 569 | """ 570 | Get the updates for a gradient descent optimizer using 571 | SGD, AdaDelta, or AdaGrad. 572 | 573 | Returns the shared variables for the gradient caches, 574 | and the updates dictionary for compilation by a 575 | theano function. 576 | 577 | Inputs 578 | ------ 579 | 580 | cost theano variable : what to minimize 581 | params list : list of theano variables 582 | with respect to which 583 | the gradient is taken. 584 | max_norm float : cap on excess gradients 585 | lr float : base learning rate for 586 | adagrad and SGD 587 | eps float : numerical stability value 588 | to not divide by zero 589 | sometimes 590 | rho float : adadelta hyperparameter. 591 | method str : 'adagrad', 'adadelta', or 'sgd'. 592 | 593 | 594 | Outputs: 595 | -------- 596 | 597 | updates OrderedDict : the updates to pass to a 598 | theano function 599 | gsums list : gradient caches for Adagrad 600 | and Adadelta 601 | xsums list : gradient caches for AdaDelta only 602 | lr theano shared : learning rate 603 | max_norm theano_shared : normalizing clipping value for 604 | excessive gradients (exploding). 605 | 606 | """ 607 | lr = theano.shared(np.float64(lr).astype(theano.config.floatX)) 608 | eps = np.float64(eps).astype(theano.config.floatX) 609 | rho = theano.shared(np.float64(rho).astype(theano.config.floatX)) 610 | if max_norm is not None and max_norm is not False: 611 | max_norm = theano.shared(np.float64(max_norm).astype(theano.config.floatX)) 612 | 613 | gsums = [theano.shared(np.zeros_like(param.get_value(borrow=True))) if (method == 'adadelta' or method == 'adagrad') else None for param in params] 614 | xsums = [theano.shared(np.zeros_like(param.get_value(borrow=True))) if method == 'adadelta' else None for param in params] 615 | 616 | gparams = T.grad(cost, params) if gradients is None else gradients 617 | 618 | if updates is None: 619 | updates = OrderedDict() 620 | 621 | for gparam, param, gsum, xsum in zip(gparams, params, gsums, xsums): 622 | # clip gradients if they get too big 623 | if max_norm is not None and max_norm is not False: 624 | grad_norm = gparam.norm(L=2) 625 | gparam = (T.minimum(max_norm, grad_norm)/ (grad_norm + eps)) * gparam 626 | 627 | if method == 'adadelta': 628 | updates[gsum] = T.cast(rho * gsum + (1. - rho) * (gparam **2), theano.config.floatX) 629 | dparam = -T.sqrt((xsum + eps) / (updates[gsum] + eps)) * gparam 630 | updates[xsum] = T.cast(rho * xsum + (1. - rho) * (dparam **2), theano.config.floatX) 631 | updates[param] = T.cast(param + dparam, theano.config.floatX) 632 | elif method == 'adagrad': 633 | updates[gsum] = T.cast(gsum + (gparam ** 2), theano.config.floatX) 634 | updates[param] = T.cast(param - lr * (gparam / (T.sqrt(updates[gsum] + eps))), theano.config.floatX) 635 | else: 636 | updates[param] = param - gparam * lr 637 | 638 | if method == 'adadelta': 639 | lr = rho 640 | 641 | return updates, gsums, xsums, lr, max_norm 642 | 643 | 644 | __all__ = [ 645 | "create_optimization_updates", 646 | "masked_loss", 647 | "masked_loss_dx", 648 | "clip_gradient", 649 | "create_shared", 650 | "Dropout", 651 | "apply_dropout", 652 | "StackedCells", 653 | "Layer", 654 | "LSTM", 655 | "RNN", 656 | "GatedInput", 657 | "Embedding", 658 | "MultiDropout", 659 | "wrap_params", 660 | "borrow_memory", 661 | "borrow_all_memories" 662 | ] 663 | -------------------------------------------------------------------------------- /theano_lstm/masked_loss.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano.tensor as T 3 | from theano import gof 4 | from theano.gof import Apply 5 | from theano.gradient import grad_not_implemented 6 | 7 | class MaskedLossDx(gof.Op): 8 | 9 | def make_node(self, softmaxes, y_idxes, y_lengths, y_startidxes, g_costs, **kwargs): 10 | softmaxes = T.as_tensor_variable(softmaxes) 11 | y_idxes = T.as_tensor_variable(y_idxes) 12 | y_lengths = T.as_tensor_variable(y_lengths) 13 | y_startidxes = T.as_tensor_variable(y_startidxes) 14 | g_costs = T.as_tensor_variable(g_costs) 15 | 16 | if (softmaxes.type.ndim != 3 or 17 | softmaxes.type.dtype not in T.float_dtypes): 18 | raise ValueError('dy must be 3-d tensor of floats', softmaxes.type) 19 | 20 | if (y_idxes.type.ndim != 2 or 21 | y_idxes.type.dtype not in T.discrete_dtypes): 22 | raise ValueError('y_idxes must be 2-d tensor of integers', y_idxes.type) 23 | 24 | if (y_lengths.type.ndim != 1 or 25 | y_lengths.type.dtype not in T.discrete_dtypes): 26 | raise ValueError('y_lengths must be 1-d tensor of integers', y_lengths.type) 27 | 28 | if (y_startidxes.type.ndim != 1 or 29 | y_startidxes.type.dtype not in T.discrete_dtypes): 30 | raise ValueError('y_startidxes must be 1-d tensor of integers', y_startidxes.type) 31 | 32 | if (g_costs.type.ndim != 1 or 33 | g_costs.type.dtype not in T.float_dtypes): 34 | raise ValueError('g_costs must be 1-d tensor of floats', g_costs.type) 35 | 36 | return Apply(self, [softmaxes, y_idxes, y_lengths, y_startidxes, g_costs], 37 | [T.Tensor(dtype=softmaxes.dtype, broadcastable=softmaxes.type.broadcastable)()]) 38 | 39 | def perform(self, node, input_storage, output_storage): 40 | softmaxes, y_idxes, y_lengths, y_startidxes, g_costs = input_storage 41 | 42 | dx = np.zeros_like(softmaxes) 43 | for i in range(y_lengths.shape[0]): 44 | # take the total cost to be the errors made 45 | #dx[i, y_startidxes[i]:y_startidxes[i]+y_lengths[i]] = softmaxes[i, y_startidxes[i]:y_startidxes[i]+y_lengths[i]] * g_costs[i] 46 | dx[i, 47 | np.arange(y_startidxes[i], y_startidxes[i] + y_lengths[i]), 48 | y_idxes[i, y_startidxes[i]:y_startidxes[i]+y_lengths[i]] 49 | ] -= 1./(softmaxes[i, 50 | np.arange(y_startidxes[i], y_startidxes[i] + y_lengths[i]), 51 | y_idxes[i, y_startidxes[i]:y_startidxes[i]+y_lengths[i]]] * g_costs[i]) 52 | 53 | output_storage[0][0] = dx 54 | 55 | def c_code_cache_version(self): 56 | return (3,) 57 | 58 | def __init__(self, **kwargs): 59 | gof.Op.__init__(self, **kwargs) 60 | 61 | def __eq__(self, other): 62 | return type(self) == type(other) 63 | 64 | def __hash__(self): 65 | return T.hashtype(self) 66 | 67 | def __str__(self): 68 | return self.__class__.__name__ 69 | 70 | def c_code(self, node, name, inp, out, sub): 71 | softmaxes, y_idxes, y_lengths, y_startidxes, g_costs = inp 72 | dx, = out 73 | out_typenum = node.inputs[0].type.dtype_specs()[2] 74 | return """ 75 | 76 | if ((PyArray_TYPE(%(g_costs)s) != NPY_DOUBLE) && 77 | (PyArray_TYPE(%(g_costs)s) != NPY_FLOAT)) 78 | { 79 | PyErr_SetString(PyExc_TypeError, 80 | "g_costs type should be float32 or float64"); 81 | %(fail)s; 82 | } 83 | if ((PyArray_TYPE(%(softmaxes)s) != NPY_DOUBLE) && 84 | (PyArray_TYPE(%(softmaxes)s) != NPY_FLOAT)) 85 | { 86 | PyErr_SetString(PyExc_TypeError, 87 | "softmaxes type should be float32 or float64"); 88 | %(fail)s; 89 | } 90 | if ((PyArray_NDIM(%(g_costs)s) != 1) 91 | || (PyArray_NDIM(%(softmaxes)s) != 3) 92 | || (PyArray_NDIM(%(y_idxes)s) != 2) 93 | || (PyArray_NDIM(%(y_lengths)s) != 1) 94 | || (PyArray_NDIM(%(y_startidxes)s) != 1)) 95 | { 96 | PyErr_SetString(PyExc_ValueError, "rank error"); 97 | %(fail)s; 98 | } 99 | if (PyArray_DIMS(%(g_costs)s)[0] != PyArray_DIMS(%(softmaxes)s)[0]) 100 | { 101 | PyErr_Format(PyExc_ValueError, 102 | "g_costs.shape[0] (%%ld) != softmaxes.shape[0] (%%ld)", 103 | (long int)PyArray_DIMS(%(g_costs)s)[0], 104 | (long int)PyArray_DIMS(%(softmaxes)s)[0]); 105 | %(fail)s; 106 | } 107 | if (PyArray_DIMS(%(g_costs)s)[0] != PyArray_DIMS(%(y_idxes)s)[0]) 108 | { 109 | PyErr_Format(PyExc_ValueError, 110 | "g_costs.shape[0] (%%ld) != y_idxes.shape[0] (%%ld)", 111 | (long int)PyArray_DIMS(%(g_costs)s)[0], 112 | (long int)PyArray_DIMS(%(y_idxes)s)[0]); 113 | %(fail)s; 114 | } 115 | if ((NULL == %(dx)s) 116 | || (PyArray_DIMS(%(dx)s)[0] != PyArray_DIMS(%(softmaxes)s)[0]) 117 | || (PyArray_DIMS(%(dx)s)[1] != PyArray_DIMS(%(softmaxes)s)[1]) 118 | || (PyArray_DIMS(%(dx)s)[2] != PyArray_DIMS(%(softmaxes)s)[2])) 119 | { 120 | if (NULL != %(dx)s) Py_XDECREF(%(dx)s); 121 | %(dx)s = (PyArrayObject*) PyArray_Zeros(3, 122 | PyArray_DIMS(%(softmaxes)s), 123 | PyArray_DescrFromType(%(out_typenum)s), 0); 124 | if(!%(dx)s) { 125 | PyErr_SetString(PyExc_MemoryError, 126 | "failed to alloc dx output"); 127 | %(fail)s 128 | } 129 | } 130 | 131 | 132 | 133 | // for all examples index i is used 134 | for (size_t i = 0; i < PyArray_DIMS(%(y_lengths)s)[0]; ++i) 135 | { 136 | const dtype_%(softmaxes)s eps = (dtype_%(softmaxes)s)1e-9; 137 | 138 | // the temporal slice size for updates is given by the stride 139 | // length of dx along its second dimension 140 | npy_intp Sdx = PyArray_STRIDES(%(dx)s)[1]/sizeof(dtype_%(dx)s); 141 | npy_intp Ssm = PyArray_STRIDES(%(softmaxes)s)[1]/sizeof(dtype_%(softmaxes)s); 142 | 143 | // the distribution slice size for updates: 144 | npy_intp Sdx_dist = PyArray_STRIDES(%(dx)s)[2]/sizeof(dtype_%(dx)s); 145 | npy_intp Ssm_dist = PyArray_STRIDES(%(softmaxes)s)[2]/sizeof(dtype_%(softmaxes)s); 146 | 147 | // stride size for each example: 148 | npy_intp g_cost_stride = PyArray_STRIDES(%(g_costs)s)[0]; 149 | npy_intp dx_stride = PyArray_STRIDES(%(dx)s)[0]; 150 | npy_intp softmax_stride = PyArray_STRIDES(%(softmaxes)s)[0]; 151 | npy_intp y_idxes_stride = PyArray_STRIDES(%(y_idxes)s)[0]; 152 | npy_intp y_startidxes_stride = PyArray_STRIDES(%(y_startidxes)s)[0]; 153 | npy_intp y_lengths_stride = PyArray_STRIDES(%(y_lengths)s)[0]; 154 | 155 | npy_intp y_idxes_temp_stride = PyArray_STRIDES(%(y_idxes)s)[1]/sizeof(dtype_%(y_idxes)s); 156 | 157 | 158 | // slices for example i: 159 | dtype_%(dx) s* __restrict__ dx_i = (dtype_%(dx)s*)(PyArray_BYTES(%(dx)s) + dx_stride * i); 160 | dtype_%(y_idxes) s* __restrict__ idxes_i = (dtype_%(y_idxes)s*)(PyArray_BYTES(%(y_idxes)s) + y_idxes_stride * i); 161 | const dtype_%(softmaxes)s* __restrict__ softmaxes_i = (dtype_%(softmaxes)s*)(PyArray_BYTES(%(softmaxes)s) + softmax_stride * i); 162 | const dtype_%(g_costs)s g_costs_i = ((dtype_%(g_costs)s*)(PyArray_BYTES(%(g_costs)s) + g_cost_stride * i))[0]; 163 | const dtype_%(y_lengths) s y_lengths_i = ((dtype_%(y_lengths)s*)(PyArray_BYTES(%(y_lengths)s) + y_lengths_stride * i))[0]; 164 | const dtype_%(y_startidxes) s y_startidxes_i = ((dtype_%(y_startidxes)s*)(PyArray_BYTES(%(y_startidxes)s) + y_startidxes_stride * i))[0]; 165 | 166 | for (size_t j = 0 ; j < y_lengths_i; ++j) 167 | { 168 | if (idxes_i[(y_startidxes_i + j) * y_idxes_temp_stride] < 0 || idxes_i[(y_startidxes_i + j) * y_idxes_temp_stride] >= PyArray_DIMS(%(softmaxes)s)[2]) { 169 | PyErr_Format(PyExc_ValueError, 170 | "Softmax Index for KL Divergence is out of range ( %%ld not in [0, %%ld]", 171 | (long int)idxes_i[(y_startidxes_i + j) * y_idxes_temp_stride], 172 | (long int)PyArray_DIMS(%(softmaxes)s)[2]); 173 | %(fail)s; 174 | } 175 | dx_i[(y_startidxes_i + j) * Sdx + idxes_i[(y_startidxes_i + j) * y_idxes_temp_stride] * Sdx_dist] = -1. / ( 176 | softmaxes_i[(y_startidxes_i + j) * Ssm + idxes_i[(y_startidxes_i + j) * y_idxes_temp_stride] * Ssm_dist] * g_costs_i + eps); 177 | } 178 | 179 | } 180 | """ % dict(locals(), **sub) 181 | 182 | def grad(self, *args): 183 | raise NotImplementedError() 184 | 185 | masked_loss_dx = MaskedLossDx() 186 | 187 | class MaskedLoss(gof.Op): 188 | nin = 3 189 | nout = 1 190 | """Masked Loss for sequence""" 191 | 192 | def perform(self, node, input_storage, output_storage): 193 | softmaxes, y_idxes, y_lengths, y_startidxes = input_storage 194 | prediction_cost = np.zeros(y_lengths.shape[0], dtype=softmaxes.dtype) 195 | # for all lengths to be predicted 196 | for i in range(y_lengths.shape[0]): 197 | # take the total cost to be the errors made 198 | prediction_cost[i] -= np.log(softmaxes[i, 199 | np.arange(y_startidxes[i], y_startidxes[i] + y_lengths[i]), 200 | y_idxes[i, y_startidxes[i] :y_startidxes[i] + y_lengths[i]] 201 | ]).sum() 202 | 203 | output_storage[0][0] = prediction_cost 204 | 205 | def c_code(self, node, name, inp, out, sub): 206 | softmaxes, y_idxes, y_lengths, y_startidxes = inp 207 | errors, = out 208 | out_typenum = node.inputs[0].type.dtype_specs()[2] 209 | return """ 210 | if ((PyArray_TYPE(%(softmaxes)s) != NPY_DOUBLE) && 211 | (PyArray_TYPE(%(softmaxes)s) != NPY_FLOAT)) 212 | { 213 | PyErr_SetString(PyExc_TypeError, 214 | "softmaxes type should be float32 or float64"); 215 | %(fail)s; 216 | } 217 | if ((PyArray_NDIM(%(softmaxes)s) != 3) 218 | || (PyArray_NDIM(%(y_idxes)s) != 2) 219 | || (PyArray_NDIM(%(y_lengths)s) != 1) 220 | || (PyArray_NDIM(%(y_startidxes)s) != 1)) 221 | { 222 | PyErr_SetString(PyExc_ValueError, "rank error"); 223 | %(fail)s; 224 | } 225 | if (PyArray_DIMS(%(softmaxes)s)[0] != PyArray_DIMS(%(y_lengths)s)[0]) 226 | { 227 | PyErr_Format(PyExc_ValueError, 228 | "softmaxes.shape[0] (%%ld) != y_lengths.shape[0] (%%ld)", 229 | (long int)PyArray_DIMS(%(softmaxes)s)[0], 230 | (long int)PyArray_DIMS(%(y_lengths)s)[0]); 231 | %(fail)s; 232 | } 233 | if (PyArray_DIMS(%(softmaxes)s)[0] != PyArray_DIMS(%(y_startidxes)s)[0]) 234 | { 235 | PyErr_Format(PyExc_ValueError, 236 | "softmaxes.shape[0] (%%ld) != y_startidxes.shape[0] (%%ld)", 237 | (long int)PyArray_DIMS(%(softmaxes)s)[0], 238 | (long int)PyArray_DIMS(%(y_startidxes)s)[0]); 239 | %(fail)s; 240 | } 241 | if (PyArray_DIMS(%(softmaxes)s)[0] != PyArray_DIMS(%(y_idxes)s)[0]) 242 | { 243 | PyErr_Format(PyExc_ValueError, 244 | "softmaxes.shape[0] (%%ld) != y_idxes.shape[0] (%%ld)", 245 | (long int)PyArray_DIMS(%(softmaxes)s)[0], 246 | (long int)PyArray_DIMS(%(y_idxes)s)[0]); 247 | %(fail)s; 248 | } 249 | if ((NULL == %(errors)s) 250 | || (PyArray_DIMS(%(errors)s)[0] != PyArray_DIMS(%(softmaxes)s)[0])) 251 | { 252 | if (NULL != %(errors)s) Py_XDECREF(%(errors)s); 253 | %(errors)s = (PyArrayObject*) PyArray_Zeros(1, 254 | PyArray_DIMS(%(softmaxes)s), 255 | PyArray_DescrFromType(%(out_typenum)s), 0); 256 | if(!%(errors)s) { 257 | PyErr_SetString(PyExc_MemoryError, 258 | "failed to alloc errors output"); 259 | %(fail)s 260 | } 261 | } 262 | 263 | // for all examples index i is used 264 | for (size_t i = 0; i < PyArray_DIMS(%(y_lengths)s)[0]; ++i) 265 | { 266 | 267 | // the temporal slice size for updates is given by the stride 268 | // length of dx along its second dimension 269 | npy_intp Ssm = PyArray_STRIDES(%(softmaxes)s)[1]/sizeof(dtype_%(softmaxes)s); 270 | 271 | // the distribution slice size for updates: 272 | npy_intp Ssm_dist = PyArray_STRIDES(%(softmaxes)s)[2]/sizeof(dtype_%(softmaxes)s); 273 | 274 | // stride size for each example: 275 | npy_intp error_stride = PyArray_STRIDES(%(errors)s)[0]; 276 | npy_intp softmax_stride = PyArray_STRIDES(%(softmaxes)s)[0]; 277 | npy_intp y_idxes_stride = PyArray_STRIDES(%(y_idxes)s)[0]; 278 | npy_intp y_startidxes_stride = PyArray_STRIDES(%(y_startidxes)s)[0]; 279 | npy_intp y_lengths_stride = PyArray_STRIDES(%(y_lengths)s)[0]; 280 | 281 | npy_intp y_idxes_temp_stride = PyArray_STRIDES(%(y_idxes)s)[1]/sizeof(dtype_%(y_idxes)s); 282 | 283 | 284 | // slices for example i: 285 | dtype_%(errors) s* __restrict__ errors_i = (dtype_%(errors)s*)(PyArray_BYTES(%(errors)s) + error_stride * i); 286 | dtype_%(y_idxes) s* __restrict__ idxes_i = (dtype_%(y_idxes)s*)(PyArray_BYTES(%(y_idxes)s) + y_idxes_stride * i); 287 | const dtype_%(softmaxes)s* __restrict__ softmaxes_i = (dtype_%(softmaxes)s*)(PyArray_BYTES(%(softmaxes)s) + softmax_stride * i); 288 | const dtype_%(y_lengths) s y_lengths_i = ((dtype_%(y_lengths)s*)(PyArray_BYTES(%(y_lengths)s) + y_lengths_stride * i))[0]; 289 | const dtype_%(y_startidxes) s y_startidxes_i = ((dtype_%(y_startidxes)s*)(PyArray_BYTES(%(y_startidxes)s) + y_startidxes_stride * i))[0]; 290 | 291 | for (size_t j = 0 ; j < y_lengths_i; ++j) { 292 | if (idxes_i[(y_startidxes_i + j) * y_idxes_temp_stride] < 0 || idxes_i[(y_startidxes_i + j) * y_idxes_temp_stride] >= PyArray_DIMS(%(softmaxes)s)[2]) { 293 | PyErr_Format(PyExc_ValueError, 294 | "Softmax Index for KL Divergence is out of range ( %%ld not in [0, %%ld]", 295 | (long int)idxes_i[(y_startidxes_i + j) * y_idxes_temp_stride], 296 | (long int)PyArray_DIMS(%(softmaxes)s)[2]); 297 | %(fail)s; 298 | } 299 | errors_i[0] -= log( softmaxes_i[(y_startidxes_i + j) * Ssm + idxes_i[(y_startidxes_i + j) * y_idxes_temp_stride] * Ssm_dist]); 300 | } 301 | 302 | } 303 | """ % dict(locals(), **sub) 304 | 305 | def make_node(self, softmaxes, y_idxes, y_lengths, y_startidxes, **kwargs): 306 | softmaxes = T.as_tensor_variable(softmaxes) 307 | y_idxes = T.as_tensor_variable(y_idxes) 308 | y_lengths = T.as_tensor_variable(y_lengths) 309 | y_startidxes = T.as_tensor_variable(y_startidxes) 310 | if (softmaxes.type.ndim != 3 or 311 | softmaxes.type.dtype not in T.float_dtypes): 312 | raise ValueError('dy must be 3-d tensor of floats', softmaxes.type) 313 | 314 | if (y_idxes.type.ndim != 2 or 315 | y_idxes.type.dtype not in T.discrete_dtypes): 316 | raise ValueError('y_idxes must be 2-d tensor of integers', y_idxes.type) 317 | 318 | if (y_lengths.type.ndim != 1 or 319 | y_lengths.type.dtype not in T.discrete_dtypes): 320 | raise ValueError('y_lengths must be 1-d tensor of integers', y_lengths.type) 321 | 322 | if (y_startidxes.type.ndim != 1 or 323 | y_startidxes.type.dtype not in T.discrete_dtypes): 324 | raise ValueError('y_startidxes must be 1-d tensor of integers', y_startidxes.type) 325 | 326 | return Apply(self, [softmaxes, y_idxes, y_lengths, y_startidxes], [ 327 | T.Tensor(dtype=softmaxes.dtype, broadcastable=[False])()]) 328 | 329 | def grad(self, inp, grads): 330 | softmaxes, y_idxes, y_lengths, y_startidxes = inp 331 | g_costs, = grads 332 | return [masked_loss_dx(softmaxes, y_idxes, y_lengths, y_startidxes, g_costs), 333 | grad_not_implemented(self, 1, y_idxes), 334 | grad_not_implemented(self, 1, y_lengths), 335 | grad_not_implemented(self, 1, y_startidxes)] 336 | 337 | class MaskedSumDx(gof.Op): 338 | """ 339 | Gradient of the sum of values along the third dimension 340 | for a 3d tensor for some subranges defined by a start dimension 341 | and a length along which the gradient is computed. 342 | """ 343 | 344 | def make_node(self, y, y_starts, y_lengths, g_costs, **kwargs): 345 | y = T.as_tensor_variable(y) 346 | y_lengths = T.as_tensor_variable(y_lengths) 347 | y_starts = T.as_tensor_variable(y_starts) 348 | g_costs = T.as_tensor_variable(g_costs) 349 | 350 | if (y.type.ndim != 3 or 351 | y.type.dtype not in T.float_dtypes): 352 | raise ValueError('y must be 3-d tensor of floats', y.type) 353 | 354 | if (y_lengths.type.ndim != 1 or 355 | y_lengths.type.dtype not in T.discrete_dtypes): 356 | raise ValueError('y_lengths must be 1-d tensor of integers', y_lengths.type) 357 | 358 | if (y_starts.type.ndim != 1 or 359 | y_starts.type.dtype not in T.discrete_dtypes): 360 | raise ValueError('y_starts must be 1-d tensor of integers', y_starts.type) 361 | 362 | if (g_costs.type.ndim != 1 or 363 | g_costs.type.dtype not in T.float_dtypes): 364 | raise ValueError('g_costs must be 1-d tensor of floats', g_costs.type) 365 | 366 | return Apply(self, [y, y_starts, y_lengths, g_costs], 367 | [T.Tensor(dtype=y.dtype, broadcastable=y.type.broadcastable)()]) 368 | 369 | def perform(self, node, input_storage, output_storage): 370 | y, y_starts, y_lengths, g_costs = input_storage 371 | 372 | dx = np.zeros_like(y) 373 | for i in range(y_starts.shape[0]): 374 | # d/dx x = 1: 375 | dx[i, y_starts[i]:y_starts+y_lengths[i],:] = g_costs[i] 376 | 377 | output_storage[0][0] = dx 378 | 379 | def c_code_cache_version(self): 380 | return (3,) 381 | 382 | def __init__(self, **kwargs): 383 | gof.Op.__init__(self, **kwargs) 384 | 385 | def __eq__(self, other): 386 | return type(self) == type(other) 387 | 388 | def __hash__(self): 389 | return T.hashtype(self) 390 | 391 | def __str__(self): 392 | return self.__class__.__name__ 393 | 394 | def c_code(self, node, name, inp, out, sub): 395 | y, y_starts, y_lengths, g_costs = inp 396 | dx, = out 397 | out_typenum = node.inputs[0].type.dtype_specs()[2] 398 | return """ 399 | 400 | if ((PyArray_TYPE(%(g_costs)s) != NPY_DOUBLE) && 401 | (PyArray_TYPE(%(g_costs)s) != NPY_FLOAT)) 402 | { 403 | PyErr_SetString(PyExc_TypeError, 404 | "g_costs type should be float32 or float64"); 405 | %(fail)s; 406 | } 407 | if ((PyArray_TYPE(%(y)s) != NPY_DOUBLE) && 408 | (PyArray_TYPE(%(y)s) != NPY_FLOAT)) 409 | { 410 | PyErr_SetString(PyExc_TypeError, 411 | "y type should be float32 or float64"); 412 | %(fail)s; 413 | } 414 | if ((PyArray_NDIM(%(g_costs)s) != 1) 415 | || (PyArray_NDIM(%(y)s) != 3) 416 | || (PyArray_NDIM(%(y_starts)s) != 1) 417 | || (PyArray_NDIM(%(y_lengths)s) != 1)) 418 | { 419 | PyErr_SetString(PyExc_ValueError, "rank error"); 420 | %(fail)s; 421 | } 422 | if (PyArray_DIMS(%(g_costs)s)[0] != PyArray_DIMS(%(y)s)[0]) 423 | { 424 | PyErr_Format(PyExc_ValueError, 425 | "g_costs.shape[0] (%%ld) != y.shape[0] (%%ld)", 426 | (long int)PyArray_DIMS(%(g_costs)s)[0], 427 | (long int)PyArray_DIMS(%(y)s)[0]); 428 | %(fail)s; 429 | } 430 | if (PyArray_DIMS(%(g_costs)s)[0] != PyArray_DIMS(%(y_starts)s)[0]) 431 | { 432 | PyErr_Format(PyExc_ValueError, 433 | "g_costs.shape[0] (%%ld) != y_starts.shape[0] (%%ld)", 434 | (long int)PyArray_DIMS(%(g_costs)s)[0], 435 | (long int)PyArray_DIMS(%(y_starts)s)[0]); 436 | %(fail)s; 437 | } 438 | if (PyArray_DIMS(%(g_costs)s)[0] != PyArray_DIMS(%(y_lengths)s)[0]) 439 | { 440 | PyErr_Format(PyExc_ValueError, 441 | "g_costs.shape[0] (%%ld) != y_lengths.shape[0] (%%ld)", 442 | (long int)PyArray_DIMS(%(g_costs)s)[0], 443 | (long int)PyArray_DIMS(%(y_lengths)s)[0]); 444 | %(fail)s; 445 | } 446 | if ((NULL == %(dx)s) 447 | || (PyArray_DIMS(%(dx)s)[0] != PyArray_DIMS(%(y)s)[0]) 448 | || (PyArray_DIMS(%(dx)s)[1] != PyArray_DIMS(%(y)s)[1]) 449 | || (PyArray_DIMS(%(dx)s)[2] != PyArray_DIMS(%(y)s)[2])) 450 | { 451 | if (NULL != %(dx)s) Py_XDECREF(%(dx)s); 452 | %(dx)s = (PyArrayObject*) PyArray_Zeros(3, 453 | PyArray_DIMS(%(y)s), 454 | PyArray_DescrFromType(%(out_typenum)s), 0); 455 | if(!%(dx)s) { 456 | PyErr_SetString(PyExc_MemoryError, 457 | "failed to alloc dx output"); 458 | %(fail)s 459 | } 460 | } 461 | 462 | 463 | 464 | // for all examples index i is used 465 | for (size_t i = 0; i < PyArray_DIMS(%(y_starts)s)[0]; ++i) 466 | { 467 | 468 | // the temporal slice size for updates is given by the stride 469 | // length of dx along its second dimension 470 | npy_intp Sdx = PyArray_STRIDES(%(dx)s)[1]/sizeof(dtype_%(dx)s); 471 | 472 | // the distribution slice size for updates: 473 | npy_intp Sdx_dist = PyArray_STRIDES(%(dx)s)[2]/sizeof(dtype_%(dx)s); 474 | 475 | // stride size for each example: 476 | npy_intp g_cost_stride = PyArray_STRIDES(%(g_costs)s)[0]; 477 | npy_intp dx_stride = PyArray_STRIDES(%(dx)s)[0]; 478 | npy_intp y_starts_stride = PyArray_STRIDES(%(y_starts)s)[0]; 479 | npy_intp y_lengths_stride = PyArray_STRIDES(%(y_lengths)s)[0]; 480 | size_t y_dim_2 = PyArray_DIMS(%(y)s)[2]; 481 | 482 | 483 | // slices for example i: 484 | dtype_%(dx) s* __restrict__ dx_i = (dtype_%(dx)s*)(PyArray_BYTES(%(dx)s) + dx_stride * i); 485 | const dtype_%(g_costs)s g_costs_i = ((dtype_%(g_costs)s*)(PyArray_BYTES(%(g_costs)s) + g_cost_stride * i))[0]; 486 | const dtype_%(y_lengths) s y_lengths_i = ((dtype_%(y_lengths)s*)(PyArray_BYTES(%(y_lengths)s) + y_lengths_stride * i))[0]; 487 | const dtype_%(y_starts) s y_starts_i = ((dtype_%(y_startidxes)s*)(PyArray_BYTES(%(y_startidxes)s) + y_starts_stride * i))[0]; 488 | 489 | for (size_t j = 0 ; j < y_lengths_i; ++j) 490 | { 491 | for (size_t k = 0; k < y_dim_2; ++k) 492 | { 493 | dx_i[(y_starts_i + j) * Sdx + k * Sdx_dist] = g_costs_i; 494 | } 495 | 496 | } 497 | 498 | } 499 | """ % dict(locals(), **sub) 500 | 501 | def grad(self, *args): 502 | raise NotImplementedError() 503 | 504 | masked_sum_dx = MaskedSumDx() 505 | 506 | class MaskedSum(gof.Op): 507 | nin = 3 508 | nout = 1 509 | """Masked sum for sequence""" 510 | 511 | def make_node(self, y, y_starts, y_lengths, **kwargs): 512 | y = T.as_tensor_variable(y) 513 | y_lengths = T.as_tensor_variable(y_lengths) 514 | y_starts = T.as_tensor_variable(y_starts) 515 | 516 | if (y.type.ndim != 3 or 517 | y.type.dtype not in T.float_dtypes): 518 | raise ValueError('y must be 3-d tensor of floats', y.type) 519 | 520 | if (y_lengths.type.ndim != 1 or 521 | y_lengths.type.dtype not in T.discrete_dtypes): 522 | raise ValueError('y_lengths must be 1-d tensor of integers', y_lengths.type) 523 | 524 | if (y_starts.type.ndim != 1 or 525 | y_starts.type.dtype not in T.discrete_dtypes): 526 | raise ValueError('y_starts must be 1-d tensor of integers', y_starts.type) 527 | 528 | return Apply(self, [y, y_starts, y_lengths], 529 | [T.Tensor(dtype=y.dtype, broadcastable=y.type.broadcastable)()]) 530 | 531 | def perform(self, node, input_storage, output_storage): 532 | y, y_starts, y_lengths = input_storage 533 | 534 | masked_acc = np.zeros([y.shape[0]], dtype=y.dtype) 535 | for i in range(y_starts.shape[0]): 536 | # sum along row / column i 537 | masked_acc[i] = y[i, y_starts[i]:y_starts+y_lengths[i],:].sum() 538 | 539 | output_storage[0][0] = masked_acc 540 | 541 | def c_code(self, node, name, inp, out, sub): 542 | softmaxes, y_idxes, y_lengths, y_startidxes = inp 543 | errors, = out 544 | out_typenum = node.inputs[0].type.dtype_specs()[2] 545 | return """ 546 | if ((PyArray_TYPE(%(softmaxes)s) != NPY_DOUBLE) && 547 | (PyArray_TYPE(%(softmaxes)s) != NPY_FLOAT)) 548 | { 549 | PyErr_SetString(PyExc_TypeError, 550 | "softmaxes type should be float32 or float64"); 551 | %(fail)s; 552 | } 553 | if ((PyArray_NDIM(%(softmaxes)s) != 3) 554 | || (PyArray_NDIM(%(y_idxes)s) != 2) 555 | || (PyArray_NDIM(%(y_lengths)s) != 1) 556 | || (PyArray_NDIM(%(y_startidxes)s) != 1)) 557 | { 558 | PyErr_SetString(PyExc_ValueError, "rank error"); 559 | %(fail)s; 560 | } 561 | if (PyArray_DIMS(%(softmaxes)s)[0] != PyArray_DIMS(%(y_lengths)s)[0]) 562 | { 563 | PyErr_Format(PyExc_ValueError, 564 | "softmaxes.shape[0] (%%ld) != y_lengths.shape[0] (%%ld)", 565 | (long int)PyArray_DIMS(%(softmaxes)s)[0], 566 | (long int)PyArray_DIMS(%(y_lengths)s)[0]); 567 | %(fail)s; 568 | } 569 | if (PyArray_DIMS(%(softmaxes)s)[0] != PyArray_DIMS(%(y_startidxes)s)[0]) 570 | { 571 | PyErr_Format(PyExc_ValueError, 572 | "softmaxes.shape[0] (%%ld) != y_startidxes.shape[0] (%%ld)", 573 | (long int)PyArray_DIMS(%(softmaxes)s)[0], 574 | (long int)PyArray_DIMS(%(y_startidxes)s)[0]); 575 | %(fail)s; 576 | } 577 | if (PyArray_DIMS(%(softmaxes)s)[0] != PyArray_DIMS(%(y_idxes)s)[0]) 578 | { 579 | PyErr_Format(PyExc_ValueError, 580 | "softmaxes.shape[0] (%%ld) != y_idxes.shape[0] (%%ld)", 581 | (long int)PyArray_DIMS(%(softmaxes)s)[0], 582 | (long int)PyArray_DIMS(%(y_idxes)s)[0]); 583 | %(fail)s; 584 | } 585 | if ((NULL == %(errors)s) 586 | || (PyArray_DIMS(%(errors)s)[0] != PyArray_DIMS(%(softmaxes)s)[0])) 587 | { 588 | if (NULL != %(errors)s) Py_XDECREF(%(errors)s); 589 | %(errors)s = (PyArrayObject*) PyArray_Zeros(1, 590 | PyArray_DIMS(%(softmaxes)s), 591 | PyArray_DescrFromType(%(out_typenum)s), 0); 592 | if(!%(errors)s) { 593 | PyErr_SetString(PyExc_MemoryError, 594 | "failed to alloc errors output"); 595 | %(fail)s 596 | } 597 | } 598 | 599 | // for all examples index i is used 600 | for (size_t i = 0; i < PyArray_DIMS(%(y_lengths)s)[0]; ++i) 601 | { 602 | 603 | // the temporal slice size for updates is given by the stride 604 | // length of dx along its second dimension 605 | npy_intp Ssm = PyArray_STRIDES(%(softmaxes)s)[1]/sizeof(dtype_%(softmaxes)s); 606 | 607 | // the distribution slice size for updates: 608 | npy_intp Ssm_dist = PyArray_STRIDES(%(softmaxes)s)[2]/sizeof(dtype_%(softmaxes)s); 609 | 610 | // stride size for each example: 611 | npy_intp error_stride = PyArray_STRIDES(%(errors)s)[0]; 612 | npy_intp softmax_stride = PyArray_STRIDES(%(softmaxes)s)[0]; 613 | npy_intp y_idxes_stride = PyArray_STRIDES(%(y_idxes)s)[0]; 614 | npy_intp y_startidxes_stride = PyArray_STRIDES(%(y_startidxes)s)[0]; 615 | npy_intp y_lengths_stride = PyArray_STRIDES(%(y_lengths)s)[0]; 616 | 617 | npy_intp y_idxes_temp_stride = PyArray_STRIDES(%(y_idxes)s)[1]/sizeof(dtype_%(y_idxes)s); 618 | 619 | 620 | // slices for example i: 621 | dtype_%(errors) s* __restrict__ errors_i = (dtype_%(errors)s*)(PyArray_BYTES(%(errors)s) + error_stride * i); 622 | dtype_%(y_idxes) s* __restrict__ idxes_i = (dtype_%(y_idxes)s*)(PyArray_BYTES(%(y_idxes)s) + y_idxes_stride * i); 623 | const dtype_%(softmaxes)s* __restrict__ softmaxes_i = (dtype_%(softmaxes)s*)(PyArray_BYTES(%(softmaxes)s) + softmax_stride * i); 624 | const dtype_%(y_lengths) s y_lengths_i = ((dtype_%(y_lengths)s*)(PyArray_BYTES(%(y_lengths)s) + y_lengths_stride * i))[0]; 625 | const dtype_%(y_startidxes) s y_startidxes_i = ((dtype_%(y_startidxes)s*)(PyArray_BYTES(%(y_startidxes)s) + y_startidxes_stride * i))[0]; 626 | 627 | for (size_t j = 0 ; j < y_lengths_i; ++j) 628 | { 629 | errors_i[0] -= log( softmaxes_i[(y_startidxes_i + j) * Ssm + idxes_i[(y_startidxes_i + j) * y_idxes_temp_stride] * Ssm_dist]); 630 | } 631 | 632 | } 633 | """ % dict(locals(), **sub) 634 | 635 | def grad(self, inp, grads): 636 | y, y_starts, y_lengths, = inp 637 | g_costs, = grads 638 | return [masked_sum_dx(y, y_starts, y_lengths, g_costs), 639 | grad_not_implemented(self, 1, y_starts), 640 | grad_not_implemented(self, 1, y_lengths)] 641 | 642 | masked_loss = MaskedLoss() 643 | -------------------------------------------------------------------------------- /theano_lstm/shared_memory.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import sharedctypes 2 | from numpy import ctypeslib 3 | import numpy as np 4 | 5 | def wrap_params(params): 6 | """ 7 | For each parameter in a list of Theano TensorSharedVariable 8 | we substitute the memory with a sharedctype using the 9 | multiprocessing library. 10 | 11 | The wrapped memory can then be used by other child processes 12 | thereby synchronising different instances of a model across 13 | processes (e.g. for multi cpu gradient descent using single cpu 14 | Theano code). 15 | 16 | Inputs: 17 | ------- 18 | 19 | params list : the list of shared Theano 20 | variables 21 | 22 | Outputs: 23 | -------- 24 | 25 | wrapped_instances list : list of 26 | sharedctypes (shared memory arrays) that point to the memory 27 | used by the current process's Theano variable. 28 | 29 | Usage: 30 | ------ 31 | 32 | # define some theano model: 33 | mymodel = MyModel(20, 50, etc...) 34 | 35 | # wrap the memory of the Theano variables: 36 | shared_ctypes = wrap_params(mymodel.params) 37 | 38 | Then you can use this memory in child processes 39 | (See usage of `borrow_memory`) 40 | 41 | """ 42 | wrapped_instances = [] 43 | for param in params: 44 | original = param.get_value(True,True) 45 | size = original.size 46 | shape = original.shape 47 | original.shape = size 48 | ctypes = sharedctypes.RawArray('f' if original.dtype == np.float32 else 'd', original) 49 | wrapped = np.frombuffer(ctypes, dtype=original.dtype, count=size) 50 | wrapped.shape = shape 51 | param.set_value(wrapped, borrow=True) 52 | wrapped_instances.append(ctypes) 53 | 54 | return wrapped_instances 55 | 56 | def borrow_memory(param, memory): 57 | """ 58 | Spawn different processes with the shared memory 59 | of your theano model's variables. 60 | 61 | Inputs: 62 | ------- 63 | 64 | param TensorSharedVariable : the Theano shared variable where 65 | shared memory should be used instead. 66 | memory multiprocessing.sharedctypes : the memory shared across processes (e.g. 67 | from `wrap_params`) 68 | 69 | Outputs: 70 | -------- 71 | 72 | None 73 | 74 | Usage 75 | ----- 76 | 77 | For each process in the target function run the theano_borrow_memory 78 | method on the parameters you want to have share memory across processes. 79 | 80 | In this example we have a model called "mymodel" with parameters stored in 81 | a list called "params". We loop through each theano shared variable and 82 | call `theano_borrow_memory` on it to share memory across processes. 83 | 84 | def spawn_model(path, wrapped_params): 85 | # prevent recompilation and arbitrary locks 86 | theano.config.reoptimize_unpickled_function = False 87 | theano.gof.compilelock.set_lock_status(False) 88 | 89 | # load your model from its pickled instance (from path) 90 | mymodel = MyModel.load(path) 91 | 92 | # for each parameter in your model 93 | # apply the borrow memory strategy to replace 94 | # the internal parameter's memory with the 95 | # across-process memory 96 | for param, memory in zip(mymodel.params, wrapped_params): 97 | borrow_memory(param, memory) 98 | 99 | # acquire your dataset (either through some smart shared memory 100 | # or by reloading it for each process) 101 | dataset, dataset_labels = acquire_dataset() 102 | 103 | # then run your model forward in this process 104 | epochs = 20 105 | for epoch in range(epochs): 106 | model.update_fun(dataset, dataset_labels) 107 | 108 | See `borrow_all_memories` for list usage. 109 | 110 | """ 111 | 112 | param_value = ctypeslib.as_array(memory) 113 | param_value.shape = param.get_value(True,True).shape 114 | param.set_value(param_value, borrow=True) 115 | 116 | 117 | def borrow_all_memories(params, memory_handlers): 118 | """ 119 | Run theano_borrow_memory on a list of params and shared memory 120 | sharedctypes. 121 | 122 | Inputs: 123 | ------- 124 | 125 | param list : list of Theano shared variable where 126 | shared memory should be used instead. 127 | memory list : list of memory shared across processes (e.g. 128 | from `wrap_params`) 129 | 130 | Outputs: 131 | -------- 132 | 133 | None 134 | 135 | Usage: 136 | ------ 137 | 138 | Same as `borrow_memory` but for lists of shared memories and 139 | theano variables. See `borrow_memory` 140 | 141 | """ 142 | for param, memory_handler in zip(params, memory_handlers): 143 | borrow_memory(param, memory_handler) 144 | --------------------------------------------------------------------------------