├── .gitignore ├── Cython LSTM.ipynb ├── README.md ├── Recurrent Net.ipynb └── cython_lstm ├── __init__.py ├── cython_utils.pyx ├── dataset.py ├── error.py ├── layers ├── __init__.py ├── activation_layer.py ├── base_layer.py ├── connectible_layer.py ├── element_wise.py ├── layer.py ├── linear_layer.py ├── loop_layer.py ├── recurrent_averaging_layer.py ├── recurrent_layer.py ├── recurrent_multistage_layer.py ├── slice_layer.py ├── temporal_layer.py └── tile_layer.py ├── network.py ├── network_viewer.py ├── neuron.py ├── topology.py └── trainer.py /.gitignore: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | # Compiled source # 3 | ################### 4 | *.com 5 | *.class 6 | *.dll 7 | *.exe 8 | *.o 9 | *.so 10 | 11 | nohup.out 12 | 13 | # Packages # 14 | ############ 15 | # it's better to unpack these files and commit the raw source 16 | # git has its own built in compression methods 17 | *.7z 18 | *.dmg 19 | *.gz 20 | *.iso 21 | *.jar 22 | *.rar 23 | *.tar 24 | *.zip 25 | *.gem 26 | *.pem 27 | 28 | # Saves # 29 | ######### 30 | saves/* 31 | imported_saves/* 32 | pvdm_snapshots/* 33 | sentiment_data/* 34 | *.npy 35 | *.vocab 36 | *.svocab 37 | text8 38 | __pycache__/* 39 | *.pyc 40 | .ipynb_checkpoints 41 | __pycache__ 42 | 43 | # Logs and databases # 44 | ###################### 45 | *.log 46 | *.sql 47 | *.sqlite 48 | 49 | # OS generated files # 50 | ###################### 51 | .DS_Store 52 | .DS_Store? 53 | ._* 54 | .Spotlight-V100 55 | .Trashes 56 | ehthumbs.db 57 | Thumbs.db 58 | ======= 59 | .DS_Store 60 | >>>>>>> 9ced40381de6f9e6c2b02fc8ba7bb993203c6d62 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Cython LSTM 2 | ----------- 3 | 4 | @author Jonathan Raiman 5 | @date 3rd November 2014 6 | 7 | See the current implementation [on this notebook](http://nbviewer.ipython.org/github/JonathanRaiman/cython_lstm/blob/master/Cython%20LSTM.ipynb). 8 | 9 | ## Capabilities: 10 | 11 | * Multi Layer Perceptrons 12 | 13 | * Backprop over the network 14 | 15 | * Tanh, Logistic, Softmax, Rectifier, Linear activations 16 | 17 | * Recurrent Neural Networks (Hidden states only, no memory) 18 | 19 | * Backprop through time 20 | 21 | * Draw graph of network using matplotlib ([see notebook](http://nbviewer.ipython.org/github/JonathanRaiman/cython_lstm/blob/master/Cython%20LSTM.ipynb#drawing-the-network)) 22 | 23 | * Training using SGD or batch gradient descent 24 | 25 | * Tensor networks (quadratic form connecting layers) 26 | 27 | ### Key design goals 28 | 29 | * are to mimic simplicity and practicaly of Pynnet and Cybrain / Pybrain. 30 | 31 | * Model connections using matrices not explicit connections (to get vector algebra involved) 32 | 33 | * Construct and run million parameter models for LSTM and RNN type models 34 | 35 | * Be able to run AdaGrad / RMSprop on gradients easily 36 | 37 | #### Icing on the cake 38 | 39 | * Support dtype float32, float64 (currently float32), and int32 / int64 for indices 40 | 41 | * BackProp through structure 42 | 43 | * Variable input size indices for RNN (so batches of different sequence sizes can be run adjacent to one another -- currently difficult given numpy array size restrictions) 44 | 45 | * Language Models / Hiearchical Softmax parameters 46 | 47 | * Have an interface for Theano variables if needed (avoid compilation times and make everything cythonish) -------------------------------------------------------------------------------- /Recurrent Net.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:d61aca7b09d27726e1d16c369db0862155a496068f0deeccaf27f7f8a50294ac" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [ 15 | "% load_ext autoreload\n", 16 | "% autoreload 2\n", 17 | "% matplotlib inline\n", 18 | "% load_ext cythonmagic\n", 19 | "% config InlineBackend.figure_format = 'svg'\n", 20 | "import matplotlib.pyplot as plt\n", 21 | "import numpy as np, matplotlib\n", 22 | "\n", 23 | "from cython_lstm.network import Network\n", 24 | "from cython_lstm.neuron import Neuron, LogisticNeuron, TanhNeuron, SoftmaxNeuron\n", 25 | "from cython_lstm.layers import Layer, LoopLayer, SliceLayer, TileLayer, TemporalLayer, RecurrentLayer, RecurrentAveragingLayer, RecurrentMultiStageLayer\n", 26 | "from cython_lstm.trainer import Trainer\n", 27 | "from cython_lstm.dataset import create_xor_dataset, create_digit_dataset\n", 28 | "\n", 29 | "SIZE = 10\n", 30 | "INTERNAL_SIZE = 5\n", 31 | "TIMESTEPS = 2\n", 32 | "STREAMS = 2" 33 | ], 34 | "language": "python", 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "output_type": "stream", 39 | "stream": "stdout", 40 | "text": [ 41 | "The autoreload extension is already loaded. To reload it, use:\n", 42 | " %reload_ext autoreload\n", 43 | "The cythonmagic extension is already loaded. To reload it, use:\n", 44 | " %reload_ext cythonmagic\n" 45 | ] 46 | } 47 | ], 48 | "prompt_number": 48 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "# Testing recurrent behavior\n", 55 | "\n", 56 | "## Forward propagation\n", 57 | "\n", 58 | "Forward propagation through time is simple, and should be efficient memory wise. It requires some planning at the wiring stage to know when the output of one stage is ready for the next. Here we test this assumption by constructing two very similar networks.\n", 59 | "\n", 60 | "Both have an input layer, a gate, and a layer that averages the previous activation with the new one using the gate. In the first network the gate is fed by the input, in the second network, the gate is fed by the activation of the first layer.\n", 61 | "\n", 62 | "Here we perform both sets of calculations using an internal for loop that does book keeping for us, and we also perform the same calculation using an explicit loop for inspectability. Both operations should return the same result, regardless of the network wiring." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "collapsed": false, 68 | "input": [ 69 | "net1 = Network()\n", 70 | "\n", 71 | "# create the layers\n", 72 | "linear_layer = TemporalLayer(SIZE, INTERNAL_SIZE, neuron = TanhNeuron)\n", 73 | "\n", 74 | "# a gate has internal size 1\n", 75 | "gate = TemporalLayer(SIZE, 1, neuron = LogisticNeuron)\n", 76 | "\n", 77 | "# this is a multiplier layer, with a sum gate, and another multiplier layer.\n", 78 | "# although it's not expressed as such\n", 79 | "averaging_layer = RecurrentAveragingLayer(gate, linear_layer)\n", 80 | "\n", 81 | "# input should be dispatched to the gate and the linear layer\n", 82 | "tiles = TileLayer()\n", 83 | "tiles.connect_to(gate, temporal=True)\n", 84 | "tiles.connect_to(linear_layer, temporal=True)\n", 85 | "\n", 86 | "linear_layer.connect_to(averaging_layer, temporal=True)\n", 87 | "\n", 88 | "averaging_layer._initial_hidden_state += np.random.standard_normal(averaging_layer._initial_hidden_state.shape)\n", 89 | "\n", 90 | "net1.add_layer(tiles, input=True)\n", 91 | "net1.add_layer(linear_layer)\n", 92 | "net1.add_layer(gate)\n", 93 | "net1.add_layer(averaging_layer, output=True)\n", 94 | "\n", 95 | "recurrent_data = np.random.standard_normal([TIMESTEPS, STREAMS, SIZE]).astype(np.float32)\n", 96 | "out = net1.activate(recurrent_data)[-1]\n", 97 | "net1.clear()\n", 98 | "\n", 99 | "# manual pass\n", 100 | "\n", 101 | "net1.layers[1].allocate_activation(TIMESTEPS, STREAMS)\n", 102 | "net1.layers[2].allocate_activation(TIMESTEPS, STREAMS)\n", 103 | "net1.layers[3].allocate_activation(TIMESTEPS, STREAMS)\n", 104 | "\n", 105 | "for t in range(TIMESTEPS):\n", 106 | " out1 = net1.layers[1].forward_propagate(recurrent_data[t, :, :])\n", 107 | " out2 = net1.layers[2].forward_propagate(recurrent_data[t, :, :])\n", 108 | " out3 = net1.layers[3].forward_propagate(out1)\n", 109 | " net1.layers[1].step += 1\n", 110 | " net1.layers[2].step += 1\n", 111 | " net1.layers[3].step += 1\n", 112 | "net1.clear()\n", 113 | "\n", 114 | "# comparison\n", 115 | "print(\"Outputs are identical => \", np.allclose(out, out3))" 116 | ], 117 | "language": "python", 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "output_type": "stream", 122 | "stream": "stdout", 123 | "text": [ 124 | "Outputs are identical => True\n" 125 | ] 126 | } 127 | ], 128 | "prompt_number": 6 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "Alternate wiring diagram, now the output of the first Layer feeds the gate:" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "collapsed": false, 140 | "input": [ 141 | "net2 = Network()\n", 142 | "\n", 143 | "# create the layers\n", 144 | "recurrent_layer = TemporalLayer(SIZE, INTERNAL_SIZE, neuron = TanhNeuron)\n", 145 | "gate = RecurrentGatedLayer(INTERNAL_SIZE, neuron = LogisticNeuron)\n", 146 | "averaging_layer = RecurrentAveragingLayer(gate, recurrent_layer)\n", 147 | "\n", 148 | "# connect them in the order the computation should proceed\n", 149 | "recurrent_layer.connect_to(gate, temporal=True)\n", 150 | "recurrent_layer.connect_to(averaging_layer, temporal=True)\n", 151 | "\n", 152 | "#averaging_layer.connect_to(recurrent_layer, temporal=True)\n", 153 | "\n", 154 | "averaging_layer._initial_hidden_state += np.random.standard_normal(averaging_layer._initial_hidden_state.shape)\n", 155 | "\n", 156 | "net2.add_layer(recurrent_layer, input=True)\n", 157 | "net2.add_layer(gate)\n", 158 | "net2.add_layer(averaging_layer, output=True)\n", 159 | "\n", 160 | "recurrent_data = np.random.standard_normal([TIMESTEPS, STREAMS, SIZE]).astype(np.float32)\n", 161 | "out = net2.activate(recurrent_data)[-1]\n", 162 | "net2.clear()\n", 163 | "\n", 164 | "net2.layers[0].allocate_activation(TIMESTEPS, STREAMS)\n", 165 | "net2.layers[1].allocate_activation(TIMESTEPS, STREAMS)\n", 166 | "net2.layers[2].allocate_activation(TIMESTEPS, STREAMS)\n", 167 | "\n", 168 | "for t in range(TIMESTEPS):\n", 169 | " out1 = net2.layers[0].forward_propagate(recurrent_data[t, :, :])\n", 170 | " out2 = net2.layers[1].forward_propagate(out1)\n", 171 | " out3 = net2.layers[2].forward_propagate(out1)\n", 172 | " net2.layers[0].step += 1\n", 173 | " net2.layers[1].step += 1\n", 174 | " net2.layers[2].step += 1\n", 175 | "net2.clear()\n", 176 | "print(\"Outputs are identical => \", np.allclose(out, out3))" 177 | ], 178 | "language": "python", 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "output_type": "stream", 183 | "stream": "stdout", 184 | "text": [ 185 | "Outputs are identical => True\n" 186 | ] 187 | } 188 | ], 189 | "prompt_number": 28 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "## Backward propagation\n", 196 | "\n", 197 | "Backward propagation in general can be done nicely for well designed graphs. However whenever cycles are introduced we need to perform backpropagation through time. In this instance we want to make sure these operations are well defined, and that the internal bookkeeping is done correctly, so that the error signal is sent through all stages of the computational graph correctly.\n" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "collapsed": false, 203 | "input": [ 204 | "PREDICTION_SIZE = 20\n", 205 | "\n", 206 | "net2 = Network()\n", 207 | "\n", 208 | "# create the layers\n", 209 | "recurrent_layer = TemporalLayer(SIZE, INTERNAL_SIZE, neuron = TanhNeuron)\n", 210 | "gate = RecurrentGatedLayer(INTERNAL_SIZE, neuron = LogisticNeuron)\n", 211 | "averaging_layer = RecurrentAveragingLayer(gate, recurrent_layer)\n", 212 | "\n", 213 | "temporal_slice = SliceLayer(-1)\n", 214 | "\n", 215 | "softmax_output_layer = Layer(INTERNAL_SIZE, PREDICTION_SIZE, neuron=SoftmaxNeuron)\n", 216 | "\n", 217 | "# connect them in the order the computation should proceed\n", 218 | "recurrent_layer.connect_to(gate, temporal=True)\n", 219 | "recurrent_layer.connect_to(averaging_layer, temporal=True)\n", 220 | "\n", 221 | "averaging_layer.connect_to(temporal_slice)\n", 222 | "temporal_slice.connect_to(softmax_output_layer)\n", 223 | "averaging_layer._initial_hidden_state += np.random.standard_normal(averaging_layer._initial_hidden_state.shape)\n", 224 | "\n", 225 | "net2.add_layer(recurrent_layer, input=True)\n", 226 | "net2.add_layer(gate)\n", 227 | "net2.add_layer(averaging_layer)\n", 228 | "net2.add_layer(temporal_slice)\n", 229 | "net2.add_layer(softmax_output_layer, output=True)\n", 230 | "\n", 231 | "prediction_data = np.random.multinomial(1, np.arange(0., PREDICTION_SIZE) / np.arange(0., PREDICTION_SIZE).sum(), size=(STREAMS))" 232 | ], 233 | "language": "python", 234 | "metadata": {}, 235 | "outputs": [], 236 | "prompt_number": 50 237 | }, 238 | { 239 | "cell_type": "code", 240 | "collapsed": false, 241 | "input": [ 242 | "net2.layers" 243 | ], 244 | "language": "python", 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "metadata": {}, 249 | "output_type": "pyout", 250 | "prompt_number": 51, 251 | "text": [ 252 | "[,\n", 253 | " ,\n", 254 | " ,\n", 255 | " ,\n", 256 | " ]" 257 | ] 258 | } 259 | ], 260 | "prompt_number": 51 261 | }, 262 | { 263 | "cell_type": "code", 264 | "collapsed": false, 265 | "input": [ 266 | "net2.clear()\n", 267 | "net2.activate(recurrent_data)\n", 268 | "net2.backpropagate(prediction_data)" 269 | ], 270 | "language": "python", 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "output_type": "stream", 275 | "stream": "stdout", 276 | "text": [ 277 | "Activating forward layers from TemporalLayer\n" 278 | ] 279 | }, 280 | { 281 | "ename": "AttributeError", 282 | "evalue": "'NoneType' object has no attribute 'copy'", 283 | "output_type": "pyerr", 284 | "traceback": [ 285 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", 286 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mnet2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclear\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mnet2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mactivate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrecurrent_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mnet2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackpropagate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprediction_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 287 | "\u001b[0;32m/Users/jonathanraiman/Desktop/Coding/cython_lstm/cython_lstm/network.py\u001b[0m in \u001b[0;36mbackpropagate\u001b[0;34m(self, target)\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[0mtarget\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 53\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output_layer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merror_activate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 54\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 288 | "\u001b[0;32m/Users/jonathanraiman/Desktop/Coding/cython_lstm/cython_lstm/layers/layer.py\u001b[0m in \u001b[0;36merror_activate\u001b[0;34m(self, target)\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 137\u001b[0m \u001b[0;31m# get the error here\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 138\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackpropagate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdEdy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mactivation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 139\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclear_weight_caches\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 289 | "\u001b[0;32m/Users/jonathanraiman/Desktop/Coding/cython_lstm/cython_lstm/neuron.py\u001b[0m in \u001b[0;36mdEdy\u001b[0;34m(y, t)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdEdy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 105\u001b[0;31m \u001b[0mdEdy\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 106\u001b[0m \u001b[0mdEdy\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdEdy\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 290 | "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'copy'" 291 | ] 292 | } 293 | ], 294 | "prompt_number": 47 295 | }, 296 | { 297 | "cell_type": "code", 298 | "collapsed": false, 299 | "input": [ 300 | "a.pop()" 301 | ], 302 | "language": "python", 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "metadata": {}, 307 | "output_type": "pyout", 308 | "prompt_number": 7, 309 | "text": [ 310 | "1" 311 | ] 312 | } 313 | ], 314 | "prompt_number": 7 315 | }, 316 | { 317 | "cell_type": "code", 318 | "collapsed": false, 319 | "input": [ 320 | "a" 321 | ], 322 | "language": "python", 323 | "metadata": {}, 324 | "outputs": [ 325 | { 326 | "metadata": {}, 327 | "output_type": "pyout", 328 | "prompt_number": 8, 329 | "text": [ 330 | "[]" 331 | ] 332 | } 333 | ], 334 | "prompt_number": 8 335 | }, 336 | { 337 | "cell_type": "code", 338 | "collapsed": false, 339 | "input": [ 340 | "a = [1, 2, 3]" 341 | ], 342 | "language": "python", 343 | "metadata": {}, 344 | "outputs": [], 345 | "prompt_number": 10 346 | }, 347 | { 348 | "cell_type": "code", 349 | "collapsed": false, 350 | "input": [ 351 | "a.index(3)" 352 | ], 353 | "language": "python", 354 | "metadata": {}, 355 | "outputs": [ 356 | { 357 | "metadata": {}, 358 | "output_type": "pyout", 359 | "prompt_number": 14, 360 | "text": [ 361 | "2" 362 | ] 363 | } 364 | ], 365 | "prompt_number": 14 366 | }, 367 | { 368 | "cell_type": "code", 369 | "collapsed": false, 370 | "input": [], 371 | "language": "python", 372 | "metadata": {}, 373 | "outputs": [] 374 | } 375 | ], 376 | "metadata": {} 377 | } 378 | ] 379 | } -------------------------------------------------------------------------------- /cython_lstm/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple Package for creating artificial 3 | neural networks including recurrent, 4 | recursive, and LSTM neural networks. 5 | 6 | """ 7 | import pyximport, numpy as np 8 | pyximport.install(setup_args={"include_dirs": np.get_include()}) 9 | from .cython_utils import vector_outer_product, tensor_delta_down, tensor_delta_down_with_output 10 | 11 | __all__ = ["vector_outer_product", "tensor_delta_down", "tensor_delta_down_with_output"] -------------------------------------------------------------------------------- /cython_lstm/cython_utils.pyx: -------------------------------------------------------------------------------- 1 | import cython 2 | from cython cimport view 3 | import numpy as np 4 | cimport numpy as np 5 | 6 | from cpython cimport PyCapsule_GetPointer # PyCObject_AsVoidPtr 7 | from scipy.linalg.blas import fblas 8 | 9 | REAL = np.float32 10 | ctypedef np.float32_t REAL_t 11 | 12 | cdef int ONE = 1 13 | cdef REAL_t ONEF = 1.0 14 | 15 | ctypedef void (*sger_ptr) (const int *M, const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY, float *A, const int * LDA) nogil 16 | cdef sger_ptr sger=PyCapsule_GetPointer(fblas.sger._cpointer , NULL) # A := alpha*x*y.T + A 17 | 18 | cdef void outer_prod(REAL_t* x, REAL_t* y, REAL_t * out, int x_len, int y_len): 19 | sger(&y_len, &x_len, &ONEF, y, &ONE, x, &ONE, out, &y_len) 20 | 21 | ctypedef void (*dgemv_ptr) (char *trans, int *m, int *n,\ 22 | float *alpha, float *a, int *lda, float *x, int *incx,\ 23 | float *beta, float *y, int *incy) 24 | 25 | ctypedef float (*sdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil 26 | 27 | 28 | cdef sdot_ptr sdot=PyCapsule_GetPointer(fblas.sdot._cpointer, NULL) # float = dot(x, y) 29 | cdef dgemv_ptr dgemv=PyCapsule_GetPointer(fblas.dgemv._cpointer, NULL) 30 | 31 | cpdef np.ndarray[REAL_t, ndim=3] vector_outer_product(np.ndarray[REAL_t, ndim=2] _x, np.ndarray[REAL_t, ndim=2] _y): 32 | 33 | cdef int i, length = _x.shape[0], x_len = _x.shape[1], y_len = _y.shape[1] 34 | cdef int box_size = x_len * y_len 35 | cdef np.ndarray[REAL_t, ndim=3] result 36 | result = np.zeros([length, x_len, y_len], dtype = REAL) 37 | 38 | cdef REAL_t* x = (np.PyArray_DATA(_x)) 39 | cdef REAL_t* y = (np.PyArray_DATA(_y)) 40 | 41 | cdef REAL_t[:,:] x_view = _x 42 | cdef REAL_t[:,:] y_view = _y 43 | 44 | for i in range(length): 45 | outer_prod(&x_view[i,0], &y_view[i,0], &result[i,0,0], x_len, y_len) 46 | 47 | return result.transpose((1,2,0)) 48 | 49 | def tensor_delta_down_with_output( 50 | np.ndarray[REAL_t, ndim=3] tensor, 51 | np.ndarray[REAL_t, ndim=2] dEdz, 52 | np.ndarray[REAL_t, ndim=2] input, 53 | np.ndarray[REAL_t, ndim=2] out): 54 | 55 | cdef: 56 | int size = dEdz.shape[1] 57 | np.ndarray[REAL_t, ndim=3] outer_dotted = vector_outer_product(dEdz, input) 58 | 59 | for i in range(size): 60 | out += np.dot(tensor[i,:,:], outer_dotted[i,:,:]).T 61 | out += np.dot(tensor[i,:,:].T, outer_dotted[i,:,:]).T 62 | 63 | def tensor_delta_down( 64 | np.ndarray[REAL_t, ndim=3] tensor, 65 | np.ndarray[REAL_t, ndim=2] dEdz, 66 | np.ndarray[REAL_t, ndim=2] input): 67 | 68 | cdef: 69 | int size = dEdz.shape[1] 70 | np.ndarray[REAL_t, ndim=2] delta_unbiased = np.zeros_like(input, dtype=REAL) 71 | np.ndarray[REAL_t, ndim=3] outer_dotted = vector_outer_product(dEdz, input) 72 | 73 | for i in range(size): 74 | delta_unbiased += np.dot(tensor[i,:,:], outer_dotted[i,:,:]).T 75 | delta_unbiased += np.dot(tensor[i,:,:].T, outer_dotted[i,:,:]).T 76 | 77 | return delta_unbiased -------------------------------------------------------------------------------- /cython_lstm/dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | def create_digit_dataset(): 3 | return (np.array([ 4 | [0, 0, 0, 0], 5 | [0, 0, 0, 1], 6 | [0, 0, 1, 0], 7 | [0, 0, 1, 1], 8 | [0, 1, 0, 0], 9 | [0, 1, 0, 1], 10 | [0, 1, 1, 0], 11 | [0, 1, 1, 1], 12 | [1, 0, 0, 0], 13 | [1, 0, 0, 1], 14 | [1, 0, 1, 0]], dtype=np.float32), 15 | np.arange(0, 11).astype(np.int32)) 16 | 17 | def create_xor_dataset(): 18 | return (np.array([ 19 | [0, 0], 20 | [0, 1], 21 | [1, 0], 22 | [1, 1]], dtype=np.float32), 23 | np.array([ 24 | [0, 1], 25 | [1, 0], 26 | [1, 0], 27 | [0, 1]], dtype=np.float32)) 28 | 29 | __all__ = ["create_digit_dataset", "create_xor_dataset"] -------------------------------------------------------------------------------- /cython_lstm/error.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Error(object): 4 | pass 5 | 6 | def softmax_error_one_hot(x, t): 7 | return -np.log(x[np.arange(0,t.shape[0]), t]).sum() 8 | 9 | class MSE(Error): 10 | @staticmethod 11 | def dEdy(y, t): 12 | return y - t 13 | 14 | @staticmethod 15 | def error(y, target): 16 | """ 17 | Mean squared error : 1/2 (y-t)^2 18 | """ 19 | return 0.5 * (y - target)**2 20 | 21 | class BinaryCrossEntropy(MSE): 22 | @staticmethod 23 | def error(y, target): 24 | """ 25 | Binary Cross entropy error: 26 | D_{KL}(p || q) = sum_i { (1-p_i) * log(1 - q_i) -p_i log (q_i) } 27 | """ 28 | return -(target * np.log(y) + (1.0 - target) * np.log1p(-y.clip(max=0.99999999))) 29 | 30 | class TanhBinayCrossEntropy(MSE): 31 | @staticmethod 32 | def error(y, target): 33 | """ 34 | Cross entropy error (we reshape tanh activation into 35 | a sigmoid like activation and then apply cross entropy 36 | criterion to it) 37 | """ 38 | resized_activation = (y + 1.0) / 2.0 39 | return -(target * np.log(resized_activation) + (1.0 - target) * np.log1p(- resized_activation)) 40 | 41 | class CategoricalCrossEntropy(MSE): 42 | @staticmethod 43 | def dEdy(y, t): 44 | dEdy = y.copy() 45 | dEdy[np.arange(0,t.shape[0]), t] -= 1. 46 | return dEdy 47 | 48 | @staticmethod 49 | def error(y, target): 50 | """ 51 | Cross entropy error (we reshape tanh activation into 52 | a sigmoid like activation and then apply cross entropy 53 | criterion to it) (One hot outputs) 54 | """ 55 | return softmax_error_one_hot(y, target) -------------------------------------------------------------------------------- /cython_lstm/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .layer import Layer 2 | from .temporal_layer import TemporalLayer 3 | from .recurrent_layer import RecurrentLayer 4 | from .recurrent_averaging_layer import RecurrentAveragingLayer 5 | from .recurrent_multistage_layer import RecurrentMultiStageLayer 6 | from .tile_layer import TileLayer 7 | from .slice_layer import SliceLayer 8 | from .loop_layer import LoopLayer 9 | from .linear_layer import LinearLayer 10 | from .activation_layer import ActivationLayer 11 | 12 | __all__ = [ 13 | "Layer", 14 | "LoopLayer", 15 | "LinearLayer", 16 | "ActivationLayer", 17 | "SliceLayer", 18 | "TemporalLayer", 19 | "TileLayer", 20 | "RecurrentLayer", 21 | "RecurrentAveragingLayer", 22 | "RecurrentMultiStageLayer" 23 | ] -------------------------------------------------------------------------------- /cython_lstm/layers/activation_layer.py: -------------------------------------------------------------------------------- 1 | from .base_layer import BaseLayer 2 | import numpy as np 3 | 4 | class ActivationLayer(BaseLayer): 5 | """ 6 | An activation layer takes a neuron as input 7 | and using this it defines the gradient 8 | with respect to its input. 9 | """ 10 | def __init__(self, neuron): 11 | BaseLayer.__init__(self) 12 | self.activation_function = neuron.activation_function 13 | self.dydz = neuron.dydz 14 | self.gradients = [] 15 | 16 | def activate(self, x, out=None): 17 | """ 18 | Operations of an activation layer are elementwise, 19 | and thus the out shape is defined from the in shape, 20 | x. 21 | """ 22 | if out is None: 23 | self._activation = self.activation_function(x[0],out) 24 | return self._activation 25 | else: 26 | return self.activation_function(x[0],out) 27 | 28 | def update_grad_input(self, input, output, grad_output): 29 | """ 30 | Here we take the neuron's update method for the gradient 31 | which is usually a function of its output. 32 | this is a form of code smell, but for a neural network 33 | module this is habitutal 34 | """ 35 | if self.gradinput is None: 36 | self.gradinput = np.zeros_like(input) 37 | self.gradinput += grad_output * self.dydz(output) 38 | return self.gradinput -------------------------------------------------------------------------------- /cython_lstm/layers/base_layer.py: -------------------------------------------------------------------------------- 1 | from .connectible_layer import ConnectibleLayer 2 | 3 | class BaseLayer(ConnectibleLayer): 4 | """ 5 | Base Neural Network Layer, defines the key 6 | methods that need to be implemented. Is not 7 | useful until inherited from.""" 8 | 9 | def __init__(self, dtype='float32'): 10 | ConnectibleLayer.__init__(self) 11 | self.dtype = dtype 12 | self._activation = None 13 | self.gradinput = None 14 | self.params = [] 15 | 16 | def clear(self): 17 | self.gradinput = None 18 | for grad in self.gradients: 19 | grad.fill(0) 20 | 21 | def activate(self, x, out=None): 22 | """ 23 | Each layer must have an activate method 24 | that can output to the out parameter 25 | or returns its out allocated output. 26 | """ 27 | raise NotImplementedError 28 | 29 | def update_grad_input(self, input, output, grad_output): 30 | """ 31 | Each layer must have an update grad input method 32 | that takes care of updating its gradients and passing 33 | those down. 34 | """ 35 | raise NotImplementedError 36 | 37 | def activate_forward_layers(self): 38 | # first activate forward. 39 | for layer in self._forward_layers: 40 | layer.activate(self.activation()) 41 | # then pass the message onwards: 42 | for layer in self._forward_layers: 43 | layer.activate_forward_layers() 44 | 45 | def __repr__(self): 46 | return "<" + self.__class__.__name__ + " " + str({"activation": self.activation_function.__doc__ if hasattr(self, 'activation_function') else '', "input_size": self.input_size if hasattr(self, 'input_size') else '', "output_size": self.output_size if hasattr(self, 'output_size') else ''})+">" -------------------------------------------------------------------------------- /cython_lstm/layers/connectible_layer.py: -------------------------------------------------------------------------------- 1 | class ConnectibleLayer(object): 2 | """ 3 | This class only handles layer 4 | interconnections. Only knowledge 5 | about a supposed graph is assumed here. 6 | 7 | Note: a layer can have multiple output 8 | layers, but only a single input layer. 9 | Use a layer stack to have multiple inputs 10 | go to a single input (backprop through 11 | structure essentially). 12 | 13 | """ 14 | def __init__(self): 15 | self.parents = [] 16 | self.children = [] 17 | self._forward_layer = None 18 | self._backward_layer = None 19 | 20 | def add_backward_layer(self, layer): 21 | """ 22 | Connect a layer to the antecedents 23 | of this layer in the graph. 24 | """ 25 | if self._backward_layer is not None: self._backward_layer.remove_forward_layer(self) 26 | self.parents.append(layer) 27 | self._backward_layer = layer 28 | 29 | def __sub__(self, layer): 30 | from .element_wise import ElementWiseSum 31 | result = ElementWiseSum(self, layer, -1) 32 | return result 33 | 34 | def __mul__(self, layer): 35 | from .element_wise import ElementWiseProd 36 | result = ElementWiseProd(self, layer) 37 | return result 38 | 39 | def __add__(self, layer): 40 | from .element_wise import ElementWiseSum 41 | result = ElementWiseSum(self, layer) 42 | return result 43 | 44 | def remove_forward_layer(self, layer): 45 | """ 46 | Remove a layer from the forward layers 47 | of this layer. Stops the propagation 48 | of activations in the graph here. 49 | 50 | """ 51 | if self._forward_layer is layer: self._forward_layer = None 52 | 53 | def remove_backward_layer(self, layer): 54 | """ 55 | Remove a layer from the antecedents 56 | of this layer. 57 | """ 58 | if self._backward_layer is layer: self._backward_layer = None 59 | 60 | def connect_to(self, layer): 61 | """ 62 | Adds the layer to the forward and 63 | backward lists of layers to connect 64 | the graph. 65 | """ 66 | self.children.append(layer) 67 | self._forward_layer = layer 68 | layer.add_backward_layer(self) -------------------------------------------------------------------------------- /cython_lstm/layers/element_wise.py: -------------------------------------------------------------------------------- 1 | from .base_layer import BaseLayer 2 | import numpy as np 3 | 4 | class ElementWise(BaseLayer): 5 | """ 6 | Sum the elements of both parent layers 7 | element wise. The gradient is equally 8 | shared among the parents. 9 | 10 | Note: untested 11 | """ 12 | def __init__(self, a, b): 13 | BaseLayer.__init__(self) 14 | self.a = a 15 | self.b = b 16 | # inform the topological sort 17 | # of this dependency 18 | self.parents.append(a) 19 | self.parents.append(b) 20 | a.children.append(self) 21 | b.children.append(self) 22 | 23 | class ElementWiseSum(ElementWise): 24 | def activate(self, x, out=None): 25 | """ 26 | Activate by passing the list of activations 27 | for both parents. 28 | """ 29 | if out is not None: 30 | out[:] = x[0] + x[1] 31 | else: 32 | return x[0] + x[1] 33 | def update_grad_input(self, input, output, grad_output): 34 | num_singletons = [len(grad_output.shape) - len(x.shape) for x in input] 35 | 36 | self.gradinput = [np.sum(grad_output, 37 | axis=tuple(range(num_singletons[i])), 38 | keepdims=False) if num_singletons[i] > 0 else grad_output 39 | for i in len(input)] 40 | return self.gradinput 41 | class ElementWiseSub(ElementWise): 42 | def activate(self, x, out=None): 43 | if out is not None: 44 | out[:] = x[0] - x[1] 45 | else: 46 | return x[0] - x[1] 47 | def update_grad_input(self, input, output, grad_output): 48 | num_singletons = [len(grad_output.shape) - len(x.shape) for x in input] 49 | 50 | self.gradinput = [np.sum(grad_output, 51 | axis=tuple(range(num_singletons[i])), 52 | keepdims=False) if num_singletons[i] > 0 else grad_output 53 | for i in len(input)] 54 | # this one has a negative gradient: 55 | self.gradinput[1] *= -1 56 | return self.gradinput 57 | class ElementWiseProd(ElementWise): 58 | def activate(self, x, out=None): 59 | if out is not None: 60 | out[:] = x[0] * x[1] 61 | else: 62 | return x[0] * x[1] 63 | def update_grad_input(self, input, output, grad_output): 64 | raise NotImplementedError() 65 | # self.gradinput = map(np.ones_like, input) 66 | # self.gradinput[1] *= -1 67 | # for grad, grad_out in zip(self.gradinput, grad_output): 68 | # grad *= grad_out 69 | # return self.gradinput -------------------------------------------------------------------------------- /cython_lstm/layers/layer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Neural Network Layer 3 | -------------------- 4 | 5 | Note: Deprecated ! 6 | 7 | Missing: Dropout, Language Models, Hierachical Softmax 8 | 9 | Note: it appears that Dropout is a weightless layer. 10 | Layer should be generalized to the weighless case. 11 | 12 | """ 13 | from .base_layer import BaseLayer 14 | from ..cython_utils import vector_outer_product, tensor_delta_down_with_output 15 | from ..neuron import Neuron 16 | import numpy as np 17 | REAL = np.float32 18 | 19 | def quadratic_form(tensor, x): 20 | return (np.dot(tensor, x.T) * x.T).sum(axis=1).T 21 | 22 | def quadratic_form_gradient(error, x): 23 | return (vector_outer_product(x, x)[:,:,:, np.newaxis] * error).transpose((3,1,0,2)) 24 | 25 | class Layer(BaseLayer): 26 | """ 27 | Create a feedforward layer with identity activation 28 | """ 29 | def __init__(self, 30 | input_size = 10, 31 | output_size = None, 32 | tensor = False, 33 | neuron = Neuron, 34 | dtype=REAL): 35 | BaseLayer.__init__(self) 36 | 37 | self.dimensionless = False 38 | self.step = 0 39 | self.dtype = dtype 40 | self.activation_function = neuron.activation_function 41 | self.error = neuron.error 42 | self.dydz = neuron.dydz 43 | self.dEdy = neuron.dEdy 44 | self._weight_matrix = None 45 | self._bias_units = None 46 | self.input_size = input_size 47 | self.output_size = output_size 48 | 49 | self.tensor = tensor 50 | 51 | self._dEdy = None 52 | self.dEdz = None 53 | 54 | self._weight_matrix_diff = None 55 | self._bias_units_diff = None 56 | 57 | if self.tensor: 58 | self._weight_tensor = None 59 | self._weight_tensor_diff = None 60 | 61 | if self.input_size is not None and self.output_size is not None: 62 | self.create_weights() 63 | 64 | def activate(self, input): 65 | # run net forward using input 66 | self.forward_propagate(input) 67 | # transfer activation as input to next layers: 68 | self.activate_forward_layers() 69 | 70 | def allocate_activation(self, timesteps, streams): 71 | pass 72 | 73 | def backpropagate_dEdy(self): 74 | """ 75 | Backpropagate error signal to the weights 76 | and prepare for lower layers to use by getting 77 | dEdz. 78 | """ 79 | 80 | self.dEdz = np.dot(self._dEdy, self._weight_matrix) 81 | 82 | # can be a costlyish operation if requires addition 83 | # of hidden state vector: 84 | layer_input = self.layer_input() 85 | 86 | # updates to weight matrix are given by outer 87 | # product of signal with input: 88 | self._weight_matrix_diff += vector_outer_product(self._dEdy, layer_input).sum(axis=-1) 89 | 90 | # updates to bias units are given by signal 91 | self._bias_units_diff += self._dEdy.T.sum(axis=-1) 92 | 93 | if self.tensor: 94 | # propagate signal backwards through the tensor: 95 | tensor_delta_down_with_output(self._weight_tensor, self._dEdy, layer_input, self.dEdz) 96 | # obtain gradient for a tensor: 97 | self._weight_tensor_diff += quadratic_form_gradient(self._dEdy, layer_input).sum(axis=-1) 98 | 99 | def backpropagate(self, signal): 100 | """ 101 | Get local error responsability using 102 | the derivative of error with respect 103 | to output times the derivative of the 104 | local parameters dy / dz 105 | """ 106 | # signal backwards is given by taking weight matrix 107 | # with signal with derivative 108 | self._dEdy = signal * self.dydz(self._activation) 109 | 110 | # given we know the error signal at this stage, 111 | # constitute the local error responsability dEdz 112 | # and mark the updates to the weights: 113 | self.backpropagate_dEdy() 114 | 115 | # Send dEdz backwards as new error signal 116 | self._backward_layer.backpropagate(self.dEdz) 117 | 118 | def layer_input(self): 119 | """ 120 | Input is sum of activations of backward 121 | layers. 122 | """ 123 | return self._backward_layer.activation() 124 | 125 | def activation(self): 126 | return self._activation 127 | 128 | def error_activate(self, target): 129 | """ 130 | Start the backpropagation using a target 131 | by getting the initial error responsability 132 | as dE / dy = y - t 133 | 134 | dEdW is then provided for the backward layers 135 | iteratively: 136 | dE / dW = (dy_l / dW) * (...) * (dy_l / dy) * (dE / dy) 137 | """ 138 | 139 | # get the error here 140 | self.backpropagate(self.dEdy(self.activation(),target)) 141 | 142 | def clear_weight_caches(self): 143 | for grad in self.gradients: 144 | grad.fill(0) 145 | 146 | def clear(self): 147 | """ 148 | Clears the activation and the local 149 | error responsibility for this layer 150 | """ 151 | self.step = 0 152 | self._activation = None 153 | self._dEdy = None 154 | self.dEdz = None 155 | self.clear_weight_caches() 156 | 157 | def reset_weights(self): 158 | """ 159 | Reset to random weights this 160 | layer 161 | """ 162 | self.clear() 163 | self._weight_matrix += self._random_weight_matrix() 164 | self._bias_units += self._random_bias_units() 165 | if self.tensor: 166 | self._weight_tensor += self._random_weight_tensor() 167 | 168 | def _random_weight_tensor(self): 169 | return ( 170 | (1. / self.input_size) * 171 | np.random.standard_normal([ 172 | self.output_size, 173 | self.input_size, 174 | self.input_size]) 175 | ).astype(self.dtype) 176 | 177 | def _random_weight_matrix(self): 178 | return ( 179 | (1. / self.input_size) * 180 | np.random.standard_normal([ 181 | self.output_size, 182 | self.input_size]) 183 | ).astype(self.dtype) 184 | 185 | def _random_bias_units(self): 186 | return ( 187 | (1. / self.input_size) * 188 | np.random.standard_normal(self.output_size) 189 | ).astype(self.dtype) 190 | 191 | def create_weights(self): 192 | """ 193 | Randomly initialize the weights for this layer 194 | with gaussian noise with std 1 / input size 195 | """ 196 | self._weight_matrix = self._random_weight_matrix() 197 | self._weight_matrix_diff = np.zeros_like(self._weight_matrix) 198 | 199 | self._bias_units = self._random_bias_units() 200 | self._bias_units_diff = np.zeros_like(self._bias_units) 201 | 202 | 203 | self.params = [self._weight_matrix, self._bias_units] 204 | self.gradients = [self._weight_matrix_diff, self._bias_units_diff] 205 | 206 | if self.tensor: 207 | self._weight_tensor = self._random_weight_tensor() 208 | self._weight_tensor_diff = np.zeros_like(self._weight_tensor) 209 | self.params.append(self._weight_tensor) 210 | self.gradients.append(self._weight_tensor_diff) 211 | 212 | def forward_propagate(self, input): 213 | """ 214 | use the weights and the activation function 215 | to react to the input 216 | 217 | TODO: use the `out' parameter of numpy dot to 218 | gain speed on memory allocation. 219 | 220 | Inputs 221 | ------ 222 | 223 | inputs ndarray : the input data 224 | 225 | Outputs 226 | ------- 227 | 228 | activation ndarray : the activation for this input 229 | 230 | """ 231 | if self.tensor: 232 | self._activation = self.activation_function( quadratic_form(self._weight_tensor, input) + np.dot(input, self._weight_matrix.T) + self._bias_units) 233 | else: 234 | self._activation = self.activation_function( np.dot(input, self._weight_matrix.T) + self._bias_units) 235 | return self._activation 236 | 237 | def _zero_initial_state(self): 238 | return np.zeros(self.output_size, dtype=self.dtype) 239 | 240 | -------------------------------------------------------------------------------- /cython_lstm/layers/linear_layer.py: -------------------------------------------------------------------------------- 1 | from .base_layer import BaseLayer 2 | import numpy as np 3 | from ..cython_utils import vector_outer_product, tensor_delta_down_with_output 4 | 5 | def quadratic_form(tensor, x): 6 | return (np.dot(tensor, x.T) * x.T).sum(axis=1).T 7 | 8 | def quadratic_form_gradient(error, x): 9 | return (vector_outer_product(x, x)[:,:,:, np.newaxis] * error).transpose((3,1,0,2)) 10 | 11 | class LinearLayer(BaseLayer): 12 | """ 13 | A layer that takes an input and performs 14 | an affine, or tensor based, transformation 15 | of its inputs (e.g. using a matrix, a bias 16 | vector, and optionally a tensor). 17 | This is a feedforward layer in the traditional 18 | sense. 19 | No activation function is used here. 20 | """ 21 | def __init__(self, input_size, output_size, tensor = False): 22 | """ 23 | Initialize a layer that projects its input into another 24 | dimension. 25 | 26 | Inputs 27 | ------ 28 | 29 | input_size int : the size of the input 30 | dimensions 31 | output_size int : the size of the output 32 | dimensions 33 | tensor boolean (optional) : whether to use a bilinear 34 | form in the projection of 35 | the input. 36 | 37 | Note: You may also opt to have a tensor also perform 38 | a transformation on the input. However this adds as many 39 | matrices as there are output dimensions, so this can 40 | become a very costly operation quickly. 41 | 42 | """ 43 | BaseLayer.__init__(self) 44 | self.input_size = input_size 45 | self.output_size = output_size 46 | self.tensor = tensor 47 | self.create_weights() 48 | 49 | def random_weight_matrix(self): 50 | return ( (1. / self.input_size) * np.random.standard_normal([ self.output_size, self.input_size]) ).astype(self.dtype) 51 | 52 | def random_bias_units(self): 53 | return ( (1. / self.input_size) * np.random.standard_normal( self.output_size) ).astype(self.dtype) 54 | 55 | def random_weight_tensor(self): 56 | return ( (1. / self.input_size) * np.random.standard_normal([ self.output_size, self.input_size, self.input_size]) ).astype(self.dtype) 57 | 58 | def create_weights(self): 59 | """ 60 | Randomly initialize the weights for this layer 61 | with gaussian noise with std 1 / input size 62 | """ 63 | self.weight_matrix = self.random_weight_matrix() 64 | self.weight_matrix_diff = np.zeros_like(self.weight_matrix) 65 | 66 | self.bias_units = self.random_bias_units() 67 | self.bias_units_diff = np.zeros_like(self.bias_units) 68 | 69 | self.params = [self.weight_matrix, self.bias_units] 70 | self.gradients = [self.weight_matrix_diff, self.bias_units_diff] 71 | 72 | if self.tensor: 73 | self.weight_tensor = self.random_weight_tensor() 74 | self.weight_tensor_diff = np.zeros_like(self.weight_tensor) 75 | self.params.append(self.weight_tensor) 76 | self.gradients.append(self.weight_tensor_diff) 77 | 78 | def reset_weights(self): 79 | """ 80 | Reset to random weights this layer. 81 | """ 82 | self.clear() 83 | self.weight_matrix.fill(0) 84 | self.bias_vector.fill(0) 85 | self.weight_matrix += self.random_weight_matrix() 86 | self.bias_vector += self.random_bias_units() 87 | if self.tensor: 88 | self.weight_tensor.fill(0) 89 | self.weight_tensor += self.random_weight_tensor() 90 | 91 | def activate(self, x, out=None): 92 | """ 93 | Projects the input into the output dimension. 94 | 95 | Inputs 96 | ------ 97 | 98 | x list : the input to this layer. 99 | 100 | Outputs 101 | ------- 102 | 103 | activation ndarray : the activation for this input 104 | 105 | """ 106 | if out is None: 107 | if self.tensor: 108 | self._activation = quadratic_form(self.weight_tensor, x[0]) + np.dot(x[0], self.weight_matrix.T) + self.bias_units 109 | else: 110 | self._activation = np.dot(x[0], self.weight_matrix.T) + self.bias_units 111 | return self._activation 112 | else: 113 | if self.tensor: 114 | np.dot(x[0], self.weight_matrix.T, out=out) 115 | out += self.bias_units 116 | out += quadratic_form(self.weight_tensor, x[0]) 117 | return out 118 | else: 119 | np.dot(x[0], self.weight_matrix.T, out=out) 120 | out += self.bias_units 121 | return out 122 | 123 | def update_grad_input(self, input, output, grad_output): 124 | """ 125 | Here we use the input and the output of this layer to 126 | get the gradient of this layer. Usually the output of 127 | a linear layer is not needed, but because a second 128 | order method (a bilinear form, using a tensor) is 129 | possible, the output of this layer can be useful for 130 | backpropagation. 131 | """ 132 | if self.gradinput is None: 133 | self.gradinput = np.dot(grad_output, self.weight_matrix) 134 | else: 135 | self.gradinput += np.dot(grad_output, self.weight_matrix) 136 | 137 | # updates to weight matrix are given by outer 138 | # product of signal with input: 139 | self.weight_matrix_diff += vector_outer_product(grad_output, input).sum(axis=-1) 140 | 141 | # updates to bias units are given by signal 142 | self.bias_units_diff += grad_output.T.sum(axis=-1) 143 | 144 | if self.tensor: 145 | # propagate signal backwards through the tensor: 146 | tensor_delta_down_with_output(self.weight_tensor, grad_output, input, self.gradinput) 147 | # obtain gradient for a tensor: 148 | self.weight_tensor_diff += quadratic_form_gradient(grad_output, input).sum(axis=-1) 149 | 150 | return self.gradinput -------------------------------------------------------------------------------- /cython_lstm/layers/loop_layer.py: -------------------------------------------------------------------------------- 1 | from .base_layer import BaseLayer 2 | import numpy as np 3 | 4 | class LoopLayer(BaseLayer): 5 | 6 | def __init__(self, input_size, internal_layer): 7 | BaseLayer.__init__(self) 8 | self.input_size = input_size 9 | self.internal_layer = internal_layer 10 | self.layers = [] 11 | last_layer = self.internal_layer 12 | self.gradients = [] 13 | self.params = [] 14 | while last_layer is not None: 15 | self.layers.append(last_layer) 16 | self.gradients.extend(last_layer.gradients) 17 | self.params.extend(last_layer.params) 18 | last_layer = last_layer._forward_layer 19 | 20 | def clear(self): 21 | self.gradinput = None 22 | for layer in self.layers: 23 | layer.clear() 24 | 25 | def activation_holder(self, timesteps, streams): 26 | """ 27 | Construct an empty array of arrays for holding the result 28 | of the internal computations at every timesteps. 29 | 30 | For now this computation is done without recurrence built in, 31 | e.g. layers do not get to see the past. 32 | 33 | Note: We'll upgrade this in a minute. 34 | """ 35 | holder = [] 36 | output_size = self.input_size 37 | for layer in self.layers: 38 | if hasattr(layer, 'output_size'): 39 | output_size = layer.output_size 40 | holder.append(np.zeros([timesteps, streams, output_size], dtype=layer.dtype)) 41 | return holder 42 | 43 | def activate(self, x, out = None): 44 | """ 45 | Activate through time by taking 46 | slices along the input's first dimensions. 47 | 48 | Implementation Note: for now let's not worry 49 | too much about memory and speed and consider 50 | that the input x is a Python list with different 51 | useful pieces for the computation passed in in 52 | an order reasonable for the internal layer (and 53 | this responsability is on the user, but not on 54 | the loop layer). 55 | 56 | """ 57 | 58 | timesteps = x[0].shape[0] 59 | streams = x[0].shape[1] 60 | 61 | if out is None: 62 | self._activation = self.activation_holder(timesteps, streams) 63 | out = self._activation 64 | 65 | for t in range(timesteps): 66 | # this way of doing things is a bit clumsy. 67 | input = x[0][t] 68 | for layer, holder in zip(self.layers, out): 69 | input = layer.activate([input], out = holder[t]) 70 | return out 71 | 72 | def reset_internal_gradient_cache(self): 73 | """ 74 | Reset gradient input to the layers inside 75 | the loop, thereby simulating the unfolding 76 | through time of the layers (each layer 77 | is thereby unique to each time point 78 | with regard to its gradient cache). 79 | """ 80 | for layer in self.layers: 81 | # in theory memory allocation 82 | # could be saved here, 83 | # be np dot erases the output 84 | # destination 85 | layer.gradinput = None 86 | 87 | 88 | def update_grad_input(self, input, output, grad_output): 89 | """ 90 | Here we take the gradient with respect to all time points. The error 91 | signal is the grad_output. The input is the original input provided 92 | to the loop. 93 | """ 94 | # go backwards in time 95 | timesteps = input.shape[0] 96 | if self.gradinput is None: 97 | self.gradinput = np.zeros_like(input) 98 | for t in reversed(range(timesteps)): 99 | grad_down = None 100 | for layer, layer_in, grad, out in zip(reversed(self.layers), reversed([input] + output[:-1]), reversed(grad_output), reversed(output)): 101 | if grad_down is None: 102 | grad_down = layer.update_grad_input(layer_in[t], out[t], grad[t]) 103 | else: 104 | grad_down = layer.update_grad_input(layer_in[t], out[t], grad[t] + grad_down) 105 | # next we clear the grad 106 | # input at this level, 107 | # since for each timestep 108 | # the layers are unique. 109 | self.reset_internal_gradient_cache() 110 | self.gradinput[t] = grad_down 111 | 112 | return self.gradinput -------------------------------------------------------------------------------- /cython_lstm/layers/recurrent_averaging_layer.py: -------------------------------------------------------------------------------- 1 | from .recurrent_layer import RecurrentLayer, REAL 2 | import numpy as np 3 | 4 | class RecurrentAveragingLayer(RecurrentLayer): 5 | 6 | def __init__(self, a_layer, bc_layer, dtype = REAL): 7 | """ 8 | Strange layer used for multi stage recurrence. 9 | """ 10 | self.step = 0 11 | self._temporal_forward_layers = [] 12 | 13 | self.dimensionless = False 14 | self._backward_layer = None 15 | self._forward_layers = [] 16 | self.input_size = bc_layer.output_size 17 | self.output_size = bc_layer.output_size 18 | 19 | self._a_layer = a_layer 20 | self._bc_layer = bc_layer 21 | self._activation = None 22 | self.tensor = False 23 | self.dtype = dtype 24 | self.create_weights() 25 | 26 | def create_weights(self): 27 | self.params = [] 28 | self.gradients = [] 29 | 30 | self._initial_hidden_state = self._zero_initial_state() 31 | self._initial_hidden_state_diff = np.zeros_like(self._initial_hidden_state) 32 | 33 | self.params.append(self._initial_hidden_state) 34 | self.gradients.append(self._initial_hidden_state_diff) 35 | 36 | def backpropagate_one_step(self, signal): 37 | """ 38 | Get local error responsability using 39 | the derivative of error with respect 40 | to output times the derivative of the 41 | local parameters dy / dz 42 | 43 | Derivative for a * b + (1-a) * c 44 | for a is : b - c 45 | for b is : a 46 | for c is : 1-a 47 | 48 | """ 49 | t = self.step 50 | self.dEda = signal * (self._bc_layer._activation[t] - self._bc_layer._activation[t - 1]) 51 | self.dEdb = signal * self._a_layer._activation[t] 52 | self.dEdc = signal * (1. - self._a_layer._activation[t]) 53 | self.step -=1 54 | 55 | def backpropagate(self, signal): 56 | raise NotImplementedError("Cannot backpropagate multiple timesteps using this layer.") 57 | 58 | def forward_propagate(self, input): 59 | """ 60 | Average b and c using a: 61 | out = a * b + (1 - a) * c 62 | """ 63 | t = self.step 64 | hidden = self._activation[t-1] if t > 0 else np.tile(self._initial_hidden_state, (input.shape[0], 1)) 65 | self._activation[t] = ( 66 | self._a_layer._activation[t] * input + 67 | (1-self._a_layer._activation[t]) * hidden) 68 | return self._activation[t] 69 | 70 | def error_activate(self, target): 71 | raise NotImplementedError("Cannot error activate multiple timesteps using this layer.") 72 | 73 | def __repr__(self): 74 | return "<" + self.__class__.__name__ + " " + str({"activation": "a * b + (1 - a) * c", "input_size": "%d + %d" % (self._a_layer.output_size, self._bc_layer.output_size), "output_size": self._bc_layer.output_size})+">" 75 | -------------------------------------------------------------------------------- /cython_lstm/layers/recurrent_layer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Recurrent Neural Network Layer 3 | ------------------------------ 4 | 5 | Missing: LSTM, Recursive Gated, Language Models, Hierachical Softmax 6 | 7 | """ 8 | from .temporal_layer import TemporalLayer, quadratic_form 9 | from .layer import Layer 10 | import numpy as np 11 | REAL = np.float32 12 | 13 | class RecurrentLayer(TemporalLayer): 14 | """ 15 | Recurrent Neural net layer with a linear activation, 16 | with backpropagation through time implemented 17 | for an error in the future. 18 | 19 | """ 20 | 21 | def activate_timestep(self, input): 22 | if self.step < input.shape[0]: 23 | self.forward_propagate(input[self.step]) 24 | self.step += 1 25 | for layer in self._temporal_forward_layers: 26 | #print("(%d) %s => %s" % (self.step, self.__class__.__name__, layer.__class__.__name__)) 27 | layer.activate_timestep(self._activation) 28 | 29 | def layer_input(self): 30 | """ 31 | Input is sum of activations of backward 32 | layers. 33 | 34 | Activation dimensions are: 35 | 1. time step 36 | 2. which stream (for batch training) 37 | 3. dimensions of observation 38 | """ 39 | # what was given as an input: 40 | observation = self._backward_layer._activation[self.step] 41 | 42 | if self.step == 0: 43 | # repeat initial hidden state as many times as the data is observed 44 | hidden = np.tile(self._initial_hidden_state, (observation.shape[0], 1)) 45 | else: 46 | # previous hidden state is concatenated with the current observation: 47 | hidden = self._activation[self.step-1] 48 | return np.concatenate([ 49 | hidden, # repeated hidden state 50 | observation # timestep data observation 51 | ], axis=1) 52 | 53 | def create_weights(self): 54 | """ 55 | Randomly initialize the weights for this recurrent layer 56 | with gaussian noise with std 1 / input size. 57 | Weights have size corresponding to output: 58 | visible + hidden => hidden 59 | 60 | """ 61 | Layer.create_weights(self) 62 | 63 | self._initial_hidden_state = self._zero_initial_state() 64 | self._initial_hidden_state_diff = np.zeros_like(self._initial_hidden_state) 65 | 66 | self.params.append(self._initial_hidden_state) 67 | self.gradients.append(self._initial_hidden_state_diff) 68 | 69 | def reset_weights(self): 70 | """ 71 | Reset to random weights this 72 | layer 73 | """ 74 | Layer.reset_weights() 75 | self._initial_hidden_state.fill(0) 76 | 77 | def _random_weight_matrix(self): 78 | return (1. / (self.input_size + self.output_size) * 79 | np.random.standard_normal([ 80 | self.output_size, 81 | self.input_size + self.output_size]) 82 | ).astype(self.dtype) 83 | 84 | def _random_weight_tensor(self): 85 | return (1. / (self.input_size + self.output_size) * 86 | np.random.standard_normal([ 87 | self.output_size, 88 | self.input_size + self.output_size, 89 | self.input_size + self.output_size]) 90 | ).astype(self.dtype) 91 | 92 | def forward_propagate(self, input): 93 | """ 94 | use the weights and the activation function 95 | to react to the input 96 | 97 | TODO: use the `out' parameter of numpy dot to 98 | gain speed on memory allocation. 99 | 100 | Inputs 101 | ------ 102 | 103 | inputs ndarray : the input data 104 | 105 | Outputs 106 | ------- 107 | 108 | activation ndarray : the activation for this input 109 | 110 | """ 111 | t = self.step 112 | hidden = self._activation[t-1] if t > 0 else np.tile(self._initial_hidden_state, (input.shape[0], 1)) 113 | x = np.concatenate([hidden, input], axis=1) 114 | if self.tensor: 115 | self._activation[t] = self.activation_function( 116 | quadratic_form(self._weight_tensor, x) + 117 | np.dot(self._weight_matrix, x.T).T + 118 | self._bias_units ) 119 | else: 120 | self._activation[t] = self.activation_function( 121 | np.dot(self._weight_matrix, x.T).T + 122 | self._bias_units ) 123 | return self._activation[t] 124 | 125 | def backpropagate(self, signal): 126 | """ 127 | Get local error responsability using 128 | the derivative of error with respect 129 | to output times the derivative of the 130 | local parameters dy / dz 131 | """ 132 | if self.step == -1: 133 | # signal backwards is transmitted to initial hidden state: 134 | # only use top part of dE/dy to get hidden state error: 135 | self._initial_hidden_state_diff += signal[:, 0:self.output_size].T.sum(axis=-1) 136 | 137 | # reset step: 138 | self.step = 0 139 | 140 | self._backward_layer.backpropagate(self.dEdz) 141 | else: 142 | # signal backwards is given by taking weight matrix 143 | # with signal with derivative 144 | # take beginning part since remainder is attributable 145 | # to observation 146 | self._dEdy = signal[:, 0:self.output_size] * self.dydz(self._activation[self.step]) 147 | 148 | # given we know the error signal at this stage, 149 | # constitute the local error responsability dEdz 150 | # and mark the updates to the weights: 151 | self.backpropagate_dEdy() 152 | 153 | self.step -=1 154 | 155 | return self.backpropagate(self.dEdz) -------------------------------------------------------------------------------- /cython_lstm/layers/recurrent_multistage_layer.py: -------------------------------------------------------------------------------- 1 | from .recurrent_layer import RecurrentLayer 2 | 3 | import numpy as np 4 | REAL = np.float32 5 | 6 | class RecurrentMultiStageLayer(RecurrentLayer): 7 | """ 8 | Wrapper around multiple temporal layers. 9 | Handles the flow for forward and backward 10 | propagation of signal. 11 | 12 | """ 13 | 14 | def __init__(self, layers, dtype=REAL): 15 | 16 | self.dtype = dtype 17 | self._dEdy = None 18 | self.dEdz = None 19 | self.tensor = False 20 | self._forward_layers = [] 21 | self._backward_layer = None 22 | 23 | self._internal_layers = layers 24 | self._output_layer = self._internal_layers[-1] 25 | 26 | # collect these using the layers: 27 | self.params = [] 28 | self.gradients = [] 29 | 30 | def create_weights(self): 31 | pass 32 | 33 | def reset_weights(self): 34 | for layer in self._internal_layers: 35 | self.reset_weights() 36 | 37 | def clear(self): 38 | """ 39 | 40 | Clears the activation and the local 41 | error responsibility for this layer, 42 | and clears the internal layers too. 43 | 44 | """ 45 | self.step = 0 46 | self._dEdy = None 47 | self.dEdz = None 48 | for layer in self._internal_layers: 49 | layer.clear() 50 | 51 | def activation(self): 52 | return self._output_layer._activation[self.step] 53 | 54 | def activate_forward_layers(self): 55 | """ 56 | Pass the last timestep activation forward 57 | """ 58 | for layer in self._forward_layers: 59 | layer.activate(self.activation()) 60 | 61 | def layer_input(self): 62 | return self._backward_layer._activation[self.step] 63 | 64 | def error_activate(self, target): 65 | raise NotImplementedError("Not implemented") 66 | 67 | def backpropagate_one_step(self, signal): 68 | pass 69 | 70 | def backpropagate(self, signal): 71 | # todo : generalize so that all layers use backprop one step 72 | # within backpropagate 73 | if self.step == -1: 74 | self.step = 0 75 | self._backward_layer.backpropagate(self.dEdz) 76 | else: 77 | for layer in reversed(self._internal_layers): 78 | layer.backpropagate_one_step(signal) 79 | self.step -= 1 80 | self.backpropagate(self.dEdz) 81 | 82 | def allocate_activation(self, timesteps, streams): 83 | for layer in self._internal_layers: 84 | layer.allocate_activation(timesteps, streams) 85 | 86 | def forward_propagate(self, input): 87 | self._internal_layers[0].activate(input) 88 | -------------------------------------------------------------------------------- /cython_lstm/layers/slice_layer.py: -------------------------------------------------------------------------------- 1 | from .base_layer import BaseLayer 2 | import numpy as np 3 | 4 | class SliceLayer(BaseLayer): 5 | """ 6 | Takes a subset of its input, and passes 7 | it forward. 8 | """ 9 | 10 | def __init__(self, index): 11 | BaseLayer.__init__(self) 12 | if not hasattr(index, '__len__'): 13 | index = (index,) 14 | self.index = index 15 | self.gradients = [] 16 | 17 | def select_slice(self, input): 18 | if type(input) is list: 19 | if len(self.index) > 1: 20 | return input[self.index[0]][self.index[1:]] 21 | else: 22 | return input[self.index[0]] 23 | else: 24 | return input[self.index] 25 | 26 | def activate(self, x, out=None): 27 | """ 28 | Activation for a slice layer is dead simple: 29 | 30 | > y = x[index] 31 | 32 | """ 33 | input = self.select_slice(x[0]) 34 | 35 | if out is None: 36 | # no copy is performed here 37 | # thankfully 38 | self._activation = input 39 | return self._activation 40 | else: 41 | out[:] = input 42 | return out 43 | 44 | def update_grad_input(self, input, output, grad_output): 45 | """ 46 | Gradient for a slice is simply zeros for all non 47 | sliced dimensions, and pass the grad_output inside 48 | the sliced piece. 49 | 50 | > grad = np.zeros_like(input) 51 | > grad[index] = ParentGradient 52 | 53 | """ 54 | if self.gradinput is None: 55 | if type(input) is list: 56 | self.gradinput = [np.zeros_like(piece) for piece in input] 57 | else: 58 | self.gradinput = np.zeros_like(input) 59 | self.select_slice(self.gradinput)[:] = grad_output 60 | return self.gradinput -------------------------------------------------------------------------------- /cython_lstm/layers/temporal_layer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Neural Network Temporal Layer 3 | ----------------------------- 4 | 5 | """ 6 | """ 7 | Recurrent Neural Network Layer 8 | ------------------------------ 9 | 10 | Missing: LSTM, Recursive Gated, Language Models, Hierachical Softmax 11 | 12 | """ 13 | from .layer import Layer, quadratic_form 14 | import numpy as np 15 | REAL = np.float32 16 | 17 | class TemporalLayer(Layer): 18 | """ 19 | Recurrent Neural net layer with a linear activation, 20 | with backpropagation through time implemented 21 | for an error in the future, and no hidden activation. 22 | """ 23 | 24 | def connect_to(self, layer, temporal = False, **kwargs): 25 | if temporal: 26 | self.connect_through_time(layer) 27 | else: 28 | Layer.connect_to(self, layer, **kwargs) 29 | 30 | def connect_through_time(self, layer): 31 | self._temporal_forward_layers.append(layer) 32 | layer.add_backward_layer(self) 33 | 34 | def activation(self): 35 | return self._activation[self.step] 36 | 37 | def allocate_activation(self, timesteps, streams): 38 | #print(self.__class__.__name__ + " is allocating memory for its activations.") 39 | self._activation = np.zeros([timesteps, streams, self.output_size] , dtype=self.dtype) 40 | 41 | def activate_timestep(self, input): 42 | if self.step < input.shape[0]: 43 | self.forward_propagate(input[self.step]) 44 | self.step += 1 45 | for layer in self._temporal_forward_layers: 46 | #print("(%d) %s => %s" % (self.step, self.__class__.__name__, layer.__class__.__name__)) 47 | layer.activate_timestep(self._activation) 48 | 49 | def recursive_activate_timestep(self, input): 50 | self.activate_timestep(input) 51 | if self.step < input.shape[0]: 52 | self.recursive_activate_timestep(input) 53 | 54 | def activate(self, input): 55 | """ 56 | Activate a recurrent neural layer 57 | by advancing a step for each of the 58 | dimensions in the first axis of the 59 | data. 60 | """ 61 | self.step = 0 62 | self.recursive_activate_timestep(input) 63 | self.step -= 1 64 | 65 | # transfer activation as input to next layers: 66 | print("Activating forward layers from %s" % (self.__class__.__name__,)) 67 | self.activate_forward_layers() 68 | 69 | def layer_input(self): 70 | """ 71 | Input is sum of activations of backward 72 | layers. 73 | 74 | """ 75 | return self._backward_layer._activation[self.step] 76 | 77 | def forward_propagate(self, x): 78 | """ 79 | use the weights and the activation function 80 | to react to the input 81 | 82 | TODO: use the `out' parameter of numpy dot to 83 | gain speed on memory allocation. 84 | 85 | Inputs 86 | ------ 87 | 88 | inputs ndarray : the input data 89 | 90 | Outputs 91 | ------- 92 | 93 | activation ndarray : the activation for this input 94 | 95 | """ 96 | if self.tensor: 97 | self._activation[self.step] = self.activation_function( 98 | quadratic_form(self._weight_tensor, x) + 99 | np.dot(self._weight_matrix, x.T).T + 100 | self._bias_units ) 101 | else: 102 | self._activation[self.step] = self.activation_function( 103 | np.dot(self._weight_matrix, x.T).T + self._bias_units ) 104 | return self._activation[self.step] 105 | 106 | def backpropagate(self, signal): 107 | """ 108 | Get local error responsability using 109 | the derivative of error with respect 110 | to output times the derivative of the 111 | local parameters dy / dz 112 | """ 113 | if self.step == -1: 114 | # reset step: 115 | self.step = 0 116 | self._backward_layer.backpropagate(self.dEdz) 117 | else: 118 | # signal backwards is given by taking weight matrix 119 | # with signal with derivative 120 | # take beginning part since remainder is attributable 121 | # to observation 122 | self._dEdy = signal[:,:] * self.dydz(self._activation[self.step]) 123 | 124 | # given we know the error signal at this stage, 125 | # constitute the local error responsability dEdz 126 | # and mark the updates to the weights: 127 | self.backpropagate_dEdy() 128 | self.step -=1 129 | 130 | return self.backpropagate(self.dEdz) -------------------------------------------------------------------------------- /cython_lstm/layers/tile_layer.py: -------------------------------------------------------------------------------- 1 | from .base_layer import BaseLayer 2 | 3 | class TileLayer(BaseLayer): 4 | """ 5 | Repeats an input sequence to all its outgoing 6 | layers. Useful for repeating a data layer 7 | to multiple nodes that need to listen to it. 8 | """ 9 | 10 | def forward_propagate(self, x): 11 | return [layer.forward_propagate(x) for layer in self._temporal_forward_layers] 12 | 13 | def clear(self): 14 | self.step = 0 15 | self._activation = None 16 | 17 | def allocate_activation(self, *args): 18 | pass 19 | 20 | @property 21 | def input_size(self): 22 | return self._internal_layers[0].input_size if len(self._temporal_forward_layers) > 0 else None 23 | 24 | @property 25 | def output_size(self): 26 | return self._internal_layers[0].input_size if len(self._temporal_forward_layers) > 0 else None 27 | 28 | def activate(self, input): 29 | """ 30 | For each layer within this repeat 31 | layer, the input is sent. 32 | 33 | """ 34 | self._activation = input 35 | 36 | self.step = 0 37 | self.recursive_activate_timestep(input) 38 | 39 | # transfer activation as input to next layers: 40 | self.activate_forward_layers() 41 | 42 | def activate_timestep(self, input): 43 | if self.step < input.shape[0]: 44 | self.forward_propagate(input[self.step]) 45 | self.step += 1 46 | for layer in self._temporal_forward_layers: 47 | #print("(%d) %s => %s" % (self.step, self.__class__.__name__, layer.__class__.__name__)) 48 | layer.activate_timestep(self._activation) 49 | 50 | def recursive_activate_timestep(self, input): 51 | self.activate_timestep(input) 52 | if self.step < input.shape[0]: 53 | self.recursive_activate_timestep(input) -------------------------------------------------------------------------------- /cython_lstm/network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .error import MSE 3 | from .layers.connectible_layer import ConnectibleLayer 4 | from .topology import topological_sort, Node 5 | 6 | class Network(): 7 | """ 8 | Create a simple neural network 9 | """ 10 | def __init__(self, metric = MSE, dtype=np.float32): 11 | self.dtype = dtype 12 | self.layers = [] 13 | assert(hasattr(metric, 'error') and hasattr(metric, 'dEdy')), "Metric must implement error and error derivative." 14 | self.error_metric = metric 15 | self._output_layer = None 16 | self._input_layer = None 17 | self._data_layer = DataLayer() 18 | 19 | def reset_weights(self): 20 | for layer in self.layers: 21 | layer.reset_weights() 22 | 23 | def add_layer(self, layer, input=False, output = False): 24 | self.layers.append(layer) 25 | if input: 26 | self._input_layer = layer 27 | # connect the data layer to the first network 28 | # layer: 29 | if self._data_layer._forward_layer is not None: 30 | self._data_layer._forward_layer.remove_backward_layer(self._data_layer) 31 | 32 | self._data_layer.connect_to(layer) 33 | # self._data_layer.children.append(layer) 34 | # layer.add_backward_layer(self._data_layer) 35 | if output: 36 | self._output_layer = layer 37 | 38 | def remove_layer(self, layer): 39 | self.layers.remove(layer) 40 | 41 | def allocate_activation(self, timesteps, streams): 42 | for layer in self.layers: 43 | if hasattr(layer, 'allocate_activation'): 44 | layer.allocate_activation(timesteps, streams) 45 | 46 | def activate(self, input): 47 | """ 48 | Activate takes the input to the network 49 | and dispatches it forward starting from 50 | the data layer (the lowest input). 51 | 52 | Note: in the future this activation procedure 53 | will instead rely on the topological sort of the 54 | nodes in graph, and multiple inputs will be usable. 55 | """ 56 | input = np.asarray(input, dtype=self.dtype) 57 | if input.ndim == 1: 58 | input = input.reshape(1,-1) 59 | # deprecated method: 60 | self.allocate_activation(input.shape[0], input.shape[1]) 61 | # activate first layer 62 | last_layer = self._data_layer 63 | out = input 64 | while last_layer is not None: 65 | out = last_layer.activate([out]) 66 | last_layer = last_layer._forward_layer 67 | 68 | return self._output_layer._activation 69 | 70 | def topsort(self): 71 | """ 72 | Return the input layers to the network (the 73 | data layers) and use those as roots for a 74 | topological sort of the network. 75 | """ 76 | mapping = Node.to_layer_mapping([self._data_layer]) 77 | roots = [value 78 | for key, value in mapping.items() 79 | if isinstance(key, DataLayer)] 80 | return topological_sort(roots) 81 | 82 | def backpropagate(self, target): 83 | if not target.dtype.kind == 'i': 84 | target = np.asarray(target, dtype=self.dtype) 85 | if target.ndim == 1: 86 | target = target.reshape(1,-1) 87 | if hasattr(self._output_layer, 'error_activate'): 88 | self._output_layer.error_activate(target) 89 | else: 90 | self.manual_backpropagation(target) 91 | 92 | 93 | def manual_backpropagation(self, target): 94 | last_layer = self._output_layer 95 | # special error is used here to initialize the backpropagation 96 | # procedure 97 | error_signal = self.error_metric.dEdy(last_layer._activation, target) 98 | while last_layer is not None and last_layer is not self._data_layer: 99 | # we pass down the current error signal 100 | error_signal = last_layer.update_grad_input( 101 | last_layer._backward_layer._activation, 102 | last_layer._activation, 103 | error_signal) 104 | # step down by a layer. 105 | last_layer = last_layer._backward_layer 106 | 107 | def set_error(self, metric): 108 | """ 109 | Set the error metric that should 110 | be used. The choices are: 111 | * MSE 112 | * BinaryCrossEntropy 113 | * TanhBinayCrossEntropy 114 | * CategoricalCrossEntropy 115 | """ 116 | assert(hasattr(metric, 'error') and hasattr(metric, 'dEdy')), "Metric must implement error and error derivative." 117 | self.error_metric = metric 118 | 119 | def error(self, target): 120 | return self.error_metric.error(self._output_layer._activation, target) 121 | 122 | def clear(self): 123 | """ 124 | Resets the state of the layers in this 125 | neural network 126 | """ 127 | for layer in self.layers: 128 | layer.clear() 129 | 130 | def get_parameters(self): 131 | """ 132 | Collect the parameters of the net into a list. 133 | """ 134 | 135 | parameters = [] 136 | for layer in self.layers: 137 | parameters.extend(layer.params) 138 | return parameters 139 | 140 | def get_gradients(self): 141 | """ 142 | Collect the gradients of the net into a list. 143 | """ 144 | gradients = [] 145 | for layer in self.layers: 146 | gradients.extend(layer.gradients) 147 | return gradients 148 | 149 | def __repr__(self): 150 | if len(self.layers) == 1: 151 | return str({"layer":self.layers[0]}) 152 | else: 153 | return str({ 154 | "layers": self.layers, 155 | "output_layer": self._output_layer.activation_function.__doc__ if hasattr(self._output_layer, 'activation_function') else '', 156 | "input_layer": self._input_layer.activation_function.__doc__ if hasattr(self._input_layer, 'activation_function') else '' 157 | }) 158 | 159 | class DataLayer(ConnectibleLayer): 160 | def __init__(self): 161 | self.parents = [] 162 | self.children = [] 163 | self._activation = None 164 | self._forward_layer = None 165 | 166 | def activate(self, input): 167 | self._activation = input[0] 168 | return self._activation 169 | 170 | def activation(self): 171 | return self._activation 172 | 173 | def backpropagate(self, signal): 174 | pass -------------------------------------------------------------------------------- /cython_lstm/network_viewer.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import matplotlib.pyplot as plt 3 | 4 | def create_network_graph(network): 5 | """ 6 | Create a networkx representation 7 | of the current layer arrangement 8 | for this network. 9 | """ 10 | graph = nx.DiGraph() 11 | nodes_pos = {} 12 | nodes_label = {} 13 | input_units = [] 14 | output_units = [] 15 | outputted_units = [] 16 | hidden_units = [] 17 | 18 | layer_index = {} 19 | 20 | # add the units per layer 21 | for layer_i, layer in enumerate(network.layers): 22 | layer_index[layer] = layer_i 23 | 24 | # add the nodes for this layer 25 | for node_i in range(layer.input_size): 26 | node = "%d_%d" % (layer_i, node_i) # simple encoding of a node 27 | graph.add_node(node) 28 | nodes_pos[node] = (layer_i, float(layer.input_size) / 2.0 - float(node_i)) 29 | 30 | if layer == network._input_layer: 31 | # label for input layer 32 | input_units.append(node) 33 | nodes_label[node] = r"$X_{%d}$" % node_i 34 | elif layer == network._output_layer: 35 | # label for output layer 36 | output_units.append(node) 37 | nodes_label[node] = r"$Y_{%d}$" % node_i 38 | else: 39 | # hidden layer 40 | hidden_units.append(node) 41 | nodes_label[node] = r"$Z_{%d, %d}$" % (layer_i, node_i) 42 | 43 | if layer is network._output_layer: 44 | for i in range(layer.output_size): 45 | node = "%d_%d" % (len(network.layers), i) # simple encoding of a node 46 | graph.add_node(node) 47 | outputted_units.append(node) 48 | nodes_label[node] = r"$O_{%d}$" % i 49 | nodes_pos[node] = (len(network.layers), layer.output_size / 2.0 - i) 50 | 51 | for layer_i, layer in enumerate(network.layers): 52 | for forward_layer in layer._forward_layers: 53 | graph.add_edges_from([ 54 | ("%d_%d" % (layer_i, k), "%d_%d" % (layer_index[forward_layer], l)) 55 | for k in range(layer.input_size) for l in range(forward_layer.input_size) 56 | ]) 57 | if layer is network._output_layer: 58 | # map to fictional output nodes for net 59 | graph.add_edges_from([ 60 | ("%d_%d" % (layer_i, k), "%d_%d" % (len(network.layers), l)) 61 | for k in range(layer.input_size) for l in range(layer.output_size) 62 | ]) 63 | 64 | return (graph, nodes_pos, nodes_label, input_units, output_units, outputted_units, hidden_units) 65 | 66 | def draw(network, ax = None): 67 | """ Draw the neural network 68 | """ 69 | if ax is None: 70 | ax = plt.figure(figsize=(10, 6)).add_subplot(1, 1, 1) 71 | 72 | graph, nodes_pos, nodes_label, input_units, output_units, outputted_units, hidden_units = create_network_graph(network) 73 | 74 | nx.draw_networkx_edges(graph, pos=nodes_pos, alpha=0.7, ax=ax) 75 | nx.draw_networkx_nodes(graph, nodelist=input_units, 76 | pos=nodes_pos, ax=ax, 77 | node_color='#66FFFF', node_size=700) 78 | nx.draw_networkx_nodes(graph, nodelist=hidden_units, 79 | pos=nodes_pos, ax=ax, 80 | node_color='#CCCCCC', node_size=900) 81 | nx.draw_networkx_nodes(graph, nodelist=output_units, 82 | pos=nodes_pos, ax=ax, 83 | node_color='#FFFF99', node_size=700) 84 | nx.draw_networkx_labels(graph, labels=nodes_label, 85 | pos=nodes_pos, font_size=14, ax=ax) 86 | 87 | nx.draw_networkx_nodes(graph, nodelist=outputted_units, 88 | pos=nodes_pos, ax=ax, 89 | node_color='#f2276e', node_size=470) 90 | ax.axis('off'); 91 | 92 | layer_sizes = [layer.input_size for layer in network.layers] + [network._output_layer.output_size] 93 | activation_func = [layer.activation_function.__doc__ for layer in network.layers] + ["Output"] 94 | max_heights_for_layer = [max([nodes_pos["%d_%d" % (k, node)][1] for node in range(size)]) for k, size in enumerate(layer_sizes)] 95 | 96 | for i, layer_height in enumerate(max_heights_for_layer): 97 | ax.text(i, layer_height + 0.8, activation_func[i], horizontalalignment ='center'); 98 | ax.axis('off') -------------------------------------------------------------------------------- /cython_lstm/neuron.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | try: 3 | import scipy.special 4 | sigmoid = scipy.special.expit 5 | except ImportError: 6 | def sigmoid(x, out=None): 7 | """ 8 | Sigmoid implemented using proxy 9 | ufuncs from numpy. 10 | """ 11 | return np.divide(1., 1. + np.exp(-x), out, dtype=x.dtype) 12 | 13 | def softmax(x, out=None): 14 | layer_max = x.max(axis=-1) 15 | exped_distributions = np.exp(x.T - layer_max) 16 | total_distribution = exped_distributions.sum(axis=0) 17 | if out is None: 18 | return (exped_distributions / total_distribution).T 19 | else: 20 | out[:] = (exped_distributions / total_distribution).T 21 | return out 22 | 23 | def softmax_unsafe(x): 24 | exped_distributions = np.exp(x.T) 25 | total_distribution = exped_distributions.sum(axis=0) 26 | return (exped_distributions / total_distribution).T 27 | 28 | class Neuron(): 29 | @staticmethod 30 | def activation_function(x, out=None): 31 | """Identity""" 32 | if out is None: 33 | return x 34 | else: 35 | out[:] = x 36 | return out 37 | 38 | @staticmethod 39 | def dydz(x): 40 | return np.float64(1.0).astype(x.dtype) 41 | 42 | class RectifierNeuron(Neuron): 43 | @staticmethod 44 | def activation_function(x, out=None): 45 | """Rectifier""" 46 | return np.fmax(0,x, out) 47 | 48 | @staticmethod 49 | def dydz(x): 50 | return np.sign(x) 51 | 52 | class LogisticNeuron(Neuron): 53 | @staticmethod 54 | def activation_function(x, out=None): 55 | """Sigmoid""" 56 | return sigmoid(x,out) 57 | 58 | @staticmethod 59 | def dydz(x): 60 | """ 61 | Sigmoid derivative 62 | d/dx 1/ ( 1 + e^-x) = d/dx sig(x) = sig(x) - sig(x)^2 63 | """ 64 | return x - x**2 65 | 66 | class TanhNeuron(Neuron): 67 | @staticmethod 68 | def activation_function(x, out=None): 69 | """Tanh""" 70 | return np.tanh(x, out) 71 | 72 | @staticmethod 73 | def dydz(x): 74 | """ 75 | hyperbolic tangent Derivative 76 | """ 77 | return 1.0 - x**2 78 | 79 | class SoftmaxNeuron(Neuron): 80 | @staticmethod 81 | def activation_function(x, out=None): 82 | """Softmax""" 83 | return softmax(x, out) -------------------------------------------------------------------------------- /cython_lstm/topology.py: -------------------------------------------------------------------------------- 1 | """ 2 | Topology submodule handling topological 3 | sorts for handling of forward and backward 4 | propagation over arbitrary graphs (similar to 5 | nn-graph for Torch). 6 | """ 7 | class Node(object): 8 | """ 9 | Takes a layer (inheriting from BaseLayer) 10 | and a mapping function taking layers to 11 | Nodes, and recursively maps all layers 12 | into a graph. 13 | """ 14 | def __init__(self, layer, mapping): 15 | mapping[layer] = self 16 | self.layer = layer 17 | self.parents = [] 18 | for parent in layer.parents: 19 | if parent in mapping: 20 | self.parents.append(mapping[parent]) 21 | else: 22 | Node(parent, mapping) 23 | self.parents.append(mapping[parent]) 24 | self.children = [] 25 | for child in layer.children: 26 | if child in mapping: 27 | self.children.append(mapping[child]) 28 | else: 29 | Node(child, mapping) 30 | self.children.append(mapping[child]) 31 | 32 | @staticmethod 33 | def to_layer_mapping(layers): 34 | """ 35 | Convert several layers to Nodes, 36 | recursively cross the graph starting at those 37 | layers and convert the remainder to nodes. 38 | Return the mapping dictionary. 39 | 40 | Inputs 41 | ------ 42 | 43 | nodes list: the roots of the graph 44 | 45 | Outputs 46 | ------- 47 | 48 | mapped_layers dict: a mapping from layer to Node. 49 | """ 50 | mapped_layers = {} 51 | for layer in layers: 52 | if not layer in mapped_layers: 53 | mapped_layers[layer] = Node(layer, mapped_layers) 54 | return mapped_layers 55 | 56 | @staticmethod 57 | def to_nodes(layers): 58 | """ 59 | Convert several layers to Nodes, 60 | recursively cross the graph starting at those 61 | layers and convert the remainder to nodes. 62 | Return the input layers as nodes. 63 | 64 | Inputs 65 | ------ 66 | 67 | nodes list: the roots of the graph 68 | 69 | Outputs 70 | ------- 71 | 72 | mapped_nodes list: Converted layers 73 | into Nodes of the 74 | calculation graph. 75 | """ 76 | mapped_layers = Node.to_layer_mapping(layers) 77 | return [mapped_layers[layer] for layer in layers] 78 | 79 | def topological_sort(nodes): 80 | """ 81 | Find an ordering of nodes that respects 82 | all dependencies in the graph. 83 | 84 | Inputs 85 | ------ 86 | 87 | nodes list: the roots of the graph 88 | 89 | Outputs 90 | ------- 91 | 92 | L list: an ordering of the nodes in the graph 93 | such that all dependencies are respected 94 | by proceeding in the provided order. 95 | 96 | Note: this function does not check whether underlying 97 | graph is a directed acyclic graph. Since cycles are 98 | not detected, certain nodes may never be called, and 99 | hence the behavior in those cases will differ from 100 | user-intended. 101 | """ 102 | L = [] 103 | S = nodes 104 | while len(S) > 0: 105 | node = S.pop() 106 | L.append(node) 107 | children = [m for m in node.children] 108 | for child in children: 109 | # remove edge from graph 110 | del node.children[node.children.index(child)] 111 | del child.parents[child.parents.index(node)] 112 | 113 | # if this was the last connection 114 | # then we can add this node to 115 | # the nodes with no incoming connections: 116 | # S 117 | if len(child.parents) == 0: 118 | S.append(child) 119 | return L -------------------------------------------------------------------------------- /cython_lstm/trainer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Trainer(): 4 | """ 5 | Get the updates for a gradient descent optimizer using 6 | SGD, AdaDelta, or AdaGrad. 7 | 8 | Call train with a pair of input output to perform 9 | gradient descent on a model. 10 | 11 | Parameters are updated in place. 12 | 13 | Inputs 14 | ------ 15 | 16 | model Network : The model to optimizer. 17 | Must have the methods: 18 | * `get_parameters()`, 19 | * `get_gradients()` 20 | * `clear()`, 21 | * `activate(input)`, 22 | * `backpropagate(output)`, 23 | * `error(output)`. 24 | lr float : base learning rate for 25 | adagrad and SGD 26 | eps float : numerical stability value 27 | to not divide by zero 28 | sometimes 29 | rho float : adadelta hyperparameter. 30 | method str : 'adagrad', 'adadelta', or 'sgd'. 31 | 32 | """ 33 | def __init__(self, model, 34 | eps=1e-6, 35 | rho=0.95, 36 | lr = 0.01, 37 | max_norm=5.0, 38 | method = "adadelta"): 39 | # should freeze the structure of the network or have 40 | # robust method of linking to the elements inside 41 | self.model = model 42 | self._method = method 43 | self.parameters = model.get_parameters() 44 | self.gradients = model.get_gradients() 45 | self.lr = lr 46 | self.rho = rho 47 | self.eps = eps 48 | 49 | if method == "adadelta": 50 | self.gsums = [np.zeros_like(param) for param in self.parameters] 51 | self.xsums = [np.zeros_like(param) for param in self.parameters] 52 | self._grad_update = self.adadelta_update 53 | elif method == "adagrad": 54 | self.gsums = [np.zeros_like(param) for param in self.parameters] 55 | self._grad_update = self.adagrad_update 56 | else: 57 | self._grad_update = self.sgd_update 58 | 59 | @property 60 | def method(self): 61 | return self._method 62 | 63 | def adadelta_update(self): 64 | for gparam, param, gsum, xsum in zip(self.gradients, self.parameters, self.gsums, self.xsums): 65 | gsum[:] = (self.rho * gsum + (1. - self.rho) * (gparam **2)).astype(param.dtype, False) 66 | dparam = -np.sqrt((xsum + self.eps) / (gsum + self.eps)) * gparam 67 | xsum[:] = (self.rho * xsum + (1. - self.rho) * (dparam **2)).astype(param.dtype, False) 68 | param += dparam 69 | def adagrad_update(self): 70 | for gparam, param, gsum in zip(self.gradients, self.parameters, self.gsums): 71 | gsum[:] = (gsum + (gparam ** 2)).astype(param.dtype, False) 72 | param -= self.lr * (gparam / (np.sqrt(gsum + self.eps))) 73 | 74 | def sgd_update(self): 75 | for gparam, param in zip(self.gradients, self.parameters): 76 | param -= (self.lr * gparam) 77 | 78 | 79 | def train(self, input, output): 80 | 81 | # reset model activations 82 | self.model.clear() 83 | 84 | # run data through model 85 | self.model.activate(input) 86 | 87 | # backpropagate error through net: 88 | self.model.backpropagate(output) 89 | 90 | # collect cost: 91 | cost = self.model.error(output).sum() 92 | 93 | # update weights: 94 | self._grad_update() 95 | 96 | return cost --------------------------------------------------------------------------------