├── .gitignore ├── 01_scalar_soln.py ├── 02_vector_mat_soln.py ├── 03_tensor_soln.py ├── 04_function_soln.py ├── 05_shared_soln.py ├── 06_grad_soln.py ├── 07_mode.py ├── 08_scan_polynomial_soln.py ├── Exercices.ipynb ├── README.md ├── imdb.py ├── lstm.py ├── lstm_double.diff ├── lstm_double.py ├── lstm_reverse.diff └── lstm_reverse.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | -------------------------------------------------------------------------------- /01_scalar_soln.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from theano import function 3 | import theano.tensor as T 4 | 5 | 6 | def make_scalar(): 7 | """ 8 | Returns a new Theano scalar. 9 | """ 10 | 11 | return T.scalar() 12 | 13 | 14 | def log(x): 15 | """ 16 | Returns the logarithm of a Theano scalar x. 17 | """ 18 | 19 | return T.log(x) 20 | 21 | 22 | def add(x, y): 23 | """ 24 | Adds two theano scalars together and returns the result. 25 | """ 26 | 27 | return x + y 28 | 29 | a = make_scalar() 30 | b = make_scalar() 31 | c = log(b) 32 | d = add(a, c) 33 | f = function([a, b], d) 34 | a = np.cast[a.dtype](1.) 35 | b = np.cast[b.dtype](2.) 36 | actual = f(a, b) 37 | expected = 1. + np.log(2.) 38 | assert np.allclose(actual, expected) 39 | print "SUCCESS!" 40 | -------------------------------------------------------------------------------- /02_vector_mat_soln.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from theano import function 3 | import theano.tensor as T 4 | 5 | 6 | def make_vector(): 7 | """ 8 | Returns a new Theano vector. 9 | """ 10 | 11 | return T.vector() 12 | 13 | 14 | def make_matrix(): 15 | """ 16 | Returns a new Theano matrix. 17 | """ 18 | 19 | return T.matrix() 20 | 21 | 22 | def elemwise_mul(a, b): 23 | """ 24 | a: A theano matrix 25 | b: A theano matrix 26 | Returns the elementwise product of a and b 27 | """ 28 | 29 | return a * b 30 | 31 | 32 | def matrix_vector_mul(a, b): 33 | """ 34 | a: A theano matrix 35 | b: A theano vector 36 | Returns the matrix-vector product of a and b 37 | """ 38 | 39 | return T.dot(a, b) 40 | 41 | a = make_vector() 42 | b = make_vector() 43 | c = elemwise_mul(a, b) 44 | d = make_matrix() 45 | e = matrix_vector_mul(d, c) 46 | 47 | f = function([a, b, d], e) 48 | 49 | rng = np.random.RandomState([1, 2, 3]) 50 | a_value = rng.randn(5).astype(a.dtype) 51 | b_value = rng.rand(5).astype(b.dtype) 52 | c_value = a_value * b_value 53 | d_value = rng.randn(5, 5).astype(d.dtype) 54 | expected = np.dot(d_value, c_value) 55 | 56 | actual = f(a_value, b_value, d_value) 57 | 58 | assert np.allclose(actual, expected) 59 | print "SUCCESS!" 60 | -------------------------------------------------------------------------------- /03_tensor_soln.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from theano import function 3 | import theano.tensor as T 4 | 5 | 6 | def make_tensor(dim): 7 | """ 8 | Returns a new Theano tensor with no broadcastable dimensions. 9 | dim: the total number of dimensions of the tensor. 10 | """ 11 | 12 | return T.TensorType(broadcastable=tuple([False] * dim), dtype='float32')() 13 | 14 | 15 | def broadcasted_add(a, b): 16 | """ 17 | a: a 3D theano tensor 18 | b: a 4D theano tensor 19 | Returns c, a 4D theano tensor, where 20 | 21 | c[i, j, k, l] = a[l, k, i] + b[i, j, k, l] 22 | 23 | for all i, j, k, l 24 | """ 25 | 26 | return a.dimshuffle(2, 'x', 1, 0) + b 27 | 28 | 29 | def partial_max(a): 30 | """ 31 | a: a 4D theano tensor 32 | 33 | Returns b, a theano matrix, where 34 | 35 | b[i, j] = max_{k,l} a[i, k, l, j] 36 | 37 | for all i, j 38 | """ 39 | 40 | return a.max(axis=(1, 2)) 41 | 42 | a = make_tensor(3) 43 | b = make_tensor(4) 44 | c = broadcasted_add(a, b) 45 | d = partial_max(c) 46 | 47 | f = function([a, b], d) 48 | 49 | rng = np.random.RandomState([1, 2, 3]) 50 | a_value = rng.randn(2, 2, 2).astype(a.dtype) 51 | b_value = rng.rand(2, 2, 2, 2).astype(b.dtype) 52 | c_value = np.transpose(a_value, (2, 1, 0))[:, None, :, :] + b_value 53 | expected = c_value.max(axis=1).max(axis=1) 54 | 55 | actual = f(a_value, b_value) 56 | 57 | assert np.allclose(actual, expected), (actual, expected) 58 | print "SUCCESS!" 59 | -------------------------------------------------------------------------------- /04_function_soln.py: -------------------------------------------------------------------------------- 1 | from theano import tensor as T 2 | from theano import function 3 | 4 | 5 | def evaluate(x, y, expr, x_value, y_value): 6 | """ 7 | x: A theano variable 8 | y: A theano variable 9 | expr: A theano expression involving x and y 10 | x_value: A numpy value 11 | y_value: A numpy value 12 | 13 | Returns the value of expr when x_value is substituted for x 14 | and y_value is substituted for y 15 | """ 16 | 17 | return function([x, y], expr)(x_value, y_value) 18 | 19 | 20 | x = T.iscalar() 21 | y = T.iscalar() 22 | z = x + y 23 | assert evaluate(x, y, z, 1, 2) == 3 24 | print "SUCCESS!" 25 | -------------------------------------------------------------------------------- /05_shared_soln.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from theano.compat.python2x import OrderedDict 3 | from theano import function 4 | from theano import shared 5 | 6 | 7 | def make_shared(shape): 8 | """ 9 | Returns a theano shared variable containing a tensor of the specified 10 | shape. 11 | You can use any value you want. 12 | """ 13 | return shared(np.zeros(shape)) 14 | 15 | 16 | def exchange_shared(a, b): 17 | """ 18 | a: a theano shared variable 19 | b: a theano shared variable 20 | Uses get_value and set_value to swap the values stored in a and b 21 | """ 22 | temp = a.get_value() 23 | a.set_value(b.get_value()) 24 | b.set_value(temp) 25 | 26 | 27 | def make_exchange_func(a, b): 28 | """ 29 | a: a theano shared variable 30 | b: a theano shared variable 31 | Returns f 32 | where f is a theano function, that, when called, swaps the 33 | values in a and b 34 | f should not return anything 35 | """ 36 | 37 | updates = OrderedDict() 38 | updates[a] = b 39 | updates[b] = a 40 | f = function([], updates=updates) 41 | return f 42 | 43 | 44 | a = make_shared((5, 4, 3)) 45 | assert a.get_value().shape == (5, 4, 3) 46 | b = make_shared((5, 4, 3)) 47 | assert a.get_value().shape == (5, 4, 3) 48 | a.set_value(np.zeros((5, 4, 3), dtype=a.dtype)) 49 | b.set_value(np.ones((5, 4, 3), dtype=b.dtype)) 50 | exchange_shared(a, b) 51 | assert np.all(a.get_value() == 1.) 52 | assert np.all(b.get_value() == 0.) 53 | f = make_exchange_func(a, b) 54 | rval = f() 55 | assert isinstance(rval, list) 56 | assert len(rval) == 0 57 | assert np.all(a.get_value() == 0.) 58 | assert np.all(b.get_value() == 1.) 59 | 60 | print "SUCCESS!" 61 | -------------------------------------------------------------------------------- /06_grad_soln.py: -------------------------------------------------------------------------------- 1 | # Fill in the TODOs in this exercise, then run 2 | # python 01_grad.py to see if your solution works! 3 | # 4 | from theano import tensor as T 5 | 6 | 7 | def grad_sum(x, y, z): 8 | """ 9 | x: A theano variable 10 | y: A theano variable 11 | z: A theano expression involving x and y 12 | 13 | Returns dz / dx + dz / dy 14 | """ 15 | 16 | return sum(T.grad(z, [x, y])) 17 | 18 | x = T.scalar() 19 | y = T.scalar() 20 | z = x + y 21 | s = grad_sum(x, y, z) 22 | assert s.eval({x: 0, y: 0}) == 2 23 | print "SUCCESS!" 24 | -------------------------------------------------------------------------------- /07_mode.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from theano import function 3 | from theano import tensor as T 4 | from theano import config 5 | config.compute_test_value = 'raise' 6 | a = T.vector() 7 | a.tag.test_value = np.ones((3,)).astype(a.dtype) 8 | b = T.log(a) 9 | c = T.nnet.sigmoid(b) 10 | d = T.sqrt(c) 11 | e = T.concatenate((d, c), axis=0) 12 | f = b * c * d 13 | # This is the first bad line 14 | g = e + f 15 | h = g / c 16 | fn = function([a], h) 17 | fn(np.ones((3,)).astype(a.dtype)) 18 | -------------------------------------------------------------------------------- /08_scan_polynomial_soln.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | import theano 4 | import theano.tensor as tt 5 | 6 | theano.config.warn.subtensor_merge_bug = False 7 | 8 | coefficients = tt.vector("coefficients") 9 | x = tt.scalar("x") 10 | max_coefficients_supported = 10000 11 | 12 | # Generate the components of the polynomial 13 | full_range = tt.arange(max_coefficients_supported) 14 | 15 | 16 | outputs_info = tt.as_tensor_variable(numpy.asarray(0, 'float64')) 17 | 18 | components, updates = theano.scan( 19 | fn=lambda coeff, power, prior_value, free_var: 20 | prior_value + (coeff * (free_var ** power)), 21 | sequences=[coefficients, full_range], 22 | outputs_info=outputs_info, 23 | non_sequences=x) 24 | 25 | polynomial = components[-1] 26 | calculate_polynomial = theano.function( 27 | inputs=[coefficients, x], 28 | outputs=polynomial, updates=updates) 29 | 30 | test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32) 31 | print calculate_polynomial(test_coeff, 3) 32 | # 19.0 33 | -------------------------------------------------------------------------------- /Exercices.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "Exercices", 4 | "signature": "sha256:77d5115ebf6c96f00122775403f827b913b9144aa466e24779ec3e5729e36d94" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [], 15 | "language": "python", 16 | "metadata": {}, 17 | "outputs": [] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "The exercices work that way:\n", 24 | "\n", 25 | "1) You have a cell with TODOs that raise errors with a description of what is needed. Do that.\n", 26 | "2) Then run the cell(ctrl-enter) to execute it.\n", 27 | "3) It should print \"Success\" at the end (there is validation code in the cell). If not, try again.\n", 28 | "4) If you want to see the solution, execute the cell that start with \"%load\" after the exercice.\n", 29 | "\n", 30 | "First, there are Theano exercices, then 1 scan specific exercics, then some exercices related to the LSTM example." 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "collapsed": false, 36 | "input": [ 37 | "# Exercices 1\n", 38 | "# This exercice ask you to create Theano variable and do some\n", 39 | "# computation on them.\n", 40 | "import numpy as np\n", 41 | "from theano import function\n", 42 | "raise NotImplementedError(\"TODO: add any other imports you need\")\n", 43 | "\n", 44 | "\n", 45 | "def make_scalar():\n", 46 | " \"\"\"\n", 47 | " Returns a new Theano scalar.\n", 48 | " \"\"\"\n", 49 | "\n", 50 | " raise NotImplementedError(\"TODO: implement this function.\")\n", 51 | "\n", 52 | "\n", 53 | "def log(x):\n", 54 | " \"\"\"\n", 55 | " Returns the logarithm of a Theano scalar x.\n", 56 | " \"\"\"\n", 57 | "\n", 58 | " raise NotImplementedError(\"TODO: implement this function.\")\n", 59 | "\n", 60 | "\n", 61 | "def add(x, y):\n", 62 | " \"\"\"\n", 63 | " Adds two theano scalars together and returns the result.\n", 64 | " \"\"\"\n", 65 | "\n", 66 | " raise NotImplementedError(\"TODO: implement this function.\")\n", 67 | " \n", 68 | "# The following code use your code and test it.\n", 69 | "a = make_scalar()\n", 70 | "b = make_scalar()\n", 71 | "c = log(b)\n", 72 | "d = add(a, c)\n", 73 | "f = function([a, b], d)\n", 74 | "a = np.cast[a.dtype](1.)\n", 75 | "b = np.cast[b.dtype](2.)\n", 76 | "actual = f(a, b)\n", 77 | "expected = 1. + np.log(2.)\n", 78 | "assert np.allclose(actual, expected)\n", 79 | "print \"SUCCESS!\"\n" 80 | ], 81 | "language": "python", 82 | "metadata": {}, 83 | "outputs": [] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "collapsed": false, 88 | "input": [ 89 | "%load 01_scalar_soln.py" 90 | ], 91 | "language": "python", 92 | "metadata": {}, 93 | "outputs": [] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "collapsed": false, 98 | "input": [ 99 | "# Exercices 2\n", 100 | "# This exercices ask you to make Theano variable, elemwise\n", 101 | "# multiplication and matrix/vector dot product.\n", 102 | "import numpy as np\n", 103 | "from theano import function\n", 104 | "raise NotImplementedError(\"TODO: add any other imports you need\")\n", 105 | "\n", 106 | "\n", 107 | "def make_vector():\n", 108 | " \"\"\"\n", 109 | " Returns a new Theano vector.\n", 110 | " \"\"\"\n", 111 | "\n", 112 | " raise NotImplementedError(\"TODO: implement this function.\")\n", 113 | "\n", 114 | "\n", 115 | "def make_matrix():\n", 116 | " \"\"\"\n", 117 | " Returns a new Theano matrix.\n", 118 | " \"\"\"\n", 119 | "\n", 120 | " raise NotImplementedError(\"TODO: implement this function.\")\n", 121 | "\n", 122 | "def elemwise_mul(a, b):\n", 123 | " \"\"\"\n", 124 | " a: A theano matrix\n", 125 | " b: A theano matrix\n", 126 | " Returns the elementwise product of a and b\n", 127 | " \"\"\"\n", 128 | "\n", 129 | " raise NotImplementedError(\"TODO: implement this function.\")\n", 130 | "\n", 131 | "\n", 132 | "def matrix_vector_mul(a, b):\n", 133 | " \"\"\"\n", 134 | " a: A theano matrix\n", 135 | " b: A theano vector\n", 136 | " Returns the matrix-vector product of a and b\n", 137 | " \"\"\"\n", 138 | "\n", 139 | " raise NotImplementedError(\"TODO: implement this function.\")\n", 140 | "\n", 141 | "# The following code use your code and test it.\n", 142 | "a = make_vector()\n", 143 | "b = make_vector()\n", 144 | "c = elemwise_mul(a, b)\n", 145 | "d = make_matrix()\n", 146 | "e = matrix_vector_mul(d, c)\n", 147 | "\n", 148 | "f = function([a, b, d], e)\n", 149 | "\n", 150 | "rng = np.random.RandomState([1, 2, 3])\n", 151 | "a_value = rng.randn(5).astype(a.dtype)\n", 152 | "b_value = rng.rand(5).astype(b.dtype)\n", 153 | "c_value = a_value * b_value\n", 154 | "d_value = rng.randn(5, 5).astype(d.dtype)\n", 155 | "expected = np.dot(d_value, c_value)\n", 156 | "\n", 157 | "actual = f(a_value, b_value, d_value)\n", 158 | "assert np.allclose(actual, expected)\n", 159 | "print \"SUCCESS!\"\n" 160 | ], 161 | "language": "python", 162 | "metadata": {}, 163 | "outputs": [] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "collapsed": false, 168 | "input": [ 169 | "%load 02_vector_mat_soln.py" 170 | ], 171 | "language": "python", 172 | "metadata": {}, 173 | "outputs": [] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "collapsed": false, 178 | "input": [ 179 | "# Exercices 3\n", 180 | "# This exercices ask you to create Theano tensor variable, do\n", 181 | "# broadcastable addition and to compute the max over part of a tensor.\n", 182 | "import numpy as np\n", 183 | "from theano import function\n", 184 | "raise NotImplementedError(\"TODO: add any other imports you need\")\n", 185 | "\n", 186 | "\n", 187 | "def make_tensor(dim):\n", 188 | " \"\"\"\n", 189 | " Returns a new Theano tensor with no broadcastable dimensions.\n", 190 | " dim: the total number of dimensions of the tensor.\n", 191 | " (You can use any dtype you like)\n", 192 | " \"\"\"\n", 193 | "\n", 194 | " raise NotImplementedError(\"TODO: implement this function.\")\n", 195 | "\n", 196 | "\n", 197 | "def broadcasted_add(a, b):\n", 198 | " \"\"\"\n", 199 | " a: a 3D theano tensor\n", 200 | " b: a 4D theano tensor\n", 201 | " Returns c, a 4D theano tensor, where\n", 202 | "\n", 203 | " c[i, j, k, l] = a[l, k, i] + b[i, j, k, l]\n", 204 | "\n", 205 | " for all i, j, k, l\n", 206 | " \"\"\"\n", 207 | "\n", 208 | " raise NotImplementedError(\"TODO: implement this function.\")\n", 209 | "\n", 210 | "def partial_max(a):\n", 211 | " \"\"\"\n", 212 | " a: a 4D theano tensor\n", 213 | "\n", 214 | " Returns b, a theano matrix, where\n", 215 | "\n", 216 | " b[i, j] = max_{k,l} a[i, k, l, j]\n", 217 | "\n", 218 | " for all i, j\n", 219 | " \"\"\"\n", 220 | "\n", 221 | " raise NotImplementedError(\"TODO: implement this function.\")\n", 222 | "\n", 223 | "# The following code use your code and test it.\n", 224 | "a = make_tensor(3)\n", 225 | "b = make_tensor(4)\n", 226 | "c = broadcasted_add(a, b)\n", 227 | "d = partial_max(c)\n", 228 | "\n", 229 | "f = function([a, b], d)\n", 230 | "\n", 231 | "rng = np.random.RandomState([1, 2, 3])\n", 232 | "a_value = rng.randn(2, 2, 2).astype(a.dtype)\n", 233 | "b_value = rng.rand(2, 2, 2, 2).astype(b.dtype)\n", 234 | "c_value = np.transpose(a_value, (2, 1, 0))[:, None, :, :] + b_value\n", 235 | "expected = c_value.max(axis=1).max(axis=1)\n", 236 | "\n", 237 | "actual = f(a_value, b_value)\n", 238 | "\n", 239 | "assert np.allclose(actual, expected), (actual, expected)\n", 240 | "print \"SUCCESS!\"" 241 | ], 242 | "language": "python", 243 | "metadata": {}, 244 | "outputs": [] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "collapsed": false, 249 | "input": [ 250 | "%load 03_tensor_soln.py" 251 | ], 252 | "language": "python", 253 | "metadata": {}, 254 | "outputs": [] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "collapsed": false, 259 | "input": [ 260 | "# Exercices 4\n", 261 | "# This exercice ask you to compile a Theano functiont and call it to\n", 262 | "# execute \"x + y\".\n", 263 | "from theano import tensor as T\n", 264 | "raise NotImplementedError(\"TODO: add any other imports you need\")\n", 265 | "\n", 266 | "\n", 267 | "def evaluate(x, y, expr, x_value, y_value):\n", 268 | " \"\"\"\n", 269 | " x: A theano variable\n", 270 | " y: A theano variable\n", 271 | " expr: A theano expression involving x and y\n", 272 | " x_value: A numpy value\n", 273 | " y_value: A numpy value\n", 274 | "\n", 275 | " Returns the value of expr when x_value is substituted for x\n", 276 | " and y_value is substituted for y\n", 277 | " \"\"\"\n", 278 | "\n", 279 | " raise NotImplementedError(\"TODO: implement this function.\")\n", 280 | "\n", 281 | "\n", 282 | "# The following code use your code and test it.\n", 283 | "x = T.iscalar()\n", 284 | "y = T.iscalar()\n", 285 | "z = x + y\n", 286 | "assert evaluate(x, y, z, 1, 2) == 3\n", 287 | "print \"SUCCESS!\"" 288 | ], 289 | "language": "python", 290 | "metadata": {}, 291 | "outputs": [] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "collapsed": false, 296 | "input": [ 297 | "%load 04_function_soln.py" 298 | ], 299 | "language": "python", 300 | "metadata": {}, 301 | "outputs": [] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "collapsed": false, 306 | "input": [ 307 | "# Exercices 5\n", 308 | "# This exercice make you use shared variable. You must create them and\n", 309 | "# update them by swapping 2 shared variables values.\n", 310 | "import numpy as np\n", 311 | "raise NotImplementedError(\"TODO: add any other imports you need\")\n", 312 | "\n", 313 | "\n", 314 | "def make_shared(shape):\n", 315 | " \"\"\"\n", 316 | " Returns a theano shared variable containing a tensor of the specified\n", 317 | " shape.\n", 318 | " You can use any value you want.\n", 319 | " \"\"\"\n", 320 | " raise NotImplementedError(\"TODO: implement the function\")\n", 321 | "\n", 322 | "\n", 323 | "def exchange_shared(a, b):\n", 324 | " \"\"\"\n", 325 | " a: a theano shared variable\n", 326 | " b: a theano shared variable\n", 327 | " Uses get_value and set_value to swap the values stored in a and b\n", 328 | " \"\"\"\n", 329 | " raise NotImplementedError(\"TODO: implement the function\")\n", 330 | "\n", 331 | "\n", 332 | "def make_exchange_func(a, b):\n", 333 | " \"\"\"\n", 334 | " a: a theano shared variable\n", 335 | " b: a theano shared variable\n", 336 | " Returns f\n", 337 | " where f is a theano function, that, when called, swaps the\n", 338 | " values in a and b\n", 339 | " f should not return anything\n", 340 | " \"\"\"\n", 341 | " raise NotImplementedError(\"TODO: implement the function\")\n", 342 | "\n", 343 | "\n", 344 | "# The following code use your code and test it.\n", 345 | "a = make_shared((5, 4, 3))\n", 346 | "assert a.get_value().shape == (5, 4, 3)\n", 347 | "b = make_shared((5, 4, 3))\n", 348 | "assert a.get_value().shape == (5, 4, 3)\n", 349 | "a.set_value(np.zeros((5, 4, 3), dtype=a.dtype))\n", 350 | "b.set_value(np.ones((5, 4, 3), dtype=b.dtype))\n", 351 | "exchange_shared(a, b)\n", 352 | "assert np.all(a.get_value() == 1.)\n", 353 | "assert np.all(b.get_value() == 0.)\n", 354 | "f = make_exchange_func(a, b)\n", 355 | "rval = f()\n", 356 | "assert isinstance(rval, list)\n", 357 | "assert len(rval) == 0\n", 358 | "assert np.all(a.get_value() == 0.)\n", 359 | "assert np.all(b.get_value() == 1.)\n", 360 | "\n", 361 | "print \"SUCCESS!\"" 362 | ], 363 | "language": "python", 364 | "metadata": {}, 365 | "outputs": [] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "collapsed": false, 370 | "input": [ 371 | "%load 05_shared_soln.py" 372 | ], 373 | "language": "python", 374 | "metadata": {}, 375 | "outputs": [] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "collapsed": false, 380 | "input": [ 381 | "# Exercices 6\n", 382 | "# This exercice make use Theano symbolic grad\n", 383 | "from theano import tensor as T\n", 384 | "\n", 385 | "\n", 386 | "def grad_sum(x, y, z):\n", 387 | " \"\"\"\n", 388 | " x: A theano variable\n", 389 | " y: A theano variable\n", 390 | " z: A theano expression involving x and y\n", 391 | "\n", 392 | " Returns dz / dx + dz / dy\n", 393 | " \"\"\"\n", 394 | " raise NotImplementedError(\"TODO: implement this function.\")\n", 395 | "\n", 396 | "\n", 397 | "# The following code use your code and test it.\n", 398 | "x = T.scalar()\n", 399 | "y = T.scalar()\n", 400 | "z = x + y\n", 401 | "s = grad_sum(x, y, z)\n", 402 | "assert s.eval({x: 0, y: 0}) == 2\n", 403 | "print \"SUCCESS!\"" 404 | ], 405 | "language": "python", 406 | "metadata": {}, 407 | "outputs": [] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "collapsed": false, 412 | "input": [ 413 | "%load 06_grad_soln.py" 414 | ], 415 | "language": "python", 416 | "metadata": {}, 417 | "outputs": [] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "collapsed": false, 422 | "input": [ 423 | "# Exercice 7 #TODO: talk about mode=FAST_COMPILE\n", 424 | "# This code have a bug. Run this cell to see it.\n", 425 | "# Use Theano flag (easy in shell, harder in ipython) or extra parameter to a function \n", 426 | "# to find the cause and fix it.\n", 427 | "# Do not find the bug by inspecting the code. This is to help you find the bug\n", 428 | "# in more complicated case when code inspection isn't working well.\n", 429 | "\n", 430 | "import numpy as np\n", 431 | "from theano import function\n", 432 | "from theano import tensor as T\n", 433 | "a = T.vector()\n", 434 | "b = T.log(a)\n", 435 | "c = T.nnet.sigmoid(b)\n", 436 | "d = T.sqrt(c)\n", 437 | "e = T.concatenate((d, c), axis=0)\n", 438 | "f = b * c * d\n", 439 | "g = e + f\n", 440 | "h = g / c\n", 441 | "fn = function([a], h)\n", 442 | "fn(np.ones((3,)).astype(a.dtype))\n" 443 | ], 444 | "language": "python", 445 | "metadata": {}, 446 | "outputs": [] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "collapsed": false, 451 | "input": [ 452 | "%load 07_mode.py" 453 | ], 454 | "language": "python", 455 | "metadata": {}, 456 | "outputs": [] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "collapsed": false, 461 | "input": [ 462 | "# Exercice 8\n", 463 | "# This exercice is different. The initial version work.\n", 464 | "# So you must modify it as described bellow and it should still give the same output.\n", 465 | "\n", 466 | "# Modify and execute the polynomial example to have the reduction(the sum() call) done by scan.\n", 467 | "import numpy\n", 468 | "import theano\n", 469 | "import theano.tensor as T\n", 470 | "theano.config.warn.subtensor_merge_bug = False\n", 471 | "\n", 472 | "coefficients = theano.tensor.vector(\"coefficients\")\n", 473 | "x = T.scalar(\"x\")\n", 474 | "max_coefficients_supported = 10000\n", 475 | "\n", 476 | "# Generate the components of the polynomial\n", 477 | "full_range=theano.tensor.arange(max_coefficients_supported)\n", 478 | "components, updates = theano.scan(fn=lambda coeff, power, free_var:\n", 479 | " coeff * (free_var ** power),\n", 480 | " outputs_info=None,\n", 481 | " sequences=[coefficients, full_range],\n", 482 | " non_sequences=x)\n", 483 | "\n", 484 | "polynomial = components.sum()\n", 485 | "calculate_polynomial = theano.function(inputs=[coefficients, x],\n", 486 | " outputs=polynomial)\n", 487 | "\n", 488 | "test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)\n", 489 | "print calculate_polynomial(test_coeff, 3)\n", 490 | "# 19.0" 491 | ], 492 | "language": "python", 493 | "metadata": {}, 494 | "outputs": [] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "collapsed": false, 499 | "input": [ 500 | "%load 08_scan_polynomial_soln.py" 501 | ], 502 | "language": "python", 503 | "metadata": {}, 504 | "outputs": [] 505 | }, 506 | { 507 | "cell_type": "markdown", 508 | "metadata": {}, 509 | "source": [ 510 | "LSTM Exercice" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "collapsed": false, 516 | "input": [ 517 | "1) Modif LSTM: Reverse the input sequence and try it like that:\n", 518 | " Sutskever-NIPS2014 (No change to Theano code, but useful to better understand how to make 2)\n", 519 | "2) Modif LSTM: Add to have 2 LSTM layers. The new one take\n", 520 | " the input in the reverse order. Then you concatenate the mean\n", 521 | " of the outputs of both LSTM to the logistic regression.\n", 522 | "3) Modif LSTM: Add the V_o parameter and use it. (No solutions provided)\n", 523 | " \n", 524 | "Note. 2) need more epoch before we start to see that it learn something. With max_epochs=16, we start to see it for all version.\n", 525 | "\n", 526 | "You can load the original example code in the next cell.\n", 527 | "Run it once. It will charge the data.\n", 528 | "At the end of that code, there is in comment example how to run it for a short time (~10m on my laptop, core i5).\n", 529 | "During that time, we see that is start to learn, but I do not let it go too long for this tutorial." 530 | ], 531 | "language": "python", 532 | "metadata": {}, 533 | "outputs": [] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "collapsed": false, 538 | "input": [ 539 | "%load lstm.py" 540 | ], 541 | "language": "python", 542 | "metadata": {}, 543 | "outputs": [], 544 | "prompt_number": 5 545 | }, 546 | { 547 | "cell_type": "code", 548 | "collapsed": false, 549 | "input": [ 550 | "%load lstm_reverse.diff" 551 | ], 552 | "language": "python", 553 | "metadata": {}, 554 | "outputs": [], 555 | "prompt_number": 2 556 | }, 557 | { 558 | "cell_type": "code", 559 | "collapsed": false, 560 | "input": [ 561 | "%load lstm_double.diff" 562 | ], 563 | "language": "python", 564 | "metadata": {}, 565 | "outputs": [], 566 | "prompt_number": 4 567 | } 568 | ], 569 | "metadata": {} 570 | } 571 | ] 572 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![intro](http://i.imgur.com/7vDvQnY.png) 2 | 3 | ###slides 4 | https://drive.google.com/a/startup.ml/file/d/0B6VRjP1VYAtTSWdaZ1BLTmQ0Qm8/view 5 | -------------------------------------------------------------------------------- /imdb.py: -------------------------------------------------------------------------------- 1 | import cPickle 2 | import gzip 3 | import os 4 | 5 | import numpy 6 | 7 | 8 | def prepare_data(seqs, labels, maxlen=None): 9 | """Create the matrices from the datasets. 10 | 11 | This pad each sequence to the same lenght: the lenght of the 12 | longuest sequence or maxlen. 13 | 14 | if maxlen is set, we will cut all sequence to this maximum 15 | lenght. 16 | 17 | This swap the axis! 18 | """ 19 | # x: a list of sentences 20 | lengths = [len(s) for s in seqs] 21 | 22 | if maxlen is not None: 23 | new_seqs = [] 24 | new_labels = [] 25 | new_lengths = [] 26 | for l, s, y in zip(lengths, seqs, labels): 27 | if l < maxlen: 28 | new_seqs.append(s) 29 | new_labels.append(y) 30 | new_lengths.append(l) 31 | lengths = new_lengths 32 | labels = new_labels 33 | seqs = new_seqs 34 | 35 | if len(lengths) < 1: 36 | return None, None, None 37 | 38 | n_samples = len(seqs) 39 | maxlen = numpy.max(lengths) 40 | 41 | x = numpy.zeros((maxlen, n_samples)).astype('int64') 42 | x_mask = numpy.zeros((maxlen, n_samples)).astype('float32') 43 | for idx, s in enumerate(seqs): 44 | x[:lengths[idx], idx] = s 45 | x_mask[:lengths[idx], idx] = 1. 46 | 47 | return x, x_mask, labels 48 | 49 | 50 | def get_dataset_file(dataset, default_dataset, origin): 51 | '''Look for it as if it was a full path, if not, try local file, 52 | if not try in the data directory. 53 | 54 | Download dataset if it is not present 55 | 56 | ''' 57 | data_dir, data_file = os.path.split(dataset) 58 | if data_dir == "" and not os.path.isfile(dataset): 59 | # Check if dataset is in the data directory. 60 | new_path = os.path.join( 61 | os.path.split(__file__)[0], 62 | dataset 63 | ) 64 | if os.path.isfile(new_path) or data_file == default_dataset: 65 | dataset = new_path 66 | 67 | if (not os.path.isfile(dataset)) and data_file == default_dataset: 68 | import urllib 69 | print 'Downloading data from %s' % origin 70 | urllib.urlretrieve(origin, dataset) 71 | return dataset 72 | 73 | 74 | def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None): 75 | ''' Loads the dataset 76 | 77 | :type path: String 78 | :param path: The path to the dataset (here IMDB) 79 | :type n_words: int 80 | :param n_words: The number of word to keep in the vocabulary. 81 | All extra words are set to unknow (1). 82 | :type valid_portion: float 83 | :param valid_portion: The proportion of the full train set used for 84 | the validation set. 85 | :type maxlen: None or positive int 86 | :param maxlen: the max sequence length we use in the train/valid set. 87 | ''' 88 | 89 | ############# 90 | # LOAD DATA # 91 | ############# 92 | 93 | # Load the dataset 94 | path = get_dataset_file( 95 | path, "imdb.pkl", 96 | "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl") 97 | 98 | if path.endswith(".gz"): 99 | f = gzip.open(path, 'rb') 100 | else: 101 | f = open(path, 'rb') 102 | 103 | train_set = cPickle.load(f) 104 | test_set = cPickle.load(f) 105 | f.close() 106 | if maxlen: 107 | new_train_set_x = [] 108 | new_train_set_y = [] 109 | for x, y in zip(train_set[0], train_set[1]): 110 | if len(x) < maxlen: 111 | new_train_set_x.append(x) 112 | new_train_set_y.append(y) 113 | train_set = (new_train_set_x, new_train_set_y) 114 | del new_train_set_x, new_train_set_y 115 | 116 | # split training set into validation set 117 | train_set_x, train_set_y = train_set 118 | n_samples = len(train_set_x) 119 | sidx = numpy.random.permutation(n_samples) 120 | n_train = int(numpy.round(n_samples * (1. - valid_portion))) 121 | valid_set_x = [train_set_x[s] for s in sidx[n_train:]] 122 | valid_set_y = [train_set_y[s] for s in sidx[n_train:]] 123 | train_set_x = [train_set_x[s] for s in sidx[:n_train]] 124 | train_set_y = [train_set_y[s] for s in sidx[:n_train]] 125 | 126 | train_set = (train_set_x, train_set_y) 127 | valid_set = (valid_set_x, valid_set_y) 128 | 129 | def remove_unk(x): 130 | return [[1 if w >= n_words else w for w in sen] for sen in x] 131 | 132 | test_set_x, test_set_y = test_set 133 | valid_set_x, valid_set_y = valid_set 134 | train_set_x, train_set_y = train_set 135 | 136 | train_set_x = remove_unk(train_set_x) 137 | valid_set_x = remove_unk(valid_set_x) 138 | test_set_x = remove_unk(test_set_x) 139 | 140 | train = (train_set_x, train_set_y) 141 | valid = (valid_set_x, valid_set_y) 142 | test = (test_set_x, test_set_y) 143 | 144 | return train, valid, test 145 | -------------------------------------------------------------------------------- /lstm.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Build a tweet sentiment analyzer 3 | ''' 4 | from collections import OrderedDict 5 | import cPickle as pkl 6 | import random 7 | import sys 8 | import time 9 | 10 | import numpy 11 | import theano 12 | import theano.tensor as tensor 13 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 14 | 15 | import imdb 16 | 17 | datasets = {'imdb': (imdb.load_data, imdb.prepare_data)} 18 | 19 | 20 | def get_minibatches_idx(n, minibatch_size, shuffle=False): 21 | """ 22 | Used to shuffle the dataset at each iteration. 23 | """ 24 | 25 | idx_list = numpy.arange(n, dtype="int32") 26 | 27 | if shuffle: 28 | random.shuffle(idx_list) 29 | 30 | minibatches = [] 31 | minibatch_start = 0 32 | for i in range(n // minibatch_size): 33 | minibatches.append(idx_list[minibatch_start: 34 | minibatch_start + minibatch_size]) 35 | minibatch_start += minibatch_size 36 | 37 | if (minibatch_start != n): 38 | # Make a minibatch out of what is left 39 | minibatches.append(idx_list[minibatch_start:]) 40 | 41 | return zip(range(len(minibatches)), minibatches) 42 | 43 | 44 | def get_dataset(name): 45 | return datasets[name][0], datasets[name][1] 46 | 47 | 48 | def zipp(params, tparams): 49 | """ 50 | When we reload the model. Needed for the GPU stuff. 51 | """ 52 | for kk, vv in params.iteritems(): 53 | tparams[kk].set_value(vv) 54 | 55 | 56 | def unzip(zipped): 57 | """ 58 | When we pickle the model. Needed for the GPU stuff. 59 | """ 60 | new_params = OrderedDict() 61 | for kk, vv in zipped.iteritems(): 62 | new_params[kk] = vv.get_value() 63 | return new_params 64 | 65 | 66 | def dropout_layer(state_before, use_noise, trng): 67 | proj = tensor.switch(use_noise, 68 | (state_before * 69 | trng.binomial(state_before.shape, 70 | p=0.5, n=1, 71 | dtype=state_before.dtype)), 72 | state_before * 0.5) 73 | return proj 74 | 75 | 76 | def _p(pp, name): 77 | return '%s_%s' % (pp, name) 78 | 79 | 80 | def init_params(options): 81 | """ 82 | Global (not LSTM) parameter. For the embeding and the classifier. 83 | """ 84 | params = OrderedDict() 85 | # embedding 86 | randn = numpy.random.rand(options['n_words'], 87 | options['dim_proj']) 88 | params['Wemb'] = (0.01 * randn).astype('float32') 89 | params = get_layer(options['encoder'])[0](options, 90 | params, 91 | prefix=options['encoder']) 92 | # classifier 93 | params['U'] = 0.01 * numpy.random.randn(options['dim_proj'], 94 | options['ydim']).astype('float32') 95 | params['b'] = numpy.zeros((options['ydim'],)).astype('float32') 96 | 97 | return params 98 | 99 | 100 | def load_params(path, params): 101 | pp = numpy.load(path) 102 | for kk, vv in params.iteritems(): 103 | if kk not in pp: 104 | raise Warning('%s is not in the archive' % kk) 105 | params[kk] = pp[kk] 106 | 107 | return params 108 | 109 | 110 | def init_tparams(params): 111 | tparams = OrderedDict() 112 | for kk, pp in params.iteritems(): 113 | tparams[kk] = theano.shared(params[kk], name=kk) 114 | return tparams 115 | 116 | 117 | def get_layer(name): 118 | fns = layers[name] 119 | return fns 120 | 121 | 122 | def ortho_weight(ndim): 123 | W = numpy.random.randn(ndim, ndim) 124 | u, s, v = numpy.linalg.svd(W) 125 | return u.astype('float32') 126 | 127 | 128 | def param_init_lstm(options, params, prefix='lstm'): 129 | """ 130 | Init the LSTM parameter: 131 | 132 | :see: init_params 133 | """ 134 | W = numpy.concatenate([ortho_weight(options['dim_proj']), 135 | ortho_weight(options['dim_proj']), 136 | ortho_weight(options['dim_proj']), 137 | ortho_weight(options['dim_proj'])], axis=1) 138 | params[_p(prefix, 'W')] = W 139 | U = numpy.concatenate([ortho_weight(options['dim_proj']), 140 | ortho_weight(options['dim_proj']), 141 | ortho_weight(options['dim_proj']), 142 | ortho_weight(options['dim_proj'])], axis=1) 143 | params[_p(prefix, 'U')] = U 144 | b = numpy.zeros((4 * options['dim_proj'],)) 145 | params[_p(prefix, 'b')] = b.astype('float32') 146 | 147 | return params 148 | 149 | 150 | def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): 151 | nsteps = state_below.shape[0] 152 | if state_below.ndim == 3: 153 | n_samples = state_below.shape[1] 154 | else: 155 | n_samples = 1 156 | 157 | assert mask is not None 158 | 159 | def _slice(_x, n, dim): 160 | if _x.ndim == 3: 161 | return _x[:, :, n * dim:(n + 1) * dim] 162 | return _x[:, n * dim:(n + 1) * dim] 163 | 164 | def _step(m_, x_, h_, c_): 165 | preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) 166 | preact += x_ 167 | preact += tparams[_p(prefix, 'b')] 168 | 169 | i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) 170 | f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) 171 | o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj'])) 172 | c = tensor.tanh(_slice(preact, 3, options['dim_proj'])) 173 | 174 | c = f * c_ + i * c 175 | c = m_[:, None] * c + (1. - m_)[:, None] * c_ 176 | 177 | h = o * tensor.tanh(c) 178 | h = m_[:, None] * h + (1. - m_)[:, None] * h_ 179 | 180 | return h, c 181 | 182 | state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + 183 | tparams[_p(prefix, 'b')]) 184 | 185 | dim_proj = options['dim_proj'] 186 | rval, updates = theano.scan(_step, 187 | sequences=[mask, state_below], 188 | outputs_info=[tensor.alloc(0., n_samples, 189 | dim_proj), 190 | tensor.alloc(0., n_samples, 191 | dim_proj)], 192 | name=_p(prefix, '_layers'), 193 | n_steps=nsteps) 194 | return rval[0] 195 | 196 | 197 | # ff: Feed Forward (normal neural net), only useful to put after lstm 198 | # before the classifier. 199 | layers = {'lstm': (param_init_lstm, lstm_layer)} 200 | 201 | 202 | def sgd(lr, tparams, grads, x, mask, y, cost): 203 | """ Stochastic Gradient Descent 204 | 205 | :note: A more complicated version of sgd then needed. This is 206 | done like that for adadelta and rmsprop. 207 | 208 | """ 209 | # New set of shared variable that will contain the gradient 210 | # for a mini-batch. 211 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) 212 | for k, p in tparams.iteritems()] 213 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 214 | 215 | # Function that computes gradients for a mini-batch, but do not 216 | # updates the weights. 217 | f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, 218 | name='sgd_f_grad_shared') 219 | 220 | pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)] 221 | 222 | # Function that updates the weights from the previously computed 223 | # gradient. 224 | f_update = theano.function([lr], [], updates=pup, 225 | name='sgd_f_update') 226 | 227 | return f_grad_shared, f_update 228 | 229 | 230 | def adadelta(lr, tparams, grads, x, mask, y, cost): 231 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 232 | name='%s_grad' % k) 233 | for k, p in tparams.iteritems()] 234 | running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), 235 | name='%s_rup2' % k) 236 | for k, p in tparams.iteritems()] 237 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 238 | name='%s_rgrad2' % k) 239 | for k, p in tparams.iteritems()] 240 | 241 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 242 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 243 | for rg2, g in zip(running_grads2, grads)] 244 | 245 | f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up, 246 | name='adadelta_f_grad_shared') 247 | 248 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 249 | for zg, ru2, rg2 in zip(zipped_grads, 250 | running_up2, 251 | running_grads2)] 252 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 253 | for ru2, ud in zip(running_up2, updir)] 254 | param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] 255 | 256 | f_update = theano.function([lr], [], updates=ru2up + param_up, 257 | on_unused_input='ignore', 258 | name='adadelta_f_update') 259 | 260 | return f_grad_shared, f_update 261 | 262 | 263 | def rmsprop(lr, tparams, grads, x, mask, y, cost): 264 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 265 | name='%s_grad' % k) 266 | for k, p in tparams.iteritems()] 267 | running_grads = [theano.shared(p.get_value() * numpy.float32(0.), 268 | name='%s_rgrad' % k) 269 | for k, p in tparams.iteritems()] 270 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 271 | name='%s_rgrad2' % k) 272 | for k, p in tparams.iteritems()] 273 | 274 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 275 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] 276 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 277 | for rg2, g in zip(running_grads2, grads)] 278 | 279 | f_grad_shared = theano.function([x, mask, y], cost, 280 | updates=zgup + rgup + rg2up, 281 | name='rmsprop_f_grad_shared') 282 | 283 | updir = [theano.shared(p.get_value() * numpy.float32(0.), 284 | name='%s_updir' % k) 285 | for k, p in tparams.iteritems()] 286 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) 287 | for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, 288 | running_grads2)] 289 | param_up = [(p, p + udn[1]) 290 | for p, udn in zip(tparams.values(), updir_new)] 291 | f_update = theano.function([lr], [], updates=updir_new + param_up, 292 | on_unused_input='ignore', 293 | name='rmsprop_f_update') 294 | 295 | return f_grad_shared, f_update 296 | 297 | 298 | def build_model(tparams, options): 299 | trng = RandomStreams(1234) 300 | 301 | # Used for dropout. 302 | use_noise = theano.shared(numpy.float32(0.)) 303 | 304 | x = tensor.matrix('x', dtype='int64') 305 | mask = tensor.matrix('mask', dtype='float32') 306 | y = tensor.vector('y', dtype='int64') 307 | 308 | n_timesteps = x.shape[0] 309 | n_samples = x.shape[1] 310 | 311 | emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, 312 | n_samples, 313 | options['dim_proj']]) 314 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 315 | prefix=options['encoder'], 316 | mask=mask) 317 | if options['encoder'] == 'lstm': 318 | proj = (proj * mask[:, :, None]).sum(axis=0) 319 | proj = proj / mask.sum(axis=0)[:, None] 320 | if options['use_dropout']: 321 | proj = dropout_layer(proj, use_noise, trng) 322 | 323 | pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b']) 324 | 325 | f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob') 326 | f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred') 327 | 328 | cost = -tensor.log(pred[tensor.arange(n_samples), y] + 1e-8).mean() 329 | 330 | return use_noise, x, mask, y, f_pred_prob, f_pred, cost 331 | 332 | 333 | def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False): 334 | """ If you want to use a trained model, this is useful to compute 335 | the probabilities of new examples. 336 | """ 337 | n_samples = len(data[0]) 338 | probs = numpy.zeros((n_samples, 2)).astype('float32') 339 | 340 | n_done = 0 341 | 342 | for _, valid_index in iterator: 343 | x, mask, y = prepare_data([data[0][t] for t in valid_index], 344 | numpy.array(data[1])[valid_index], 345 | maxlen=None) 346 | pred_probs = f_pred_prob(x, mask) 347 | probs[valid_index, :] = pred_probs 348 | 349 | n_done += len(valid_index) 350 | if verbose: 351 | print '%d/%d samples classified' % (n_done, n_samples) 352 | 353 | return probs 354 | 355 | 356 | def pred_error(f_pred, prepare_data, data, iterator, verbose=False): 357 | """ 358 | Just compute the error 359 | f_pred: Theano fct computing the prediction 360 | prepare_data: usual prepare_data for that dataset. 361 | """ 362 | valid_err = 0 363 | for _, valid_index in iterator: 364 | x, mask, y = prepare_data([data[0][t] for t in valid_index], 365 | numpy.array(data[1])[valid_index], 366 | maxlen=None) 367 | preds = f_pred(x, mask) 368 | targets = numpy.array(data[1])[valid_index] 369 | valid_err += (preds == targets).sum() 370 | valid_err = 1. - numpy.float32(valid_err) / len(data[0]) 371 | 372 | return valid_err 373 | 374 | 375 | def train_lstm( 376 | train, valid, test, 377 | dim_proj=128, # word embeding dimension and LSTM number of hidden units. 378 | patience=10, # Number of epoch to wait before early stop if no progress 379 | max_epochs=5000, # The maximum number of epoch to run 380 | dispFreq=10, # Display to stdout the training progress every N updates 381 | decay_c=0., # Weight decay for the classifier applied to the U weights. 382 | lrate=0.0001, # Learning rate for sgd (not used for adadelta and rmsprop) 383 | n_words=10000, # Vocabulary size 384 | # sgd, adadelta and rmsprop available, 385 | # sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). 386 | optimizer=adadelta, 387 | encoder='lstm', # TODO: can be removed must be lstm. 388 | saveto='lstm_model.npz', # The best model will be saved there 389 | validFreq=370, # Compute the validation error after this number of update. 390 | saveFreq=1110, # Save the parameters after every saveFreq updates 391 | batch_size=16, # The batch size during training. 392 | valid_batch_size=64, # The batch size used for validation/test set. 393 | dataset='imdb', 394 | 395 | # Parameter for extra option 396 | noise_std=0., 397 | use_dropout=True, # if False slightly faster, but worst test error 398 | # This frequently need a bigger model. 399 | reload_model="", # Path to a saved model we want to start from. 400 | test_size=-1, # If >0, we will trunc the test set to this number of example. 401 | ): 402 | 403 | # Model options 404 | model_options = locals().copy() 405 | del model_options['train'] 406 | del model_options['valid'] 407 | del model_options['test'] 408 | print "model options", model_options 409 | 410 | if test_size > 0: 411 | test = (test[0][:test_size], test[1][:test_size]) 412 | 413 | ydim = numpy.max(train[1]) + 1 414 | 415 | model_options['ydim'] = ydim 416 | 417 | print 'Building model' 418 | # This create the initial parameters as numpy ndarrays. 419 | # Dict name (string) -> numpy ndarray 420 | params = init_params(model_options) 421 | 422 | if reload_model: 423 | load_params('lstm_model.npz', params) 424 | 425 | # This create Theano Shared Variable from the parameters. 426 | # Dict name (string) -> Theano Tensor Shared Variable 427 | # params and tparams have different copy of the weights. 428 | tparams = init_tparams(params) 429 | 430 | # use_noise is for dropout 431 | (use_noise, x, mask, 432 | y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) 433 | 434 | if decay_c > 0.: 435 | decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') 436 | weight_decay = 0. 437 | weight_decay += (tparams['U'] ** 2).sum() 438 | weight_decay *= decay_c 439 | cost += weight_decay 440 | 441 | f_cost = theano.function([x, mask, y], cost, name='f_cost') 442 | 443 | grads = tensor.grad(cost, wrt=tparams.values()) 444 | f_grad = theano.function([x, mask, y], grads, name='f_grad') 445 | 446 | lr = tensor.scalar(name='lr') 447 | f_grad_shared, f_update = optimizer(lr, tparams, grads, 448 | x, mask, y, cost) 449 | 450 | print 'Training' 451 | 452 | kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size, 453 | shuffle=True) 454 | kf_test = get_minibatches_idx(len(test[0]), valid_batch_size, 455 | shuffle=True) 456 | 457 | print "%d train examples" % len(train[0]) 458 | print "%d valid examples" % len(valid[0]) 459 | print "%d test examples" % len(test[0]) 460 | history_errs = [] 461 | best_p = None 462 | bad_count = 0 463 | 464 | if validFreq == -1: 465 | validFreq = len(train[0]) / batch_size 466 | if saveFreq == -1: 467 | saveFreq = len(train[0]) / batch_size 468 | 469 | uidx = 0 # the number of update done 470 | estop = False # early stop 471 | start_time = time.clock() 472 | try: 473 | for eidx in xrange(max_epochs): 474 | n_samples = 0 475 | 476 | # Get new shuffled index for the training set. 477 | kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) 478 | 479 | for _, train_index in kf: 480 | uidx += 1 481 | use_noise.set_value(1.) 482 | 483 | # Select the random examples for this minibatch 484 | y = [train[1][t] for t in train_index] 485 | x = [train[0][t] for t in train_index] 486 | 487 | # Get the data in numpy.ndarray format 488 | # This swap the axis! 489 | # Return something of shape (minibatch maxlen, n samples) 490 | x, mask, y = prepare_data(x, y) 491 | n_samples += x.shape[1] 492 | 493 | cost = f_grad_shared(x, mask, y) 494 | f_update(lrate) 495 | 496 | if numpy.isnan(cost) or numpy.isinf(cost): 497 | print 'NaN detected' 498 | return 1., 1., 1. 499 | 500 | if numpy.mod(uidx, dispFreq) == 0: 501 | print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost 502 | 503 | if numpy.mod(uidx, saveFreq) == 0: 504 | print 'Saving...', 505 | 506 | if best_p is not None: 507 | params = best_p 508 | else: 509 | params = unzip(tparams) 510 | numpy.savez(saveto, history_errs=history_errs, **params) 511 | pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) 512 | print 'Done' 513 | 514 | if numpy.mod(uidx, validFreq) == 0: 515 | use_noise.set_value(0.) 516 | train_err = pred_error(f_pred, prepare_data, train, kf) 517 | valid_err = pred_error(f_pred, prepare_data, valid, 518 | kf_valid) 519 | test_err = pred_error(f_pred, prepare_data, test, kf_test) 520 | 521 | history_errs.append([valid_err, test_err]) 522 | 523 | if (uidx == 0 or 524 | valid_err <= numpy.array(history_errs)[:, 525 | 0].min()): 526 | 527 | best_p = unzip(tparams) 528 | bad_counter = 0 529 | 530 | print ('Train ', train_err, 'Valid ', valid_err, 531 | 'Test ', test_err) 532 | 533 | if (len(history_errs) > patience and 534 | valid_err >= numpy.array(history_errs)[:-patience, 535 | 0].min()): 536 | bad_counter += 1 537 | if bad_counter > patience: 538 | print 'Early Stop!' 539 | estop = True 540 | break 541 | 542 | print 'Seen %d samples' % n_samples 543 | 544 | if estop: 545 | break 546 | 547 | except KeyboardInterrupt: 548 | print "Training interupted" 549 | 550 | end_time = time.clock() 551 | print "Training done" 552 | if best_p is not None: 553 | zipp(best_p, tparams) 554 | else: 555 | best_p = unzip(tparams) 556 | 557 | print "Computing errors" 558 | use_noise.set_value(0.) 559 | train_err = pred_error(f_pred, prepare_data, train, kf) 560 | valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) 561 | test_err = pred_error(f_pred, prepare_data, test, kf_test) 562 | 563 | print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err 564 | 565 | numpy.savez(saveto, train_err=train_err, 566 | valid_err=valid_err, test_err=test_err, 567 | history_errs=history_errs, **best_p) 568 | print 'The code run for %d epochs, with %f sec/epochs' % ( 569 | (eidx + 1), (end_time - start_time) / (1. * (eidx + 1))) 570 | print >> sys.stderr, ('Training took %.1fs' % 571 | (end_time - start_time)) 572 | return train_err, valid_err, test_err 573 | 574 | 575 | # We must have floatX=float32 for this tutorial to work correctly. 576 | theano.config.floatX = "float32" 577 | # The next line is the new Theano default. This is a speed up. 578 | theano.config.scan.allow_gc = False 579 | 580 | print 'Loading data' 581 | n_words = 10000 582 | load_data, prepare_data = get_dataset("imdb") 583 | train, valid, test = load_data(n_words=n_words, valid_portion=0.05, 584 | maxlen=100) 585 | print 'Loading data: Done' 586 | print "See the comment at the end of this cell to train the model." 587 | 588 | # See function train for all possible parameter and there definition. 589 | #train_lstm( 590 | # train, valid, test, 591 | # I set max_epochs to only 16, as this is enought to 592 | # show that the network learn. A real job should try for longer. 593 | # max_epochs=16, 594 | # test_size=500, 595 | # n_words=n_words, 596 | #) 597 | -------------------------------------------------------------------------------- /lstm_double.diff: -------------------------------------------------------------------------------- 1 | --- lstm.py.orig 2015-01-16 17:20:22.075153409 -0800 2 | +++ lstm_double.py 2015-01-16 17:20:16.627153500 -0800 3 | @@ -227,7 +227,7 @@ 4 | return f_grad_shared, f_update 5 | 6 | 7 | -def adadelta(lr, tparams, grads, x, mask, y, cost): 8 | +def adadelta(lr, tparams, grads, x, rx, mask, y, cost): 9 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 10 | name='%s_grad' % k) 11 | for k, p in tparams.iteritems()] 12 | @@ -242,7 +242,7 @@ 13 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 14 | for rg2, g in zip(running_grads2, grads)] 15 | 16 | - f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up, 17 | + f_grad_shared = theano.function([x, rx, mask, y], cost, updates=zgup + rg2up, 18 | name='adadelta_f_grad_shared') 19 | 20 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 21 | @@ -260,7 +260,7 @@ 22 | return f_grad_shared, f_update 23 | 24 | 25 | -def rmsprop(lr, tparams, grads, x, mask, y, cost): 26 | +def rmsprop(lr, tparams, grads, x, rx, mask, y, cost): 27 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 28 | name='%s_grad' % k) 29 | for k, p in tparams.iteritems()] 30 | @@ -276,7 +276,7 @@ 31 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 32 | for rg2, g in zip(running_grads2, grads)] 33 | 34 | - f_grad_shared = theano.function([x, mask, y], cost, 35 | + f_grad_shared = theano.function([x, rx, mask, y], cost, 36 | updates=zgup + rgup + rg2up, 37 | name='rmsprop_f_grad_shared') 38 | 39 | @@ -302,32 +302,39 @@ 40 | use_noise = theano.shared(numpy.float32(0.)) 41 | 42 | x = tensor.matrix('x', dtype='int64') 43 | + rx = tensor.matrix('rx', dtype='int64') 44 | mask = tensor.matrix('mask', dtype='float32') 45 | y = tensor.vector('y', dtype='int64') 46 | 47 | n_timesteps = x.shape[0] 48 | n_samples = x.shape[1] 49 | 50 | - emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, 51 | - n_samples, 52 | - options['dim_proj']]) 53 | - proj = get_layer(options['encoder'])[1](tparams, emb, options, 54 | - prefix=options['encoder'], 55 | - mask=mask) 56 | - if options['encoder'] == 'lstm': 57 | - proj = (proj * mask[:, :, None]).sum(axis=0) 58 | - proj = proj / mask.sum(axis=0)[:, None] 59 | - if options['use_dropout']: 60 | - proj = dropout_layer(proj, use_noise, trng) 61 | + lstm_outs = [] 62 | + for inp in [x, rx]: 63 | + emb = tparams['Wemb'][inp.flatten()].reshape([n_timesteps, 64 | + n_samples, 65 | + options['dim_proj']]) 66 | + proj = get_layer(options['encoder'])[1](tparams, emb, options, 67 | + prefix=options['encoder'], 68 | + mask=mask) 69 | + if options['encoder'] == 'lstm': 70 | + proj = (proj * mask[:, :, None]).sum(axis=0) 71 | + proj = proj / mask.sum(axis=0)[:, None] 72 | + if options['use_dropout']: 73 | + proj = dropout_layer(proj, use_noise, trng) 74 | + lstm_outs.append(proj) 75 | + 76 | + del proj 77 | + pred = tensor.nnet.softmax(tensor.dot(theano.tensor.concatenate(lstm_outs), 78 | + tparams['U']) + tparams['b']) 79 | + pred = pred.reshape((2, pred.shape[0]/2, pred.shape[1])).mean(axis=0) 80 | 81 | - pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b']) 82 | - 83 | - f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob') 84 | - f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred') 85 | + f_pred_prob = theano.function([x, rx, mask], pred, name='f_pred_prob') 86 | + f_pred = theano.function([x, rx, mask], pred.argmax(axis=1), name='f_pred') 87 | 88 | cost = -tensor.log(pred[tensor.arange(n_samples), y] + 1e-8).mean() 89 | 90 | - return use_noise, x, mask, y, f_pred_prob, f_pred, cost 91 | + return use_noise, x, rx, mask, y, f_pred_prob, f_pred, cost 92 | 93 | 94 | def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False): 95 | @@ -343,6 +350,9 @@ 96 | x, mask, y = prepare_data([data[0][t] for t in valid_index], 97 | numpy.array(data[1])[valid_index], 98 | maxlen=None) 99 | + rx, _, _ = prepare_data([data[0][t][::-1] for t in valid_index], 100 | + numpy.array(data[1])[valid_index], 101 | + maxlen=None) 102 | pred_probs = f_pred_prob(x, mask) 103 | probs[valid_index, :] = pred_probs 104 | 105 | @@ -364,7 +374,10 @@ 106 | x, mask, y = prepare_data([data[0][t] for t in valid_index], 107 | numpy.array(data[1])[valid_index], 108 | maxlen=None) 109 | - preds = f_pred(x, mask) 110 | + rx, _, _ = prepare_data([data[0][t][::-1] for t in valid_index], 111 | + numpy.array(data[1])[valid_index], 112 | + maxlen=None) 113 | + preds = f_pred(x, rx, mask) 114 | targets = numpy.array(data[1])[valid_index] 115 | valid_err += (preds == targets).sum() 116 | valid_err = 1. - numpy.float32(valid_err) / len(data[0]) 117 | @@ -428,7 +441,7 @@ 118 | tparams = init_tparams(params) 119 | 120 | # use_noise is for dropout 121 | - (use_noise, x, mask, 122 | + (use_noise, x, rx, mask, 123 | y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) 124 | 125 | if decay_c > 0.: 126 | @@ -438,14 +451,14 @@ 127 | weight_decay *= decay_c 128 | cost += weight_decay 129 | 130 | - f_cost = theano.function([x, mask, y], cost, name='f_cost') 131 | + f_cost = theano.function([x, rx, mask, y], cost, name='f_cost') 132 | 133 | grads = tensor.grad(cost, wrt=tparams.values()) 134 | - f_grad = theano.function([x, mask, y], grads, name='f_grad') 135 | + f_grad = theano.function([x, rx, mask, y], grads, name='f_grad') 136 | 137 | lr = tensor.scalar(name='lr') 138 | f_grad_shared, f_update = optimizer(lr, tparams, grads, 139 | - x, mask, y, cost) 140 | + x, rx, mask, y, cost) 141 | 142 | print 'Training' 143 | 144 | @@ -488,9 +501,10 @@ 145 | # This swap the axis! 146 | # Return something of shape (minibatch maxlen, n samples) 147 | x, mask, y = prepare_data(x, y) 148 | + rx, _, Y = prepare_data([t[::-1] for t in x], y) 149 | n_samples += x.shape[1] 150 | 151 | - cost = f_grad_shared(x, mask, y) 152 | + cost = f_grad_shared(x, rx, mask, y) 153 | f_update(lrate) 154 | 155 | if numpy.isnan(cost) or numpy.isinf(cost): 156 | -------------------------------------------------------------------------------- /lstm_double.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Build a tweet sentiment analyzer 3 | ''' 4 | from collections import OrderedDict 5 | import cPickle as pkl 6 | import random 7 | import sys 8 | import time 9 | 10 | import numpy 11 | import theano 12 | import theano.tensor as tensor 13 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 14 | 15 | import imdb 16 | 17 | datasets = {'imdb': (imdb.load_data, imdb.prepare_data)} 18 | 19 | 20 | def get_minibatches_idx(n, minibatch_size, shuffle=False): 21 | """ 22 | Used to shuffle the dataset at each iteration. 23 | """ 24 | 25 | idx_list = numpy.arange(n, dtype="int32") 26 | 27 | if shuffle: 28 | random.shuffle(idx_list) 29 | 30 | minibatches = [] 31 | minibatch_start = 0 32 | for i in range(n // minibatch_size): 33 | minibatches.append(idx_list[minibatch_start: 34 | minibatch_start + minibatch_size]) 35 | minibatch_start += minibatch_size 36 | 37 | if (minibatch_start != n): 38 | # Make a minibatch out of what is left 39 | minibatches.append(idx_list[minibatch_start:]) 40 | 41 | return zip(range(len(minibatches)), minibatches) 42 | 43 | 44 | def get_dataset(name): 45 | return datasets[name][0], datasets[name][1] 46 | 47 | 48 | def zipp(params, tparams): 49 | """ 50 | When we reload the model. Needed for the GPU stuff. 51 | """ 52 | for kk, vv in params.iteritems(): 53 | tparams[kk].set_value(vv) 54 | 55 | 56 | def unzip(zipped): 57 | """ 58 | When we pickle the model. Needed for the GPU stuff. 59 | """ 60 | new_params = OrderedDict() 61 | for kk, vv in zipped.iteritems(): 62 | new_params[kk] = vv.get_value() 63 | return new_params 64 | 65 | 66 | def dropout_layer(state_before, use_noise, trng): 67 | proj = tensor.switch(use_noise, 68 | (state_before * 69 | trng.binomial(state_before.shape, 70 | p=0.5, n=1, 71 | dtype=state_before.dtype)), 72 | state_before * 0.5) 73 | return proj 74 | 75 | 76 | def _p(pp, name): 77 | return '%s_%s' % (pp, name) 78 | 79 | 80 | def init_params(options): 81 | """ 82 | Global (not LSTM) parameter. For the embeding and the classifier. 83 | """ 84 | params = OrderedDict() 85 | # embedding 86 | randn = numpy.random.rand(options['n_words'], 87 | options['dim_proj']) 88 | params['Wemb'] = (0.01 * randn).astype('float32') 89 | params = get_layer(options['encoder'])[0](options, 90 | params, 91 | prefix=options['encoder']) 92 | # classifier 93 | params['U'] = 0.01 * numpy.random.randn(options['dim_proj'], 94 | options['ydim']).astype('float32') 95 | params['b'] = numpy.zeros((options['ydim'],)).astype('float32') 96 | 97 | return params 98 | 99 | 100 | def load_params(path, params): 101 | pp = numpy.load(path) 102 | for kk, vv in params.iteritems(): 103 | if kk not in pp: 104 | raise Warning('%s is not in the archive' % kk) 105 | params[kk] = pp[kk] 106 | 107 | return params 108 | 109 | 110 | def init_tparams(params): 111 | tparams = OrderedDict() 112 | for kk, pp in params.iteritems(): 113 | tparams[kk] = theano.shared(params[kk], name=kk) 114 | return tparams 115 | 116 | 117 | def get_layer(name): 118 | fns = layers[name] 119 | return fns 120 | 121 | 122 | def ortho_weight(ndim): 123 | W = numpy.random.randn(ndim, ndim) 124 | u, s, v = numpy.linalg.svd(W) 125 | return u.astype('float32') 126 | 127 | 128 | def param_init_lstm(options, params, prefix='lstm'): 129 | """ 130 | Init the LSTM parameter: 131 | 132 | :see: init_params 133 | """ 134 | W = numpy.concatenate([ortho_weight(options['dim_proj']), 135 | ortho_weight(options['dim_proj']), 136 | ortho_weight(options['dim_proj']), 137 | ortho_weight(options['dim_proj'])], axis=1) 138 | params[_p(prefix, 'W')] = W 139 | U = numpy.concatenate([ortho_weight(options['dim_proj']), 140 | ortho_weight(options['dim_proj']), 141 | ortho_weight(options['dim_proj']), 142 | ortho_weight(options['dim_proj'])], axis=1) 143 | params[_p(prefix, 'U')] = U 144 | b = numpy.zeros((4 * options['dim_proj'],)) 145 | params[_p(prefix, 'b')] = b.astype('float32') 146 | 147 | return params 148 | 149 | 150 | def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): 151 | nsteps = state_below.shape[0] 152 | if state_below.ndim == 3: 153 | n_samples = state_below.shape[1] 154 | else: 155 | n_samples = 1 156 | 157 | assert mask is not None 158 | 159 | def _slice(_x, n, dim): 160 | if _x.ndim == 3: 161 | return _x[:, :, n * dim:(n + 1) * dim] 162 | return _x[:, n * dim:(n + 1) * dim] 163 | 164 | def _step(m_, x_, h_, c_): 165 | preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) 166 | preact += x_ 167 | preact += tparams[_p(prefix, 'b')] 168 | 169 | i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) 170 | f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) 171 | o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj'])) 172 | c = tensor.tanh(_slice(preact, 3, options['dim_proj'])) 173 | 174 | c = f * c_ + i * c 175 | c = m_[:, None] * c + (1. - m_)[:, None] * c_ 176 | 177 | h = o * tensor.tanh(c) 178 | h = m_[:, None] * h + (1. - m_)[:, None] * h_ 179 | 180 | return h, c 181 | 182 | state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + 183 | tparams[_p(prefix, 'b')]) 184 | 185 | dim_proj = options['dim_proj'] 186 | rval, updates = theano.scan(_step, 187 | sequences=[mask, state_below], 188 | outputs_info=[tensor.alloc(0., n_samples, 189 | dim_proj), 190 | tensor.alloc(0., n_samples, 191 | dim_proj)], 192 | name=_p(prefix, '_layers'), 193 | n_steps=nsteps) 194 | return rval[0] 195 | 196 | 197 | # ff: Feed Forward (normal neural net), only useful to put after lstm 198 | # before the classifier. 199 | layers = {'lstm': (param_init_lstm, lstm_layer)} 200 | 201 | 202 | def sgd(lr, tparams, grads, x, mask, y, cost): 203 | """ Stochastic Gradient Descent 204 | 205 | :note: A more complicated version of sgd then needed. This is 206 | done like that for adadelta and rmsprop. 207 | 208 | """ 209 | # New set of shared variable that will contain the gradient 210 | # for a mini-batch. 211 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) 212 | for k, p in tparams.iteritems()] 213 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 214 | 215 | # Function that computes gradients for a mini-batch, but do not 216 | # updates the weights. 217 | f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, 218 | name='sgd_f_grad_shared') 219 | 220 | pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)] 221 | 222 | # Function that updates the weights from the previously computed 223 | # gradient. 224 | f_update = theano.function([lr], [], updates=pup, 225 | name='sgd_f_update') 226 | 227 | return f_grad_shared, f_update 228 | 229 | 230 | def adadelta(lr, tparams, grads, x, rx, mask, y, cost): 231 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 232 | name='%s_grad' % k) 233 | for k, p in tparams.iteritems()] 234 | running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), 235 | name='%s_rup2' % k) 236 | for k, p in tparams.iteritems()] 237 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 238 | name='%s_rgrad2' % k) 239 | for k, p in tparams.iteritems()] 240 | 241 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 242 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 243 | for rg2, g in zip(running_grads2, grads)] 244 | 245 | f_grad_shared = theano.function([x, rx, mask, y], cost, updates=zgup + rg2up, 246 | name='adadelta_f_grad_shared') 247 | 248 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 249 | for zg, ru2, rg2 in zip(zipped_grads, 250 | running_up2, 251 | running_grads2)] 252 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 253 | for ru2, ud in zip(running_up2, updir)] 254 | param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] 255 | 256 | f_update = theano.function([lr], [], updates=ru2up + param_up, 257 | on_unused_input='ignore', 258 | name='adadelta_f_update') 259 | 260 | return f_grad_shared, f_update 261 | 262 | 263 | def rmsprop(lr, tparams, grads, x, rx, mask, y, cost): 264 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 265 | name='%s_grad' % k) 266 | for k, p in tparams.iteritems()] 267 | running_grads = [theano.shared(p.get_value() * numpy.float32(0.), 268 | name='%s_rgrad' % k) 269 | for k, p in tparams.iteritems()] 270 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 271 | name='%s_rgrad2' % k) 272 | for k, p in tparams.iteritems()] 273 | 274 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 275 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] 276 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 277 | for rg2, g in zip(running_grads2, grads)] 278 | 279 | f_grad_shared = theano.function([x, rx, mask, y], cost, 280 | updates=zgup + rgup + rg2up, 281 | name='rmsprop_f_grad_shared') 282 | 283 | updir = [theano.shared(p.get_value() * numpy.float32(0.), 284 | name='%s_updir' % k) 285 | for k, p in tparams.iteritems()] 286 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) 287 | for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, 288 | running_grads2)] 289 | param_up = [(p, p + udn[1]) 290 | for p, udn in zip(tparams.values(), updir_new)] 291 | f_update = theano.function([lr], [], updates=updir_new + param_up, 292 | on_unused_input='ignore', 293 | name='rmsprop_f_update') 294 | 295 | return f_grad_shared, f_update 296 | 297 | 298 | def build_model(tparams, options): 299 | trng = RandomStreams(1234) 300 | 301 | # Used for dropout. 302 | use_noise = theano.shared(numpy.float32(0.)) 303 | 304 | x = tensor.matrix('x', dtype='int64') 305 | rx = tensor.matrix('rx', dtype='int64') 306 | mask = tensor.matrix('mask', dtype='float32') 307 | y = tensor.vector('y', dtype='int64') 308 | 309 | n_timesteps = x.shape[0] 310 | n_samples = x.shape[1] 311 | 312 | lstm_outs = [] 313 | for inp in [x, rx]: 314 | emb = tparams['Wemb'][inp.flatten()].reshape([n_timesteps, 315 | n_samples, 316 | options['dim_proj']]) 317 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 318 | prefix=options['encoder'], 319 | mask=mask) 320 | if options['encoder'] == 'lstm': 321 | proj = (proj * mask[:, :, None]).sum(axis=0) 322 | proj = proj / mask.sum(axis=0)[:, None] 323 | if options['use_dropout']: 324 | proj = dropout_layer(proj, use_noise, trng) 325 | lstm_outs.append(proj) 326 | 327 | del proj 328 | pred = tensor.nnet.softmax(tensor.dot(theano.tensor.concatenate(lstm_outs), 329 | tparams['U']) + tparams['b']) 330 | pred = pred.reshape((2, pred.shape[0]/2, pred.shape[1])).mean(axis=0) 331 | 332 | f_pred_prob = theano.function([x, rx, mask], pred, name='f_pred_prob') 333 | f_pred = theano.function([x, rx, mask], pred.argmax(axis=1), name='f_pred') 334 | 335 | cost = -tensor.log(pred[tensor.arange(n_samples), y] + 1e-8).mean() 336 | 337 | return use_noise, x, rx, mask, y, f_pred_prob, f_pred, cost 338 | 339 | 340 | def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False): 341 | """ If you want to use a trained model, this is useful to compute 342 | the probabilities of new examples. 343 | """ 344 | n_samples = len(data[0]) 345 | probs = numpy.zeros((n_samples, 2)).astype('float32') 346 | 347 | n_done = 0 348 | 349 | for _, valid_index in iterator: 350 | x, mask, y = prepare_data([data[0][t] for t in valid_index], 351 | numpy.array(data[1])[valid_index], 352 | maxlen=None) 353 | rx, _, _ = prepare_data([data[0][t][::-1] for t in valid_index], 354 | numpy.array(data[1])[valid_index], 355 | maxlen=None) 356 | pred_probs = f_pred_prob(x, mask) 357 | probs[valid_index, :] = pred_probs 358 | 359 | n_done += len(valid_index) 360 | if verbose: 361 | print '%d/%d samples classified' % (n_done, n_samples) 362 | 363 | return probs 364 | 365 | 366 | def pred_error(f_pred, prepare_data, data, iterator, verbose=False): 367 | """ 368 | Just compute the error 369 | f_pred: Theano fct computing the prediction 370 | prepare_data: usual prepare_data for that dataset. 371 | """ 372 | valid_err = 0 373 | for _, valid_index in iterator: 374 | x, mask, y = prepare_data([data[0][t] for t in valid_index], 375 | numpy.array(data[1])[valid_index], 376 | maxlen=None) 377 | rx, _, _ = prepare_data([data[0][t][::-1] for t in valid_index], 378 | numpy.array(data[1])[valid_index], 379 | maxlen=None) 380 | preds = f_pred(x, rx, mask) 381 | targets = numpy.array(data[1])[valid_index] 382 | valid_err += (preds == targets).sum() 383 | valid_err = 1. - numpy.float32(valid_err) / len(data[0]) 384 | 385 | return valid_err 386 | 387 | 388 | def train_lstm( 389 | train, valid, test, 390 | dim_proj=128, # word embeding dimension and LSTM number of hidden units. 391 | patience=10, # Number of epoch to wait before early stop if no progress 392 | max_epochs=5000, # The maximum number of epoch to run 393 | dispFreq=10, # Display to stdout the training progress every N updates 394 | decay_c=0., # Weight decay for the classifier applied to the U weights. 395 | lrate=0.0001, # Learning rate for sgd (not used for adadelta and rmsprop) 396 | n_words=10000, # Vocabulary size 397 | # sgd, adadelta and rmsprop available, 398 | # sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). 399 | optimizer=adadelta, 400 | encoder='lstm', # TODO: can be removed must be lstm. 401 | saveto='lstm_model.npz', # The best model will be saved there 402 | validFreq=370, # Compute the validation error after this number of update. 403 | saveFreq=1110, # Save the parameters after every saveFreq updates 404 | batch_size=16, # The batch size during training. 405 | valid_batch_size=64, # The batch size used for validation/test set. 406 | dataset='imdb', 407 | 408 | # Parameter for extra option 409 | noise_std=0., 410 | use_dropout=True, # if False slightly faster, but worst test error 411 | # This frequently need a bigger model. 412 | reload_model="", # Path to a saved model we want to start from. 413 | test_size=-1, # If >0, we will trunc the test set to this number of example. 414 | ): 415 | 416 | # Model options 417 | model_options = locals().copy() 418 | del model_options['train'] 419 | del model_options['valid'] 420 | del model_options['test'] 421 | print "model options", model_options 422 | 423 | if test_size > 0: 424 | test = (test[0][:test_size], test[1][:test_size]) 425 | 426 | ydim = numpy.max(train[1]) + 1 427 | 428 | model_options['ydim'] = ydim 429 | 430 | print 'Building model' 431 | # This create the initial parameters as numpy ndarrays. 432 | # Dict name (string) -> numpy ndarray 433 | params = init_params(model_options) 434 | 435 | if reload_model: 436 | load_params('lstm_model.npz', params) 437 | 438 | # This create Theano Shared Variable from the parameters. 439 | # Dict name (string) -> Theano Tensor Shared Variable 440 | # params and tparams have different copy of the weights. 441 | tparams = init_tparams(params) 442 | 443 | # use_noise is for dropout 444 | (use_noise, x, rx, mask, 445 | y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) 446 | 447 | if decay_c > 0.: 448 | decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') 449 | weight_decay = 0. 450 | weight_decay += (tparams['U'] ** 2).sum() 451 | weight_decay *= decay_c 452 | cost += weight_decay 453 | 454 | f_cost = theano.function([x, rx, mask, y], cost, name='f_cost') 455 | 456 | grads = tensor.grad(cost, wrt=tparams.values()) 457 | f_grad = theano.function([x, rx, mask, y], grads, name='f_grad') 458 | 459 | lr = tensor.scalar(name='lr') 460 | f_grad_shared, f_update = optimizer(lr, tparams, grads, 461 | x, rx, mask, y, cost) 462 | 463 | print 'Training' 464 | 465 | kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size, 466 | shuffle=True) 467 | kf_test = get_minibatches_idx(len(test[0]), valid_batch_size, 468 | shuffle=True) 469 | 470 | print "%d train examples" % len(train[0]) 471 | print "%d valid examples" % len(valid[0]) 472 | print "%d test examples" % len(test[0]) 473 | history_errs = [] 474 | best_p = None 475 | bad_count = 0 476 | 477 | if validFreq == -1: 478 | validFreq = len(train[0]) / batch_size 479 | if saveFreq == -1: 480 | saveFreq = len(train[0]) / batch_size 481 | 482 | uidx = 0 # the number of update done 483 | estop = False # early stop 484 | start_time = time.clock() 485 | try: 486 | for eidx in xrange(max_epochs): 487 | n_samples = 0 488 | 489 | # Get new shuffled index for the training set. 490 | kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) 491 | 492 | for _, train_index in kf: 493 | uidx += 1 494 | use_noise.set_value(1.) 495 | 496 | # Select the random examples for this minibatch 497 | y = [train[1][t] for t in train_index] 498 | x = [train[0][t] for t in train_index] 499 | 500 | # Get the data in numpy.ndarray format 501 | # This swap the axis! 502 | # Return something of shape (minibatch maxlen, n samples) 503 | x, mask, y = prepare_data(x, y) 504 | rx, _, Y = prepare_data([t[::-1] for t in x], y) 505 | n_samples += x.shape[1] 506 | 507 | cost = f_grad_shared(x, rx, mask, y) 508 | f_update(lrate) 509 | 510 | if numpy.isnan(cost) or numpy.isinf(cost): 511 | print 'NaN detected' 512 | return 1., 1., 1. 513 | 514 | if numpy.mod(uidx, dispFreq) == 0: 515 | print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost 516 | 517 | if numpy.mod(uidx, saveFreq) == 0: 518 | print 'Saving...', 519 | 520 | if best_p is not None: 521 | params = best_p 522 | else: 523 | params = unzip(tparams) 524 | numpy.savez(saveto, history_errs=history_errs, **params) 525 | pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) 526 | print 'Done' 527 | 528 | if numpy.mod(uidx, validFreq) == 0: 529 | use_noise.set_value(0.) 530 | train_err = pred_error(f_pred, prepare_data, train, kf) 531 | valid_err = pred_error(f_pred, prepare_data, valid, 532 | kf_valid) 533 | test_err = pred_error(f_pred, prepare_data, test, kf_test) 534 | 535 | history_errs.append([valid_err, test_err]) 536 | 537 | if (uidx == 0 or 538 | valid_err <= numpy.array(history_errs)[:, 539 | 0].min()): 540 | 541 | best_p = unzip(tparams) 542 | bad_counter = 0 543 | 544 | print ('Train ', train_err, 'Valid ', valid_err, 545 | 'Test ', test_err) 546 | 547 | if (len(history_errs) > patience and 548 | valid_err >= numpy.array(history_errs)[:-patience, 549 | 0].min()): 550 | bad_counter += 1 551 | if bad_counter > patience: 552 | print 'Early Stop!' 553 | estop = True 554 | break 555 | 556 | print 'Seen %d samples' % n_samples 557 | 558 | if estop: 559 | break 560 | 561 | except KeyboardInterrupt: 562 | print "Training interupted" 563 | 564 | end_time = time.clock() 565 | print "Training done" 566 | if best_p is not None: 567 | zipp(best_p, tparams) 568 | else: 569 | best_p = unzip(tparams) 570 | 571 | print "Computing errors" 572 | use_noise.set_value(0.) 573 | train_err = pred_error(f_pred, prepare_data, train, kf) 574 | valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) 575 | test_err = pred_error(f_pred, prepare_data, test, kf_test) 576 | 577 | print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err 578 | 579 | numpy.savez(saveto, train_err=train_err, 580 | valid_err=valid_err, test_err=test_err, 581 | history_errs=history_errs, **best_p) 582 | print 'The code run for %d epochs, with %f sec/epochs' % ( 583 | (eidx + 1), (end_time - start_time) / (1. * (eidx + 1))) 584 | print >> sys.stderr, ('Training took %.1fs' % 585 | (end_time - start_time)) 586 | return train_err, valid_err, test_err 587 | 588 | 589 | # We must have floatX=float32 for this tutorial to work correctly. 590 | theano.config.floatX = "float32" 591 | # The next line is the new Theano default. This is a speed up. 592 | theano.config.scan.allow_gc = False 593 | 594 | print 'Loading data' 595 | n_words = 10000 596 | load_data, prepare_data = get_dataset("imdb") 597 | train, valid, test = load_data(n_words=n_words, valid_portion=0.05, 598 | maxlen=100) 599 | print 'Loading data: Done' 600 | print "See the comment at the end of this cell to train the model." 601 | 602 | train_lstm( 603 | train, valid, test, 604 | # Setting max_epochs to 15 will show that it start to learn ~10m on my laptop. 605 | max_epochs=16, 606 | test_size=500, 607 | n_words=n_words, 608 | ) -------------------------------------------------------------------------------- /lstm_reverse.diff: -------------------------------------------------------------------------------- 1 | --- lstm.py.orig 2015-01-16 17:20:22.075153409 -0800 2 | +++ lstm_reverse.py 2015-01-16 17:21:10.243152608 -0800 3 | @@ -582,6 +582,9 @@ 4 | load_data, prepare_data = get_dataset("imdb") 5 | train, valid, test = load_data(n_words=n_words, valid_portion=0.05, 6 | maxlen=100) 7 | +train = ([r[::-1] for r in train[0]], train[1]) 8 | +valid = ([r[::-1] for r in valid[0]], valid[1]) 9 | +test = ([r[::-1] for r in test[0]], test[1]) 10 | print 'Loading data: Done' 11 | print "See the comment at the end of this cell to train the model." 12 | 13 | -------------------------------------------------------------------------------- /lstm_reverse.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Build a tweet sentiment analyzer 3 | ''' 4 | from collections import OrderedDict 5 | import cPickle as pkl 6 | import random 7 | import sys 8 | import time 9 | 10 | import numpy 11 | import theano 12 | import theano.tensor as tensor 13 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 14 | 15 | import imdb 16 | 17 | datasets = {'imdb': (imdb.load_data, imdb.prepare_data)} 18 | 19 | 20 | def get_minibatches_idx(n, minibatch_size, shuffle=False): 21 | """ 22 | Used to shuffle the dataset at each iteration. 23 | """ 24 | 25 | idx_list = numpy.arange(n, dtype="int32") 26 | 27 | if shuffle: 28 | random.shuffle(idx_list) 29 | 30 | minibatches = [] 31 | minibatch_start = 0 32 | for i in range(n // minibatch_size): 33 | minibatches.append(idx_list[minibatch_start: 34 | minibatch_start + minibatch_size]) 35 | minibatch_start += minibatch_size 36 | 37 | if (minibatch_start != n): 38 | # Make a minibatch out of what is left 39 | minibatches.append(idx_list[minibatch_start:]) 40 | 41 | return zip(range(len(minibatches)), minibatches) 42 | 43 | 44 | def get_dataset(name): 45 | return datasets[name][0], datasets[name][1] 46 | 47 | 48 | def zipp(params, tparams): 49 | """ 50 | When we reload the model. Needed for the GPU stuff. 51 | """ 52 | for kk, vv in params.iteritems(): 53 | tparams[kk].set_value(vv) 54 | 55 | 56 | def unzip(zipped): 57 | """ 58 | When we pickle the model. Needed for the GPU stuff. 59 | """ 60 | new_params = OrderedDict() 61 | for kk, vv in zipped.iteritems(): 62 | new_params[kk] = vv.get_value() 63 | return new_params 64 | 65 | 66 | def dropout_layer(state_before, use_noise, trng): 67 | proj = tensor.switch(use_noise, 68 | (state_before * 69 | trng.binomial(state_before.shape, 70 | p=0.5, n=1, 71 | dtype=state_before.dtype)), 72 | state_before * 0.5) 73 | return proj 74 | 75 | 76 | def _p(pp, name): 77 | return '%s_%s' % (pp, name) 78 | 79 | 80 | def init_params(options): 81 | """ 82 | Global (not LSTM) parameter. For the embeding and the classifier. 83 | """ 84 | params = OrderedDict() 85 | # embedding 86 | randn = numpy.random.rand(options['n_words'], 87 | options['dim_proj']) 88 | params['Wemb'] = (0.01 * randn).astype('float32') 89 | params = get_layer(options['encoder'])[0](options, 90 | params, 91 | prefix=options['encoder']) 92 | # classifier 93 | params['U'] = 0.01 * numpy.random.randn(options['dim_proj'], 94 | options['ydim']).astype('float32') 95 | params['b'] = numpy.zeros((options['ydim'],)).astype('float32') 96 | 97 | return params 98 | 99 | 100 | def load_params(path, params): 101 | pp = numpy.load(path) 102 | for kk, vv in params.iteritems(): 103 | if kk not in pp: 104 | raise Warning('%s is not in the archive' % kk) 105 | params[kk] = pp[kk] 106 | 107 | return params 108 | 109 | 110 | def init_tparams(params): 111 | tparams = OrderedDict() 112 | for kk, pp in params.iteritems(): 113 | tparams[kk] = theano.shared(params[kk], name=kk) 114 | return tparams 115 | 116 | 117 | def get_layer(name): 118 | fns = layers[name] 119 | return fns 120 | 121 | 122 | def ortho_weight(ndim): 123 | W = numpy.random.randn(ndim, ndim) 124 | u, s, v = numpy.linalg.svd(W) 125 | return u.astype('float32') 126 | 127 | 128 | def param_init_lstm(options, params, prefix='lstm'): 129 | """ 130 | Init the LSTM parameter: 131 | 132 | :see: init_params 133 | """ 134 | W = numpy.concatenate([ortho_weight(options['dim_proj']), 135 | ortho_weight(options['dim_proj']), 136 | ortho_weight(options['dim_proj']), 137 | ortho_weight(options['dim_proj'])], axis=1) 138 | params[_p(prefix, 'W')] = W 139 | U = numpy.concatenate([ortho_weight(options['dim_proj']), 140 | ortho_weight(options['dim_proj']), 141 | ortho_weight(options['dim_proj']), 142 | ortho_weight(options['dim_proj'])], axis=1) 143 | params[_p(prefix, 'U')] = U 144 | b = numpy.zeros((4 * options['dim_proj'],)) 145 | params[_p(prefix, 'b')] = b.astype('float32') 146 | 147 | return params 148 | 149 | 150 | def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): 151 | nsteps = state_below.shape[0] 152 | if state_below.ndim == 3: 153 | n_samples = state_below.shape[1] 154 | else: 155 | n_samples = 1 156 | 157 | assert mask is not None 158 | 159 | def _slice(_x, n, dim): 160 | if _x.ndim == 3: 161 | return _x[:, :, n * dim:(n + 1) * dim] 162 | return _x[:, n * dim:(n + 1) * dim] 163 | 164 | def _step(m_, x_, h_, c_): 165 | preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) 166 | preact += x_ 167 | preact += tparams[_p(prefix, 'b')] 168 | 169 | i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) 170 | f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) 171 | o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj'])) 172 | c = tensor.tanh(_slice(preact, 3, options['dim_proj'])) 173 | 174 | c = f * c_ + i * c 175 | c = m_[:, None] * c + (1. - m_)[:, None] * c_ 176 | 177 | h = o * tensor.tanh(c) 178 | h = m_[:, None] * h + (1. - m_)[:, None] * h_ 179 | 180 | return h, c 181 | 182 | state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + 183 | tparams[_p(prefix, 'b')]) 184 | 185 | dim_proj = options['dim_proj'] 186 | rval, updates = theano.scan(_step, 187 | sequences=[mask, state_below], 188 | outputs_info=[tensor.alloc(0., n_samples, 189 | dim_proj), 190 | tensor.alloc(0., n_samples, 191 | dim_proj)], 192 | name=_p(prefix, '_layers'), 193 | n_steps=nsteps) 194 | return rval[0] 195 | 196 | 197 | # ff: Feed Forward (normal neural net), only useful to put after lstm 198 | # before the classifier. 199 | layers = {'lstm': (param_init_lstm, lstm_layer)} 200 | 201 | 202 | def sgd(lr, tparams, grads, x, mask, y, cost): 203 | """ Stochastic Gradient Descent 204 | 205 | :note: A more complicated version of sgd then needed. This is 206 | done like that for adadelta and rmsprop. 207 | 208 | """ 209 | # New set of shared variable that will contain the gradient 210 | # for a mini-batch. 211 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) 212 | for k, p in tparams.iteritems()] 213 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 214 | 215 | # Function that computes gradients for a mini-batch, but do not 216 | # updates the weights. 217 | f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, 218 | name='sgd_f_grad_shared') 219 | 220 | pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)] 221 | 222 | # Function that updates the weights from the previously computed 223 | # gradient. 224 | f_update = theano.function([lr], [], updates=pup, 225 | name='sgd_f_update') 226 | 227 | return f_grad_shared, f_update 228 | 229 | 230 | def adadelta(lr, tparams, grads, x, mask, y, cost): 231 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 232 | name='%s_grad' % k) 233 | for k, p in tparams.iteritems()] 234 | running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), 235 | name='%s_rup2' % k) 236 | for k, p in tparams.iteritems()] 237 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 238 | name='%s_rgrad2' % k) 239 | for k, p in tparams.iteritems()] 240 | 241 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 242 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 243 | for rg2, g in zip(running_grads2, grads)] 244 | 245 | f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up, 246 | name='adadelta_f_grad_shared') 247 | 248 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 249 | for zg, ru2, rg2 in zip(zipped_grads, 250 | running_up2, 251 | running_grads2)] 252 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 253 | for ru2, ud in zip(running_up2, updir)] 254 | param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] 255 | 256 | f_update = theano.function([lr], [], updates=ru2up + param_up, 257 | on_unused_input='ignore', 258 | name='adadelta_f_update') 259 | 260 | return f_grad_shared, f_update 261 | 262 | 263 | def rmsprop(lr, tparams, grads, x, mask, y, cost): 264 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 265 | name='%s_grad' % k) 266 | for k, p in tparams.iteritems()] 267 | running_grads = [theano.shared(p.get_value() * numpy.float32(0.), 268 | name='%s_rgrad' % k) 269 | for k, p in tparams.iteritems()] 270 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 271 | name='%s_rgrad2' % k) 272 | for k, p in tparams.iteritems()] 273 | 274 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 275 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] 276 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 277 | for rg2, g in zip(running_grads2, grads)] 278 | 279 | f_grad_shared = theano.function([x, mask, y], cost, 280 | updates=zgup + rgup + rg2up, 281 | name='rmsprop_f_grad_shared') 282 | 283 | updir = [theano.shared(p.get_value() * numpy.float32(0.), 284 | name='%s_updir' % k) 285 | for k, p in tparams.iteritems()] 286 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) 287 | for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, 288 | running_grads2)] 289 | param_up = [(p, p + udn[1]) 290 | for p, udn in zip(tparams.values(), updir_new)] 291 | f_update = theano.function([lr], [], updates=updir_new + param_up, 292 | on_unused_input='ignore', 293 | name='rmsprop_f_update') 294 | 295 | return f_grad_shared, f_update 296 | 297 | 298 | def build_model(tparams, options): 299 | trng = RandomStreams(1234) 300 | 301 | # Used for dropout. 302 | use_noise = theano.shared(numpy.float32(0.)) 303 | 304 | x = tensor.matrix('x', dtype='int64') 305 | mask = tensor.matrix('mask', dtype='float32') 306 | y = tensor.vector('y', dtype='int64') 307 | 308 | n_timesteps = x.shape[0] 309 | n_samples = x.shape[1] 310 | 311 | emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, 312 | n_samples, 313 | options['dim_proj']]) 314 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 315 | prefix=options['encoder'], 316 | mask=mask) 317 | if options['encoder'] == 'lstm': 318 | proj = (proj * mask[:, :, None]).sum(axis=0) 319 | proj = proj / mask.sum(axis=0)[:, None] 320 | if options['use_dropout']: 321 | proj = dropout_layer(proj, use_noise, trng) 322 | 323 | pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b']) 324 | 325 | f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob') 326 | f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred') 327 | 328 | cost = -tensor.log(pred[tensor.arange(n_samples), y] + 1e-8).mean() 329 | 330 | return use_noise, x, mask, y, f_pred_prob, f_pred, cost 331 | 332 | 333 | def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False): 334 | """ If you want to use a trained model, this is useful to compute 335 | the probabilities of new examples. 336 | """ 337 | n_samples = len(data[0]) 338 | probs = numpy.zeros((n_samples, 2)).astype('float32') 339 | 340 | n_done = 0 341 | 342 | for _, valid_index in iterator: 343 | x, mask, y = prepare_data([data[0][t] for t in valid_index], 344 | numpy.array(data[1])[valid_index], 345 | maxlen=None) 346 | pred_probs = f_pred_prob(x, mask) 347 | probs[valid_index, :] = pred_probs 348 | 349 | n_done += len(valid_index) 350 | if verbose: 351 | print '%d/%d samples classified' % (n_done, n_samples) 352 | 353 | return probs 354 | 355 | 356 | def pred_error(f_pred, prepare_data, data, iterator, verbose=False): 357 | """ 358 | Just compute the error 359 | f_pred: Theano fct computing the prediction 360 | prepare_data: usual prepare_data for that dataset. 361 | """ 362 | valid_err = 0 363 | for _, valid_index in iterator: 364 | x, mask, y = prepare_data([data[0][t] for t in valid_index], 365 | numpy.array(data[1])[valid_index], 366 | maxlen=None) 367 | preds = f_pred(x, mask) 368 | targets = numpy.array(data[1])[valid_index] 369 | valid_err += (preds == targets).sum() 370 | valid_err = 1. - numpy.float32(valid_err) / len(data[0]) 371 | 372 | return valid_err 373 | 374 | 375 | def train_lstm( 376 | train, valid, test, 377 | dim_proj=128, # word embeding dimension and LSTM number of hidden units. 378 | patience=10, # Number of epoch to wait before early stop if no progress 379 | max_epochs=5000, # The maximum number of epoch to run 380 | dispFreq=10, # Display to stdout the training progress every N updates 381 | decay_c=0., # Weight decay for the classifier applied to the U weights. 382 | lrate=0.0001, # Learning rate for sgd (not used for adadelta and rmsprop) 383 | n_words=10000, # Vocabulary size 384 | # sgd, adadelta and rmsprop available, 385 | # sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). 386 | optimizer=adadelta, 387 | encoder='lstm', # TODO: can be removed must be lstm. 388 | saveto='lstm_model.npz', # The best model will be saved there 389 | validFreq=370, # Compute the validation error after this number of update. 390 | saveFreq=1110, # Save the parameters after every saveFreq updates 391 | batch_size=16, # The batch size during training. 392 | valid_batch_size=64, # The batch size used for validation/test set. 393 | dataset='imdb', 394 | 395 | # Parameter for extra option 396 | noise_std=0., 397 | use_dropout=True, # if False slightly faster, but worst test error 398 | # This frequently need a bigger model. 399 | reload_model="", # Path to a saved model we want to start from. 400 | test_size=-1, # If >0, we will trunc the test set to this number of example. 401 | ): 402 | 403 | # Model options 404 | model_options = locals().copy() 405 | del model_options['train'] 406 | del model_options['valid'] 407 | del model_options['test'] 408 | print "model options", model_options 409 | 410 | if test_size > 0: 411 | test = (test[0][:test_size], test[1][:test_size]) 412 | 413 | ydim = numpy.max(train[1]) + 1 414 | 415 | model_options['ydim'] = ydim 416 | 417 | print 'Building model' 418 | # This create the initial parameters as numpy ndarrays. 419 | # Dict name (string) -> numpy ndarray 420 | params = init_params(model_options) 421 | 422 | if reload_model: 423 | load_params('lstm_model.npz', params) 424 | 425 | # This create Theano Shared Variable from the parameters. 426 | # Dict name (string) -> Theano Tensor Shared Variable 427 | # params and tparams have different copy of the weights. 428 | tparams = init_tparams(params) 429 | 430 | # use_noise is for dropout 431 | (use_noise, x, mask, 432 | y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) 433 | 434 | if decay_c > 0.: 435 | decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') 436 | weight_decay = 0. 437 | weight_decay += (tparams['U'] ** 2).sum() 438 | weight_decay *= decay_c 439 | cost += weight_decay 440 | 441 | f_cost = theano.function([x, mask, y], cost, name='f_cost') 442 | 443 | grads = tensor.grad(cost, wrt=tparams.values()) 444 | f_grad = theano.function([x, mask, y], grads, name='f_grad') 445 | 446 | lr = tensor.scalar(name='lr') 447 | f_grad_shared, f_update = optimizer(lr, tparams, grads, 448 | x, mask, y, cost) 449 | 450 | print 'Training' 451 | 452 | kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size, 453 | shuffle=True) 454 | kf_test = get_minibatches_idx(len(test[0]), valid_batch_size, 455 | shuffle=True) 456 | 457 | print "%d train examples" % len(train[0]) 458 | print "%d valid examples" % len(valid[0]) 459 | print "%d test examples" % len(test[0]) 460 | history_errs = [] 461 | best_p = None 462 | bad_count = 0 463 | 464 | if validFreq == -1: 465 | validFreq = len(train[0]) / batch_size 466 | if saveFreq == -1: 467 | saveFreq = len(train[0]) / batch_size 468 | 469 | uidx = 0 # the number of update done 470 | estop = False # early stop 471 | start_time = time.clock() 472 | try: 473 | for eidx in xrange(max_epochs): 474 | n_samples = 0 475 | 476 | # Get new shuffled index for the training set. 477 | kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) 478 | 479 | for _, train_index in kf: 480 | uidx += 1 481 | use_noise.set_value(1.) 482 | 483 | # Select the random examples for this minibatch 484 | y = [train[1][t] for t in train_index] 485 | x = [train[0][t] for t in train_index] 486 | 487 | # Get the data in numpy.ndarray format 488 | # This swap the axis! 489 | # Return something of shape (minibatch maxlen, n samples) 490 | x, mask, y = prepare_data(x, y) 491 | n_samples += x.shape[1] 492 | 493 | cost = f_grad_shared(x, mask, y) 494 | f_update(lrate) 495 | 496 | if numpy.isnan(cost) or numpy.isinf(cost): 497 | print 'NaN detected' 498 | return 1., 1., 1. 499 | 500 | if numpy.mod(uidx, dispFreq) == 0: 501 | print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost 502 | 503 | if numpy.mod(uidx, saveFreq) == 0: 504 | print 'Saving...', 505 | 506 | if best_p is not None: 507 | params = best_p 508 | else: 509 | params = unzip(tparams) 510 | numpy.savez(saveto, history_errs=history_errs, **params) 511 | pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) 512 | print 'Done' 513 | 514 | if numpy.mod(uidx, validFreq) == 0: 515 | use_noise.set_value(0.) 516 | train_err = pred_error(f_pred, prepare_data, train, kf) 517 | valid_err = pred_error(f_pred, prepare_data, valid, 518 | kf_valid) 519 | test_err = pred_error(f_pred, prepare_data, test, kf_test) 520 | 521 | history_errs.append([valid_err, test_err]) 522 | 523 | if (uidx == 0 or 524 | valid_err <= numpy.array(history_errs)[:, 525 | 0].min()): 526 | 527 | best_p = unzip(tparams) 528 | bad_counter = 0 529 | 530 | print ('Train ', train_err, 'Valid ', valid_err, 531 | 'Test ', test_err) 532 | 533 | if (len(history_errs) > patience and 534 | valid_err >= numpy.array(history_errs)[:-patience, 535 | 0].min()): 536 | bad_counter += 1 537 | if bad_counter > patience: 538 | print 'Early Stop!' 539 | estop = True 540 | break 541 | 542 | print 'Seen %d samples' % n_samples 543 | 544 | if estop: 545 | break 546 | 547 | except KeyboardInterrupt: 548 | print "Training interupted" 549 | 550 | end_time = time.clock() 551 | print "Training done" 552 | if best_p is not None: 553 | zipp(best_p, tparams) 554 | else: 555 | best_p = unzip(tparams) 556 | 557 | print "Computing errors" 558 | use_noise.set_value(0.) 559 | train_err = pred_error(f_pred, prepare_data, train, kf) 560 | valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) 561 | test_err = pred_error(f_pred, prepare_data, test, kf_test) 562 | 563 | print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err 564 | 565 | numpy.savez(saveto, train_err=train_err, 566 | valid_err=valid_err, test_err=test_err, 567 | history_errs=history_errs, **best_p) 568 | print 'The code run for %d epochs, with %f sec/epochs' % ( 569 | (eidx + 1), (end_time - start_time) / (1. * (eidx + 1))) 570 | print >> sys.stderr, ('Training took %.1fs' % 571 | (end_time - start_time)) 572 | return train_err, valid_err, test_err 573 | 574 | 575 | # We must have floatX=float32 for this tutorial to work correctly. 576 | theano.config.floatX = "float32" 577 | # The next line is the new Theano default. This is a speed up. 578 | theano.config.scan.allow_gc = False 579 | 580 | print 'Loading data' 581 | n_words = 10000 582 | load_data, prepare_data = get_dataset("imdb") 583 | train, valid, test = load_data(n_words=n_words, valid_portion=0.05, 584 | maxlen=100) 585 | train = ([r[::-1] for r in train[0]], train[1]) 586 | valid = ([r[::-1] for r in valid[0]], valid[1]) 587 | test = ([r[::-1] for r in test[0]], test[1]) 588 | print 'Loading data: Done' 589 | print "See the comment at the end of this cell to train the model." 590 | 591 | # See function train for all possible parameter and there definition. 592 | #train_lstm( 593 | # train, valid, test, 594 | # max_epochs=16, 595 | # test_size=500, 596 | # n_words=n_words, 597 | #) 598 | --------------------------------------------------------------------------------