├── .gitignore
├── 01_scalar_soln.py
├── 02_vector_mat_soln.py
├── 03_tensor_soln.py
├── 04_function_soln.py
├── 05_shared_soln.py
├── 06_grad_soln.py
├── 07_mode.py
├── 08_scan_polynomial_soln.py
├── Exercices.ipynb
├── README.md
├── imdb.py
├── lstm.py
├── lstm_double.diff
├── lstm_double.py
├── lstm_reverse.diff
└── lstm_reverse.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 


--------------------------------------------------------------------------------
/01_scalar_soln.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from theano import function
 3 | import theano.tensor as T
 4 | 
 5 | 
 6 | def make_scalar():
 7 |     """
 8 |     Returns a new Theano scalar.
 9 |     """
10 | 
11 |     return T.scalar()
12 | 
13 | 
14 | def log(x):
15 |     """
16 |     Returns the logarithm of a Theano scalar x.
17 |     """
18 | 
19 |     return T.log(x)
20 | 
21 | 
22 | def add(x, y):
23 |     """
24 |     Adds two theano scalars together and returns the result.
25 |     """
26 | 
27 |     return x + y
28 | 
29 | a = make_scalar()
30 | b = make_scalar()
31 | c = log(b)
32 | d = add(a, c)
33 | f = function([a, b], d)
34 | a = np.cast[a.dtype](1.)
35 | b = np.cast[b.dtype](2.)
36 | actual = f(a, b)
37 | expected = 1. + np.log(2.)
38 | assert np.allclose(actual, expected)
39 | print "SUCCESS!"
40 | 


--------------------------------------------------------------------------------
/02_vector_mat_soln.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from theano import function
 3 | import theano.tensor as T
 4 | 
 5 | 
 6 | def make_vector():
 7 |     """
 8 |     Returns a new Theano vector.
 9 |     """
10 | 
11 |     return T.vector()
12 | 
13 | 
14 | def make_matrix():
15 |     """
16 |     Returns a new Theano matrix.
17 |     """
18 | 
19 |     return T.matrix()
20 | 
21 | 
22 | def elemwise_mul(a, b):
23 |     """
24 |     a: A theano matrix
25 |     b: A theano matrix
26 |     Returns the elementwise product of a and b
27 |     """
28 | 
29 |     return a * b
30 | 
31 | 
32 | def matrix_vector_mul(a, b):
33 |     """
34 |     a: A theano matrix
35 |     b: A theano vector
36 |     Returns the matrix-vector product of a and b
37 |     """
38 | 
39 |     return T.dot(a, b)
40 | 
41 | a = make_vector()
42 | b = make_vector()
43 | c = elemwise_mul(a, b)
44 | d = make_matrix()
45 | e = matrix_vector_mul(d, c)
46 | 
47 | f = function([a, b, d], e)
48 | 
49 | rng = np.random.RandomState([1, 2, 3])
50 | a_value = rng.randn(5).astype(a.dtype)
51 | b_value = rng.rand(5).astype(b.dtype)
52 | c_value = a_value * b_value
53 | d_value = rng.randn(5, 5).astype(d.dtype)
54 | expected = np.dot(d_value, c_value)
55 | 
56 | actual = f(a_value, b_value, d_value)
57 | 
58 | assert np.allclose(actual, expected)
59 | print "SUCCESS!"
60 | 


--------------------------------------------------------------------------------
/03_tensor_soln.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from theano import function
 3 | import theano.tensor as T
 4 | 
 5 | 
 6 | def make_tensor(dim):
 7 |     """
 8 |     Returns a new Theano tensor with no broadcastable dimensions.
 9 |     dim: the total number of dimensions of the tensor.
10 |     """
11 | 
12 |     return T.TensorType(broadcastable=tuple([False] * dim), dtype='float32')()
13 | 
14 | 
15 | def broadcasted_add(a, b):
16 |     """
17 |     a: a 3D theano tensor
18 |     b: a 4D theano tensor
19 |     Returns c, a 4D theano tensor, where
20 | 
21 |     c[i, j, k, l] = a[l, k, i] + b[i, j, k, l]
22 | 
23 |     for all i, j, k, l
24 |     """
25 | 
26 |     return a.dimshuffle(2, 'x', 1, 0) + b
27 | 
28 | 
29 | def partial_max(a):
30 |     """
31 |     a: a 4D theano tensor
32 | 
33 |     Returns b, a theano matrix, where
34 | 
35 |     b[i, j] = max_{k,l} a[i, k, l, j]
36 | 
37 |     for all i, j
38 |     """
39 | 
40 |     return a.max(axis=(1, 2))
41 | 
42 | a = make_tensor(3)
43 | b = make_tensor(4)
44 | c = broadcasted_add(a, b)
45 | d = partial_max(c)
46 | 
47 | f = function([a, b], d)
48 | 
49 | rng = np.random.RandomState([1, 2, 3])
50 | a_value = rng.randn(2, 2, 2).astype(a.dtype)
51 | b_value = rng.rand(2, 2, 2, 2).astype(b.dtype)
52 | c_value = np.transpose(a_value, (2, 1, 0))[:, None, :, :] + b_value
53 | expected = c_value.max(axis=1).max(axis=1)
54 | 
55 | actual = f(a_value, b_value)
56 | 
57 | assert np.allclose(actual, expected), (actual, expected)
58 | print "SUCCESS!"
59 | 


--------------------------------------------------------------------------------
/04_function_soln.py:
--------------------------------------------------------------------------------
 1 | from theano import tensor as T
 2 | from theano import function
 3 | 
 4 | 
 5 | def evaluate(x, y, expr, x_value, y_value):
 6 |     """
 7 |     x: A theano variable
 8 |     y: A theano variable
 9 |     expr: A theano expression involving x and y
10 |     x_value: A numpy value
11 |     y_value: A numpy value
12 | 
13 |     Returns the value of expr when x_value is substituted for x
14 |     and y_value is substituted for y
15 |     """
16 | 
17 |     return function([x, y], expr)(x_value, y_value)
18 | 
19 | 
20 | x = T.iscalar()
21 | y = T.iscalar()
22 | z = x + y
23 | assert evaluate(x, y, z, 1, 2) == 3
24 | print "SUCCESS!"
25 | 


--------------------------------------------------------------------------------
/05_shared_soln.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from theano.compat.python2x import OrderedDict
 3 | from theano import function
 4 | from theano import shared
 5 | 
 6 | 
 7 | def make_shared(shape):
 8 |     """
 9 |     Returns a theano shared variable containing a tensor of the specified
10 |     shape.
11 |     You can use any value you want.
12 |     """
13 |     return shared(np.zeros(shape))
14 | 
15 | 
16 | def exchange_shared(a, b):
17 |     """
18 |     a: a theano shared variable
19 |     b: a theano shared variable
20 |     Uses get_value and set_value to swap the values stored in a and b
21 |     """
22 |     temp = a.get_value()
23 |     a.set_value(b.get_value())
24 |     b.set_value(temp)
25 | 
26 | 
27 | def make_exchange_func(a, b):
28 |     """
29 |     a: a theano shared variable
30 |     b: a theano shared variable
31 |     Returns f
32 |     where f is a theano function, that, when called, swaps the
33 |     values in a and b
34 |     f should not return anything
35 |     """
36 | 
37 |     updates = OrderedDict()
38 |     updates[a] = b
39 |     updates[b] = a
40 |     f = function([], updates=updates)
41 |     return f
42 | 
43 | 
44 | a = make_shared((5, 4, 3))
45 | assert a.get_value().shape == (5, 4, 3)
46 | b = make_shared((5, 4, 3))
47 | assert a.get_value().shape == (5, 4, 3)
48 | a.set_value(np.zeros((5, 4, 3), dtype=a.dtype))
49 | b.set_value(np.ones((5, 4, 3), dtype=b.dtype))
50 | exchange_shared(a, b)
51 | assert np.all(a.get_value() == 1.)
52 | assert np.all(b.get_value() == 0.)
53 | f = make_exchange_func(a, b)
54 | rval = f()
55 | assert isinstance(rval, list)
56 | assert len(rval) == 0
57 | assert np.all(a.get_value() == 0.)
58 | assert np.all(b.get_value() == 1.)
59 | 
60 | print "SUCCESS!"
61 | 


--------------------------------------------------------------------------------
/06_grad_soln.py:
--------------------------------------------------------------------------------
 1 | # Fill in the TODOs in this exercise, then run
 2 | # python 01_grad.py to see if your solution works!
 3 | #
 4 | from theano import tensor as T
 5 | 
 6 | 
 7 | def grad_sum(x, y, z):
 8 |     """
 9 |     x: A theano variable
10 |     y: A theano variable
11 |     z: A theano expression involving x and y
12 | 
13 |     Returns dz / dx + dz / dy
14 |     """
15 | 
16 |     return sum(T.grad(z, [x, y]))
17 | 
18 | x = T.scalar()
19 | y = T.scalar()
20 | z = x + y
21 | s = grad_sum(x, y, z)
22 | assert s.eval({x: 0, y: 0}) == 2
23 | print "SUCCESS!"
24 | 


--------------------------------------------------------------------------------
/07_mode.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from theano import function
 3 | from theano import tensor as T
 4 | from theano import config
 5 | config.compute_test_value = 'raise'
 6 | a = T.vector()
 7 | a.tag.test_value = np.ones((3,)).astype(a.dtype)
 8 | b = T.log(a)
 9 | c = T.nnet.sigmoid(b)
10 | d = T.sqrt(c)
11 | e = T.concatenate((d, c), axis=0)
12 | f = b * c * d
13 | # This is the first bad line
14 | g = e + f
15 | h = g / c
16 | fn = function([a], h)
17 | fn(np.ones((3,)).astype(a.dtype))
18 | 


--------------------------------------------------------------------------------
/08_scan_polynomial_soln.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | import theano
 4 | import theano.tensor as tt
 5 | 
 6 | theano.config.warn.subtensor_merge_bug = False
 7 | 
 8 | coefficients = tt.vector("coefficients")
 9 | x = tt.scalar("x")
10 | max_coefficients_supported = 10000
11 | 
12 | # Generate the components of the polynomial
13 | full_range = tt.arange(max_coefficients_supported)
14 | 
15 | 
16 | outputs_info = tt.as_tensor_variable(numpy.asarray(0, 'float64'))
17 | 
18 | components, updates = theano.scan(
19 |     fn=lambda coeff, power, prior_value, free_var:
20 |     prior_value + (coeff * (free_var ** power)),
21 |     sequences=[coefficients, full_range],
22 |     outputs_info=outputs_info,
23 |     non_sequences=x)
24 | 
25 | polynomial = components[-1]
26 | calculate_polynomial = theano.function(
27 |     inputs=[coefficients, x],
28 |     outputs=polynomial, updates=updates)
29 | 
30 | test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)
31 | print calculate_polynomial(test_coeff, 3)
32 | # 19.0
33 | 


--------------------------------------------------------------------------------
/Exercices.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "Exercices",
  4 |   "signature": "sha256:77d5115ebf6c96f00122775403f827b913b9144aa466e24779ec3e5729e36d94"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "code",
 13 |      "collapsed": false,
 14 |      "input": [],
 15 |      "language": "python",
 16 |      "metadata": {},
 17 |      "outputs": []
 18 |     },
 19 |     {
 20 |      "cell_type": "markdown",
 21 |      "metadata": {},
 22 |      "source": [
 23 |       "The exercices work that way:\n",
 24 |       "\n",
 25 |       "1) You have a cell with TODOs that raise errors with a description of what is needed. Do that.\n",
 26 |       "2) Then run the cell(ctrl-enter) to execute it.\n",
 27 |       "3) It should print \"Success\" at the end (there is validation code in the cell). If not, try again.\n",
 28 |       "4) If you want to see the solution, execute the cell that start with \"%load\" after the exercice.\n",
 29 |       "\n",
 30 |       "First, there are Theano exercices, then 1 scan specific exercics, then some exercices related to the LSTM example."
 31 |      ]
 32 |     },
 33 |     {
 34 |      "cell_type": "code",
 35 |      "collapsed": false,
 36 |      "input": [
 37 |       "# Exercices 1\n",
 38 |       "# This exercice ask you to create Theano variable and do some\n",
 39 |       "# computation on them.\n",
 40 |       "import numpy as np\n",
 41 |       "from theano import function\n",
 42 |       "raise NotImplementedError(\"TODO: add any other imports you need\")\n",
 43 |       "\n",
 44 |       "\n",
 45 |       "def make_scalar():\n",
 46 |       "    \"\"\"\n",
 47 |       "    Returns a new Theano scalar.\n",
 48 |       "    \"\"\"\n",
 49 |       "\n",
 50 |       "    raise NotImplementedError(\"TODO: implement this function.\")\n",
 51 |       "\n",
 52 |       "\n",
 53 |       "def log(x):\n",
 54 |       "    \"\"\"\n",
 55 |       "    Returns the logarithm of a Theano scalar x.\n",
 56 |       "    \"\"\"\n",
 57 |       "\n",
 58 |       "    raise NotImplementedError(\"TODO: implement this function.\")\n",
 59 |       "\n",
 60 |       "\n",
 61 |       "def add(x, y):\n",
 62 |       "    \"\"\"\n",
 63 |       "    Adds two theano scalars together and returns the result.\n",
 64 |       "    \"\"\"\n",
 65 |       "\n",
 66 |       "    raise NotImplementedError(\"TODO: implement this function.\")\n",
 67 |       "    \n",
 68 |       "# The following code use your code and test it.\n",
 69 |       "a = make_scalar()\n",
 70 |       "b = make_scalar()\n",
 71 |       "c = log(b)\n",
 72 |       "d = add(a, c)\n",
 73 |       "f = function([a, b], d)\n",
 74 |       "a = np.cast[a.dtype](1.)\n",
 75 |       "b = np.cast[b.dtype](2.)\n",
 76 |       "actual = f(a, b)\n",
 77 |       "expected = 1. + np.log(2.)\n",
 78 |       "assert np.allclose(actual, expected)\n",
 79 |       "print \"SUCCESS!\"\n"
 80 |      ],
 81 |      "language": "python",
 82 |      "metadata": {},
 83 |      "outputs": []
 84 |     },
 85 |     {
 86 |      "cell_type": "code",
 87 |      "collapsed": false,
 88 |      "input": [
 89 |       "%load 01_scalar_soln.py"
 90 |      ],
 91 |      "language": "python",
 92 |      "metadata": {},
 93 |      "outputs": []
 94 |     },
 95 |     {
 96 |      "cell_type": "code",
 97 |      "collapsed": false,
 98 |      "input": [
 99 |       "# Exercices 2\n",
100 |       "# This exercices ask you to make Theano variable, elemwise\n",
101 |       "# multiplication and matrix/vector dot product.\n",
102 |       "import numpy as np\n",
103 |       "from theano import function\n",
104 |       "raise NotImplementedError(\"TODO: add any other imports you need\")\n",
105 |       "\n",
106 |       "\n",
107 |       "def make_vector():\n",
108 |       "    \"\"\"\n",
109 |       "    Returns a new Theano vector.\n",
110 |       "    \"\"\"\n",
111 |       "\n",
112 |       "    raise NotImplementedError(\"TODO: implement this function.\")\n",
113 |       "\n",
114 |       "\n",
115 |       "def make_matrix():\n",
116 |       "    \"\"\"\n",
117 |       "    Returns a new Theano matrix.\n",
118 |       "    \"\"\"\n",
119 |       "\n",
120 |       "    raise NotImplementedError(\"TODO: implement this function.\")\n",
121 |       "\n",
122 |       "def elemwise_mul(a, b):\n",
123 |       "    \"\"\"\n",
124 |       "    a: A theano matrix\n",
125 |       "    b: A theano matrix\n",
126 |       "    Returns the elementwise product of a and b\n",
127 |       "    \"\"\"\n",
128 |       "\n",
129 |       "    raise NotImplementedError(\"TODO: implement this function.\")\n",
130 |       "\n",
131 |       "\n",
132 |       "def matrix_vector_mul(a, b):\n",
133 |       "    \"\"\"\n",
134 |       "    a: A theano matrix\n",
135 |       "    b: A theano vector\n",
136 |       "    Returns the matrix-vector product of a and b\n",
137 |       "    \"\"\"\n",
138 |       "\n",
139 |       "    raise NotImplementedError(\"TODO: implement this function.\")\n",
140 |       "\n",
141 |       "# The following code use your code and test it.\n",
142 |       "a = make_vector()\n",
143 |       "b = make_vector()\n",
144 |       "c = elemwise_mul(a, b)\n",
145 |       "d = make_matrix()\n",
146 |       "e = matrix_vector_mul(d, c)\n",
147 |       "\n",
148 |       "f = function([a, b, d], e)\n",
149 |       "\n",
150 |       "rng = np.random.RandomState([1, 2, 3])\n",
151 |       "a_value = rng.randn(5).astype(a.dtype)\n",
152 |       "b_value = rng.rand(5).astype(b.dtype)\n",
153 |       "c_value = a_value * b_value\n",
154 |       "d_value = rng.randn(5, 5).astype(d.dtype)\n",
155 |       "expected = np.dot(d_value, c_value)\n",
156 |       "\n",
157 |       "actual = f(a_value, b_value, d_value)\n",
158 |       "assert np.allclose(actual, expected)\n",
159 |       "print \"SUCCESS!\"\n"
160 |      ],
161 |      "language": "python",
162 |      "metadata": {},
163 |      "outputs": []
164 |     },
165 |     {
166 |      "cell_type": "code",
167 |      "collapsed": false,
168 |      "input": [
169 |       "%load 02_vector_mat_soln.py"
170 |      ],
171 |      "language": "python",
172 |      "metadata": {},
173 |      "outputs": []
174 |     },
175 |     {
176 |      "cell_type": "code",
177 |      "collapsed": false,
178 |      "input": [
179 |       "# Exercices 3\n",
180 |       "# This exercices ask you to create Theano tensor variable, do\n",
181 |       "# broadcastable addition and to compute the max over part of a tensor.\n",
182 |       "import numpy as np\n",
183 |       "from theano import function\n",
184 |       "raise NotImplementedError(\"TODO: add any other imports you need\")\n",
185 |       "\n",
186 |       "\n",
187 |       "def make_tensor(dim):\n",
188 |       "    \"\"\"\n",
189 |       "    Returns a new Theano tensor with no broadcastable dimensions.\n",
190 |       "    dim: the total number of dimensions of the tensor.\n",
191 |       "    (You can use any dtype you like)\n",
192 |       "    \"\"\"\n",
193 |       "\n",
194 |       "    raise NotImplementedError(\"TODO: implement this function.\")\n",
195 |       "\n",
196 |       "\n",
197 |       "def broadcasted_add(a, b):\n",
198 |       "    \"\"\"\n",
199 |       "    a: a 3D theano tensor\n",
200 |       "    b: a 4D theano tensor\n",
201 |       "    Returns c, a 4D theano tensor, where\n",
202 |       "\n",
203 |       "    c[i, j, k, l] = a[l, k, i] + b[i, j, k, l]\n",
204 |       "\n",
205 |       "    for all i, j, k, l\n",
206 |       "    \"\"\"\n",
207 |       "\n",
208 |       "    raise NotImplementedError(\"TODO: implement this function.\")\n",
209 |       "\n",
210 |       "def partial_max(a):\n",
211 |       "    \"\"\"\n",
212 |       "    a: a 4D theano tensor\n",
213 |       "\n",
214 |       "    Returns b, a theano matrix, where\n",
215 |       "\n",
216 |       "    b[i, j] = max_{k,l} a[i, k, l, j]\n",
217 |       "\n",
218 |       "    for all i, j\n",
219 |       "    \"\"\"\n",
220 |       "\n",
221 |       "    raise NotImplementedError(\"TODO: implement this function.\")\n",
222 |       "\n",
223 |       "# The following code use your code and test it.\n",
224 |       "a = make_tensor(3)\n",
225 |       "b = make_tensor(4)\n",
226 |       "c = broadcasted_add(a, b)\n",
227 |       "d = partial_max(c)\n",
228 |       "\n",
229 |       "f = function([a, b], d)\n",
230 |       "\n",
231 |       "rng = np.random.RandomState([1, 2, 3])\n",
232 |       "a_value = rng.randn(2, 2, 2).astype(a.dtype)\n",
233 |       "b_value = rng.rand(2, 2, 2, 2).astype(b.dtype)\n",
234 |       "c_value = np.transpose(a_value, (2, 1, 0))[:, None, :, :] + b_value\n",
235 |       "expected = c_value.max(axis=1).max(axis=1)\n",
236 |       "\n",
237 |       "actual = f(a_value, b_value)\n",
238 |       "\n",
239 |       "assert np.allclose(actual, expected), (actual, expected)\n",
240 |       "print \"SUCCESS!\""
241 |      ],
242 |      "language": "python",
243 |      "metadata": {},
244 |      "outputs": []
245 |     },
246 |     {
247 |      "cell_type": "code",
248 |      "collapsed": false,
249 |      "input": [
250 |       "%load 03_tensor_soln.py"
251 |      ],
252 |      "language": "python",
253 |      "metadata": {},
254 |      "outputs": []
255 |     },
256 |     {
257 |      "cell_type": "code",
258 |      "collapsed": false,
259 |      "input": [
260 |       "# Exercices 4\n",
261 |       "# This exercice ask you to compile a Theano functiont and call it to\n",
262 |       "# execute \"x + y\".\n",
263 |       "from theano import tensor as T\n",
264 |       "raise NotImplementedError(\"TODO: add any other imports you need\")\n",
265 |       "\n",
266 |       "\n",
267 |       "def evaluate(x, y, expr, x_value, y_value):\n",
268 |       "    \"\"\"\n",
269 |       "    x: A theano variable\n",
270 |       "    y: A theano variable\n",
271 |       "    expr: A theano expression involving x and y\n",
272 |       "    x_value: A numpy value\n",
273 |       "    y_value: A numpy value\n",
274 |       "\n",
275 |       "    Returns the value of expr when x_value is substituted for x\n",
276 |       "    and y_value is substituted for y\n",
277 |       "    \"\"\"\n",
278 |       "\n",
279 |       "    raise NotImplementedError(\"TODO: implement this function.\")\n",
280 |       "\n",
281 |       "\n",
282 |       "# The following code use your code and test it.\n",
283 |       "x = T.iscalar()\n",
284 |       "y = T.iscalar()\n",
285 |       "z = x + y\n",
286 |       "assert evaluate(x, y, z, 1, 2) == 3\n",
287 |       "print \"SUCCESS!\""
288 |      ],
289 |      "language": "python",
290 |      "metadata": {},
291 |      "outputs": []
292 |     },
293 |     {
294 |      "cell_type": "code",
295 |      "collapsed": false,
296 |      "input": [
297 |       "%load 04_function_soln.py"
298 |      ],
299 |      "language": "python",
300 |      "metadata": {},
301 |      "outputs": []
302 |     },
303 |     {
304 |      "cell_type": "code",
305 |      "collapsed": false,
306 |      "input": [
307 |       "# Exercices 5\n",
308 |       "# This exercice make you use shared variable. You must create them and\n",
309 |       "# update them by swapping 2 shared variables values.\n",
310 |       "import numpy as np\n",
311 |       "raise NotImplementedError(\"TODO: add any other imports you need\")\n",
312 |       "\n",
313 |       "\n",
314 |       "def make_shared(shape):\n",
315 |       "    \"\"\"\n",
316 |       "    Returns a theano shared variable containing a tensor of the specified\n",
317 |       "    shape.\n",
318 |       "    You can use any value you want.\n",
319 |       "    \"\"\"\n",
320 |       "    raise NotImplementedError(\"TODO: implement the function\")\n",
321 |       "\n",
322 |       "\n",
323 |       "def exchange_shared(a, b):\n",
324 |       "    \"\"\"\n",
325 |       "    a: a theano shared variable\n",
326 |       "    b: a theano shared variable\n",
327 |       "    Uses get_value and set_value to swap the values stored in a and b\n",
328 |       "    \"\"\"\n",
329 |       "    raise NotImplementedError(\"TODO: implement the function\")\n",
330 |       "\n",
331 |       "\n",
332 |       "def make_exchange_func(a, b):\n",
333 |       "    \"\"\"\n",
334 |       "    a: a theano shared variable\n",
335 |       "    b: a theano shared variable\n",
336 |       "    Returns f\n",
337 |       "    where f is a theano function, that, when called, swaps the\n",
338 |       "    values in a and b\n",
339 |       "    f should not return anything\n",
340 |       "    \"\"\"\n",
341 |       "    raise NotImplementedError(\"TODO: implement the function\")\n",
342 |       "\n",
343 |       "\n",
344 |       "# The following code use your code and test it.\n",
345 |       "a = make_shared((5, 4, 3))\n",
346 |       "assert a.get_value().shape == (5, 4, 3)\n",
347 |       "b = make_shared((5, 4, 3))\n",
348 |       "assert a.get_value().shape == (5, 4, 3)\n",
349 |       "a.set_value(np.zeros((5, 4, 3), dtype=a.dtype))\n",
350 |       "b.set_value(np.ones((5, 4, 3), dtype=b.dtype))\n",
351 |       "exchange_shared(a, b)\n",
352 |       "assert np.all(a.get_value() == 1.)\n",
353 |       "assert np.all(b.get_value() == 0.)\n",
354 |       "f = make_exchange_func(a, b)\n",
355 |       "rval = f()\n",
356 |       "assert isinstance(rval, list)\n",
357 |       "assert len(rval) == 0\n",
358 |       "assert np.all(a.get_value() == 0.)\n",
359 |       "assert np.all(b.get_value() == 1.)\n",
360 |       "\n",
361 |       "print \"SUCCESS!\""
362 |      ],
363 |      "language": "python",
364 |      "metadata": {},
365 |      "outputs": []
366 |     },
367 |     {
368 |      "cell_type": "code",
369 |      "collapsed": false,
370 |      "input": [
371 |       "%load 05_shared_soln.py"
372 |      ],
373 |      "language": "python",
374 |      "metadata": {},
375 |      "outputs": []
376 |     },
377 |     {
378 |      "cell_type": "code",
379 |      "collapsed": false,
380 |      "input": [
381 |       "# Exercices 6\n",
382 |       "# This exercice make use Theano symbolic grad\n",
383 |       "from theano import tensor as T\n",
384 |       "\n",
385 |       "\n",
386 |       "def grad_sum(x, y, z):\n",
387 |       "    \"\"\"\n",
388 |       "    x: A theano variable\n",
389 |       "    y: A theano variable\n",
390 |       "    z: A theano expression involving x and y\n",
391 |       "\n",
392 |       "    Returns dz / dx + dz / dy\n",
393 |       "    \"\"\"\n",
394 |       "    raise NotImplementedError(\"TODO: implement this function.\")\n",
395 |       "\n",
396 |       "\n",
397 |       "# The following code use your code and test it.\n",
398 |       "x = T.scalar()\n",
399 |       "y = T.scalar()\n",
400 |       "z = x + y\n",
401 |       "s = grad_sum(x, y, z)\n",
402 |       "assert s.eval({x: 0, y: 0}) == 2\n",
403 |       "print \"SUCCESS!\""
404 |      ],
405 |      "language": "python",
406 |      "metadata": {},
407 |      "outputs": []
408 |     },
409 |     {
410 |      "cell_type": "code",
411 |      "collapsed": false,
412 |      "input": [
413 |       "%load 06_grad_soln.py"
414 |      ],
415 |      "language": "python",
416 |      "metadata": {},
417 |      "outputs": []
418 |     },
419 |     {
420 |      "cell_type": "code",
421 |      "collapsed": false,
422 |      "input": [
423 |       "# Exercice 7 #TODO: talk about mode=FAST_COMPILE\n",
424 |       "# This code have a bug. Run this cell to see it.\n",
425 |       "# Use Theano flag (easy in shell, harder in ipython) or extra parameter to a function \n",
426 |       "# to find the cause and fix it.\n",
427 |       "# Do not find the bug by inspecting the code. This is to help you find the bug\n",
428 |       "# in more complicated case when code inspection isn't working well.\n",
429 |       "\n",
430 |       "import numpy as np\n",
431 |       "from theano import function\n",
432 |       "from theano import tensor as T\n",
433 |       "a = T.vector()\n",
434 |       "b = T.log(a)\n",
435 |       "c = T.nnet.sigmoid(b)\n",
436 |       "d = T.sqrt(c)\n",
437 |       "e = T.concatenate((d, c), axis=0)\n",
438 |       "f = b * c * d\n",
439 |       "g = e + f\n",
440 |       "h = g / c\n",
441 |       "fn = function([a], h)\n",
442 |       "fn(np.ones((3,)).astype(a.dtype))\n"
443 |      ],
444 |      "language": "python",
445 |      "metadata": {},
446 |      "outputs": []
447 |     },
448 |     {
449 |      "cell_type": "code",
450 |      "collapsed": false,
451 |      "input": [
452 |       "%load 07_mode.py"
453 |      ],
454 |      "language": "python",
455 |      "metadata": {},
456 |      "outputs": []
457 |     },
458 |     {
459 |      "cell_type": "code",
460 |      "collapsed": false,
461 |      "input": [
462 |       "# Exercice 8\n",
463 |       "# This exercice is different. The initial version work.\n",
464 |       "# So you must modify it as described bellow and it should still give the same output.\n",
465 |       "\n",
466 |       "# Modify and execute the polynomial example to have the reduction(the sum() call) done by scan.\n",
467 |       "import numpy\n",
468 |       "import theano\n",
469 |       "import theano.tensor as T\n",
470 |       "theano.config.warn.subtensor_merge_bug = False\n",
471 |       "\n",
472 |       "coefficients = theano.tensor.vector(\"coefficients\")\n",
473 |       "x = T.scalar(\"x\")\n",
474 |       "max_coefficients_supported = 10000\n",
475 |       "\n",
476 |       "# Generate the components of the polynomial\n",
477 |       "full_range=theano.tensor.arange(max_coefficients_supported)\n",
478 |       "components, updates = theano.scan(fn=lambda coeff, power, free_var:\n",
479 |       "                                   coeff * (free_var ** power),\n",
480 |       "                                outputs_info=None,\n",
481 |       "                                sequences=[coefficients, full_range],\n",
482 |       "                                non_sequences=x)\n",
483 |       "\n",
484 |       "polynomial = components.sum()\n",
485 |       "calculate_polynomial = theano.function(inputs=[coefficients, x],\n",
486 |       "                                     outputs=polynomial)\n",
487 |       "\n",
488 |       "test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)\n",
489 |       "print calculate_polynomial(test_coeff, 3)\n",
490 |       "# 19.0"
491 |      ],
492 |      "language": "python",
493 |      "metadata": {},
494 |      "outputs": []
495 |     },
496 |     {
497 |      "cell_type": "code",
498 |      "collapsed": false,
499 |      "input": [
500 |       "%load 08_scan_polynomial_soln.py"
501 |      ],
502 |      "language": "python",
503 |      "metadata": {},
504 |      "outputs": []
505 |     },
506 |     {
507 |      "cell_type": "markdown",
508 |      "metadata": {},
509 |      "source": [
510 |       "LSTM Exercice"
511 |      ]
512 |     },
513 |     {
514 |      "cell_type": "code",
515 |      "collapsed": false,
516 |      "input": [
517 |       "1) Modif LSTM: Reverse the input sequence and try it like that:\n",
518 |       "   Sutskever-NIPS2014 (No change to Theano code, but useful to better understand how to make 2)\n",
519 |       "2) Modif LSTM: Add to have 2 LSTM layers. The new one take\n",
520 |       "   the input in the reverse order. Then you concatenate the mean\n",
521 |       "   of the outputs of both LSTM to the logistic regression.\n",
522 |       "3) Modif LSTM: Add the V_o parameter and use it. (No solutions provided)\n",
523 |       "    \n",
524 |       "Note. 2) need more epoch before we start to see that it learn something. With max_epochs=16, we start to see it for all version.\n",
525 |       "\n",
526 |       "You can load the original example code in the next cell.\n",
527 |       "Run it once. It will charge the data.\n",
528 |       "At the end of that code, there is in comment example how to run it for a short time (~10m on my laptop, core i5).\n",
529 |       "During that time, we see that is start to learn, but I do not let it go too long for this tutorial."
530 |      ],
531 |      "language": "python",
532 |      "metadata": {},
533 |      "outputs": []
534 |     },
535 |     {
536 |      "cell_type": "code",
537 |      "collapsed": false,
538 |      "input": [
539 |       "%load lstm.py"
540 |      ],
541 |      "language": "python",
542 |      "metadata": {},
543 |      "outputs": [],
544 |      "prompt_number": 5
545 |     },
546 |     {
547 |      "cell_type": "code",
548 |      "collapsed": false,
549 |      "input": [
550 |       "%load lstm_reverse.diff"
551 |      ],
552 |      "language": "python",
553 |      "metadata": {},
554 |      "outputs": [],
555 |      "prompt_number": 2
556 |     },
557 |     {
558 |      "cell_type": "code",
559 |      "collapsed": false,
560 |      "input": [
561 |       "%load lstm_double.diff"
562 |      ],
563 |      "language": "python",
564 |      "metadata": {},
565 |      "outputs": [],
566 |      "prompt_number": 4
567 |     }
568 |    ],
569 |    "metadata": {}
570 |   }
571 |  ]
572 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ![intro](http://i.imgur.com/7vDvQnY.png)
2 | 
3 | ###slides
4 | https://drive.google.com/a/startup.ml/file/d/0B6VRjP1VYAtTSWdaZ1BLTmQ0Qm8/view
5 | 


--------------------------------------------------------------------------------
/imdb.py:
--------------------------------------------------------------------------------
  1 | import cPickle
  2 | import gzip
  3 | import os
  4 | 
  5 | import numpy
  6 | 
  7 | 
  8 | def prepare_data(seqs, labels, maxlen=None):
  9 |     """Create the matrices from the datasets.
 10 | 
 11 |     This pad each sequence to the same lenght: the lenght of the
 12 |     longuest sequence or maxlen.
 13 | 
 14 |     if maxlen is set, we will cut all sequence to this maximum
 15 |     lenght.
 16 | 
 17 |     This swap the axis!
 18 |     """
 19 |     # x: a list of sentences
 20 |     lengths = [len(s) for s in seqs]
 21 | 
 22 |     if maxlen is not None:
 23 |         new_seqs = []
 24 |         new_labels = []
 25 |         new_lengths = []
 26 |         for l, s, y in zip(lengths, seqs, labels):
 27 |             if l < maxlen:
 28 |                 new_seqs.append(s)
 29 |                 new_labels.append(y)
 30 |                 new_lengths.append(l)
 31 |         lengths = new_lengths
 32 |         labels = new_labels
 33 |         seqs = new_seqs
 34 | 
 35 |         if len(lengths) < 1:
 36 |             return None, None, None
 37 | 
 38 |     n_samples = len(seqs)
 39 |     maxlen = numpy.max(lengths)
 40 | 
 41 |     x = numpy.zeros((maxlen, n_samples)).astype('int64')
 42 |     x_mask = numpy.zeros((maxlen, n_samples)).astype('float32')
 43 |     for idx, s in enumerate(seqs):
 44 |         x[:lengths[idx], idx] = s
 45 |         x_mask[:lengths[idx], idx] = 1.
 46 | 
 47 |     return x, x_mask, labels
 48 | 
 49 | 
 50 | def get_dataset_file(dataset, default_dataset, origin):
 51 |     '''Look for it as if it was a full path, if not, try local file,
 52 |     if not try in the data directory.
 53 | 
 54 |     Download dataset if it is not present
 55 | 
 56 |     '''
 57 |     data_dir, data_file = os.path.split(dataset)
 58 |     if data_dir == "" and not os.path.isfile(dataset):
 59 |         # Check if dataset is in the data directory.
 60 |         new_path = os.path.join(
 61 |             os.path.split(__file__)[0],
 62 |             dataset
 63 |         )
 64 |         if os.path.isfile(new_path) or data_file == default_dataset:
 65 |             dataset = new_path
 66 | 
 67 |     if (not os.path.isfile(dataset)) and data_file == default_dataset:
 68 |         import urllib
 69 |         print 'Downloading data from %s' % origin
 70 |         urllib.urlretrieve(origin, dataset)
 71 |     return dataset
 72 | 
 73 | 
 74 | def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None):
 75 |     ''' Loads the dataset
 76 | 
 77 |     :type path: String
 78 |     :param path: The path to the dataset (here IMDB)
 79 |     :type n_words: int
 80 |     :param n_words: The number of word to keep in the vocabulary.
 81 |         All extra words are set to unknow (1).
 82 |     :type valid_portion: float
 83 |     :param valid_portion: The proportion of the full train set used for
 84 |         the validation set.
 85 |     :type maxlen: None or positive int
 86 |     :param maxlen: the max sequence length we use in the train/valid set.
 87 |     '''
 88 | 
 89 |     #############
 90 |     # LOAD DATA #
 91 |     #############
 92 | 
 93 |     # Load the dataset
 94 |     path = get_dataset_file(
 95 |         path, "imdb.pkl",
 96 |         "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
 97 | 
 98 |     if path.endswith(".gz"):
 99 |         f = gzip.open(path, 'rb')
100 |     else:
101 |         f = open(path, 'rb')
102 | 
103 |     train_set = cPickle.load(f)
104 |     test_set = cPickle.load(f)
105 |     f.close()
106 |     if maxlen:
107 |         new_train_set_x = []
108 |         new_train_set_y = []
109 |         for x, y in zip(train_set[0], train_set[1]):
110 |             if len(x) < maxlen:
111 |                 new_train_set_x.append(x)
112 |                 new_train_set_y.append(y)
113 |         train_set = (new_train_set_x, new_train_set_y)
114 |         del new_train_set_x, new_train_set_y
115 | 
116 |     # split training set into validation set
117 |     train_set_x, train_set_y = train_set
118 |     n_samples = len(train_set_x)
119 |     sidx = numpy.random.permutation(n_samples)
120 |     n_train = int(numpy.round(n_samples * (1. - valid_portion)))
121 |     valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
122 |     valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
123 |     train_set_x = [train_set_x[s] for s in sidx[:n_train]]
124 |     train_set_y = [train_set_y[s] for s in sidx[:n_train]]
125 | 
126 |     train_set = (train_set_x, train_set_y)
127 |     valid_set = (valid_set_x, valid_set_y)
128 | 
129 |     def remove_unk(x):
130 |         return [[1 if w >= n_words else w for w in sen] for sen in x]
131 | 
132 |     test_set_x, test_set_y = test_set
133 |     valid_set_x, valid_set_y = valid_set
134 |     train_set_x, train_set_y = train_set
135 | 
136 |     train_set_x = remove_unk(train_set_x)
137 |     valid_set_x = remove_unk(valid_set_x)
138 |     test_set_x = remove_unk(test_set_x)
139 | 
140 |     train = (train_set_x, train_set_y)
141 |     valid = (valid_set_x, valid_set_y)
142 |     test = (test_set_x, test_set_y)
143 | 
144 |     return train, valid, test
145 | 


--------------------------------------------------------------------------------
/lstm.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Build a tweet sentiment analyzer
  3 | '''
  4 | from collections import OrderedDict
  5 | import cPickle as pkl
  6 | import random
  7 | import sys
  8 | import time
  9 | 
 10 | import numpy
 11 | import theano
 12 | import theano.tensor as tensor
 13 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 14 | 
 15 | import imdb
 16 | 
 17 | datasets = {'imdb': (imdb.load_data, imdb.prepare_data)}
 18 | 
 19 | 
 20 | def get_minibatches_idx(n, minibatch_size, shuffle=False):
 21 |     """
 22 |     Used to shuffle the dataset at each iteration.
 23 |     """
 24 | 
 25 |     idx_list = numpy.arange(n, dtype="int32")
 26 | 
 27 |     if shuffle:
 28 |         random.shuffle(idx_list)
 29 | 
 30 |     minibatches = []
 31 |     minibatch_start = 0
 32 |     for i in range(n // minibatch_size):
 33 |         minibatches.append(idx_list[minibatch_start:
 34 |                                     minibatch_start + minibatch_size])
 35 |         minibatch_start += minibatch_size
 36 | 
 37 |     if (minibatch_start != n):
 38 |         # Make a minibatch out of what is left
 39 |         minibatches.append(idx_list[minibatch_start:])
 40 | 
 41 |     return zip(range(len(minibatches)), minibatches)
 42 | 
 43 | 
 44 | def get_dataset(name):
 45 |     return datasets[name][0], datasets[name][1]
 46 | 
 47 | 
 48 | def zipp(params, tparams):
 49 |     """
 50 |     When we reload the model. Needed for the GPU stuff.
 51 |     """
 52 |     for kk, vv in params.iteritems():
 53 |         tparams[kk].set_value(vv)
 54 | 
 55 | 
 56 | def unzip(zipped):
 57 |     """
 58 |     When we pickle the model. Needed for the GPU stuff.
 59 |     """
 60 |     new_params = OrderedDict()
 61 |     for kk, vv in zipped.iteritems():
 62 |         new_params[kk] = vv.get_value()
 63 |     return new_params
 64 | 
 65 | 
 66 | def dropout_layer(state_before, use_noise, trng):
 67 |     proj = tensor.switch(use_noise,
 68 |                          (state_before *
 69 |                           trng.binomial(state_before.shape,
 70 |                                         p=0.5, n=1,
 71 |                                         dtype=state_before.dtype)),
 72 |                          state_before * 0.5)
 73 |     return proj
 74 | 
 75 | 
 76 | def _p(pp, name):
 77 |     return '%s_%s' % (pp, name)
 78 | 
 79 | 
 80 | def init_params(options):
 81 |     """
 82 |     Global (not LSTM) parameter. For the embeding and the classifier.
 83 |     """
 84 |     params = OrderedDict()
 85 |     # embedding
 86 |     randn = numpy.random.rand(options['n_words'],
 87 |                               options['dim_proj'])
 88 |     params['Wemb'] = (0.01 * randn).astype('float32')
 89 |     params = get_layer(options['encoder'])[0](options,
 90 |                                               params,
 91 |                                               prefix=options['encoder'])
 92 |     # classifier
 93 |     params['U'] = 0.01 * numpy.random.randn(options['dim_proj'],
 94 |                                             options['ydim']).astype('float32')
 95 |     params['b'] = numpy.zeros((options['ydim'],)).astype('float32')
 96 | 
 97 |     return params
 98 | 
 99 | 
100 | def load_params(path, params):
101 |     pp = numpy.load(path)
102 |     for kk, vv in params.iteritems():
103 |         if kk not in pp:
104 |             raise Warning('%s is not in the archive' % kk)
105 |         params[kk] = pp[kk]
106 | 
107 |     return params
108 | 
109 | 
110 | def init_tparams(params):
111 |     tparams = OrderedDict()
112 |     for kk, pp in params.iteritems():
113 |         tparams[kk] = theano.shared(params[kk], name=kk)
114 |     return tparams
115 | 
116 | 
117 | def get_layer(name):
118 |     fns = layers[name]
119 |     return fns
120 | 
121 | 
122 | def ortho_weight(ndim):
123 |     W = numpy.random.randn(ndim, ndim)
124 |     u, s, v = numpy.linalg.svd(W)
125 |     return u.astype('float32')
126 | 
127 | 
128 | def param_init_lstm(options, params, prefix='lstm'):
129 |     """
130 |     Init the LSTM parameter:
131 | 
132 |     :see: init_params
133 |     """
134 |     W = numpy.concatenate([ortho_weight(options['dim_proj']),
135 |                            ortho_weight(options['dim_proj']),
136 |                            ortho_weight(options['dim_proj']),
137 |                            ortho_weight(options['dim_proj'])], axis=1)
138 |     params[_p(prefix, 'W')] = W
139 |     U = numpy.concatenate([ortho_weight(options['dim_proj']),
140 |                            ortho_weight(options['dim_proj']),
141 |                            ortho_weight(options['dim_proj']),
142 |                            ortho_weight(options['dim_proj'])], axis=1)
143 |     params[_p(prefix, 'U')] = U
144 |     b = numpy.zeros((4 * options['dim_proj'],))
145 |     params[_p(prefix, 'b')] = b.astype('float32')
146 | 
147 |     return params
148 | 
149 | 
150 | def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
151 |     nsteps = state_below.shape[0]
152 |     if state_below.ndim == 3:
153 |         n_samples = state_below.shape[1]
154 |     else:
155 |         n_samples = 1
156 | 
157 |     assert mask is not None
158 | 
159 |     def _slice(_x, n, dim):
160 |         if _x.ndim == 3:
161 |             return _x[:, :, n * dim:(n + 1) * dim]
162 |         return _x[:, n * dim:(n + 1) * dim]
163 | 
164 |     def _step(m_, x_, h_, c_):
165 |         preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
166 |         preact += x_
167 |         preact += tparams[_p(prefix, 'b')]
168 | 
169 |         i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
170 |         f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
171 |         o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
172 |         c = tensor.tanh(_slice(preact, 3, options['dim_proj']))
173 | 
174 |         c = f * c_ + i * c
175 |         c = m_[:, None] * c + (1. - m_)[:, None] * c_
176 | 
177 |         h = o * tensor.tanh(c)
178 |         h = m_[:, None] * h + (1. - m_)[:, None] * h_
179 | 
180 |         return h, c
181 | 
182 |     state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
183 |                    tparams[_p(prefix, 'b')])
184 | 
185 |     dim_proj = options['dim_proj']
186 |     rval, updates = theano.scan(_step,
187 |                                 sequences=[mask, state_below],
188 |                                 outputs_info=[tensor.alloc(0., n_samples,
189 |                                                            dim_proj),
190 |                                               tensor.alloc(0., n_samples,
191 |                                                            dim_proj)],
192 |                                 name=_p(prefix, '_layers'),
193 |                                 n_steps=nsteps)
194 |     return rval[0]
195 | 
196 | 
197 | # ff: Feed Forward (normal neural net), only useful to put after lstm
198 | #     before the classifier.
199 | layers = {'lstm': (param_init_lstm, lstm_layer)}
200 | 
201 | 
202 | def sgd(lr, tparams, grads, x, mask, y, cost):
203 |     """ Stochastic Gradient Descent
204 | 
205 |     :note: A more complicated version of sgd then needed.  This is
206 |         done like that for adadelta and rmsprop.
207 | 
208 |     """
209 |     # New set of shared variable that will contain the gradient
210 |     # for a mini-batch.
211 |     gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
212 |                for k, p in tparams.iteritems()]
213 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
214 | 
215 |     # Function that computes gradients for a mini-batch, but do not
216 |     # updates the weights.
217 |     f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
218 |                                     name='sgd_f_grad_shared')
219 | 
220 |     pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]
221 | 
222 |     # Function that updates the weights from the previously computed
223 |     # gradient.
224 |     f_update = theano.function([lr], [], updates=pup,
225 |                                name='sgd_f_update')
226 | 
227 |     return f_grad_shared, f_update
228 | 
229 | 
230 | def adadelta(lr, tparams, grads, x, mask, y, cost):
231 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
232 |                                   name='%s_grad' % k)
233 |                     for k, p in tparams.iteritems()]
234 |     running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
235 |                                  name='%s_rup2' % k)
236 |                    for k, p in tparams.iteritems()]
237 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
238 |                                     name='%s_rgrad2' % k)
239 |                       for k, p in tparams.iteritems()]
240 | 
241 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
242 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
243 |              for rg2, g in zip(running_grads2, grads)]
244 | 
245 |     f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up,
246 |                                     name='adadelta_f_grad_shared')
247 | 
248 |     updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
249 |              for zg, ru2, rg2 in zip(zipped_grads,
250 |                                      running_up2,
251 |                                      running_grads2)]
252 |     ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
253 |              for ru2, ud in zip(running_up2, updir)]
254 |     param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]
255 | 
256 |     f_update = theano.function([lr], [], updates=ru2up + param_up,
257 |                                on_unused_input='ignore',
258 |                                name='adadelta_f_update')
259 | 
260 |     return f_grad_shared, f_update
261 | 
262 | 
263 | def rmsprop(lr, tparams, grads, x, mask, y, cost):
264 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
265 |                                   name='%s_grad' % k)
266 |                     for k, p in tparams.iteritems()]
267 |     running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
268 |                                    name='%s_rgrad' % k)
269 |                      for k, p in tparams.iteritems()]
270 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
271 |                                     name='%s_rgrad2' % k)
272 |                       for k, p in tparams.iteritems()]
273 | 
274 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
275 |     rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
276 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
277 |              for rg2, g in zip(running_grads2, grads)]
278 | 
279 |     f_grad_shared = theano.function([x, mask, y], cost,
280 |                                     updates=zgup + rgup + rg2up,
281 |                                     name='rmsprop_f_grad_shared')
282 | 
283 |     updir = [theano.shared(p.get_value() * numpy.float32(0.),
284 |                            name='%s_updir' % k)
285 |              for k, p in tparams.iteritems()]
286 |     updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
287 |                  for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
288 |                                             running_grads2)]
289 |     param_up = [(p, p + udn[1])
290 |                 for p, udn in zip(tparams.values(), updir_new)]
291 |     f_update = theano.function([lr], [], updates=updir_new + param_up,
292 |                                on_unused_input='ignore',
293 |                                name='rmsprop_f_update')
294 | 
295 |     return f_grad_shared, f_update
296 | 
297 | 
298 | def build_model(tparams, options):
299 |     trng = RandomStreams(1234)
300 | 
301 |     # Used for dropout.
302 |     use_noise = theano.shared(numpy.float32(0.))
303 | 
304 |     x = tensor.matrix('x', dtype='int64')
305 |     mask = tensor.matrix('mask', dtype='float32')
306 |     y = tensor.vector('y', dtype='int64')
307 | 
308 |     n_timesteps = x.shape[0]
309 |     n_samples = x.shape[1]
310 | 
311 |     emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps,
312 |                                                 n_samples,
313 |                                                 options['dim_proj']])
314 |     proj = get_layer(options['encoder'])[1](tparams, emb, options,
315 |                                             prefix=options['encoder'],
316 |                                             mask=mask)
317 |     if options['encoder'] == 'lstm':
318 |         proj = (proj * mask[:, :, None]).sum(axis=0)
319 |         proj = proj / mask.sum(axis=0)[:, None]
320 |     if options['use_dropout']:
321 |         proj = dropout_layer(proj, use_noise, trng)
322 | 
323 |     pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b'])
324 | 
325 |     f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
326 |     f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')
327 | 
328 |     cost = -tensor.log(pred[tensor.arange(n_samples), y] + 1e-8).mean()
329 | 
330 |     return use_noise, x, mask, y, f_pred_prob, f_pred, cost
331 | 
332 | 
333 | def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
334 |     """ If you want to use a trained model, this is useful to compute
335 |     the probabilities of new examples.
336 |     """
337 |     n_samples = len(data[0])
338 |     probs = numpy.zeros((n_samples, 2)).astype('float32')
339 | 
340 |     n_done = 0
341 | 
342 |     for _, valid_index in iterator:
343 |         x, mask, y = prepare_data([data[0][t] for t in valid_index],
344 |                                   numpy.array(data[1])[valid_index],
345 |                                   maxlen=None)
346 |         pred_probs = f_pred_prob(x, mask)
347 |         probs[valid_index, :] = pred_probs
348 | 
349 |         n_done += len(valid_index)
350 |         if verbose:
351 |             print '%d/%d samples classified' % (n_done, n_samples)
352 | 
353 |     return probs
354 | 
355 | 
356 | def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
357 |     """
358 |     Just compute the error
359 |     f_pred: Theano fct computing the prediction
360 |     prepare_data: usual prepare_data for that dataset.
361 |     """
362 |     valid_err = 0
363 |     for _, valid_index in iterator:
364 |         x, mask, y = prepare_data([data[0][t] for t in valid_index],
365 |                                   numpy.array(data[1])[valid_index],
366 |                                   maxlen=None)
367 |         preds = f_pred(x, mask)
368 |         targets = numpy.array(data[1])[valid_index]
369 |         valid_err += (preds == targets).sum()
370 |     valid_err = 1. - numpy.float32(valid_err) / len(data[0])
371 | 
372 |     return valid_err
373 | 
374 | 
375 | def train_lstm(
376 |     train, valid, test,
377 |     dim_proj=128,  # word embeding dimension and LSTM number of hidden units.
378 |     patience=10,  # Number of epoch to wait before early stop if no progress
379 |     max_epochs=5000,  # The maximum number of epoch to run
380 |     dispFreq=10,  # Display to stdout the training progress every N updates
381 |     decay_c=0.,  # Weight decay for the classifier applied to the U weights.
382 |     lrate=0.0001,  # Learning rate for sgd (not used for adadelta and rmsprop)
383 |     n_words=10000,  # Vocabulary size
384 |     # sgd, adadelta and rmsprop available,
385 |     # sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
386 |     optimizer=adadelta,
387 |     encoder='lstm',  # TODO: can be removed must be lstm.
388 |     saveto='lstm_model.npz',  # The best model will be saved there
389 |     validFreq=370,  # Compute the validation error after this number of update.
390 |     saveFreq=1110,  # Save the parameters after every saveFreq updates
391 |     batch_size=16,  # The batch size during training.
392 |     valid_batch_size=64,  # The batch size used for validation/test set.
393 |     dataset='imdb',
394 | 
395 |     # Parameter for extra option
396 |     noise_std=0.,
397 |     use_dropout=True,  # if False slightly faster, but worst test error
398 |                        # This frequently need a bigger model.
399 |     reload_model="",  # Path to a saved model we want to start from.
400 |     test_size=-1,  # If >0, we will trunc the test set to this number of example.
401 | ):
402 | 
403 |     # Model options
404 |     model_options = locals().copy()
405 |     del model_options['train']
406 |     del model_options['valid']
407 |     del model_options['test']
408 |     print "model options", model_options
409 | 
410 |     if test_size > 0:
411 |         test = (test[0][:test_size], test[1][:test_size])
412 | 
413 |     ydim = numpy.max(train[1]) + 1
414 | 
415 |     model_options['ydim'] = ydim
416 | 
417 |     print 'Building model'
418 |     # This create the initial parameters as numpy ndarrays.
419 |     # Dict name (string) -> numpy ndarray
420 |     params = init_params(model_options)
421 | 
422 |     if reload_model:
423 |         load_params('lstm_model.npz', params)
424 | 
425 |     # This create Theano Shared Variable from the parameters.
426 |     # Dict name (string) -> Theano Tensor Shared Variable
427 |     # params and tparams have different copy of the weights.
428 |     tparams = init_tparams(params)
429 | 
430 |     # use_noise is for dropout
431 |     (use_noise, x, mask,
432 |      y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)
433 | 
434 |     if decay_c > 0.:
435 |         decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
436 |         weight_decay = 0.
437 |         weight_decay += (tparams['U'] ** 2).sum()
438 |         weight_decay *= decay_c
439 |         cost += weight_decay
440 | 
441 |     f_cost = theano.function([x, mask, y], cost, name='f_cost')
442 | 
443 |     grads = tensor.grad(cost, wrt=tparams.values())
444 |     f_grad = theano.function([x, mask, y], grads, name='f_grad')
445 | 
446 |     lr = tensor.scalar(name='lr')
447 |     f_grad_shared, f_update = optimizer(lr, tparams, grads,
448 |                                         x, mask, y, cost)
449 | 
450 |     print 'Training'
451 | 
452 |     kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size,
453 |                                    shuffle=True)
454 |     kf_test = get_minibatches_idx(len(test[0]), valid_batch_size,
455 |                                   shuffle=True)
456 | 
457 |     print "%d train examples" % len(train[0])
458 |     print "%d valid examples" % len(valid[0])
459 |     print "%d test examples" % len(test[0])
460 |     history_errs = []
461 |     best_p = None
462 |     bad_count = 0
463 | 
464 |     if validFreq == -1:
465 |         validFreq = len(train[0]) / batch_size
466 |     if saveFreq == -1:
467 |         saveFreq = len(train[0]) / batch_size
468 | 
469 |     uidx = 0  # the number of update done
470 |     estop = False  # early stop
471 |     start_time = time.clock()
472 |     try:
473 |         for eidx in xrange(max_epochs):
474 |             n_samples = 0
475 | 
476 |             # Get new shuffled index for the training set.
477 |             kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)
478 | 
479 |             for _, train_index in kf:
480 |                 uidx += 1
481 |                 use_noise.set_value(1.)
482 | 
483 |                 # Select the random examples for this minibatch
484 |                 y = [train[1][t] for t in train_index]
485 |                 x = [train[0][t] for t in train_index]
486 | 
487 |                 # Get the data in numpy.ndarray format
488 |                 # This swap the axis!
489 |                 # Return something of shape (minibatch maxlen, n samples)
490 |                 x, mask, y = prepare_data(x, y)
491 |                 n_samples += x.shape[1]
492 | 
493 |                 cost = f_grad_shared(x, mask, y)
494 |                 f_update(lrate)
495 | 
496 |                 if numpy.isnan(cost) or numpy.isinf(cost):
497 |                     print 'NaN detected'
498 |                     return 1., 1., 1.
499 | 
500 |                 if numpy.mod(uidx, dispFreq) == 0:
501 |                     print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost
502 | 
503 |                 if numpy.mod(uidx, saveFreq) == 0:
504 |                     print 'Saving...',
505 | 
506 |                     if best_p is not None:
507 |                         params = best_p
508 |                     else:
509 |                         params = unzip(tparams)
510 |                     numpy.savez(saveto, history_errs=history_errs, **params)
511 |                     pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
512 |                     print 'Done'
513 | 
514 |                 if numpy.mod(uidx, validFreq) == 0:
515 |                     use_noise.set_value(0.)
516 |                     train_err = pred_error(f_pred, prepare_data, train, kf)
517 |                     valid_err = pred_error(f_pred, prepare_data, valid,
518 |                                            kf_valid)
519 |                     test_err = pred_error(f_pred, prepare_data, test, kf_test)
520 | 
521 |                     history_errs.append([valid_err, test_err])
522 | 
523 |                     if (uidx == 0 or
524 |                         valid_err <= numpy.array(history_errs)[:,
525 |                                                                0].min()):
526 | 
527 |                         best_p = unzip(tparams)
528 |                         bad_counter = 0
529 | 
530 |                     print ('Train ', train_err, 'Valid ', valid_err,
531 |                            'Test ', test_err)
532 | 
533 |                     if (len(history_errs) > patience and
534 |                         valid_err >= numpy.array(history_errs)[:-patience,
535 |                                                                0].min()):
536 |                         bad_counter += 1
537 |                         if bad_counter > patience:
538 |                             print 'Early Stop!'
539 |                             estop = True
540 |                             break
541 | 
542 |             print 'Seen %d samples' % n_samples
543 | 
544 |             if estop:
545 |                 break
546 | 
547 |     except KeyboardInterrupt:
548 |         print "Training interupted"
549 | 
550 |     end_time = time.clock()
551 |     print "Training done"
552 |     if best_p is not None:
553 |         zipp(best_p, tparams)
554 |     else:
555 |         best_p = unzip(tparams)
556 | 
557 |     print "Computing errors"
558 |     use_noise.set_value(0.)
559 |     train_err = pred_error(f_pred, prepare_data, train, kf)
560 |     valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
561 |     test_err = pred_error(f_pred, prepare_data, test, kf_test)
562 | 
563 |     print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err
564 | 
565 |     numpy.savez(saveto, train_err=train_err,
566 |                 valid_err=valid_err, test_err=test_err,
567 |                 history_errs=history_errs, **best_p)
568 |     print 'The code run for %d epochs, with %f sec/epochs' % (
569 |         (eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))
570 |     print >> sys.stderr, ('Training took %.1fs' %
571 |                           (end_time - start_time))
572 |     return train_err, valid_err, test_err
573 | 
574 | 
575 | # We must have floatX=float32 for this tutorial to work correctly.
576 | theano.config.floatX = "float32"
577 | # The next line is the new Theano default. This is a speed up.
578 | theano.config.scan.allow_gc = False
579 | 
580 | print 'Loading data'
581 | n_words = 10000
582 | load_data, prepare_data = get_dataset("imdb")
583 | train, valid, test = load_data(n_words=n_words, valid_portion=0.05,
584 |                                maxlen=100)
585 | print 'Loading data: Done'
586 | print "See the comment at the end of this cell to train the model."
587 | 
588 | # See function train for all possible parameter and there definition.
589 | #train_lstm(
590 | #    train, valid, test,
591 | # I set max_epochs to only 16, as this is enought to
592 | # show that the network learn. A real job should try for longer.
593 | #    max_epochs=16,
594 | #    test_size=500,
595 | #    n_words=n_words,
596 | #)
597 | 


--------------------------------------------------------------------------------
/lstm_double.diff:
--------------------------------------------------------------------------------
  1 | --- lstm.py.orig	2015-01-16 17:20:22.075153409 -0800
  2 | +++ lstm_double.py	2015-01-16 17:20:16.627153500 -0800
  3 | @@ -227,7 +227,7 @@
  4 |      return f_grad_shared, f_update
  5 |  
  6 |  
  7 | -def adadelta(lr, tparams, grads, x, mask, y, cost):
  8 | +def adadelta(lr, tparams, grads, x, rx, mask, y, cost):
  9 |      zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
 10 |                                    name='%s_grad' % k)
 11 |                      for k, p in tparams.iteritems()]
 12 | @@ -242,7 +242,7 @@
 13 |      rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
 14 |               for rg2, g in zip(running_grads2, grads)]
 15 |  
 16 | -    f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up,
 17 | +    f_grad_shared = theano.function([x, rx, mask, y], cost, updates=zgup + rg2up,
 18 |                                      name='adadelta_f_grad_shared')
 19 |  
 20 |      updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
 21 | @@ -260,7 +260,7 @@
 22 |      return f_grad_shared, f_update
 23 |  
 24 |  
 25 | -def rmsprop(lr, tparams, grads, x, mask, y, cost):
 26 | +def rmsprop(lr, tparams, grads, x, rx, mask, y, cost):
 27 |      zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
 28 |                                    name='%s_grad' % k)
 29 |                      for k, p in tparams.iteritems()]
 30 | @@ -276,7 +276,7 @@
 31 |      rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
 32 |               for rg2, g in zip(running_grads2, grads)]
 33 |  
 34 | -    f_grad_shared = theano.function([x, mask, y], cost,
 35 | +    f_grad_shared = theano.function([x, rx, mask, y], cost,
 36 |                                      updates=zgup + rgup + rg2up,
 37 |                                      name='rmsprop_f_grad_shared')
 38 |  
 39 | @@ -302,32 +302,39 @@
 40 |      use_noise = theano.shared(numpy.float32(0.))
 41 |  
 42 |      x = tensor.matrix('x', dtype='int64')
 43 | +    rx = tensor.matrix('rx', dtype='int64')
 44 |      mask = tensor.matrix('mask', dtype='float32')
 45 |      y = tensor.vector('y', dtype='int64')
 46 |  
 47 |      n_timesteps = x.shape[0]
 48 |      n_samples = x.shape[1]
 49 |  
 50 | -    emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps,
 51 | -                                                n_samples,
 52 | -                                                options['dim_proj']])
 53 | -    proj = get_layer(options['encoder'])[1](tparams, emb, options,
 54 | -                                            prefix=options['encoder'],
 55 | -                                            mask=mask)
 56 | -    if options['encoder'] == 'lstm':
 57 | -        proj = (proj * mask[:, :, None]).sum(axis=0)
 58 | -        proj = proj / mask.sum(axis=0)[:, None]
 59 | -    if options['use_dropout']:
 60 | -        proj = dropout_layer(proj, use_noise, trng)
 61 | +    lstm_outs = []
 62 | +    for inp in [x, rx]:
 63 | +        emb = tparams['Wemb'][inp.flatten()].reshape([n_timesteps,
 64 | +                                                    n_samples,
 65 | +                                                    options['dim_proj']])
 66 | +        proj = get_layer(options['encoder'])[1](tparams, emb, options,
 67 | +                                                prefix=options['encoder'],
 68 | +                                                mask=mask)
 69 | +        if options['encoder'] == 'lstm':
 70 | +            proj = (proj * mask[:, :, None]).sum(axis=0)
 71 | +            proj = proj / mask.sum(axis=0)[:, None]
 72 | +        if options['use_dropout']:
 73 | +            proj = dropout_layer(proj, use_noise, trng)
 74 | +        lstm_outs.append(proj)
 75 | +
 76 | +    del proj
 77 | +    pred = tensor.nnet.softmax(tensor.dot(theano.tensor.concatenate(lstm_outs),
 78 | +                                          tparams['U']) + tparams['b'])
 79 | +    pred = pred.reshape((2, pred.shape[0]/2, pred.shape[1])).mean(axis=0)
 80 |  
 81 | -    pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b'])
 82 | -
 83 | -    f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
 84 | -    f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')
 85 | +    f_pred_prob = theano.function([x, rx, mask], pred, name='f_pred_prob')
 86 | +    f_pred = theano.function([x, rx, mask], pred.argmax(axis=1), name='f_pred')
 87 |  
 88 |      cost = -tensor.log(pred[tensor.arange(n_samples), y] + 1e-8).mean()
 89 |  
 90 | -    return use_noise, x, mask, y, f_pred_prob, f_pred, cost
 91 | +    return use_noise, x, rx, mask, y, f_pred_prob, f_pred, cost
 92 |  
 93 |  
 94 |  def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
 95 | @@ -343,6 +350,9 @@
 96 |          x, mask, y = prepare_data([data[0][t] for t in valid_index],
 97 |                                    numpy.array(data[1])[valid_index],
 98 |                                    maxlen=None)
 99 | +        rx, _, _ = prepare_data([data[0][t][::-1] for t in valid_index],
100 | +                                  numpy.array(data[1])[valid_index],
101 | +                                  maxlen=None)
102 |          pred_probs = f_pred_prob(x, mask)
103 |          probs[valid_index, :] = pred_probs
104 |  
105 | @@ -364,7 +374,10 @@
106 |          x, mask, y = prepare_data([data[0][t] for t in valid_index],
107 |                                    numpy.array(data[1])[valid_index],
108 |                                    maxlen=None)
109 | -        preds = f_pred(x, mask)
110 | +        rx, _, _ = prepare_data([data[0][t][::-1] for t in valid_index],
111 | +                                numpy.array(data[1])[valid_index],
112 | +                                maxlen=None)
113 | +        preds = f_pred(x, rx, mask)
114 |          targets = numpy.array(data[1])[valid_index]
115 |          valid_err += (preds == targets).sum()
116 |      valid_err = 1. - numpy.float32(valid_err) / len(data[0])
117 | @@ -428,7 +441,7 @@
118 |      tparams = init_tparams(params)
119 |  
120 |      # use_noise is for dropout
121 | -    (use_noise, x, mask,
122 | +    (use_noise, x, rx, mask,
123 |       y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)
124 |  
125 |      if decay_c > 0.:
126 | @@ -438,14 +451,14 @@
127 |          weight_decay *= decay_c
128 |          cost += weight_decay
129 |  
130 | -    f_cost = theano.function([x, mask, y], cost, name='f_cost')
131 | +    f_cost = theano.function([x, rx, mask, y], cost, name='f_cost')
132 |  
133 |      grads = tensor.grad(cost, wrt=tparams.values())
134 | -    f_grad = theano.function([x, mask, y], grads, name='f_grad')
135 | +    f_grad = theano.function([x, rx, mask, y], grads, name='f_grad')
136 |  
137 |      lr = tensor.scalar(name='lr')
138 |      f_grad_shared, f_update = optimizer(lr, tparams, grads,
139 | -                                        x, mask, y, cost)
140 | +                                        x, rx, mask, y, cost)
141 |  
142 |      print 'Training'
143 |  
144 | @@ -488,9 +501,10 @@
145 |                  # This swap the axis!
146 |                  # Return something of shape (minibatch maxlen, n samples)
147 |                  x, mask, y = prepare_data(x, y)
148 | +                rx, _, Y = prepare_data([t[::-1] for t in x], y)
149 |                  n_samples += x.shape[1]
150 |  
151 | -                cost = f_grad_shared(x, mask, y)
152 | +                cost = f_grad_shared(x, rx, mask, y)
153 |                  f_update(lrate)
154 |  
155 |                  if numpy.isnan(cost) or numpy.isinf(cost):
156 | 


--------------------------------------------------------------------------------
/lstm_double.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Build a tweet sentiment analyzer
  3 | '''
  4 | from collections import OrderedDict
  5 | import cPickle as pkl
  6 | import random
  7 | import sys
  8 | import time
  9 | 
 10 | import numpy
 11 | import theano
 12 | import theano.tensor as tensor
 13 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 14 | 
 15 | import imdb
 16 | 
 17 | datasets = {'imdb': (imdb.load_data, imdb.prepare_data)}
 18 | 
 19 | 
 20 | def get_minibatches_idx(n, minibatch_size, shuffle=False):
 21 |     """
 22 |     Used to shuffle the dataset at each iteration.
 23 |     """
 24 | 
 25 |     idx_list = numpy.arange(n, dtype="int32")
 26 | 
 27 |     if shuffle:
 28 |         random.shuffle(idx_list)
 29 | 
 30 |     minibatches = []
 31 |     minibatch_start = 0
 32 |     for i in range(n // minibatch_size):
 33 |         minibatches.append(idx_list[minibatch_start:
 34 |                                     minibatch_start + minibatch_size])
 35 |         minibatch_start += minibatch_size
 36 | 
 37 |     if (minibatch_start != n):
 38 |         # Make a minibatch out of what is left
 39 |         minibatches.append(idx_list[minibatch_start:])
 40 | 
 41 |     return zip(range(len(minibatches)), minibatches)
 42 | 
 43 | 
 44 | def get_dataset(name):
 45 |     return datasets[name][0], datasets[name][1]
 46 | 
 47 | 
 48 | def zipp(params, tparams):
 49 |     """
 50 |     When we reload the model. Needed for the GPU stuff.
 51 |     """
 52 |     for kk, vv in params.iteritems():
 53 |         tparams[kk].set_value(vv)
 54 | 
 55 | 
 56 | def unzip(zipped):
 57 |     """
 58 |     When we pickle the model. Needed for the GPU stuff.
 59 |     """
 60 |     new_params = OrderedDict()
 61 |     for kk, vv in zipped.iteritems():
 62 |         new_params[kk] = vv.get_value()
 63 |     return new_params
 64 | 
 65 | 
 66 | def dropout_layer(state_before, use_noise, trng):
 67 |     proj = tensor.switch(use_noise,
 68 |                          (state_before *
 69 |                           trng.binomial(state_before.shape,
 70 |                                         p=0.5, n=1,
 71 |                                         dtype=state_before.dtype)),
 72 |                          state_before * 0.5)
 73 |     return proj
 74 | 
 75 | 
 76 | def _p(pp, name):
 77 |     return '%s_%s' % (pp, name)
 78 | 
 79 | 
 80 | def init_params(options):
 81 |     """
 82 |     Global (not LSTM) parameter. For the embeding and the classifier.
 83 |     """
 84 |     params = OrderedDict()
 85 |     # embedding
 86 |     randn = numpy.random.rand(options['n_words'],
 87 |                               options['dim_proj'])
 88 |     params['Wemb'] = (0.01 * randn).astype('float32')
 89 |     params = get_layer(options['encoder'])[0](options,
 90 |                                               params,
 91 |                                               prefix=options['encoder'])
 92 |     # classifier
 93 |     params['U'] = 0.01 * numpy.random.randn(options['dim_proj'],
 94 |                                             options['ydim']).astype('float32')
 95 |     params['b'] = numpy.zeros((options['ydim'],)).astype('float32')
 96 | 
 97 |     return params
 98 | 
 99 | 
100 | def load_params(path, params):
101 |     pp = numpy.load(path)
102 |     for kk, vv in params.iteritems():
103 |         if kk not in pp:
104 |             raise Warning('%s is not in the archive' % kk)
105 |         params[kk] = pp[kk]
106 | 
107 |     return params
108 | 
109 | 
110 | def init_tparams(params):
111 |     tparams = OrderedDict()
112 |     for kk, pp in params.iteritems():
113 |         tparams[kk] = theano.shared(params[kk], name=kk)
114 |     return tparams
115 | 
116 | 
117 | def get_layer(name):
118 |     fns = layers[name]
119 |     return fns
120 | 
121 | 
122 | def ortho_weight(ndim):
123 |     W = numpy.random.randn(ndim, ndim)
124 |     u, s, v = numpy.linalg.svd(W)
125 |     return u.astype('float32')
126 | 
127 | 
128 | def param_init_lstm(options, params, prefix='lstm'):
129 |     """
130 |     Init the LSTM parameter:
131 | 
132 |     :see: init_params
133 |     """
134 |     W = numpy.concatenate([ortho_weight(options['dim_proj']),
135 |                            ortho_weight(options['dim_proj']),
136 |                            ortho_weight(options['dim_proj']),
137 |                            ortho_weight(options['dim_proj'])], axis=1)
138 |     params[_p(prefix, 'W')] = W
139 |     U = numpy.concatenate([ortho_weight(options['dim_proj']),
140 |                            ortho_weight(options['dim_proj']),
141 |                            ortho_weight(options['dim_proj']),
142 |                            ortho_weight(options['dim_proj'])], axis=1)
143 |     params[_p(prefix, 'U')] = U
144 |     b = numpy.zeros((4 * options['dim_proj'],))
145 |     params[_p(prefix, 'b')] = b.astype('float32')
146 | 
147 |     return params
148 | 
149 | 
150 | def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
151 |     nsteps = state_below.shape[0]
152 |     if state_below.ndim == 3:
153 |         n_samples = state_below.shape[1]
154 |     else:
155 |         n_samples = 1
156 | 
157 |     assert mask is not None
158 | 
159 |     def _slice(_x, n, dim):
160 |         if _x.ndim == 3:
161 |             return _x[:, :, n * dim:(n + 1) * dim]
162 |         return _x[:, n * dim:(n + 1) * dim]
163 | 
164 |     def _step(m_, x_, h_, c_):
165 |         preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
166 |         preact += x_
167 |         preact += tparams[_p(prefix, 'b')]
168 | 
169 |         i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
170 |         f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
171 |         o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
172 |         c = tensor.tanh(_slice(preact, 3, options['dim_proj']))
173 | 
174 |         c = f * c_ + i * c
175 |         c = m_[:, None] * c + (1. - m_)[:, None] * c_
176 | 
177 |         h = o * tensor.tanh(c)
178 |         h = m_[:, None] * h + (1. - m_)[:, None] * h_
179 | 
180 |         return h, c
181 | 
182 |     state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
183 |                    tparams[_p(prefix, 'b')])
184 | 
185 |     dim_proj = options['dim_proj']
186 |     rval, updates = theano.scan(_step,
187 |                                 sequences=[mask, state_below],
188 |                                 outputs_info=[tensor.alloc(0., n_samples,
189 |                                                            dim_proj),
190 |                                               tensor.alloc(0., n_samples,
191 |                                                            dim_proj)],
192 |                                 name=_p(prefix, '_layers'),
193 |                                 n_steps=nsteps)
194 |     return rval[0]
195 | 
196 | 
197 | # ff: Feed Forward (normal neural net), only useful to put after lstm
198 | #     before the classifier.
199 | layers = {'lstm': (param_init_lstm, lstm_layer)}
200 | 
201 | 
202 | def sgd(lr, tparams, grads, x, mask, y, cost):
203 |     """ Stochastic Gradient Descent
204 | 
205 |     :note: A more complicated version of sgd then needed.  This is
206 |         done like that for adadelta and rmsprop.
207 | 
208 |     """
209 |     # New set of shared variable that will contain the gradient
210 |     # for a mini-batch.
211 |     gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
212 |                for k, p in tparams.iteritems()]
213 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
214 | 
215 |     # Function that computes gradients for a mini-batch, but do not
216 |     # updates the weights.
217 |     f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
218 |                                     name='sgd_f_grad_shared')
219 | 
220 |     pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]
221 | 
222 |     # Function that updates the weights from the previously computed
223 |     # gradient.
224 |     f_update = theano.function([lr], [], updates=pup,
225 |                                name='sgd_f_update')
226 | 
227 |     return f_grad_shared, f_update
228 | 
229 | 
230 | def adadelta(lr, tparams, grads, x, rx, mask, y, cost):
231 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
232 |                                   name='%s_grad' % k)
233 |                     for k, p in tparams.iteritems()]
234 |     running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
235 |                                  name='%s_rup2' % k)
236 |                    for k, p in tparams.iteritems()]
237 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
238 |                                     name='%s_rgrad2' % k)
239 |                       for k, p in tparams.iteritems()]
240 | 
241 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
242 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
243 |              for rg2, g in zip(running_grads2, grads)]
244 | 
245 |     f_grad_shared = theano.function([x, rx, mask, y], cost, updates=zgup + rg2up,
246 |                                     name='adadelta_f_grad_shared')
247 | 
248 |     updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
249 |              for zg, ru2, rg2 in zip(zipped_grads,
250 |                                      running_up2,
251 |                                      running_grads2)]
252 |     ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
253 |              for ru2, ud in zip(running_up2, updir)]
254 |     param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]
255 | 
256 |     f_update = theano.function([lr], [], updates=ru2up + param_up,
257 |                                on_unused_input='ignore',
258 |                                name='adadelta_f_update')
259 | 
260 |     return f_grad_shared, f_update
261 | 
262 | 
263 | def rmsprop(lr, tparams, grads, x, rx, mask, y, cost):
264 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
265 |                                   name='%s_grad' % k)
266 |                     for k, p in tparams.iteritems()]
267 |     running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
268 |                                    name='%s_rgrad' % k)
269 |                      for k, p in tparams.iteritems()]
270 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
271 |                                     name='%s_rgrad2' % k)
272 |                       for k, p in tparams.iteritems()]
273 | 
274 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
275 |     rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
276 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
277 |              for rg2, g in zip(running_grads2, grads)]
278 | 
279 |     f_grad_shared = theano.function([x, rx, mask, y], cost,
280 |                                     updates=zgup + rgup + rg2up,
281 |                                     name='rmsprop_f_grad_shared')
282 | 
283 |     updir = [theano.shared(p.get_value() * numpy.float32(0.),
284 |                            name='%s_updir' % k)
285 |              for k, p in tparams.iteritems()]
286 |     updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
287 |                  for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
288 |                                             running_grads2)]
289 |     param_up = [(p, p + udn[1])
290 |                 for p, udn in zip(tparams.values(), updir_new)]
291 |     f_update = theano.function([lr], [], updates=updir_new + param_up,
292 |                                on_unused_input='ignore',
293 |                                name='rmsprop_f_update')
294 | 
295 |     return f_grad_shared, f_update
296 | 
297 | 
298 | def build_model(tparams, options):
299 |     trng = RandomStreams(1234)
300 | 
301 |     # Used for dropout.
302 |     use_noise = theano.shared(numpy.float32(0.))
303 | 
304 |     x = tensor.matrix('x', dtype='int64')
305 |     rx = tensor.matrix('rx', dtype='int64')
306 |     mask = tensor.matrix('mask', dtype='float32')
307 |     y = tensor.vector('y', dtype='int64')
308 | 
309 |     n_timesteps = x.shape[0]
310 |     n_samples = x.shape[1]
311 | 
312 |     lstm_outs = []
313 |     for inp in [x, rx]:
314 |         emb = tparams['Wemb'][inp.flatten()].reshape([n_timesteps,
315 |                                                     n_samples,
316 |                                                     options['dim_proj']])
317 |         proj = get_layer(options['encoder'])[1](tparams, emb, options,
318 |                                                 prefix=options['encoder'],
319 |                                                 mask=mask)
320 |         if options['encoder'] == 'lstm':
321 |             proj = (proj * mask[:, :, None]).sum(axis=0)
322 |             proj = proj / mask.sum(axis=0)[:, None]
323 |         if options['use_dropout']:
324 |             proj = dropout_layer(proj, use_noise, trng)
325 |         lstm_outs.append(proj)
326 | 
327 |     del proj
328 |     pred = tensor.nnet.softmax(tensor.dot(theano.tensor.concatenate(lstm_outs),
329 |                                           tparams['U']) + tparams['b'])
330 |     pred = pred.reshape((2, pred.shape[0]/2, pred.shape[1])).mean(axis=0)
331 | 
332 |     f_pred_prob = theano.function([x, rx, mask], pred, name='f_pred_prob')
333 |     f_pred = theano.function([x, rx, mask], pred.argmax(axis=1), name='f_pred')
334 | 
335 |     cost = -tensor.log(pred[tensor.arange(n_samples), y] + 1e-8).mean()
336 | 
337 |     return use_noise, x, rx, mask, y, f_pred_prob, f_pred, cost
338 | 
339 | 
340 | def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
341 |     """ If you want to use a trained model, this is useful to compute
342 |     the probabilities of new examples.
343 |     """
344 |     n_samples = len(data[0])
345 |     probs = numpy.zeros((n_samples, 2)).astype('float32')
346 | 
347 |     n_done = 0
348 | 
349 |     for _, valid_index in iterator:
350 |         x, mask, y = prepare_data([data[0][t] for t in valid_index],
351 |                                   numpy.array(data[1])[valid_index],
352 |                                   maxlen=None)
353 |         rx, _, _ = prepare_data([data[0][t][::-1] for t in valid_index],
354 |                                   numpy.array(data[1])[valid_index],
355 |                                   maxlen=None)
356 |         pred_probs = f_pred_prob(x, mask)
357 |         probs[valid_index, :] = pred_probs
358 | 
359 |         n_done += len(valid_index)
360 |         if verbose:
361 |             print '%d/%d samples classified' % (n_done, n_samples)
362 | 
363 |     return probs
364 | 
365 | 
366 | def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
367 |     """
368 |     Just compute the error
369 |     f_pred: Theano fct computing the prediction
370 |     prepare_data: usual prepare_data for that dataset.
371 |     """
372 |     valid_err = 0
373 |     for _, valid_index in iterator:
374 |         x, mask, y = prepare_data([data[0][t] for t in valid_index],
375 |                                   numpy.array(data[1])[valid_index],
376 |                                   maxlen=None)
377 |         rx, _, _ = prepare_data([data[0][t][::-1] for t in valid_index],
378 |                                 numpy.array(data[1])[valid_index],
379 |                                 maxlen=None)
380 |         preds = f_pred(x, rx, mask)
381 |         targets = numpy.array(data[1])[valid_index]
382 |         valid_err += (preds == targets).sum()
383 |     valid_err = 1. - numpy.float32(valid_err) / len(data[0])
384 | 
385 |     return valid_err
386 | 
387 | 
388 | def train_lstm(
389 |     train, valid, test,
390 |     dim_proj=128,  # word embeding dimension and LSTM number of hidden units.
391 |     patience=10,  # Number of epoch to wait before early stop if no progress
392 |     max_epochs=5000,  # The maximum number of epoch to run
393 |     dispFreq=10,  # Display to stdout the training progress every N updates
394 |     decay_c=0.,  # Weight decay for the classifier applied to the U weights.
395 |     lrate=0.0001,  # Learning rate for sgd (not used for adadelta and rmsprop)
396 |     n_words=10000,  # Vocabulary size
397 |     # sgd, adadelta and rmsprop available,
398 |     # sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
399 |     optimizer=adadelta,
400 |     encoder='lstm',  # TODO: can be removed must be lstm.
401 |     saveto='lstm_model.npz',  # The best model will be saved there
402 |     validFreq=370,  # Compute the validation error after this number of update.
403 |     saveFreq=1110,  # Save the parameters after every saveFreq updates
404 |     batch_size=16,  # The batch size during training.
405 |     valid_batch_size=64,  # The batch size used for validation/test set.
406 |     dataset='imdb',
407 | 
408 |     # Parameter for extra option
409 |     noise_std=0.,
410 |     use_dropout=True,  # if False slightly faster, but worst test error
411 |                        # This frequently need a bigger model.
412 |     reload_model="",  # Path to a saved model we want to start from.
413 |     test_size=-1,  # If >0, we will trunc the test set to this number of example.
414 | ):
415 | 
416 |     # Model options
417 |     model_options = locals().copy()
418 |     del model_options['train']
419 |     del model_options['valid']
420 |     del model_options['test']
421 |     print "model options", model_options
422 | 
423 |     if test_size > 0:
424 |         test = (test[0][:test_size], test[1][:test_size])
425 | 
426 |     ydim = numpy.max(train[1]) + 1
427 | 
428 |     model_options['ydim'] = ydim
429 | 
430 |     print 'Building model'
431 |     # This create the initial parameters as numpy ndarrays.
432 |     # Dict name (string) -> numpy ndarray
433 |     params = init_params(model_options)
434 | 
435 |     if reload_model:
436 |         load_params('lstm_model.npz', params)
437 | 
438 |     # This create Theano Shared Variable from the parameters.
439 |     # Dict name (string) -> Theano Tensor Shared Variable
440 |     # params and tparams have different copy of the weights.
441 |     tparams = init_tparams(params)
442 | 
443 |     # use_noise is for dropout
444 |     (use_noise, x, rx, mask,
445 |      y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)
446 | 
447 |     if decay_c > 0.:
448 |         decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
449 |         weight_decay = 0.
450 |         weight_decay += (tparams['U'] ** 2).sum()
451 |         weight_decay *= decay_c
452 |         cost += weight_decay
453 | 
454 |     f_cost = theano.function([x, rx, mask, y], cost, name='f_cost')
455 | 
456 |     grads = tensor.grad(cost, wrt=tparams.values())
457 |     f_grad = theano.function([x, rx, mask, y], grads, name='f_grad')
458 | 
459 |     lr = tensor.scalar(name='lr')
460 |     f_grad_shared, f_update = optimizer(lr, tparams, grads,
461 |                                         x, rx, mask, y, cost)
462 | 
463 |     print 'Training'
464 | 
465 |     kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size,
466 |                                    shuffle=True)
467 |     kf_test = get_minibatches_idx(len(test[0]), valid_batch_size,
468 |                                   shuffle=True)
469 | 
470 |     print "%d train examples" % len(train[0])
471 |     print "%d valid examples" % len(valid[0])
472 |     print "%d test examples" % len(test[0])
473 |     history_errs = []
474 |     best_p = None
475 |     bad_count = 0
476 | 
477 |     if validFreq == -1:
478 |         validFreq = len(train[0]) / batch_size
479 |     if saveFreq == -1:
480 |         saveFreq = len(train[0]) / batch_size
481 | 
482 |     uidx = 0  # the number of update done
483 |     estop = False  # early stop
484 |     start_time = time.clock()
485 |     try:
486 |         for eidx in xrange(max_epochs):
487 |             n_samples = 0
488 | 
489 |             # Get new shuffled index for the training set.
490 |             kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)
491 | 
492 |             for _, train_index in kf:
493 |                 uidx += 1
494 |                 use_noise.set_value(1.)
495 | 
496 |                 # Select the random examples for this minibatch
497 |                 y = [train[1][t] for t in train_index]
498 |                 x = [train[0][t] for t in train_index]
499 | 
500 |                 # Get the data in numpy.ndarray format
501 |                 # This swap the axis!
502 |                 # Return something of shape (minibatch maxlen, n samples)
503 |                 x, mask, y = prepare_data(x, y)
504 |                 rx, _, Y = prepare_data([t[::-1] for t in x], y)
505 |                 n_samples += x.shape[1]
506 | 
507 |                 cost = f_grad_shared(x, rx, mask, y)
508 |                 f_update(lrate)
509 | 
510 |                 if numpy.isnan(cost) or numpy.isinf(cost):
511 |                     print 'NaN detected'
512 |                     return 1., 1., 1.
513 | 
514 |                 if numpy.mod(uidx, dispFreq) == 0:
515 |                     print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost
516 | 
517 |                 if numpy.mod(uidx, saveFreq) == 0:
518 |                     print 'Saving...',
519 | 
520 |                     if best_p is not None:
521 |                         params = best_p
522 |                     else:
523 |                         params = unzip(tparams)
524 |                     numpy.savez(saveto, history_errs=history_errs, **params)
525 |                     pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
526 |                     print 'Done'
527 | 
528 |                 if numpy.mod(uidx, validFreq) == 0:
529 |                     use_noise.set_value(0.)
530 |                     train_err = pred_error(f_pred, prepare_data, train, kf)
531 |                     valid_err = pred_error(f_pred, prepare_data, valid,
532 |                                            kf_valid)
533 |                     test_err = pred_error(f_pred, prepare_data, test, kf_test)
534 | 
535 |                     history_errs.append([valid_err, test_err])
536 | 
537 |                     if (uidx == 0 or
538 |                         valid_err <= numpy.array(history_errs)[:,
539 |                                                                0].min()):
540 | 
541 |                         best_p = unzip(tparams)
542 |                         bad_counter = 0
543 | 
544 |                     print ('Train ', train_err, 'Valid ', valid_err,
545 |                            'Test ', test_err)
546 | 
547 |                     if (len(history_errs) > patience and
548 |                         valid_err >= numpy.array(history_errs)[:-patience,
549 |                                                                0].min()):
550 |                         bad_counter += 1
551 |                         if bad_counter > patience:
552 |                             print 'Early Stop!'
553 |                             estop = True
554 |                             break
555 | 
556 |             print 'Seen %d samples' % n_samples
557 | 
558 |             if estop:
559 |                 break
560 | 
561 |     except KeyboardInterrupt:
562 |         print "Training interupted"
563 | 
564 |     end_time = time.clock()
565 |     print "Training done"
566 |     if best_p is not None:
567 |         zipp(best_p, tparams)
568 |     else:
569 |         best_p = unzip(tparams)
570 | 
571 |     print "Computing errors"
572 |     use_noise.set_value(0.)
573 |     train_err = pred_error(f_pred, prepare_data, train, kf)
574 |     valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
575 |     test_err = pred_error(f_pred, prepare_data, test, kf_test)
576 | 
577 |     print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err
578 | 
579 |     numpy.savez(saveto, train_err=train_err,
580 |                 valid_err=valid_err, test_err=test_err,
581 |                 history_errs=history_errs, **best_p)
582 |     print 'The code run for %d epochs, with %f sec/epochs' % (
583 |         (eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))
584 |     print >> sys.stderr, ('Training took %.1fs' %
585 |                           (end_time - start_time))
586 |     return train_err, valid_err, test_err
587 | 
588 | 
589 | # We must have floatX=float32 for this tutorial to work correctly.
590 | theano.config.floatX = "float32"
591 | # The next line is the new Theano default. This is a speed up.
592 | theano.config.scan.allow_gc = False
593 | 
594 | print 'Loading data'
595 | n_words = 10000
596 | load_data, prepare_data = get_dataset("imdb")
597 | train, valid, test = load_data(n_words=n_words, valid_portion=0.05,
598 |                                maxlen=100)
599 | print 'Loading data: Done'
600 | print "See the comment at the end of this cell to train the model."
601 | 
602 | train_lstm(
603 |     train, valid, test,
604 | # Setting max_epochs to 15 will show that it start to learn ~10m on my laptop.
605 |     max_epochs=16,
606 |     test_size=500,
607 |     n_words=n_words,
608 |     )


--------------------------------------------------------------------------------
/lstm_reverse.diff:
--------------------------------------------------------------------------------
 1 | --- lstm.py.orig	2015-01-16 17:20:22.075153409 -0800
 2 | +++ lstm_reverse.py	2015-01-16 17:21:10.243152608 -0800
 3 | @@ -582,6 +582,9 @@
 4 |  load_data, prepare_data = get_dataset("imdb")
 5 |  train, valid, test = load_data(n_words=n_words, valid_portion=0.05,
 6 |                                 maxlen=100)
 7 | +train = ([r[::-1] for r in train[0]], train[1])
 8 | +valid = ([r[::-1] for r in valid[0]], valid[1])
 9 | +test = ([r[::-1] for r in test[0]], test[1])
10 |  print 'Loading data: Done'
11 |  print "See the comment at the end of this cell to train the model."
12 |  
13 | 


--------------------------------------------------------------------------------
/lstm_reverse.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Build a tweet sentiment analyzer
  3 | '''
  4 | from collections import OrderedDict
  5 | import cPickle as pkl
  6 | import random
  7 | import sys
  8 | import time
  9 | 
 10 | import numpy
 11 | import theano
 12 | import theano.tensor as tensor
 13 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 14 | 
 15 | import imdb
 16 | 
 17 | datasets = {'imdb': (imdb.load_data, imdb.prepare_data)}
 18 | 
 19 | 
 20 | def get_minibatches_idx(n, minibatch_size, shuffle=False):
 21 |     """
 22 |     Used to shuffle the dataset at each iteration.
 23 |     """
 24 | 
 25 |     idx_list = numpy.arange(n, dtype="int32")
 26 | 
 27 |     if shuffle:
 28 |         random.shuffle(idx_list)
 29 | 
 30 |     minibatches = []
 31 |     minibatch_start = 0
 32 |     for i in range(n // minibatch_size):
 33 |         minibatches.append(idx_list[minibatch_start:
 34 |                                     minibatch_start + minibatch_size])
 35 |         minibatch_start += minibatch_size
 36 | 
 37 |     if (minibatch_start != n):
 38 |         # Make a minibatch out of what is left
 39 |         minibatches.append(idx_list[minibatch_start:])
 40 | 
 41 |     return zip(range(len(minibatches)), minibatches)
 42 | 
 43 | 
 44 | def get_dataset(name):
 45 |     return datasets[name][0], datasets[name][1]
 46 | 
 47 | 
 48 | def zipp(params, tparams):
 49 |     """
 50 |     When we reload the model. Needed for the GPU stuff.
 51 |     """
 52 |     for kk, vv in params.iteritems():
 53 |         tparams[kk].set_value(vv)
 54 | 
 55 | 
 56 | def unzip(zipped):
 57 |     """
 58 |     When we pickle the model. Needed for the GPU stuff.
 59 |     """
 60 |     new_params = OrderedDict()
 61 |     for kk, vv in zipped.iteritems():
 62 |         new_params[kk] = vv.get_value()
 63 |     return new_params
 64 | 
 65 | 
 66 | def dropout_layer(state_before, use_noise, trng):
 67 |     proj = tensor.switch(use_noise,
 68 |                          (state_before *
 69 |                           trng.binomial(state_before.shape,
 70 |                                         p=0.5, n=1,
 71 |                                         dtype=state_before.dtype)),
 72 |                          state_before * 0.5)
 73 |     return proj
 74 | 
 75 | 
 76 | def _p(pp, name):
 77 |     return '%s_%s' % (pp, name)
 78 | 
 79 | 
 80 | def init_params(options):
 81 |     """
 82 |     Global (not LSTM) parameter. For the embeding and the classifier.
 83 |     """
 84 |     params = OrderedDict()
 85 |     # embedding
 86 |     randn = numpy.random.rand(options['n_words'],
 87 |                               options['dim_proj'])
 88 |     params['Wemb'] = (0.01 * randn).astype('float32')
 89 |     params = get_layer(options['encoder'])[0](options,
 90 |                                               params,
 91 |                                               prefix=options['encoder'])
 92 |     # classifier
 93 |     params['U'] = 0.01 * numpy.random.randn(options['dim_proj'],
 94 |                                             options['ydim']).astype('float32')
 95 |     params['b'] = numpy.zeros((options['ydim'],)).astype('float32')
 96 | 
 97 |     return params
 98 | 
 99 | 
100 | def load_params(path, params):
101 |     pp = numpy.load(path)
102 |     for kk, vv in params.iteritems():
103 |         if kk not in pp:
104 |             raise Warning('%s is not in the archive' % kk)
105 |         params[kk] = pp[kk]
106 | 
107 |     return params
108 | 
109 | 
110 | def init_tparams(params):
111 |     tparams = OrderedDict()
112 |     for kk, pp in params.iteritems():
113 |         tparams[kk] = theano.shared(params[kk], name=kk)
114 |     return tparams
115 | 
116 | 
117 | def get_layer(name):
118 |     fns = layers[name]
119 |     return fns
120 | 
121 | 
122 | def ortho_weight(ndim):
123 |     W = numpy.random.randn(ndim, ndim)
124 |     u, s, v = numpy.linalg.svd(W)
125 |     return u.astype('float32')
126 | 
127 | 
128 | def param_init_lstm(options, params, prefix='lstm'):
129 |     """
130 |     Init the LSTM parameter:
131 | 
132 |     :see: init_params
133 |     """
134 |     W = numpy.concatenate([ortho_weight(options['dim_proj']),
135 |                            ortho_weight(options['dim_proj']),
136 |                            ortho_weight(options['dim_proj']),
137 |                            ortho_weight(options['dim_proj'])], axis=1)
138 |     params[_p(prefix, 'W')] = W
139 |     U = numpy.concatenate([ortho_weight(options['dim_proj']),
140 |                            ortho_weight(options['dim_proj']),
141 |                            ortho_weight(options['dim_proj']),
142 |                            ortho_weight(options['dim_proj'])], axis=1)
143 |     params[_p(prefix, 'U')] = U
144 |     b = numpy.zeros((4 * options['dim_proj'],))
145 |     params[_p(prefix, 'b')] = b.astype('float32')
146 | 
147 |     return params
148 | 
149 | 
150 | def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
151 |     nsteps = state_below.shape[0]
152 |     if state_below.ndim == 3:
153 |         n_samples = state_below.shape[1]
154 |     else:
155 |         n_samples = 1
156 | 
157 |     assert mask is not None
158 | 
159 |     def _slice(_x, n, dim):
160 |         if _x.ndim == 3:
161 |             return _x[:, :, n * dim:(n + 1) * dim]
162 |         return _x[:, n * dim:(n + 1) * dim]
163 | 
164 |     def _step(m_, x_, h_, c_):
165 |         preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
166 |         preact += x_
167 |         preact += tparams[_p(prefix, 'b')]
168 | 
169 |         i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
170 |         f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
171 |         o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
172 |         c = tensor.tanh(_slice(preact, 3, options['dim_proj']))
173 | 
174 |         c = f * c_ + i * c
175 |         c = m_[:, None] * c + (1. - m_)[:, None] * c_
176 | 
177 |         h = o * tensor.tanh(c)
178 |         h = m_[:, None] * h + (1. - m_)[:, None] * h_
179 | 
180 |         return h, c
181 | 
182 |     state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
183 |                    tparams[_p(prefix, 'b')])
184 | 
185 |     dim_proj = options['dim_proj']
186 |     rval, updates = theano.scan(_step,
187 |                                 sequences=[mask, state_below],
188 |                                 outputs_info=[tensor.alloc(0., n_samples,
189 |                                                            dim_proj),
190 |                                               tensor.alloc(0., n_samples,
191 |                                                            dim_proj)],
192 |                                 name=_p(prefix, '_layers'),
193 |                                 n_steps=nsteps)
194 |     return rval[0]
195 | 
196 | 
197 | # ff: Feed Forward (normal neural net), only useful to put after lstm
198 | #     before the classifier.
199 | layers = {'lstm': (param_init_lstm, lstm_layer)}
200 | 
201 | 
202 | def sgd(lr, tparams, grads, x, mask, y, cost):
203 |     """ Stochastic Gradient Descent
204 | 
205 |     :note: A more complicated version of sgd then needed.  This is
206 |         done like that for adadelta and rmsprop.
207 | 
208 |     """
209 |     # New set of shared variable that will contain the gradient
210 |     # for a mini-batch.
211 |     gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
212 |                for k, p in tparams.iteritems()]
213 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
214 | 
215 |     # Function that computes gradients for a mini-batch, but do not
216 |     # updates the weights.
217 |     f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
218 |                                     name='sgd_f_grad_shared')
219 | 
220 |     pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]
221 | 
222 |     # Function that updates the weights from the previously computed
223 |     # gradient.
224 |     f_update = theano.function([lr], [], updates=pup,
225 |                                name='sgd_f_update')
226 | 
227 |     return f_grad_shared, f_update
228 | 
229 | 
230 | def adadelta(lr, tparams, grads, x, mask, y, cost):
231 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
232 |                                   name='%s_grad' % k)
233 |                     for k, p in tparams.iteritems()]
234 |     running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
235 |                                  name='%s_rup2' % k)
236 |                    for k, p in tparams.iteritems()]
237 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
238 |                                     name='%s_rgrad2' % k)
239 |                       for k, p in tparams.iteritems()]
240 | 
241 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
242 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
243 |              for rg2, g in zip(running_grads2, grads)]
244 | 
245 |     f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up,
246 |                                     name='adadelta_f_grad_shared')
247 | 
248 |     updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
249 |              for zg, ru2, rg2 in zip(zipped_grads,
250 |                                      running_up2,
251 |                                      running_grads2)]
252 |     ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
253 |              for ru2, ud in zip(running_up2, updir)]
254 |     param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]
255 | 
256 |     f_update = theano.function([lr], [], updates=ru2up + param_up,
257 |                                on_unused_input='ignore',
258 |                                name='adadelta_f_update')
259 | 
260 |     return f_grad_shared, f_update
261 | 
262 | 
263 | def rmsprop(lr, tparams, grads, x, mask, y, cost):
264 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
265 |                                   name='%s_grad' % k)
266 |                     for k, p in tparams.iteritems()]
267 |     running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
268 |                                    name='%s_rgrad' % k)
269 |                      for k, p in tparams.iteritems()]
270 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
271 |                                     name='%s_rgrad2' % k)
272 |                       for k, p in tparams.iteritems()]
273 | 
274 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
275 |     rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
276 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
277 |              for rg2, g in zip(running_grads2, grads)]
278 | 
279 |     f_grad_shared = theano.function([x, mask, y], cost,
280 |                                     updates=zgup + rgup + rg2up,
281 |                                     name='rmsprop_f_grad_shared')
282 | 
283 |     updir = [theano.shared(p.get_value() * numpy.float32(0.),
284 |                            name='%s_updir' % k)
285 |              for k, p in tparams.iteritems()]
286 |     updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
287 |                  for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
288 |                                             running_grads2)]
289 |     param_up = [(p, p + udn[1])
290 |                 for p, udn in zip(tparams.values(), updir_new)]
291 |     f_update = theano.function([lr], [], updates=updir_new + param_up,
292 |                                on_unused_input='ignore',
293 |                                name='rmsprop_f_update')
294 | 
295 |     return f_grad_shared, f_update
296 | 
297 | 
298 | def build_model(tparams, options):
299 |     trng = RandomStreams(1234)
300 | 
301 |     # Used for dropout.
302 |     use_noise = theano.shared(numpy.float32(0.))
303 | 
304 |     x = tensor.matrix('x', dtype='int64')
305 |     mask = tensor.matrix('mask', dtype='float32')
306 |     y = tensor.vector('y', dtype='int64')
307 | 
308 |     n_timesteps = x.shape[0]
309 |     n_samples = x.shape[1]
310 | 
311 |     emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps,
312 |                                                 n_samples,
313 |                                                 options['dim_proj']])
314 |     proj = get_layer(options['encoder'])[1](tparams, emb, options,
315 |                                             prefix=options['encoder'],
316 |                                             mask=mask)
317 |     if options['encoder'] == 'lstm':
318 |         proj = (proj * mask[:, :, None]).sum(axis=0)
319 |         proj = proj / mask.sum(axis=0)[:, None]
320 |     if options['use_dropout']:
321 |         proj = dropout_layer(proj, use_noise, trng)
322 | 
323 |     pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b'])
324 | 
325 |     f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
326 |     f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')
327 | 
328 |     cost = -tensor.log(pred[tensor.arange(n_samples), y] + 1e-8).mean()
329 | 
330 |     return use_noise, x, mask, y, f_pred_prob, f_pred, cost
331 | 
332 | 
333 | def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
334 |     """ If you want to use a trained model, this is useful to compute
335 |     the probabilities of new examples.
336 |     """
337 |     n_samples = len(data[0])
338 |     probs = numpy.zeros((n_samples, 2)).astype('float32')
339 | 
340 |     n_done = 0
341 | 
342 |     for _, valid_index in iterator:
343 |         x, mask, y = prepare_data([data[0][t] for t in valid_index],
344 |                                   numpy.array(data[1])[valid_index],
345 |                                   maxlen=None)
346 |         pred_probs = f_pred_prob(x, mask)
347 |         probs[valid_index, :] = pred_probs
348 | 
349 |         n_done += len(valid_index)
350 |         if verbose:
351 |             print '%d/%d samples classified' % (n_done, n_samples)
352 | 
353 |     return probs
354 | 
355 | 
356 | def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
357 |     """
358 |     Just compute the error
359 |     f_pred: Theano fct computing the prediction
360 |     prepare_data: usual prepare_data for that dataset.
361 |     """
362 |     valid_err = 0
363 |     for _, valid_index in iterator:
364 |         x, mask, y = prepare_data([data[0][t] for t in valid_index],
365 |                                   numpy.array(data[1])[valid_index],
366 |                                   maxlen=None)
367 |         preds = f_pred(x, mask)
368 |         targets = numpy.array(data[1])[valid_index]
369 |         valid_err += (preds == targets).sum()
370 |     valid_err = 1. - numpy.float32(valid_err) / len(data[0])
371 | 
372 |     return valid_err
373 | 
374 | 
375 | def train_lstm(
376 |     train, valid, test,
377 |     dim_proj=128,  # word embeding dimension and LSTM number of hidden units.
378 |     patience=10,  # Number of epoch to wait before early stop if no progress
379 |     max_epochs=5000,  # The maximum number of epoch to run
380 |     dispFreq=10,  # Display to stdout the training progress every N updates
381 |     decay_c=0.,  # Weight decay for the classifier applied to the U weights.
382 |     lrate=0.0001,  # Learning rate for sgd (not used for adadelta and rmsprop)
383 |     n_words=10000,  # Vocabulary size
384 |     # sgd, adadelta and rmsprop available,
385 |     # sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
386 |     optimizer=adadelta,
387 |     encoder='lstm',  # TODO: can be removed must be lstm.
388 |     saveto='lstm_model.npz',  # The best model will be saved there
389 |     validFreq=370,  # Compute the validation error after this number of update.
390 |     saveFreq=1110,  # Save the parameters after every saveFreq updates
391 |     batch_size=16,  # The batch size during training.
392 |     valid_batch_size=64,  # The batch size used for validation/test set.
393 |     dataset='imdb',
394 | 
395 |     # Parameter for extra option
396 |     noise_std=0.,
397 |     use_dropout=True,  # if False slightly faster, but worst test error
398 |                        # This frequently need a bigger model.
399 |     reload_model="",  # Path to a saved model we want to start from.
400 |     test_size=-1,  # If >0, we will trunc the test set to this number of example.
401 | ):
402 | 
403 |     # Model options
404 |     model_options = locals().copy()
405 |     del model_options['train']
406 |     del model_options['valid']
407 |     del model_options['test']
408 |     print "model options", model_options
409 | 
410 |     if test_size > 0:
411 |         test = (test[0][:test_size], test[1][:test_size])
412 | 
413 |     ydim = numpy.max(train[1]) + 1
414 | 
415 |     model_options['ydim'] = ydim
416 | 
417 |     print 'Building model'
418 |     # This create the initial parameters as numpy ndarrays.
419 |     # Dict name (string) -> numpy ndarray
420 |     params = init_params(model_options)
421 | 
422 |     if reload_model:
423 |         load_params('lstm_model.npz', params)
424 | 
425 |     # This create Theano Shared Variable from the parameters.
426 |     # Dict name (string) -> Theano Tensor Shared Variable
427 |     # params and tparams have different copy of the weights.
428 |     tparams = init_tparams(params)
429 | 
430 |     # use_noise is for dropout
431 |     (use_noise, x, mask,
432 |      y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)
433 | 
434 |     if decay_c > 0.:
435 |         decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
436 |         weight_decay = 0.
437 |         weight_decay += (tparams['U'] ** 2).sum()
438 |         weight_decay *= decay_c
439 |         cost += weight_decay
440 | 
441 |     f_cost = theano.function([x, mask, y], cost, name='f_cost')
442 | 
443 |     grads = tensor.grad(cost, wrt=tparams.values())
444 |     f_grad = theano.function([x, mask, y], grads, name='f_grad')
445 | 
446 |     lr = tensor.scalar(name='lr')
447 |     f_grad_shared, f_update = optimizer(lr, tparams, grads,
448 |                                         x, mask, y, cost)
449 | 
450 |     print 'Training'
451 | 
452 |     kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size,
453 |                                    shuffle=True)
454 |     kf_test = get_minibatches_idx(len(test[0]), valid_batch_size,
455 |                                   shuffle=True)
456 | 
457 |     print "%d train examples" % len(train[0])
458 |     print "%d valid examples" % len(valid[0])
459 |     print "%d test examples" % len(test[0])
460 |     history_errs = []
461 |     best_p = None
462 |     bad_count = 0
463 | 
464 |     if validFreq == -1:
465 |         validFreq = len(train[0]) / batch_size
466 |     if saveFreq == -1:
467 |         saveFreq = len(train[0]) / batch_size
468 | 
469 |     uidx = 0  # the number of update done
470 |     estop = False  # early stop
471 |     start_time = time.clock()
472 |     try:
473 |         for eidx in xrange(max_epochs):
474 |             n_samples = 0
475 | 
476 |             # Get new shuffled index for the training set.
477 |             kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)
478 | 
479 |             for _, train_index in kf:
480 |                 uidx += 1
481 |                 use_noise.set_value(1.)
482 | 
483 |                 # Select the random examples for this minibatch
484 |                 y = [train[1][t] for t in train_index]
485 |                 x = [train[0][t] for t in train_index]
486 | 
487 |                 # Get the data in numpy.ndarray format
488 |                 # This swap the axis!
489 |                 # Return something of shape (minibatch maxlen, n samples)
490 |                 x, mask, y = prepare_data(x, y)
491 |                 n_samples += x.shape[1]
492 | 
493 |                 cost = f_grad_shared(x, mask, y)
494 |                 f_update(lrate)
495 | 
496 |                 if numpy.isnan(cost) or numpy.isinf(cost):
497 |                     print 'NaN detected'
498 |                     return 1., 1., 1.
499 | 
500 |                 if numpy.mod(uidx, dispFreq) == 0:
501 |                     print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost
502 | 
503 |                 if numpy.mod(uidx, saveFreq) == 0:
504 |                     print 'Saving...',
505 | 
506 |                     if best_p is not None:
507 |                         params = best_p
508 |                     else:
509 |                         params = unzip(tparams)
510 |                     numpy.savez(saveto, history_errs=history_errs, **params)
511 |                     pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
512 |                     print 'Done'
513 | 
514 |                 if numpy.mod(uidx, validFreq) == 0:
515 |                     use_noise.set_value(0.)
516 |                     train_err = pred_error(f_pred, prepare_data, train, kf)
517 |                     valid_err = pred_error(f_pred, prepare_data, valid,
518 |                                            kf_valid)
519 |                     test_err = pred_error(f_pred, prepare_data, test, kf_test)
520 | 
521 |                     history_errs.append([valid_err, test_err])
522 | 
523 |                     if (uidx == 0 or
524 |                         valid_err <= numpy.array(history_errs)[:,
525 |                                                                0].min()):
526 | 
527 |                         best_p = unzip(tparams)
528 |                         bad_counter = 0
529 | 
530 |                     print ('Train ', train_err, 'Valid ', valid_err,
531 |                            'Test ', test_err)
532 | 
533 |                     if (len(history_errs) > patience and
534 |                         valid_err >= numpy.array(history_errs)[:-patience,
535 |                                                                0].min()):
536 |                         bad_counter += 1
537 |                         if bad_counter > patience:
538 |                             print 'Early Stop!'
539 |                             estop = True
540 |                             break
541 | 
542 |             print 'Seen %d samples' % n_samples
543 | 
544 |             if estop:
545 |                 break
546 | 
547 |     except KeyboardInterrupt:
548 |         print "Training interupted"
549 | 
550 |     end_time = time.clock()
551 |     print "Training done"
552 |     if best_p is not None:
553 |         zipp(best_p, tparams)
554 |     else:
555 |         best_p = unzip(tparams)
556 | 
557 |     print "Computing errors"
558 |     use_noise.set_value(0.)
559 |     train_err = pred_error(f_pred, prepare_data, train, kf)
560 |     valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
561 |     test_err = pred_error(f_pred, prepare_data, test, kf_test)
562 | 
563 |     print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err
564 | 
565 |     numpy.savez(saveto, train_err=train_err,
566 |                 valid_err=valid_err, test_err=test_err,
567 |                 history_errs=history_errs, **best_p)
568 |     print 'The code run for %d epochs, with %f sec/epochs' % (
569 |         (eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))
570 |     print >> sys.stderr, ('Training took %.1fs' %
571 |                           (end_time - start_time))
572 |     return train_err, valid_err, test_err
573 | 
574 | 
575 | # We must have floatX=float32 for this tutorial to work correctly.
576 | theano.config.floatX = "float32"
577 | # The next line is the new Theano default. This is a speed up.
578 | theano.config.scan.allow_gc = False
579 | 
580 | print 'Loading data'
581 | n_words = 10000
582 | load_data, prepare_data = get_dataset("imdb")
583 | train, valid, test = load_data(n_words=n_words, valid_portion=0.05,
584 |                                maxlen=100)
585 | train = ([r[::-1] for r in train[0]], train[1])
586 | valid = ([r[::-1] for r in valid[0]], valid[1])
587 | test = ([r[::-1] for r in test[0]], test[1])
588 | print 'Loading data: Done'
589 | print "See the comment at the end of this cell to train the model."
590 | 
591 | # See function train for all possible parameter and there definition.
592 | #train_lstm(
593 | #    train, valid, test,
594 | #    max_epochs=16,
595 | #    test_size=500,
596 | #    n_words=n_words,
597 | #)
598 | 


--------------------------------------------------------------------------------