├── .gitignore
├── LICENSE
├── README.md
├── setup.py
└── spaghetti
    ├── __init__.py
    ├── layers.py
    └── objectives.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | scratch
  2 | 
  3 | # Created by https://www.gitignore.io/api/python
  4 | 
  5 | ### Python ###
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | env/
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *,cover
 51 | .hypothesis/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | 
 60 | # Sphinx documentation
 61 | docs/_build/
 62 | 
 63 | # PyBuilder
 64 | target/
 65 | 
 66 | 
 67 | # Created by https://www.gitignore.io/api/pycharm
 68 | 
 69 | ### PyCharm ###
 70 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
 71 | 
 72 | *.iml
 73 | 
 74 | ## Directory-based project format:
 75 | .idea/
 76 | # if you remove the above rule, at least ignore the following:
 77 | 
 78 | # User-specific stuff:
 79 | # .idea/workspace.xml
 80 | # .idea/tasks.xml
 81 | # .idea/dictionaries
 82 | # .idea/shelf
 83 | 
 84 | # Sensitive or high-churn files:
 85 | # .idea/dataSources.ids
 86 | # .idea/dataSources.xml
 87 | # .idea/sqlDataSources.xml
 88 | # .idea/dynamic.xml
 89 | # .idea/uiDesigner.xml
 90 | 
 91 | # Gradle:
 92 | # .idea/gradle.xml
 93 | # .idea/libraries
 94 | 
 95 | # Mongo Explorer plugin:
 96 | # .idea/mongoSettings.xml
 97 | 
 98 | ## File-based project format:
 99 | *.ipr
100 | *.iws
101 | 
102 | ## Plugin-specific files:
103 | 
104 | # IntelliJ
105 | /out/
106 | 
107 | # mpeltonen/sbt-idea plugin
108 | .idea_modules/
109 | 
110 | # JIRA plugin
111 | atlassian-ide-plugin.xml
112 | 
113 | # Crashlytics plugin (for Android Studio and IntelliJ)
114 | com_crashlytics_export_strings.xml
115 | crashlytics.properties
116 | crashlytics-build.properties
117 | fabric.properties
118 | 
119 | 
120 | # Created by https://www.gitignore.io/api/ipythonnotebook
121 | 
122 | ### IPythonNotebook ###
123 | # Temporary data
124 | .ipynb_checkpoints/
125 | 
126 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Filip Korzeniowski
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Spaghetti
  2 | 
  3 | Spaghetti is an implementation of **Linear-Chain Conditional Random Fields**
  4 | (CRFs) as [Lasagne](https://github.com/Lasagne/Lasagne) layer. It facilitates
  5 | integrating CRFs with neural networks.
  6 | 
  7 | ## Installation
  8 | 
  9 | To install Spaghetti, follow these steps. Adapt as necessary.
 10 | 
 11 | 1. `git clone https://github.com/fdlm/Spaghetti.git`
 12 | 2. `cd Spaghetti`
 13 | 3. `python setup.py install`
 14 | 
 15 | ## Examples
 16 | 
 17 | ### Decoding with fixed parameters
 18 | 
 19 | ```python
 20 | import numpy as np
 21 | import theano
 22 | import theano.tensor as T
 23 | import spaghetti as spg
 24 | import lasagne
 25 | 
 26 | # invent parameters for the CRF
 27 | 
 28 | eta = 0.000000000000001  # numerical stability
 29 | pi = np.log(np.array([0.6, 0.2, 0.1, 0.1], dtype=np.float32))
 30 | tau = np.log(np.ones(4, dtype=np.float32))
 31 | c = np.log(np.ones(4, dtype=np.float32))
 32 | 
 33 | A = np.log(np.array([[0.8, 0.2, 0.0, 0.0],
 34 |                      [0.1, 0.6, 0.3, 0.0],
 35 |                      [0.0, 0.2, 0.7, 0.1],
 36 |                      [0.0, 0.0, 0.4, 0.6]]) + eta).astype(np.float32)
 37 | 
 38 | W = np.log(np.array([[0.7,  0.1, 0.2, 0.3],
 39 |                      [0.15, 0.4, 0.7, 0.1],
 40 |                      [0.15, 0.5, 0.1, 0.6]]) + eta).astype(np.float32)
 41 | 
 42 | # create observation sequence in one-hot encoding
 43 | 
 44 | def to_onehot(seq, num_states=3):
 45 |     seq_oh = np.zeros(seq.shape + (num_states,), dtype=np.float32)
 46 |     seq_oh[range(len(seq)), seq] = 1.
 47 |     return seq_oh
 48 | 
 49 | x = to_onehot(np.array([0, 0, 1, 0, 0, 2, 1, 0, 2, 1, 0, 1, 1, 1, 0, 2,
 50 |                          0, 2, 0, 1, 1, 2, 0, 0, 0, 1]))[np.newaxis, ...]
 51 | 
 52 | # create simple crf model
 53 | 
 54 | x_var = T.ftensor3(name='x')
 55 | l_in = lasagne.layers.InputLayer(name='input', shape=(None, x.shape[1], 3),
 56 |                                  input_var=x_var)
 57 | l_crf = spg.layers.CrfLayer(incoming=l_in, num_states=4, name='crf',
 58 |                             pi=pi, tau=tau, c=c, A=A, W=W)
 59 | path = lasagne.layers.get_output(l_crf, mode='decoding')
 60 | decode = theano.function([x_var], path)
 61 | 
 62 | # decode the state sequence, convert it from one-hot to state id
 63 | print decode(x).argmax(axis=2)
 64 | ```
 65 | 
 66 | ### Training
 67 | 
 68 | ```python
 69 | import spaghetti as spg
 70 | import lasagne as lnn
 71 | import numpy as np
 72 | import theano.tensor as tt
 73 | import theano
 74 | 
 75 | # one hot encoding of sequences
 76 | 
 77 | def to_onehot(seq, num_states=4):
 78 |     seq_oh = np.zeros(seq.shape + (num_states,), dtype=np.float32)
 79 |     seq_oh[range(len(seq)), seq] = 1.
 80 |     return seq_oh
 81 | 
 82 | x = np.stack((to_onehot(np.array([0, 0, 1, 0, 0, 2, 1, 0, 2, 1, 0, 1, 1, 1, 0, 2, 0, 2, 0, 1, 1, 2, 0, 0, 0, 1]), 3),
 83 |               to_onehot(np.array([2, 2, 2, 2, 1, 0, 2, 0, 0, 0, 1, 1, 1, 2, 0, 2, 2, 2, 0, 1, 1, 1, 1, 1, 1, 1]), 3)))
 84 | 
 85 | y = np.stack((to_onehot(np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 2, 2, 1, 0, 0, 0, 0])),
 86 |               to_onehot(np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2]))))
 87 | 
 88 | # create model
 89 | x_var = tt.ftensor3(name='x')   # observation sequence variable
 90 | y_var = tt.ftensor3(name='y')   # state sequence variable
 91 | 
 92 | l_in= lnn.layers.InputLayer(name='input', shape=(2, x.shape[1], 3),
 93 |                             input_var=x_var)
 94 | 
 95 | l_crf = spg.layers.CrfLayer(incoming=l_in, num_states=4, name='crf')
 96 | 
 97 | # create train function
 98 | objective = spg.objectives.neg_log_likelihood(l_crf, y_var)
 99 | params = lnn.layers.get_all_params(l_crf, trainable=True)
100 | loss = objective.mean()
101 | updates = lnn.updates.sgd(loss, params, learning_rate=0.01)
102 | train = theano.function([y_var, x_var], loss, updates=updates)
103 | 
104 | for i in range(100):
105 |     cur_loss = train(y, x)
106 |     if i % 10 == 0:
107 |         print cur_loss
108 | ```
109 | 
110 | ## TODO
111 | 
112 |  - Add unit tests
113 |  - Implement smoothing
114 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='Spaghetti',
 5 |     version='0.1dev',
 6 |     description='A Lasagne-compatible conditional random field implementation',
 7 |     classifiers=[
 8 |         "Development Status :: 3 - Alpha",
 9 |         "Intended Audience :: Developers",
10 |         "Intended Audience :: Science/Research",
11 |         "License :: OSI Approved :: MIT License",
12 |         "Programming Language :: Python :: 2.7",
13 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
14 |     ],
15 |     license='MIT',
16 |     author='Filip Korzeniowski',
17 |     author_email='filip.korzeniowski@jku.at',
18 |     install_requires=['numpy', 'Lasagne', 'Theano']
19 | )
20 | 


--------------------------------------------------------------------------------
/spaghetti/__init__.py:
--------------------------------------------------------------------------------
1 | from . import layers
2 | from . import objectives
3 | 
4 | __version__ = '0.1dev'
5 | 


--------------------------------------------------------------------------------
/spaghetti/layers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Layers that construct linear-chain conditional random fields (CRFs). Similar to
  4 | Lasagne's recurrent layers, CRF layers expect the input shape to be
  5 | ``(batch_size, sequence_length, num_inputs)``. The input is allowed to have
  6 | more than three dimensions in which case dimensions trailing the third
  7 | dimension are flattened. CRF layers can be combined directly with recurrent
  8 | layers and, using reshape operations, with feed-forward layers (see the
  9 | Lasagne docs for recurrent layers).
 10 | 
 11 | .. currentmodule:: spaghetti.layers
 12 | 
 13 | .. autosummary::
 14 |     :nosignatures:
 15 | 
 16 |     CrfLayer
 17 | """
 18 | import lasagne as lnn
 19 | import theano
 20 | import theano.tensor as tt
 21 | import numpy as np
 22 | 
 23 | STATE_ID_DTYPE = 'uint16'
 24 | 
 25 | 
 26 | class CrfLayer(lnn.layers.MergeLayer):
 27 |     """
 28 |     spaghetti.layers.ViterbiLayer(incoming, pi, tau, c, A, W, mask_input=None,
 29 |     **kwargs)
 30 | 
 31 |     A layer which implements a linear-chain conditional random field.
 32 | 
 33 |     It assumes only pair-wise potentials between labels and pairwise
 34 |     label-input potentials. Different outputs can be computed by
 35 |     passing a `mode` parameter to `get_output`.
 36 | 
 37 |     .. math ::
 38 |         P(y \mid x) = \frac{1}{Z(x)}\exp(y_0^T \pi +
 39 |                       \sum_{n=1}^N[y_{n-1}^T A y_n + y_n^T c + x_n^T W y_n] +
 40 |                       y_N^T \tau)
 41 | 
 42 |     Parameters
 43 |     ----------
 44 |     incoming : a :class:`lasagne.layers.Layer` instance or a tuple
 45 |         The layer feeding into this layer, or the expected input shape
 46 |     num_states : int16
 47 |         Number of hidden states in the CRF
 48 |     pi : callable, np.ndarray or theano.shared
 49 |         Initializer for the initial potential (:math:`\pi`)
 50 |     tau : callable, np.ndarray or theano.shared
 51 |         Initializer for the final potential (:math:`\tau`)
 52 |     c : callable, np.ndarray or theano.shared
 53 |         Initializer for the label bias potential (:math:`c`)
 54 |     A : callable, np.ndarray or theano.shared
 55 |         Initializer for the pairwise label potential (:math:`A`)
 56 |     W : callable, np.ndarray or theano.shared
 57 |         Initializer for the pairwise label-input potential (:math:`W`)
 58 |     mask_input : :class:`lasagne.layers.Layer`
 59 |         Layer which allows for a sequence mask to be input, for when sequences
 60 |         are of variable length.  Default `None`, which means no mask will be
 61 |         supplied (i.e. all sequences are of the same length).
 62 |     """
 63 | 
 64 |     def __init__(self, incoming, num_states, pi=lnn.init.Constant(0.),
 65 |                  tau=lnn.init.Constant(0.), c=lnn.init.Constant(0.),
 66 |                  A=lnn.init.GlorotUniform(), W=lnn.init.GlorotUniform(),
 67 |                  mask_input=None, **kwargs):
 68 | 
 69 |         incomings = [incoming]
 70 |         if mask_input is not None:
 71 |             incomings.append(mask_input)
 72 | 
 73 |         super(CrfLayer, self).__init__(incomings, **kwargs)
 74 | 
 75 |         self.num_states = num_states
 76 |         num_inputs = int(np.prod(self.input_shapes[0][2:]))
 77 | 
 78 |         self.pi = self.add_param(pi, (num_states,), name='pi')
 79 |         self.tau = self.add_param(tau, (num_states,), name='tau')
 80 |         self.c = self.add_param(c, (num_states,), name='c')
 81 |         self.A = self.add_param(A, (num_states, num_states), name='A')
 82 |         self.W = self.add_param(W, (num_inputs, num_states), name='W')
 83 | 
 84 |     def get_output_shape_for(self, input_shapes):
 85 |         # TODO: check how this works if we want to have a 'partition' output
 86 |         # The shape of the input to this layer will be the first element
 87 |         # of input_shapes, whether or not a mask input is being used.
 88 |         input_shape = input_shapes[0]
 89 |         return input_shape[0], input_shape[1], self.num_states
 90 | 
 91 |     def _get_viterbi_output_for(self, sequences, num_batches):
 92 | 
 93 |         def vit_step(x_i, delta_p, A, W, c):
 94 |             all_trans = A + tt.shape_padright(delta_p)
 95 |             best_trans = tt.max(all_trans, axis=1)
 96 |             best_trans_id = tt.cast(tt.argmax(all_trans, axis=1),
 97 |                                     dtype=STATE_ID_DTYPE)
 98 |             return c.T + x_i.dot(W) + best_trans, best_trans_id
 99 | 
100 |         def vit_step_masked(x_i, mask_i, delta_p, A, W, c, masked_bck_ptrs):
101 |             all_trans = A + tt.shape_padright(delta_p)
102 |             best_trans = tt.max(all_trans, axis=1)
103 |             best_trans_id = tt.cast(tt.argmax(all_trans, axis=1),
104 |                                     dtype=STATE_ID_DTYPE)
105 |             delta_c = c.T + x_i.dot(W) + best_trans
106 | 
107 |             return (delta_c * mask_i + delta_p * (1 - mask_i),
108 |                     tt.cast(best_trans_id * mask_i +
109 |                             masked_bck_ptrs * (1 - mask_i),
110 |                             dtype=STATE_ID_DTYPE))
111 | 
112 |         # prepare initial values
113 |         delta_0 = tt.repeat(tt.shape_padleft(self.pi), num_batches, axis=0)
114 | 
115 |         # choose step function
116 |         if len(sequences) == 1:
117 |             step_fun = vit_step
118 |             non_sequences = [self.A, self.W, self.c]
119 |         else:
120 |             step_fun = vit_step_masked
121 |             # We need backtracking pointers for masked steps. They just point
122 |             # to the state itself, effectively just copying the decoded step
123 |             non_sequences = [self.A, self.W, self.c,
124 |                              tt.shape_padleft(tt.arange(0, self.num_states,
125 |                                                         dtype=STATE_ID_DTYPE))]
126 | 
127 |         # loop over the observation sequence
128 |         ([deltas, back_ptrs], _) = theano.scan(
129 |             fn=step_fun,
130 |             outputs_info=[delta_0, None],
131 |             sequences=sequences,
132 |             non_sequences=non_sequences,
133 |             strict=True)
134 | 
135 |         # don't forget tau for the last step
136 |         deltas_N = deltas[-1] + self.tau
137 | 
138 |         # noinspection PyShadowingNames
139 |         def bcktr_step(back_ptrs, next_state, num_batches):
140 |             return back_ptrs[tt.arange(num_batches), next_state]
141 | 
142 |         # y_star is the most probable state sequence
143 |         y_star, _ = theano.scan(
144 |             fn=bcktr_step,
145 |             outputs_info=tt.cast(deltas_N.argmax(axis=1),
146 |                                  dtype=STATE_ID_DTYPE),
147 |             sequences=back_ptrs[1:],  # don't report the initial state y_0
148 |             non_sequences=[num_batches],
149 |             go_backwards=True,
150 |             strict=True)
151 | 
152 |         # add y_star_N, reverse to bring path in correct order and shape
153 |         y_star = tt.concatenate([y_star[::-1],
154 |                                  tt.shape_padleft(deltas_N.argmax(axis=1))
155 |                                  ]).T
156 | 
157 |         # create one-hot encoding of state sequence. since theano's
158 |         # "to_one_hot" function only takes vectors and converts them to
159 |         # matrices, we have reshape forth and back
160 |         y_star_oh = tt.extra_ops.to_one_hot(
161 |             y_star.flatten(),
162 |             self.num_states).reshape((num_batches, -1, self.num_states))
163 | 
164 |         return y_star_oh
165 | 
166 |     def _get_forward_output_for(self, sequences, num_batches):
167 | 
168 |         # define loop functions for theano scan, one for unmasked input,
169 |         # one for masked input
170 |         def fwd_step(x_i, alpha_p, Z_p, A, W, c):
171 |             alpha_c = tt.exp(c.T + x_i.dot(W)) * alpha_p.dot(tt.exp(A))
172 |             return (alpha_c / tt.shape_padright(alpha_c.sum(axis=1)),
173 |                     Z_p + tt.log(alpha_c.sum(axis=1)))
174 | 
175 |         def fwd_step_masked(x_i, mask_i, alpha_p, Z_p, A, W, c):
176 |             alpha_c = tt.exp(c.T + x_i.dot(W)) * alpha_p.dot(tt.exp(A))
177 |             norm = alpha_c.sum(axis=1)
178 |             alpha_c /= tt.shape_padright(norm)
179 | 
180 |             # use .squeeze() to remove last broadcastable dimension
181 |             return (alpha_c * mask_i + alpha_p * (1 - mask_i),
182 |                     Z_p + tt.log(norm) * mask_i.squeeze())
183 | 
184 |         # prepare initial values
185 |         alpha_0 = tt.repeat(tt.shape_padleft(tt.exp(self.pi)),
186 |                             num_batches, axis=0)
187 |         Z_0 = tt.log(alpha_0.sum(axis=1))
188 |         alpha_0 /= tt.shape_padright(alpha_0.sum(axis=1))
189 | 
190 |         # loop over the observation sequence
191 |         ([alphas, log_zs], upd) = theano.scan(
192 |             fn=fwd_step if len(sequences) == 1 else fwd_step_masked,
193 |             outputs_info=[alpha_0, Z_0],
194 |             sequences=sequences,
195 |             non_sequences=[self.A, self.W, self.c],
196 |             strict=True)
197 | 
198 |         # don't forget tau for the last step, recopute the log probability
199 |         alphas_N = alphas[-1] * tt.exp(self.tau)
200 |         norm = alphas_N.sum(axis=1)
201 |         log_z = log_zs[-1] + tt.log(norm)
202 |         alphas_N /= tt.shape_padright(norm)
203 | 
204 |         # add corrected alpha_N
205 |         alphas = tt.concatenate([alphas[:-1], tt.shape_padleft(alphas_N)])
206 | 
207 |         # bring to (num_batches, seq_len, features) shape and return
208 |         alphas = alphas.dimshuffle(1, 0, 2)
209 |         return alphas, log_z
210 | 
211 |     def get_output_for(self, inputs, mode='decoding', **kwargs):
212 |         """
213 |         Compute this layer's output function given a symbolic input variable.
214 | 
215 |         Parameters
216 |         ----------
217 |         inputs : list of theano.TensorType
218 |             `inputs[0]` should always be the symbolic input variable.  When
219 |             this layer has a mask input (i.e. was instantiated with
220 |             `mask_input != None`, indicating that the lengths of sequences in
221 |             each batch vary), `inputs` should have length 2, where `inputs[1]`
222 |             is the `mask`.  The `mask` should be supplied as a Theano variable
223 |             denoting whether each time step in each sequence in the batch is
224 |             part of the sequence or not.  `mask` should be a matrix of shape
225 |             ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
226 |             (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
227 |             of sequence i)``. When the hidden state of this layer is to be
228 |             pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
229 |             should have length at least 2, and `inputs[-1]` is the hidden state
230 |             to prefill with.
231 | 
232 |         mode : string
233 |             Indicates the type of the output of the layer. If 'decoding',
234 |             the globally optimal state sequence for each input sequence is
235 |             returned, using one-hot encoding. If 'filtering', the filtering
236 |             distribution :math:`P(y_t | x_{1:t})` is returned. If 'partition',
237 |             the partition function :math:`Z(X)` is returned.
238 | 
239 |         Returns
240 |         -------
241 |         layer_output : theano.TensorType
242 |             Symbolic output variable.
243 |         """
244 |         # Retrieve the layer input
245 |         data = inputs[0]
246 |         # Treat all dimensions after the second as flattened feature dimensions
247 |         if data.ndim > 3:
248 |             data = tt.flatten(data, 3)
249 |         # Input should be provided as (n_batch, n_time_steps, n_features)
250 |         # but scan requires the iterable dimension to be first
251 |         # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
252 |         data = data.dimshuffle(1, 0, 2)
253 |         seq_len, num_batches, _ = data.shape
254 |         sequences = [data]
255 | 
256 |         # Retrieve the mask when it is supplied
257 |         if len(inputs) > 1:
258 |             mask = inputs[1]
259 |             mask = mask.dimshuffle(1, 0, 'x')
260 |             sequences.append(mask)
261 | 
262 |         if mode == 'decoding':
263 |             return self._get_viterbi_output_for(sequences, num_batches)
264 |         elif mode == 'filtering':
265 |             return self._get_forward_output_for(sequences, num_batches)[0]
266 |         elif mode == 'partition':
267 |             return self._get_forward_output_for(sequences, num_batches)[1]
268 |         else:
269 |             raise NotImplementedError('Invalid mode "{}"'.format(mode))
270 | 


--------------------------------------------------------------------------------
/spaghetti/objectives.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Provides objective functions for training CRF layers. Currently, only
  3 | one objective is implemented:
  4 | 
  5 | .. autosummary::
  6 |     :nosignatures:
  7 | 
  8 |     neg_log_likelihood
  9 | """
 10 | import theano
 11 | import theano.tensor as tt
 12 | import lasagne as lnn
 13 | 
 14 | 
 15 | # this is taken from pylearn2 (https://github.com/lisa-lab/pylearn2)
 16 | def _log_sum_exp(x=None, axis=None):
 17 |     """
 18 |     A numerically stable expression for
 19 |     `T.log(T.exp(x).sum(axis=axis))`
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     x : theano.gof.Variable
 24 |         A tensor we want to compute the log sum exp of
 25 |     axis : int, optional
 26 |         Axis along which to sum
 27 | 
 28 |     Returns
 29 |     -------
 30 |     log_sum_exp : theano.gof.Variable
 31 |         The log sum exp of `A`
 32 |     """
 33 |     x_max = tt.max(x, axis=axis, keepdims=True)
 34 |     y = (
 35 |         tt.log(tt.sum(tt.exp(x - x_max), axis=axis, keepdims=True)) +
 36 |         x_max
 37 |     )
 38 | 
 39 |     if axis is None:
 40 |         return y.dimshuffle(())
 41 |     else:
 42 |         if type(axis) is int:
 43 |             axis = [axis]
 44 |         return y.dimshuffle([i for i in range(y.ndim) if
 45 |                              i % y.ndim not in axis])
 46 | 
 47 | 
 48 | def neg_log_likelihood(crf, target, mask=None):
 49 |     """
 50 |     Computes the negative log-likelihood of the target sequences
 51 |     given the inputs.
 52 | 
 53 |     .. math:: L = - P(t | x)
 54 | 
 55 |     Parameters
 56 |     ----------
 57 |     crf : :class:`CrfLayer` instance
 58 |         CRF layer to compute the negative log-likelihood for
 59 |     target : Theano 3D tensor
 60 |         One-hot encoded target sequences
 61 |     mask : Theano 2D tensor
 62 |         Matrix indicating for each sequence which elements are used (value 1)
 63 |         and which ignored (value 0). Default `None`, which means no mask
 64 |         will be used and thus all elements of all sequences used (i.e. all
 65 |         sequences are of the same length)
 66 | 
 67 |     Returns
 68 |     -------
 69 |     Theano scalar
 70 |         An expression for the negative log-likelihood
 71 |     """
 72 | 
 73 |     # get output and compute partition function
 74 |     x = lnn.layers.get_output(crf.input_layers[0])
 75 |     log_z = lnn.layers.get_output(crf, mode='partition')
 76 | 
 77 |     # noinspection PyPep8Naming
 78 |     def seq_step(y_prev, y_cur, x_cur, lp, A, W, c):
 79 |         return lp + c.dot(y_cur.T) + (y_prev.dot(A) * y_cur).sum(axis=1) + \
 80 |                (x_cur.dot(W) * y_cur).sum(axis=1)
 81 | 
 82 |     # noinspection PyPep8Naming
 83 |     def seq_step_masked(y_prev, y_cur, x_cur, mask_i, lp, A, W, c):
 84 |         lp_cur = c.dot(y_cur.T) + (y_prev.dot(A) * y_cur).sum(axis=1) + \
 85 |             (x_cur.dot(W) * y_cur).sum(axis=1)
 86 |         return lp + lp_cur * mask_i
 87 | 
 88 |     # treat all dimensions after the second as flattened feature dimensions
 89 |     if x.ndim > 3:
 90 |         x = tt.flatten(x, 3)
 91 |     # make time first dimension
 92 |     y = target.dimshuffle(1, 0, 2)
 93 |     x = x.dimshuffle(1, 0, 2)
 94 | 
 95 |     # create sequences - since we use x[0] already
 96 |     # for computing the initial value, we start from x[1]
 97 |     sequences = [dict(input=y, taps=[-1, 0]), x[1:]]
 98 |     if mask is not None:
 99 |         # do not attach broadcastable dimension to mask as in forward and
100 |         # viterbi computations
101 |         sequences.append(mask.dimshuffle(1, 0)[1:])
102 | 
103 |     # sum out all possibilities of y_0
104 |     # assumes that:
105 |     #  - for masked values the last valid y value is repeated!
106 |     #  - assumes y_1 is never masked
107 |     # this should work in the most common case where you mask at the
108 |     # end of a sequence.
109 |     # tricky: y_1 corresponds to y[0], while y_0 is a
110 |     # non-existing 'virtual state'
111 |     init_lp = \
112 |         _log_sum_exp(crf.pi + crf.A.dot(y[0].T).T, axis=1) + \
113 |         y[0].dot(crf.c) + (x[0].dot(crf.W) * y[0]).sum(axis=1) + \
114 |         y[-1].dot(crf.tau) - log_z
115 | 
116 |     # process the sequence
117 |     seq_lp, _ = theano.scan(
118 |         fn=seq_step if mask is None else seq_step_masked,
119 |         outputs_info=init_lp,
120 |         sequences=sequences,
121 |         non_sequences=[crf.A, crf.W, crf.c])
122 | 
123 |     # negate log likelihood because we are minimizing
124 |     return -seq_lp[-1]
125 | 


--------------------------------------------------------------------------------