├── .gitignore ├── LICENSE ├── README.md ├── setup.py └── spaghetti ├── __init__.py ├── layers.py └── objectives.py /.gitignore: -------------------------------------------------------------------------------- 1 | scratch 2 | 3 | # Created by https://www.gitignore.io/api/python 4 | 5 | ### Python ### 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | env/ 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *,cover 51 | .hypothesis/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | 60 | # Sphinx documentation 61 | docs/_build/ 62 | 63 | # PyBuilder 64 | target/ 65 | 66 | 67 | # Created by https://www.gitignore.io/api/pycharm 68 | 69 | ### PyCharm ### 70 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 71 | 72 | *.iml 73 | 74 | ## Directory-based project format: 75 | .idea/ 76 | # if you remove the above rule, at least ignore the following: 77 | 78 | # User-specific stuff: 79 | # .idea/workspace.xml 80 | # .idea/tasks.xml 81 | # .idea/dictionaries 82 | # .idea/shelf 83 | 84 | # Sensitive or high-churn files: 85 | # .idea/dataSources.ids 86 | # .idea/dataSources.xml 87 | # .idea/sqlDataSources.xml 88 | # .idea/dynamic.xml 89 | # .idea/uiDesigner.xml 90 | 91 | # Gradle: 92 | # .idea/gradle.xml 93 | # .idea/libraries 94 | 95 | # Mongo Explorer plugin: 96 | # .idea/mongoSettings.xml 97 | 98 | ## File-based project format: 99 | *.ipr 100 | *.iws 101 | 102 | ## Plugin-specific files: 103 | 104 | # IntelliJ 105 | /out/ 106 | 107 | # mpeltonen/sbt-idea plugin 108 | .idea_modules/ 109 | 110 | # JIRA plugin 111 | atlassian-ide-plugin.xml 112 | 113 | # Crashlytics plugin (for Android Studio and IntelliJ) 114 | com_crashlytics_export_strings.xml 115 | crashlytics.properties 116 | crashlytics-build.properties 117 | fabric.properties 118 | 119 | 120 | # Created by https://www.gitignore.io/api/ipythonnotebook 121 | 122 | ### IPythonNotebook ### 123 | # Temporary data 124 | .ipynb_checkpoints/ 125 | 126 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Filip Korzeniowski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spaghetti 2 | 3 | Spaghetti is an implementation of **Linear-Chain Conditional Random Fields** 4 | (CRFs) as [Lasagne](https://github.com/Lasagne/Lasagne) layer. It facilitates 5 | integrating CRFs with neural networks. 6 | 7 | ## Installation 8 | 9 | To install Spaghetti, follow these steps. Adapt as necessary. 10 | 11 | 1. `git clone https://github.com/fdlm/Spaghetti.git` 12 | 2. `cd Spaghetti` 13 | 3. `python setup.py install` 14 | 15 | ## Examples 16 | 17 | ### Decoding with fixed parameters 18 | 19 | ```python 20 | import numpy as np 21 | import theano 22 | import theano.tensor as T 23 | import spaghetti as spg 24 | import lasagne 25 | 26 | # invent parameters for the CRF 27 | 28 | eta = 0.000000000000001 # numerical stability 29 | pi = np.log(np.array([0.6, 0.2, 0.1, 0.1], dtype=np.float32)) 30 | tau = np.log(np.ones(4, dtype=np.float32)) 31 | c = np.log(np.ones(4, dtype=np.float32)) 32 | 33 | A = np.log(np.array([[0.8, 0.2, 0.0, 0.0], 34 | [0.1, 0.6, 0.3, 0.0], 35 | [0.0, 0.2, 0.7, 0.1], 36 | [0.0, 0.0, 0.4, 0.6]]) + eta).astype(np.float32) 37 | 38 | W = np.log(np.array([[0.7, 0.1, 0.2, 0.3], 39 | [0.15, 0.4, 0.7, 0.1], 40 | [0.15, 0.5, 0.1, 0.6]]) + eta).astype(np.float32) 41 | 42 | # create observation sequence in one-hot encoding 43 | 44 | def to_onehot(seq, num_states=3): 45 | seq_oh = np.zeros(seq.shape + (num_states,), dtype=np.float32) 46 | seq_oh[range(len(seq)), seq] = 1. 47 | return seq_oh 48 | 49 | x = to_onehot(np.array([0, 0, 1, 0, 0, 2, 1, 0, 2, 1, 0, 1, 1, 1, 0, 2, 50 | 0, 2, 0, 1, 1, 2, 0, 0, 0, 1]))[np.newaxis, ...] 51 | 52 | # create simple crf model 53 | 54 | x_var = T.ftensor3(name='x') 55 | l_in = lasagne.layers.InputLayer(name='input', shape=(None, x.shape[1], 3), 56 | input_var=x_var) 57 | l_crf = spg.layers.CrfLayer(incoming=l_in, num_states=4, name='crf', 58 | pi=pi, tau=tau, c=c, A=A, W=W) 59 | path = lasagne.layers.get_output(l_crf, mode='decoding') 60 | decode = theano.function([x_var], path) 61 | 62 | # decode the state sequence, convert it from one-hot to state id 63 | print decode(x).argmax(axis=2) 64 | ``` 65 | 66 | ### Training 67 | 68 | ```python 69 | import spaghetti as spg 70 | import lasagne as lnn 71 | import numpy as np 72 | import theano.tensor as tt 73 | import theano 74 | 75 | # one hot encoding of sequences 76 | 77 | def to_onehot(seq, num_states=4): 78 | seq_oh = np.zeros(seq.shape + (num_states,), dtype=np.float32) 79 | seq_oh[range(len(seq)), seq] = 1. 80 | return seq_oh 81 | 82 | x = np.stack((to_onehot(np.array([0, 0, 1, 0, 0, 2, 1, 0, 2, 1, 0, 1, 1, 1, 0, 2, 0, 2, 0, 1, 1, 2, 0, 0, 0, 1]), 3), 83 | to_onehot(np.array([2, 2, 2, 2, 1, 0, 2, 0, 0, 0, 1, 1, 1, 2, 0, 2, 2, 2, 0, 1, 1, 1, 1, 1, 1, 1]), 3))) 84 | 85 | y = np.stack((to_onehot(np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 2, 2, 1, 0, 0, 0, 0])), 86 | to_onehot(np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2])))) 87 | 88 | # create model 89 | x_var = tt.ftensor3(name='x') # observation sequence variable 90 | y_var = tt.ftensor3(name='y') # state sequence variable 91 | 92 | l_in= lnn.layers.InputLayer(name='input', shape=(2, x.shape[1], 3), 93 | input_var=x_var) 94 | 95 | l_crf = spg.layers.CrfLayer(incoming=l_in, num_states=4, name='crf') 96 | 97 | # create train function 98 | objective = spg.objectives.neg_log_likelihood(l_crf, y_var) 99 | params = lnn.layers.get_all_params(l_crf, trainable=True) 100 | loss = objective.mean() 101 | updates = lnn.updates.sgd(loss, params, learning_rate=0.01) 102 | train = theano.function([y_var, x_var], loss, updates=updates) 103 | 104 | for i in range(100): 105 | cur_loss = train(y, x) 106 | if i % 10 == 0: 107 | print cur_loss 108 | ``` 109 | 110 | ## TODO 111 | 112 | - Add unit tests 113 | - Implement smoothing 114 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='Spaghetti', 5 | version='0.1dev', 6 | description='A Lasagne-compatible conditional random field implementation', 7 | classifiers=[ 8 | "Development Status :: 3 - Alpha", 9 | "Intended Audience :: Developers", 10 | "Intended Audience :: Science/Research", 11 | "License :: OSI Approved :: MIT License", 12 | "Programming Language :: Python :: 2.7", 13 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 14 | ], 15 | license='MIT', 16 | author='Filip Korzeniowski', 17 | author_email='filip.korzeniowski@jku.at', 18 | install_requires=['numpy', 'Lasagne', 'Theano'] 19 | ) 20 | -------------------------------------------------------------------------------- /spaghetti/__init__.py: -------------------------------------------------------------------------------- 1 | from . import layers 2 | from . import objectives 3 | 4 | __version__ = '0.1dev' 5 | -------------------------------------------------------------------------------- /spaghetti/layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Layers that construct linear-chain conditional random fields (CRFs). Similar to 4 | Lasagne's recurrent layers, CRF layers expect the input shape to be 5 | ``(batch_size, sequence_length, num_inputs)``. The input is allowed to have 6 | more than three dimensions in which case dimensions trailing the third 7 | dimension are flattened. CRF layers can be combined directly with recurrent 8 | layers and, using reshape operations, with feed-forward layers (see the 9 | Lasagne docs for recurrent layers). 10 | 11 | .. currentmodule:: spaghetti.layers 12 | 13 | .. autosummary:: 14 | :nosignatures: 15 | 16 | CrfLayer 17 | """ 18 | import lasagne as lnn 19 | import theano 20 | import theano.tensor as tt 21 | import numpy as np 22 | 23 | STATE_ID_DTYPE = 'uint16' 24 | 25 | 26 | class CrfLayer(lnn.layers.MergeLayer): 27 | """ 28 | spaghetti.layers.ViterbiLayer(incoming, pi, tau, c, A, W, mask_input=None, 29 | **kwargs) 30 | 31 | A layer which implements a linear-chain conditional random field. 32 | 33 | It assumes only pair-wise potentials between labels and pairwise 34 | label-input potentials. Different outputs can be computed by 35 | passing a `mode` parameter to `get_output`. 36 | 37 | .. math :: 38 | P(y \mid x) = \frac{1}{Z(x)}\exp(y_0^T \pi + 39 | \sum_{n=1}^N[y_{n-1}^T A y_n + y_n^T c + x_n^T W y_n] + 40 | y_N^T \tau) 41 | 42 | Parameters 43 | ---------- 44 | incoming : a :class:`lasagne.layers.Layer` instance or a tuple 45 | The layer feeding into this layer, or the expected input shape 46 | num_states : int16 47 | Number of hidden states in the CRF 48 | pi : callable, np.ndarray or theano.shared 49 | Initializer for the initial potential (:math:`\pi`) 50 | tau : callable, np.ndarray or theano.shared 51 | Initializer for the final potential (:math:`\tau`) 52 | c : callable, np.ndarray or theano.shared 53 | Initializer for the label bias potential (:math:`c`) 54 | A : callable, np.ndarray or theano.shared 55 | Initializer for the pairwise label potential (:math:`A`) 56 | W : callable, np.ndarray or theano.shared 57 | Initializer for the pairwise label-input potential (:math:`W`) 58 | mask_input : :class:`lasagne.layers.Layer` 59 | Layer which allows for a sequence mask to be input, for when sequences 60 | are of variable length. Default `None`, which means no mask will be 61 | supplied (i.e. all sequences are of the same length). 62 | """ 63 | 64 | def __init__(self, incoming, num_states, pi=lnn.init.Constant(0.), 65 | tau=lnn.init.Constant(0.), c=lnn.init.Constant(0.), 66 | A=lnn.init.GlorotUniform(), W=lnn.init.GlorotUniform(), 67 | mask_input=None, **kwargs): 68 | 69 | incomings = [incoming] 70 | if mask_input is not None: 71 | incomings.append(mask_input) 72 | 73 | super(CrfLayer, self).__init__(incomings, **kwargs) 74 | 75 | self.num_states = num_states 76 | num_inputs = int(np.prod(self.input_shapes[0][2:])) 77 | 78 | self.pi = self.add_param(pi, (num_states,), name='pi') 79 | self.tau = self.add_param(tau, (num_states,), name='tau') 80 | self.c = self.add_param(c, (num_states,), name='c') 81 | self.A = self.add_param(A, (num_states, num_states), name='A') 82 | self.W = self.add_param(W, (num_inputs, num_states), name='W') 83 | 84 | def get_output_shape_for(self, input_shapes): 85 | # TODO: check how this works if we want to have a 'partition' output 86 | # The shape of the input to this layer will be the first element 87 | # of input_shapes, whether or not a mask input is being used. 88 | input_shape = input_shapes[0] 89 | return input_shape[0], input_shape[1], self.num_states 90 | 91 | def _get_viterbi_output_for(self, sequences, num_batches): 92 | 93 | def vit_step(x_i, delta_p, A, W, c): 94 | all_trans = A + tt.shape_padright(delta_p) 95 | best_trans = tt.max(all_trans, axis=1) 96 | best_trans_id = tt.cast(tt.argmax(all_trans, axis=1), 97 | dtype=STATE_ID_DTYPE) 98 | return c.T + x_i.dot(W) + best_trans, best_trans_id 99 | 100 | def vit_step_masked(x_i, mask_i, delta_p, A, W, c, masked_bck_ptrs): 101 | all_trans = A + tt.shape_padright(delta_p) 102 | best_trans = tt.max(all_trans, axis=1) 103 | best_trans_id = tt.cast(tt.argmax(all_trans, axis=1), 104 | dtype=STATE_ID_DTYPE) 105 | delta_c = c.T + x_i.dot(W) + best_trans 106 | 107 | return (delta_c * mask_i + delta_p * (1 - mask_i), 108 | tt.cast(best_trans_id * mask_i + 109 | masked_bck_ptrs * (1 - mask_i), 110 | dtype=STATE_ID_DTYPE)) 111 | 112 | # prepare initial values 113 | delta_0 = tt.repeat(tt.shape_padleft(self.pi), num_batches, axis=0) 114 | 115 | # choose step function 116 | if len(sequences) == 1: 117 | step_fun = vit_step 118 | non_sequences = [self.A, self.W, self.c] 119 | else: 120 | step_fun = vit_step_masked 121 | # We need backtracking pointers for masked steps. They just point 122 | # to the state itself, effectively just copying the decoded step 123 | non_sequences = [self.A, self.W, self.c, 124 | tt.shape_padleft(tt.arange(0, self.num_states, 125 | dtype=STATE_ID_DTYPE))] 126 | 127 | # loop over the observation sequence 128 | ([deltas, back_ptrs], _) = theano.scan( 129 | fn=step_fun, 130 | outputs_info=[delta_0, None], 131 | sequences=sequences, 132 | non_sequences=non_sequences, 133 | strict=True) 134 | 135 | # don't forget tau for the last step 136 | deltas_N = deltas[-1] + self.tau 137 | 138 | # noinspection PyShadowingNames 139 | def bcktr_step(back_ptrs, next_state, num_batches): 140 | return back_ptrs[tt.arange(num_batches), next_state] 141 | 142 | # y_star is the most probable state sequence 143 | y_star, _ = theano.scan( 144 | fn=bcktr_step, 145 | outputs_info=tt.cast(deltas_N.argmax(axis=1), 146 | dtype=STATE_ID_DTYPE), 147 | sequences=back_ptrs[1:], # don't report the initial state y_0 148 | non_sequences=[num_batches], 149 | go_backwards=True, 150 | strict=True) 151 | 152 | # add y_star_N, reverse to bring path in correct order and shape 153 | y_star = tt.concatenate([y_star[::-1], 154 | tt.shape_padleft(deltas_N.argmax(axis=1)) 155 | ]).T 156 | 157 | # create one-hot encoding of state sequence. since theano's 158 | # "to_one_hot" function only takes vectors and converts them to 159 | # matrices, we have reshape forth and back 160 | y_star_oh = tt.extra_ops.to_one_hot( 161 | y_star.flatten(), 162 | self.num_states).reshape((num_batches, -1, self.num_states)) 163 | 164 | return y_star_oh 165 | 166 | def _get_forward_output_for(self, sequences, num_batches): 167 | 168 | # define loop functions for theano scan, one for unmasked input, 169 | # one for masked input 170 | def fwd_step(x_i, alpha_p, Z_p, A, W, c): 171 | alpha_c = tt.exp(c.T + x_i.dot(W)) * alpha_p.dot(tt.exp(A)) 172 | return (alpha_c / tt.shape_padright(alpha_c.sum(axis=1)), 173 | Z_p + tt.log(alpha_c.sum(axis=1))) 174 | 175 | def fwd_step_masked(x_i, mask_i, alpha_p, Z_p, A, W, c): 176 | alpha_c = tt.exp(c.T + x_i.dot(W)) * alpha_p.dot(tt.exp(A)) 177 | norm = alpha_c.sum(axis=1) 178 | alpha_c /= tt.shape_padright(norm) 179 | 180 | # use .squeeze() to remove last broadcastable dimension 181 | return (alpha_c * mask_i + alpha_p * (1 - mask_i), 182 | Z_p + tt.log(norm) * mask_i.squeeze()) 183 | 184 | # prepare initial values 185 | alpha_0 = tt.repeat(tt.shape_padleft(tt.exp(self.pi)), 186 | num_batches, axis=0) 187 | Z_0 = tt.log(alpha_0.sum(axis=1)) 188 | alpha_0 /= tt.shape_padright(alpha_0.sum(axis=1)) 189 | 190 | # loop over the observation sequence 191 | ([alphas, log_zs], upd) = theano.scan( 192 | fn=fwd_step if len(sequences) == 1 else fwd_step_masked, 193 | outputs_info=[alpha_0, Z_0], 194 | sequences=sequences, 195 | non_sequences=[self.A, self.W, self.c], 196 | strict=True) 197 | 198 | # don't forget tau for the last step, recopute the log probability 199 | alphas_N = alphas[-1] * tt.exp(self.tau) 200 | norm = alphas_N.sum(axis=1) 201 | log_z = log_zs[-1] + tt.log(norm) 202 | alphas_N /= tt.shape_padright(norm) 203 | 204 | # add corrected alpha_N 205 | alphas = tt.concatenate([alphas[:-1], tt.shape_padleft(alphas_N)]) 206 | 207 | # bring to (num_batches, seq_len, features) shape and return 208 | alphas = alphas.dimshuffle(1, 0, 2) 209 | return alphas, log_z 210 | 211 | def get_output_for(self, inputs, mode='decoding', **kwargs): 212 | """ 213 | Compute this layer's output function given a symbolic input variable. 214 | 215 | Parameters 216 | ---------- 217 | inputs : list of theano.TensorType 218 | `inputs[0]` should always be the symbolic input variable. When 219 | this layer has a mask input (i.e. was instantiated with 220 | `mask_input != None`, indicating that the lengths of sequences in 221 | each batch vary), `inputs` should have length 2, where `inputs[1]` 222 | is the `mask`. The `mask` should be supplied as a Theano variable 223 | denoting whether each time step in each sequence in the batch is 224 | part of the sequence or not. `mask` should be a matrix of shape 225 | ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= 226 | (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length 227 | of sequence i)``. When the hidden state of this layer is to be 228 | pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` 229 | should have length at least 2, and `inputs[-1]` is the hidden state 230 | to prefill with. 231 | 232 | mode : string 233 | Indicates the type of the output of the layer. If 'decoding', 234 | the globally optimal state sequence for each input sequence is 235 | returned, using one-hot encoding. If 'filtering', the filtering 236 | distribution :math:`P(y_t | x_{1:t})` is returned. If 'partition', 237 | the partition function :math:`Z(X)` is returned. 238 | 239 | Returns 240 | ------- 241 | layer_output : theano.TensorType 242 | Symbolic output variable. 243 | """ 244 | # Retrieve the layer input 245 | data = inputs[0] 246 | # Treat all dimensions after the second as flattened feature dimensions 247 | if data.ndim > 3: 248 | data = tt.flatten(data, 3) 249 | # Input should be provided as (n_batch, n_time_steps, n_features) 250 | # but scan requires the iterable dimension to be first 251 | # So, we need to dimshuffle to (n_time_steps, n_batch, n_features) 252 | data = data.dimshuffle(1, 0, 2) 253 | seq_len, num_batches, _ = data.shape 254 | sequences = [data] 255 | 256 | # Retrieve the mask when it is supplied 257 | if len(inputs) > 1: 258 | mask = inputs[1] 259 | mask = mask.dimshuffle(1, 0, 'x') 260 | sequences.append(mask) 261 | 262 | if mode == 'decoding': 263 | return self._get_viterbi_output_for(sequences, num_batches) 264 | elif mode == 'filtering': 265 | return self._get_forward_output_for(sequences, num_batches)[0] 266 | elif mode == 'partition': 267 | return self._get_forward_output_for(sequences, num_batches)[1] 268 | else: 269 | raise NotImplementedError('Invalid mode "{}"'.format(mode)) 270 | -------------------------------------------------------------------------------- /spaghetti/objectives.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provides objective functions for training CRF layers. Currently, only 3 | one objective is implemented: 4 | 5 | .. autosummary:: 6 | :nosignatures: 7 | 8 | neg_log_likelihood 9 | """ 10 | import theano 11 | import theano.tensor as tt 12 | import lasagne as lnn 13 | 14 | 15 | # this is taken from pylearn2 (https://github.com/lisa-lab/pylearn2) 16 | def _log_sum_exp(x=None, axis=None): 17 | """ 18 | A numerically stable expression for 19 | `T.log(T.exp(x).sum(axis=axis))` 20 | 21 | Parameters 22 | ---------- 23 | x : theano.gof.Variable 24 | A tensor we want to compute the log sum exp of 25 | axis : int, optional 26 | Axis along which to sum 27 | 28 | Returns 29 | ------- 30 | log_sum_exp : theano.gof.Variable 31 | The log sum exp of `A` 32 | """ 33 | x_max = tt.max(x, axis=axis, keepdims=True) 34 | y = ( 35 | tt.log(tt.sum(tt.exp(x - x_max), axis=axis, keepdims=True)) + 36 | x_max 37 | ) 38 | 39 | if axis is None: 40 | return y.dimshuffle(()) 41 | else: 42 | if type(axis) is int: 43 | axis = [axis] 44 | return y.dimshuffle([i for i in range(y.ndim) if 45 | i % y.ndim not in axis]) 46 | 47 | 48 | def neg_log_likelihood(crf, target, mask=None): 49 | """ 50 | Computes the negative log-likelihood of the target sequences 51 | given the inputs. 52 | 53 | .. math:: L = - P(t | x) 54 | 55 | Parameters 56 | ---------- 57 | crf : :class:`CrfLayer` instance 58 | CRF layer to compute the negative log-likelihood for 59 | target : Theano 3D tensor 60 | One-hot encoded target sequences 61 | mask : Theano 2D tensor 62 | Matrix indicating for each sequence which elements are used (value 1) 63 | and which ignored (value 0). Default `None`, which means no mask 64 | will be used and thus all elements of all sequences used (i.e. all 65 | sequences are of the same length) 66 | 67 | Returns 68 | ------- 69 | Theano scalar 70 | An expression for the negative log-likelihood 71 | """ 72 | 73 | # get output and compute partition function 74 | x = lnn.layers.get_output(crf.input_layers[0]) 75 | log_z = lnn.layers.get_output(crf, mode='partition') 76 | 77 | # noinspection PyPep8Naming 78 | def seq_step(y_prev, y_cur, x_cur, lp, A, W, c): 79 | return lp + c.dot(y_cur.T) + (y_prev.dot(A) * y_cur).sum(axis=1) + \ 80 | (x_cur.dot(W) * y_cur).sum(axis=1) 81 | 82 | # noinspection PyPep8Naming 83 | def seq_step_masked(y_prev, y_cur, x_cur, mask_i, lp, A, W, c): 84 | lp_cur = c.dot(y_cur.T) + (y_prev.dot(A) * y_cur).sum(axis=1) + \ 85 | (x_cur.dot(W) * y_cur).sum(axis=1) 86 | return lp + lp_cur * mask_i 87 | 88 | # treat all dimensions after the second as flattened feature dimensions 89 | if x.ndim > 3: 90 | x = tt.flatten(x, 3) 91 | # make time first dimension 92 | y = target.dimshuffle(1, 0, 2) 93 | x = x.dimshuffle(1, 0, 2) 94 | 95 | # create sequences - since we use x[0] already 96 | # for computing the initial value, we start from x[1] 97 | sequences = [dict(input=y, taps=[-1, 0]), x[1:]] 98 | if mask is not None: 99 | # do not attach broadcastable dimension to mask as in forward and 100 | # viterbi computations 101 | sequences.append(mask.dimshuffle(1, 0)[1:]) 102 | 103 | # sum out all possibilities of y_0 104 | # assumes that: 105 | # - for masked values the last valid y value is repeated! 106 | # - assumes y_1 is never masked 107 | # this should work in the most common case where you mask at the 108 | # end of a sequence. 109 | # tricky: y_1 corresponds to y[0], while y_0 is a 110 | # non-existing 'virtual state' 111 | init_lp = \ 112 | _log_sum_exp(crf.pi + crf.A.dot(y[0].T).T, axis=1) + \ 113 | y[0].dot(crf.c) + (x[0].dot(crf.W) * y[0]).sum(axis=1) + \ 114 | y[-1].dot(crf.tau) - log_z 115 | 116 | # process the sequence 117 | seq_lp, _ = theano.scan( 118 | fn=seq_step if mask is None else seq_step_masked, 119 | outputs_info=init_lp, 120 | sequences=sequences, 121 | non_sequences=[crf.A, crf.W, crf.c]) 122 | 123 | # negate log likelihood because we are minimizing 124 | return -seq_lp[-1] 125 | --------------------------------------------------------------------------------