├── pyhacrf
    ├── tests
    │   ├── __init__.py
    │   ├── profile.py
    │   ├── test_features.py
    │   └── test_model.py
    ├── __init__.py
    ├── state_machine.py
    ├── feature_extraction.py
    ├── algorithms.pyx
    └── pyhacrf.py
├── requirements-dev.txt
├── setup.cfg
├── MANIFEST.in
├── .gitignore
├── LICENSE
├── setup.py
├── README.rst
└── examples
    └── Highered dataset.ipynb


/pyhacrf/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | cython>=0.22
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.rst


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.rst
3 | 


--------------------------------------------------------------------------------
/pyhacrf/__init__.py:
--------------------------------------------------------------------------------
1 | """ Implements a Hidden Alignment Conditional Random Field (HACRF). """
2 | 
3 | from .pyhacrf import Hacrf
4 | from .feature_extraction import StringPairFeatureExtractor, PairFeatureExtractor
5 | 
6 | __all__ = ['Hacrf', 'StringPairFeatureExtractor', 'PairFeatureExtractor']
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 


--------------------------------------------------------------------------------
/pyhacrf/tests/profile.py:
--------------------------------------------------------------------------------
 1 | """ A slow test for profiling """
 2 | 
 3 | from numpy.testing import assert_array_almost_equal
 4 | import numpy as np
 5 | from numpy import random
 6 | from pyhacrf import Hacrf
 7 | from pyhacrf.pyhacrf import _Model
 8 | 
 9 | 
10 | def test_derivate_large():
11 |     classes = ['a', 'b', 'c']
12 |     y = 'b'
13 |     x = random.randn(20, 3, 10) * 5 + 3
14 |     state_machine, states_to_classes = Hacrf._default_state_machine(classes)
15 |     parameters = Hacrf._initialize_parameters(state_machine, x.shape[2])
16 |     parameters = random.randn(*parameters.shape) * 10 - 2
17 | 
18 |     test_model = _Model(state_machine, states_to_classes, x, y)
19 |     expected_dll = np.zeros(parameters.shape)
20 | 
21 |     # Finite difference gradient approximation
22 |     delta = 10.0**-7
23 |     S, D = expected_dll.shape
24 |     for s in xrange(S):
25 |         for d in xrange(D):
26 |             dg = np.zeros(parameters.shape)
27 |             dg[s, d] = delta
28 |             y0, _ = test_model.forward_backward(parameters)
29 |             y1, _ = test_model.forward_backward(parameters + dg)
30 |             expected_dll[s, d] = (y1 - y0) / delta
31 | 
32 |     actual_ll, actual_dll = test_model.forward_backward(parameters)
33 | 
34 |     print (abs(actual_dll) - abs(expected_dll)).sum()
35 |     assert_array_almost_equal(actual_dll, expected_dll, decimal=4)
36 | 
37 | if __name__ == '__main__':
38 |     test_derivate_large()
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Dirko Coetsee
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of pyhacrf nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, Extension
 2 | from codecs import open
 3 | from os import path
 4 | 
 5 | 
 6 | # from Michael Hoffman's http://www.ebi.ac.uk/~hoffman/software/sunflower/
 7 | class NumpyExtension(Extension):
 8 | 
 9 |     def __init__(self, *args, **kwargs):
10 |         from numpy import get_include
11 |         from numpy.distutils.misc_util import get_info
12 |         kwargs.update(get_info('npymath'))
13 |         kwargs['include_dirs'] += [get_include()]
14 | 
15 |         Extension.__init__(self, *args, **kwargs)
16 | 
17 | 
18 | here = path.abspath(path.dirname(__file__))
19 | 
20 | # Get the long description from the relevant file
21 | with open(path.join(here, 'README.rst'), encoding='utf-8') as f:
22 |     long_description = f.read()
23 | 
24 | 
25 | setup(
26 |     name='pyhacrf',
27 |     version='0.1.2',
28 |     packages=['pyhacrf'],
29 |     install_requires=['numpy>=1.9', 'PyLBFGS>=0.1.3'],
30 |     ext_modules=[NumpyExtension('pyhacrf.algorithms', 
31 |                                 ['pyhacrf/algorithms.c'])],
32 |     url='https://github.com/dirko/pyhacrf',
33 |     download_url='https://github.com/dirko/pyhacrf/tarball/0.1.2',
34 |     license='BSD',
35 |     author='Dirko Coetsee',
36 |     author_email='dpcoetsee@gmail.com',
37 |     description='Hidden alignment conditional random field, a discriminative string edit distance',
38 |     long_description=long_description,
39 |     classifiers=[
40 |         'Intended Audience :: Science/Research',
41 |         'License :: OSI Approved :: BSD License',
42 |         'Operating System :: OS Independent',
43 |         'Programming Language :: Python',
44 |         'Topic :: Scientific/Engineering',
45 |         ],
46 |     )
47 | 


--------------------------------------------------------------------------------
/pyhacrf/tests/test_features.py:
--------------------------------------------------------------------------------
 1 | """ Tests for the feature extraction. """
 2 | 
 3 | import unittest
 4 | 
 5 | from numpy.testing import assert_array_almost_equal
 6 | import numpy as np
 7 | from pyhacrf import StringPairFeatureExtractor
 8 | 
 9 | 
10 | class TestStringPairFeatureExtractor(unittest.TestCase):
11 |     def test_transform_binary(self):
12 |         s1 = "kat1"
13 |         s2 = "cat2"
14 |         # 1 . . . n
15 |         # t . . m .
16 |         # a . m . .
17 |         # k . . . .
18 |         #   c a t 2
19 |         expected_x = np.zeros((4, 4, 4))
20 |         expected_x[:, :, 0] = 2.0
21 |         expected_x[:, 0, 1] = 1.0
22 |         expected_x[0, :, 1] = 1.0
23 |         expected_x[1, 1, 2] = 1.0
24 |         expected_x[2, 2, 2] = 1.0
25 |         expected_x[3, 3, 3] = 1.0
26 | 
27 |         test_extractor = StringPairFeatureExtractor(bias=2.0, start=True, match=True, numeric=True)
28 |         actual_X = test_extractor.fit_transform([(s1, s2)])
29 | 
30 |         assert_array_almost_equal(expected_x, actual_X[0])
31 | 
32 |     def test_transform_transition(self):
33 |         s1 = "ba"
34 |         s2 = "ca"
35 |         # a . .
36 |         # b . .
37 |         #   c a
38 |         chars = StringPairFeatureExtractor.CHARACTERS
39 |         nchars = len(chars)
40 |         print(nchars)
41 |         expected_x = np.zeros((2, 2, len(chars)**2 + 1))
42 |         expected_x[:, :, 0] = 1.0
43 |         expected_x[0, 0, 2 + nchars * 1 + 1] = 1.0  # b->c
44 |         expected_x[0, 1, 0 + nchars * 1 + 1] = 1.0  # b->a
45 |         expected_x[1, 0, 2 + nchars * 0 + 1] = 1.0  # a->c
46 |         expected_x[1, 1, 0 + nchars * 0 + 1] = 1.0  # a->a
47 | 
48 |         test_extractor = StringPairFeatureExtractor(transition=True)
49 |         actual_X = test_extractor.fit_transform([(s1, s2)])
50 | 
51 |         assert_array_almost_equal(expected_x, actual_X[0])
52 | 
53 | if __name__ == '__main__':
54 |     unittest.main()
55 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | pyhacrf
 2 | =======
 3 | 
 4 | Hidden alignment conditional random field for classifying string pairs -
 5 | a learnable edit distance.
 6 | 
 7 | This package aims to implement the HACRF machine learning model with a
 8 | ``sklearn``-like interface. It includes ways to fit a model to training
 9 | examples and score new example.
10 | 
11 | The model takes string pairs as input and classify them into any number
12 | of classes. In McCallum's original paper the model was applied to the
13 | database deduplication problem. Each database entry was paired with
14 | every other entry and the model then classified whether the pair was a
15 | 'match' or a 'mismatch' based on training examples of matches and
16 | mismatches.
17 | 
18 | I also tried to use it as learnable string edit distance for normalizing
19 | noisy text. See *A Conditional Random Field for Discriminatively-trained
20 | Finite-state String Edit Distance* by McCallum, Bellare, and Pereira,
21 | and the report *Conditional Random Fields for Noisy text normalisation*
22 | by Dirko Coetsee.
23 | 
24 | Example
25 | -------
26 | 
27 | .. code:: python
28 | 
29 |     from pyhacrf import StringPairFeatureExtractor, Hacrf
30 | 
31 |     training_X = [('helloooo', 'hello'), # Matching examples
32 |                   ('h0me', 'home'),
33 |                   ('krazii', 'crazy'),
34 |                   ('non matching string example', 'no really'), # Non-matching examples
35 |                   ('and another one', 'yep')]
36 |     training_y = ['match',
37 |                   'match',
38 |                   'match',
39 |                   'non-match',
40 |                   'non-match']
41 | 
42 |     # Extract features
43 |     feature_extractor = StringPairFeatureExtractor(match=True, numeric=True)
44 |     training_X_extracted = feature_extractor.fit_transform(training_X)
45 | 
46 |     # Train model
47 |     model = Hacrf(l2_regularization=1.0)
48 |     model.fit(training_X_extracted, training_y)
49 | 
50 |     # Evaluate
51 |     from sklearn.metrics import confusion_matrix
52 |     predictions = model.predict(training_X_extracted)
53 | 
54 |     print(confusion_matrix(training_y, predictions))
55 |     > [[0 3]
56 |     >  [2 0]]
57 | 
58 |     print(model.predict_proba(training_X_extracted))
59 |     > [[ 0.94914812  0.05085188]
60 |     >  [ 0.92397711  0.07602289]
61 |     >  [ 0.86756034  0.13243966]
62 |     >  [ 0.05438812  0.94561188]
63 |     >  [ 0.02641275  0.97358725]]
64 | 
65 | Dependencies
66 | ------------
67 | 
68 | This package depends on ``numpy``. The LBFGS optimizer in ``pylbfgs`` is
69 | used, but alternative optimizers can be passed.
70 | 
71 | Install
72 | -------
73 | 
74 | Install by running:
75 | 
76 | ::
77 | 
78 |     python setup.py install
79 | 
80 | or from pypi:
81 | 
82 | ::
83 | 
84 |     pip install pyhacrf
85 | 
86 | Developing
87 | ----------
88 | Clone from repository, then
89 | 
90 | ::
91 | 
92 |     pip install -r requirements-dev.txt
93 |     cython pyhacrf/*.pyx
94 |     python setup.py install
95 | 
96 | To deploy to pypi, make sure you have compiled the \*.pyx files to \*.c


--------------------------------------------------------------------------------
/pyhacrf/state_machine.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections import defaultdict, deque
  3 | 
  4 | 
  5 | class GeneralStateMachine(object):
  6 |     """ State machine which, together with two input sequences, is used to build the lattice.
  7 | 
  8 |     Each state and each transition is labelled by different integers.
  9 | 
 10 |     Parameters
 11 |     ----------
 12 |     start_states : list of ints
 13 |         The states that the state machine can start in.
 14 | 
 15 |     transitions : List of tuples
 16 |         The start state, end state, and number of positions to move in each sequence. For example,
 17 |         [(0, 0, (0, 1)),  # insertion into the first sequence, while going from state 0 to state 0.
 18 |          (1, 0, (1, 0)),  # deletion from first sequence, while moving from state 1 to state 0.
 19 |          (2, 1, (1, 1)),  # match/substitution - move from state 2 to state 1.
 20 |          ...
 21 |         ]
 22 | 
 23 |     states_to_classes : dictionary
 24 |         Dictionary where each state is mapped to a class.
 25 |     """
 26 | 
 27 |     def __init__(self, start_states, transitions, states_to_classes):
 28 |         self._start_states = start_states
 29 |         self._transitions = transitions
 30 | 
 31 |         max_state = max(max(s for s, _, _ in transitions), max(s for _, s, _ in transitions)) + 1
 32 |         self.n_states = max_state
 33 |         self.n_transitions = len(transitions)
 34 |         self.states_to_classes = states_to_classes
 35 | 
 36 |     def build_lattice(self, x):
 37 |         """ Construct the list of nodes and edges for input features. """
 38 |         I, J, _ = x.shape
 39 |         start_states, transitions = self._start_states, self._transitions
 40 | 
 41 |         lattice = []
 42 |         transitions_d = defaultdict(list)
 43 |         for transition_index, (s0, s1, delta) in enumerate(transitions):
 44 |             transitions_d[s0].append((s1, delta, transition_index))
 45 |         # Add start states
 46 |         unvisited_nodes = deque([(0, 0, s) for s in start_states])
 47 |         visited_nodes = set()
 48 |         n_states = self.n_states
 49 | 
 50 |         while unvisited_nodes:
 51 |             node = unvisited_nodes.popleft()
 52 |             lattice.append(node)
 53 |             i, j, s0 = node
 54 |             for s1, delta, transition_index in transitions_d[s0]:
 55 |                 try:
 56 |                     di, dj = delta
 57 |                 except TypeError:
 58 |                     di, dj = delta(i, j, x)
 59 | 
 60 |                 if i + di < I and j + dj < J:
 61 |                     edge = (i, j, s0, i + di, j + dj, s1, transition_index + n_states)
 62 |                     lattice.append(edge)
 63 |                     dest_node = (i + di, j + dj, s1)
 64 |                     if dest_node not in visited_nodes:
 65 |                         unvisited_nodes.append(dest_node)
 66 |                         visited_nodes.add(dest_node)
 67 | 
 68 |         lattice.sort()
 69 | 
 70 |         # Step backwards through lattice and add visitable nodes to the set of nodes to keep. The rest are discarded.
 71 |         final_lattice = []
 72 |         visited_nodes = set((I-1, J-1, s) for s in range(n_states))
 73 | 
 74 |         for node in lattice[::-1]:
 75 |             if node in visited_nodes:
 76 |                 final_lattice.append(node)
 77 |             elif len(node) > 3:
 78 |                 source_node, dest_node = node[0:3], node[3:6]
 79 |                 if dest_node in visited_nodes:
 80 |                     visited_nodes.add(source_node)
 81 |                     final_lattice.append(node)
 82 | 
 83 |         reversed_list = list(reversed(final_lattice))
 84 | 
 85 |         # Squash list
 86 |         lattice = [edge for edge in reversed_list if len(edge) > 3]
 87 |         return np.array(lattice, dtype='int64')
 88 | 
 89 | 
 90 | class DefaultStateMachine(object):
 91 |     """ State machine which, together with two input sequences, is used to build the lattice.
 92 | 
 93 |     Simple and fast state machine with a single state for each class.
 94 |     Allows for character match/substitution, deletion, and insertion.
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     classes : list
 99 |         The set of labels.
100 |     """
101 |     BASE_LENGTH = 60
102 | 
103 |     def __init__(self, classes):
104 |         n_classes = len(classes)
105 |         deltas = ((1, 1),  # Match
106 |                   (0, 1),  # Insertion
107 |                   (1, 0))  # Deletion
108 |         self._start_states = [i for i in range(n_classes)]
109 |         self._transitions = [(i, i, delta)
110 |                              for delta in deltas
111 |                              for i in range(n_classes)]
112 |         self._base_shape = (self.BASE_LENGTH, self.BASE_LENGTH)
113 |         self.states_to_classes = {i: c for i, c in enumerate(classes)}
114 |         self.n_transitions = len(self._transitions)
115 |         self.n_states = len(classes)
116 |         self._base_lattice = self._independent_lattice(self._base_shape)
117 | 
118 |         self._lattice_limits = self._lattice_ends()
119 | 
120 |     def _subset_independent_lattice(self, shape):
121 |         I, J = shape
122 | 
123 |         if I < self.BASE_LENGTH and J < self.BASE_LENGTH:
124 |             lattice = self._base_lattice.take(
125 |                 self._lattice_limits[I,J],
126 |                 axis=0)
127 |                 
128 |         elif I < self.BASE_LENGTH:
129 |             lattice = self._base_lattice.take(
130 |                 self._lattice_limits[I, None],
131 |                 axis=0)
132 |             lattice = self._independent_lattice((I, J), lattice)
133 |         elif J < self.BASE_LENGTH:
134 |             lattice = self._base_lattice.take(
135 |                 self._lattice_limits[None, J],
136 |                 axis=0)
137 |             lattice = self._independent_lattice((I, J), lattice)
138 |         else:
139 |             lattice = self._independent_lattice((I, J), self._base_lattice)
140 | 
141 |         return lattice
142 | 
143 |     def _independent_lattice(self, shape, lattice=None):
144 |         """ Helper to construct the list of nodes and edges. """
145 |         I, J = shape
146 | 
147 |         if lattice is not None:
148 |             end_I = min(I, max(lattice[..., 3])) - 1
149 |             end_J = min(J, max(lattice[..., 4])) - 1
150 |             unvisited_nodes = deque([(i, j, s)
151 |                                      for i in range(end_I)
152 |                                      for j in range(end_J)
153 |                                      for s in self._start_states])
154 |             lattice = lattice.tolist()
155 |         else:
156 |             lattice = []
157 |             unvisited_nodes = deque([(0, 0, s) for s in self._start_states])
158 |         lattice += _grow_independent_lattice(self._transitions, 
159 |                                              self.n_states, (I, J), 
160 |                                              unvisited_nodes)
161 |         lattice = np.array(sorted(lattice), dtype='int64')
162 |         return lattice
163 | 
164 |     def build_lattice(self, x):
165 |         """ Construct the list of nodes and edges for input features. """
166 |         I, J, _ = x.shape
167 |         lattice = self._subset_independent_lattice((I, J))
168 |         return lattice
169 | 
170 |     def _lattice_ends(self) :
171 | 
172 |         lattice_limits = {}
173 | 
174 |         lengths = np.arange(self.BASE_LENGTH)
175 |         lengths.reshape(1, -1)
176 | 
177 |         I = self._base_lattice[..., 3:4] < lengths
178 |         for i in range(self.BASE_LENGTH) :
179 |             lattice_limits[i, None] = I[..., i].nonzero()[0]
180 | 
181 |         J = self._base_lattice[..., 4:5] < lengths
182 |         for j in range(self.BASE_LENGTH) :
183 |             lattice_limits[None, j] = J[..., j].nonzero()[0]
184 | 
185 |         IJ = np.expand_dims(I, axis=0).T & J
186 | 
187 |         for i in range(self.BASE_LENGTH) :
188 |             for j in range(self.BASE_LENGTH) :
189 |                 lattice_limits[i,j] = IJ[i, ..., j].nonzero()[0]
190 | 
191 |         return lattice_limits
192 | 
193 | 
194 | 
195 | def _grow_independent_lattice(transitions, n_states, shape, unvisited_nodes):
196 |     I, J = shape
197 |     visited_nodes = set()
198 |     lattice = []
199 | 
200 |     transitions_d = defaultdict(list)
201 |     for transition_index, (s0, s1, delta) in enumerate(transitions):
202 |         if not callable(delta):
203 |             di, dj = delta
204 |             transitions_d[s0].append((s1, di, dj,
205 |                                       transition_index + n_states))
206 | 
207 |     while unvisited_nodes:
208 |         i, j, s0 = unvisited_nodes.popleft()
209 |         for s1, di, dj, edge_parameter_index in transitions_d[s0]:
210 |             if i + di < I and j + dj < J:
211 |                 dest_node = (i + di, j + dj, s1)
212 |                 edge = (i, j, s0) + dest_node + (edge_parameter_index,)
213 |                 lattice.append(list(edge))
214 |                 if dest_node not in visited_nodes:
215 |                     unvisited_nodes.append(dest_node)
216 |                     visited_nodes.add(dest_node)
217 | 
218 |     return lattice
219 | 
220 | 


--------------------------------------------------------------------------------
/pyhacrf/feature_extraction.py:
--------------------------------------------------------------------------------
  1 | # Authors: Dirko Coetsee
  2 | # License: 3-clause BSD
  3 | 
  4 | """ Implements feature extraction methods to use with HACRF models. """
  5 | 
  6 | import numpy as np
  7 | import functools
  8 | import itertools
  9 | 
 10 | class PairFeatureExtractor(object):
 11 |     """Extract features from sequence pairs.
 12 | 
 13 |     For each feature, a grid is constructed for a sequency pair. The 
 14 |     features are stacked, producing a 3 dimensional matrix of 
 15 |     dimensions: 
 16 | 
 17 |     (length of sequence 1) X (length of sequence 2) X (number of features)
 18 | 
 19 |     For example, a 'beginning' character feature grid for the sequences,
 20 |     'kaas' and 'cheese' could look like this.
 21 | 
 22 |        c h e e s e
 23 |      k 1 1 1 1 1 1
 24 |      a 1 0 0 0 0 0
 25 |      a 1 0 0 0 0 0
 26 |      s 1 0 0 0 0 0
 27 | 
 28 |     These grids are made from two different types of feature
 29 |     functions: real and sparse.
 30 | 
 31 |     Real features are functions of the form:
 32 | 
 33 |         def some_feature_function(array1, array2):
 34 |             ...
 35 |             return feature_grid
 36 | 
 37 |     Given two sequences, s1 and s1, return a numpy.array with dimensions 
 38 |     (length of array1) X (length of array2).
 39 | 
 40 |     For performance reasons, we take advantage of numpy broadcasting, and 
 41 |     array1 is a column array and array2 is a row array. 
 42 | 
 43 |     For a 'matching character' feature between 'kaas' and 'cheese', the
 44 |     sequences are transformed and then we use broadcasting
 45 | 
 46 |         > array1 = numpy.array([['k'],
 47 |                                 ['a'],
 48 |                                 ['a'],
 49 |                                 ['s']])
 50 |         > array2 = numpy.array([['c', 'h', 'e', 'e', 's', 'e'])
 51 |         > array1 == array2
 52 |         numpy.array([[0, 0, 0, 0, 0, 0],
 53 |                      [0, 0, 0, 0, 0, 0],
 54 |                      [0, 0, 0, 0, 0, 0],
 55 |                      [0, 0, 0, 0, 1, 0]])
 56 | 
 57 |     When writing you own real feature functions, you can assume that
 58 |     the arrays will come in with the right shape.    
 59 | 
 60 |     Sparse feature functions look similar:
 61 | 
 62 |         def some_feature_function(i, j, s1, s2):
 63 |             ...
 64 |             return some_index, total_vector_length
 65 | 
 66 |     but they always return two ints. The first is the index of the
 67 |     element that should be 1 and the second is the total length of
 68 |     vector. So for example if (4, 5) is returned, then the feature
 69 |     vector [0, 0, 0, 0, 1] is constructed.
 70 | 
 71 | 
 72 |     Parameters
 73 |     ----------
 74 |     real: list: optional (default=[])
 75 |         List of functions of the form
 76 |             def some_feature_function(i, j, s1, s2):
 77 |                 ...
 78 |                 return some_float
 79 | 
 80 |     sparse: list: optional (default=[])
 81 |         List of functions of the form
 82 |             def some_feature_function(i, j, s1, s2):
 83 |                 ...
 84 |                 return some_index, total_vector_length
 85 | 
 86 |     """
 87 | 
 88 |     def __init__(self, real=None, sparse=None):
 89 |         self._binary_features = []
 90 |         if real:
 91 |             self._binary_features = real
 92 |         self._sparse_features = []
 93 |         if sparse:
 94 |             self._sparse_features = sparse
 95 | 
 96 |     def fit_transform(self, raw_X, y=None):
 97 |         """Like transform. Transform sequence pairs to feature arrays that can be used as input to `Hacrf` models.
 98 | 
 99 |         Parameters
100 |         ----------
101 |         raw_X : List of (sequence1_n, sequence2_n) pairs, one for each training example n.
102 |         y : (ignored)
103 | 
104 |         Returns
105 |         -------
106 |          X : List of numpy ndarrays, each with shape = (I_n, J_n, K), where I_n is the length of sequence1_n, J_n is the
107 |             length of sequence2_n, and K is the number of features.
108 |             Feature matrix list, for use with estimators or further transformers.
109 |         """
110 |         return self.transform(raw_X)
111 | 
112 |     def transform(self, raw_X, y=None):
113 |         """Transform sequence pairs to feature arrays that can be used as input to `Hacrf` models.
114 | 
115 |         Parameters
116 |         ----------
117 |         raw_X : List of (sequence1_n, sequence2_n) pairs, one for each training example n.
118 |         y : (ignored)
119 | 
120 |         Returns
121 |         -------
122 |          X : List of numpy ndarrays, each with shape = (I_n, J_n, K), where I_n is the length of sequence1_n, J_n is the
123 |             length of sequence2_n, and K is the number of features.
124 |             Feature matrix list, for use with estimators or further transformers.
125 |         """
126 |         return [self._extract_features(sequence1, sequence2) for sequence1, sequence2 in raw_X]
127 | 
128 |     def _extract_features(self, sequence1, sequence2):
129 |         """ Helper to extract features for one data point. """
130 | 
131 |         array1 = np.array(tuple(sequence1), ndmin=2).T
132 |         array2 = np.array(tuple(sequence2), ndmin=2)
133 | 
134 |         K = (len(self._binary_features) 
135 |              + sum(num_feats for _, num_feats in self._sparse_features))
136 | 
137 |         feature_array = np.zeros((array1.size, array2.size, K), dtype='float64')
138 | 
139 |         for k, feature_function in enumerate(self._binary_features):
140 |             feature_array[..., k] = feature_function(array1, array2)
141 | 
142 |         if self._sparse_features:
143 |             n_binary_features = len(self._binary_features)
144 | 
145 |             for i, j in np.ndindex(len(sequence1), len(sequence2)):
146 |                 k = n_binary_features
147 | 
148 |                 for feature_function, num_features in self._sparse_features:
149 |                     
150 |                     feature_array[i, j, k + feature_function(i, j, sequence1, sequence2)] = 1.0
151 |                     k += num_features
152 | 
153 |         return feature_array
154 | 
155 | 
156 | class StringPairFeatureExtractor(PairFeatureExtractor):
157 |     """ Extract features from sequence pairs.
158 | 
159 |     A grid is constructed for each sequence pair, for example for ("kaas", "cheese"):
160 | 
161 |      s * . . . @ .
162 |      a * . . . . .
163 |      a * . . . . .
164 |      k * * * * * *
165 |        c h e e s e
166 | 
167 |     For each element in the grid, a feature vector is constructed. The elements in the feature
168 |     vector are determined by which features are active at that position in the grid. So for the
169 |     example above, the 'match' feature will be 0 in every vector in every position except the
170 |     position indicated with '@', where it will be 1. The 'start' feature will be 1 in all the
171 |     positions with '*' and 0 everywhere else.
172 | 
173 | 
174 |     Parameters
175 |     ----------
176 |     bias: float: optional (default=1.0)
177 |         A bias term that is always added to every position in the lattice.
178 | 
179 |     start: boolean: optional
180 |         Binary feature that activates at the start of either sequence.
181 | 
182 |     end: boolean: optional
183 |         Binary feature that activates at the end of either sequence.
184 | 
185 |     match: boolean: optional
186 |         Binary feature that activates when elements at a position are equal.
187 | 
188 |     numeric: boolean, optional
189 |         Binary feature that activates when all elements at a position are numerical.
190 | 
191 |     transition: boolean, optional
192 |         Adds binary features for pairs of (lower case) input characters.
193 |     """
194 | 
195 |     # Constants
196 |     CHARACTERS = 'abcdefghijklmnopqrstuvwxyz0123456789,./;\'\-=<>?:"|_+!@#$%^&*() '
197 | 
198 |     
199 | 
200 |     def __init__(self, bias=1.0, start=False, end=False, match=False, numeric=False, transition=False):
201 |         # TODO: For longer strings, tokenize and use Levenshtein
202 |         # distance up until a lattice position.  Other (possibly)
203 |         # useful features might be whether characters are consonant or
204 |         # vowel, punctuation, case.
205 |         binary_features_active = [True, start, end, match, numeric]
206 |         binary_features = [functools.partial(biases, bias=bias),
207 |                            starts,
208 |                            ends,
209 |                            matches,
210 |                            digits]
211 | 
212 |         self._binary_features = [feature 
213 |                                  for feature, active 
214 |                                  in zip(binary_features, 
215 |                                         binary_features_active)
216 |                                  if active]
217 |         self._sparse_features = []
218 |         if transition:
219 |             characters_to_index = {character: index for index, character in enumerate(self.CHARACTERS)}
220 |             curried_charIndex = functools.partial(charIndex,
221 |                                                   char2index = characters_to_index)
222 |             self._sparse_features.append((curried_charIndex, 
223 |                                           len(characters_to_index) ** 2))
224 | 
225 | 
226 | def charIndex(i, j, s1, s2, char2index=None) :
227 |     char_i, char_j = s1[i].lower(), s2[j].lower()
228 |     index = char2index[char_j] + char2index[char_i] * len(char2index)
229 |     return index
230 | 
231 | def biases(s1, s2, bias=1.0) :
232 |     return np.full((s1.size, s2.size), bias)
233 | 
234 | def starts(s1, s2) :
235 |     M = np.zeros((s1.size, s2.size))
236 |     M[0,...] = 1
237 |     M[...,0] = 1
238 |     return M
239 | 
240 | def ends(s1, s2) :
241 |     M = np.zeros((s1.size, s2.size))
242 |     M[(s1.size-1),...] = 1
243 |     M[...,(s2.size-1)] = 1
244 |     return M
245 | 
246 | def matches(s1, s2) :
247 |     return (s1 == s2)
248 | 
249 | def digits(s1, s2) :
250 |     return np.char.isdigit(s1) & np.char.isdigit(s2)
251 | 
252 | 
253 | 
254 | 
255 | 


--------------------------------------------------------------------------------
/pyhacrf/algorithms.pyx:
--------------------------------------------------------------------------------
  1 | #cython: boundscheck=False, wraparound=False, initializedcheck=False
  2 | 
  3 | import numpy as np
  4 | cimport numpy as np
  5 | from numpy import ndarray
  6 | from numpy cimport ndarray
  7 | from numpy.math cimport logaddexp, INFINITY as inf
  8 | cdef extern from "math.h" nogil :
  9 |     np.float64_t exp(np.float64_t x)
 10 | 
 11 | cpdef dict forward(np.ndarray[np.int64_t, ndim=2] lattice, np.ndarray[np.float64_t, ndim=3] x_dot_parameters, long S):
 12 |     """ Helper to calculate the forward weights.  """
 13 |     cdef dict alpha = {}
 14 | 
 15 |     cdef unsigned int r
 16 |     cdef unsigned int i0, j0, s0, i1, j1, s1, edge_parameter_index
 17 |     cdef unsigned int I, J, s
 18 | 
 19 |     cdef unsigned int old_i0, old_j0, old_s0 
 20 |     cdef np.float64_t edge_potential
 21 | 
 22 |     old_i0, old_j0, old_s0 = -1, -1, -1
 23 | 
 24 |     for r in range(lattice.shape[0]):
 25 |         i0, j0, s0 = lattice[r, 0], lattice[r, 1], lattice[r, 2] 
 26 |         i1, j1, s1 = lattice[r, 3], lattice[r, 4], lattice[r, 5]
 27 |         edge_parameter_index = lattice[r, 6]
 28 |         
 29 |         if i0 != old_i0 or j0 != old_j0 or s0 != old_s0:
 30 |             if i0 == 0 and j0 == 0:
 31 |                 alpha[(i0, j0, s0)] = x_dot_parameters[i0, j0, s0]
 32 |             else:
 33 |                 alpha[(i0, j0, s0)] += x_dot_parameters[i0, j0, s0]
 34 | 
 35 |             old_i0, old_j0, old_s0 = i0, j0, s0
 36 | 
 37 |         edge_potential = (x_dot_parameters[i1, j1, edge_parameter_index]
 38 |                           + <np.float64_t> alpha[(i0, j0, s0)])
 39 |         alpha[(i0, j0, s0, i1, j1, s1, edge_parameter_index)] = edge_potential
 40 |         alpha[(i1, j1, s1)] = logaddexp(<np.float64_t> alpha.get((i1, j1, s1), -inf),
 41 |                                         edge_potential)
 42 | 
 43 |     I = x_dot_parameters.shape[0] - 1
 44 |     J = x_dot_parameters.shape[1] - 1
 45 | 
 46 |     for s in range(S):
 47 |         if I == J == 0:
 48 |             alpha[(I, J, s)] = x_dot_parameters[I, J, s]
 49 |         else:
 50 |             alpha[(I, J, s)] = <np.float64_t> alpha.get((I, J, s), -inf) + x_dot_parameters[I, J, s]
 51 | 
 52 |     return alpha
 53 | 
 54 | cpdef np.float64_t[:, :, ::1] forward_predict(np.int64_t[:, ::1] lattice,
 55 |                                       np.float64_t[:, :, ::1] x_dot_parameters,
 56 |                                       long S) :
 57 |     """ Helper to calculate the forward weights for prediction.  """
 58 | 
 59 |     cdef np.float64_t[:, :, ::1] alpha = x_dot_parameters.copy()
 60 |     alpha[:] = -inf
 61 | 
 62 |     cdef unsigned int r 
 63 |     cdef unsigned int i0, j0, s0, i1, j1, s1, edge_parameter_index
 64 | 
 65 |     cdef int old_s0 = -1
 66 | 
 67 |     cdef np.float64_t edge_potential, source_node_potential
 68 | 
 69 |     for r in range(lattice.shape[0]):
 70 |         i0, j0, s0 = lattice[r, 0], lattice[r, 1], lattice[r, 2]
 71 | 
 72 |         if s0 != old_s0 :
 73 |             if i0 == 0 and j0 == 0:
 74 |                 source_node_potential = x_dot_parameters[i0, j0, s0]
 75 |             else:
 76 |                 source_node_potential = (alpha[i0,j0,s0]
 77 |                                          + x_dot_parameters[i0,j0,s0])
 78 |             old_s0 = s0
 79 | 
 80 |         i1, j1, s1 = lattice[r, 3], lattice[r, 4], lattice[r, 5]
 81 |         edge_parameter_index = lattice[r, 6]
 82 | 
 83 |         edge_potential = (x_dot_parameters[i1, j1, edge_parameter_index]
 84 |                           + source_node_potential)
 85 | 
 86 |         alpha[i1, j1, s1] = logaddexp(alpha[i1, j1, s1], edge_potential)
 87 | 
 88 |     cdef int I = alpha.shape[0] - 1
 89 |     cdef int J = alpha.shape[1] - 1
 90 | 
 91 |     for s in range(S):
 92 |         if I == J == 0 :
 93 |             alpha[I, J, s] = x_dot_parameters[I, J, s]
 94 |         else:
 95 |             alpha[I, J, s] += x_dot_parameters[I, J, s]
 96 |         
 97 |     return alpha
 98 | 
 99 | 
100 | cpdef np.float64_t[:, :, ::1] forward_max_predict(np.int64_t[:, ::1] lattice,
101 |                                             np.float64_t[:, :, ::1] x_dot_parameters,
102 |                                             long S) :
103 |     """ Helper to calculate the forward max-sum weights for prediction.  """
104 | 
105 |     cdef np.float64_t[:, :, ::1] alpha = x_dot_parameters.copy()
106 |     alpha[:] = -inf
107 | 
108 |     cdef unsigned int r
109 |     cdef unsigned int i0, j0, s0, i1, j1, s1, edge_parameter_index
110 | 
111 |     cdef int old_s0 = -1
112 | 
113 |     cdef np.float64_t edge_potential, source_node_potential
114 | 
115 |     for r in range(lattice.shape[0]):
116 |         i0, j0, s0 = lattice[r, 0], lattice[r, 1], lattice[r, 2]
117 | 
118 |         if s0 != old_s0 :
119 |             if i0 == 0 and j0 == 0:
120 |                 source_node_potential = x_dot_parameters[i0, j0, s0]
121 |             else:
122 |                 source_node_potential = (alpha[i0,j0,s0]
123 |                                          + x_dot_parameters[i0,j0,s0])
124 |             old_s0 = s0
125 | 
126 |         i1, j1, s1 = lattice[r, 3], lattice[r, 4], lattice[r, 5]
127 |         edge_parameter_index = lattice[r, 6]
128 | 
129 |         edge_potential = (x_dot_parameters[i1, j1, edge_parameter_index]
130 |                           + source_node_potential)
131 | 
132 |         alpha[i1, j1, s1] = max(alpha[i1, j1, s1], edge_potential)
133 | 
134 |     cdef int I = alpha.shape[0] - 1
135 |     cdef int J = alpha.shape[1] - 1
136 | 
137 |     for s in range(S):
138 |         if I == J == 0 :
139 |             alpha[I, J, s] = x_dot_parameters[I, J, s]
140 |         else:
141 |             alpha[I, J, s] += x_dot_parameters[I, J, s]
142 | 
143 |     return alpha
144 | 
145 | 
146 | cpdef dict backward(ndarray[np.int64_t, ndim=2] lattice,
147 |                     ndarray[np.float64_t, ndim=3] x_dot_parameters,
148 |                     long I, long J, long S):
149 |     """ Helper to calculate the backward weights.  """
150 |     cdef dict beta = {}
151 | 
152 |     cdef unsigned int r
153 |     cdef unsigned int s
154 |     cdef unsigned int i0, j0, s0, i1, j1, s1, edge_parameter_index
155 | 
156 |     cdef np.float64_t edge_potential
157 | 
158 |     for s in range(S):
159 |         beta[(I-1, J-1, s)] = 0.0
160 | 
161 |     for r in range((lattice.shape[0] - 1), -1, -1):
162 |         i0, j0, s0 = lattice[r, 0], lattice[r, 1], lattice[r, 2], 
163 |         i1, j1, s1 = lattice[r, 3], lattice[r, 4], lattice[r, 5]
164 |         edge_parameter_index = lattice[r, 6]
165 | 
166 |         edge_potential = <np.float64_t> beta[(i1, j1, s1)] + x_dot_parameters[i1, j1, s1]
167 |         beta[(i0, j0, s0, i1, j1, s1, edge_parameter_index)] = edge_potential
168 |         beta[(i0, j0, s0)] = logaddexp(<np.float64_t> beta.get((i0, j0, s0), -inf),
169 |                                        (edge_potential 
170 |                                         + x_dot_parameters[i1, 
171 |                                                            j1, 
172 |                                                            edge_parameter_index]))
173 |     return beta
174 | 
175 | 
176 | def gradient(dict alpha,
177 |              dict beta,
178 |              ndarray[np.float64_t, ndim=2] parameters,
179 |              ndarray[np.int64_t] states_to_classes,
180 |              ndarray[np.float64_t, ndim=3] x,
181 |              long y,
182 |              long I, long J, long K):
183 |     """ Helper to calculate the marginals and from that the gradient given the forward and backward weights. """
184 |     cdef unsigned int n_classes = max(states_to_classes) + 1
185 |     cdef ndarray[np.float64_t] class_Z = np.zeros((n_classes,))
186 |     cdef np.float64_t Z = -inf
187 |     cdef np.float64_t weight
188 |     cdef unsigned int k
189 | 
190 |     for state, clas in enumerate(states_to_classes):
191 |         weight = <np.float64_t> alpha[(I - 1, J - 1, state)]
192 |         class_Z[clas] = weight
193 |         Z = logaddexp(Z, weight)
194 | 
195 |     cdef ndarray[np.float64_t, ndim=2] derivative = np.full_like(parameters, 0.0)
196 |     cdef unsigned int i0, j0, s0, i1, j1, s1, edge_parameter_index
197 |     cdef np.float64_t alphabeta
198 | 
199 |     for node in alpha.viewkeys() | beta.viewkeys():
200 |         if len(node) == 3:
201 |             i0, j0, s0 = node
202 |             alphabeta = <np.float64_t>alpha[(i0, j0, s0)] + <np.float64_t>beta[(i0, j0, s0)]
203 | 
204 |             for k in range(K):
205 |                 if states_to_classes[s0] == y:
206 |                     derivative[s0, k] += (exp(alphabeta - class_Z[y]) - exp(alphabeta - Z)) * x[i0, j0, k]
207 |                 else:
208 |                     derivative[s0, k] -= exp(alphabeta - Z) * x[i0, j0, k]
209 | 
210 |         else:
211 |             i0, j0, s0, i1, j1, s1, edge_parameter_index = node
212 |             alphabeta = <np.float64_t>alpha[(i0, j0, s0, i1, j1, s1, edge_parameter_index)] \
213 |                         + <np.float64_t>beta[(i0, j0, s0, i1, j1, s1, edge_parameter_index)]
214 | 
215 |             for k in xrange(K):
216 |                 if states_to_classes[s1] == y:
217 |                     derivative[edge_parameter_index, k] += (exp(alphabeta - class_Z[y]) - exp(alphabeta - Z)) * x[i1, j1, k]
218 |                 else:
219 |                     derivative[edge_parameter_index, k] -= exp(alphabeta - Z) * x[i1, j1, k]
220 | 
221 |     return (class_Z[y]) - (Z), derivative
222 | 
223 | 
224 | def gradient_sparse(dict alpha,
225 |                     dict beta,
226 |                     ndarray[np.float64_t, ndim=2] parameters,
227 |                     ndarray[np.int64_t] states_to_classes,
228 |                     ndarray[np.int64_t, ndim=3] x_index,
229 |                     ndarray[np.float64_t, ndim=3] x_value,
230 |                     long y,
231 |                     long I, long J, long K):
232 |     """
233 |     Helper to calculate the marginals and from that the gradient given the forward and backward weights, for
234 |     sparse input features.
235 |     """
236 |     cdef unsigned int n_classes = max(states_to_classes) + 1
237 |     cdef ndarray[np.float64_t] class_Z = np.zeros((n_classes,))
238 |     cdef np.float64_t Z = -inf
239 |     cdef np.float64_t weight
240 |     cdef unsigned int C = K
241 |     cdef unsigned int c
242 |     cdef int k
243 | 
244 |     for state, clas in enumerate(states_to_classes):
245 |         weight = <np.float64_t> alpha[(I - 1, J - 1, state)]
246 |         class_Z[clas] = weight
247 |         Z = logaddexp(Z, weight)
248 | 
249 |     cdef ndarray[np.float64_t, ndim=2] derivative = np.full_like(parameters, 0.0)
250 |     cdef unsigned int i0, j0, s0, i1, j1, s1, edge_parameter_index
251 |     cdef np.float64_t alphabeta
252 | 
253 |     for node in alpha.viewkeys() | beta.viewkeys():
254 |         if len(node) == 3:
255 |             i0, j0, s0 = node
256 |             alphabeta = <np.float64_t>alpha[(i0, j0, s0)] + <np.float64_t>beta[(i0, j0, s0)]
257 | 
258 |             for c in range(C):
259 |                 k = x_index[i0, j0, c]
260 |                 if k < 0:
261 |                     break
262 |                 if states_to_classes[s0] == y:
263 |                     derivative[s0, k] += (exp(alphabeta - class_Z[y]) - exp(alphabeta - Z)) * x_value[i0, j0, c]
264 |                 else:
265 |                     derivative[s0, k] -= exp(alphabeta - Z) * x_value[i0, j0, c]
266 | 
267 |         else:
268 |             i0, j0, s0, i1, j1, s1, edge_parameter_index = node
269 |             alphabeta = <np.float64_t>alpha[(i0, j0, s0, i1, j1, s1, edge_parameter_index)] \
270 |                                 + <np.float64_t>beta[(i0, j0, s0, i1, j1, s1, edge_parameter_index)]
271 | 
272 |             for c in range(C):
273 |                 k = x_index[i1, j1, c]
274 |                 if k < 0:
275 |                     break
276 |                 if states_to_classes[s1] == y:
277 |                     derivative[edge_parameter_index, k] += (exp(alphabeta - class_Z[y]) - exp(alphabeta - Z)) * x_value[i1, j1, c]
278 |                 else:
279 |                     derivative[edge_parameter_index, k] -= exp(alphabeta - Z) * x_value[i1, j1, c]
280 | 
281 |     return (class_Z[y]) - (Z), derivative
282 | 
283 | 
284 | def populate_sparse_features(ndarray[np.float64_t, ndim=3] x,
285 |                              ndarray[np.int64_t, ndim=3] index_array,
286 |                              ndarray[np.float64_t, ndim=3] value_array,
287 |                              long I, long J, long K):
288 |     """ Helper to fill in sparse feature arrays. """
289 |     cdef unsigned int i, j, c, k
290 |     for i in range(I):
291 |         for j in range(J):
292 |             c = 0
293 |             for k in range(K):
294 |                 if x[i, j, k] != 0:
295 |                     value_array[i, j, c] = x[i, j, k]
296 |                     index_array[i, j, c] = k
297 |                     c += 1
298 | 
299 | def sparse_multiply(ndarray[np.float64_t, ndim=3] answer,
300 |                     ndarray[np.int64_t, ndim=3] index_array,
301 |                     ndarray[np.float64_t, ndim=3] value_array,
302 |                     ndarray[np.float64_t, ndim=2] dense_array,
303 |                     long I, long J, long K, long C, long S):
304 |     """ Multiply a sparse three dimensional numpy array (using our own scheme) with a two dimensional array. """
305 |     cdef unsigned int i, j, s, c
306 |     cdef int k
307 |     for i in range(I):
308 |         for j in range(J):
309 |             for s in range(S):
310 |                 for c in range(C):
311 |                     k = index_array[i, j, c]
312 |                     if k < 0:
313 |                         break
314 |                     answer[i, j, s] += value_array[i, j, c] * dense_array[k, s]
315 | 


--------------------------------------------------------------------------------
/pyhacrf/pyhacrf.py:
--------------------------------------------------------------------------------
  1 | # Authors: Dirko Coetsee
  2 | # License: 3-clause BSD
  3 | 
  4 | """ Implements a Hidden Alignment Conditional Random Field (HACRF). """
  5 | 
  6 | import numpy as np
  7 | import lbfgs
  8 | from .algorithms import forward, backward
  9 | from .algorithms import forward_predict, forward_max_predict
 10 | from .algorithms import gradient, gradient_sparse, populate_sparse_features, sparse_multiply
 11 | from .state_machine import DefaultStateMachine
 12 | 
 13 | 
 14 | class Hacrf(object):
 15 |     """ Hidden Alignment Conditional Random Field with L2 regularizer.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     l2_regularization : float, optional (default=0.0)
 20 |         The regularization parameter.
 21 | 
 22 |     optimizer : function, optional (default=None)
 23 |         The optimizing function that should be used minimize the negative log posterior.
 24 |         The function should have the signature:
 25 |             min_objective, argmin_objective, ... = fmin(obj, x0, **optimizer_kwargs),
 26 |         where obj is a function that returns
 27 |         the objective function and its gradient given a parameter vector; and x0 is the initial parameter vector.
 28 | 
 29 |     optimizer_kwargs : dictionary, optional (default=None)
 30 |         The keyword arguments to pass to the optimizing function. Only used when `optimizer` is also specified.
 31 | 
 32 |     state_machine : Instance of `GeneralStateMachine` or `DefaultStateMachine`, optional (default=`DefaultStateMachine`)
 33 |         The state machine to use to generate the lattice.
 34 | 
 35 |     viterbi : Boolean, optional (default=False).
 36 |         Whether to use Viterbi (max-sum) decoding for predictions (not training)
 37 |         instead of the default sum-product algorithm.
 38 | 
 39 |     References
 40 |     ----------
 41 |     See *A Conditional Random Field for Discriminatively-trained Finite-state String Edit Distance*
 42 |     by McCallum, Bellare, and Pereira, and the report *Conditional Random Fields for Noisy text normalisation*
 43 |     by Dirko Coetsee.
 44 |     """
 45 | 
 46 |     def __init__(self,
 47 |                  l2_regularization=0.0,
 48 |                  optimizer=None,
 49 |                  optimizer_kwargs=None,
 50 |                  state_machine=None,
 51 |                  viterbi=False):
 52 |         self.parameters = None
 53 |         self.classes = None
 54 |         self.l2_regularization = l2_regularization
 55 |         self._optimizer = optimizer
 56 |         self._optimizer_kwargs = optimizer_kwargs
 57 |         self.viterbi = viterbi
 58 | 
 59 |         self._optimizer_result = None
 60 |         self._state_machine = state_machine
 61 |         self._states_to_classes = None
 62 |         self._evaluation_count = None
 63 | 
 64 |     def fit(self, X, y, verbosity=0):
 65 |         """Fit the model according to the given training data.
 66 | 
 67 |         Parameters
 68 |         ----------
 69 |         X : List of ndarrays, one for each training example.
 70 |             Each training example's shape is (string1_len, string2_len, n_features), where
 71 |             string1_len and string2_len are the length of the two training strings and n_features the
 72 |             number of features.
 73 | 
 74 |         y : array-like, shape (n_samples,)
 75 |             Target vector relative to X.
 76 | 
 77 |         Returns
 78 |         -------
 79 |         self : object
 80 |             Returns self.
 81 |         """
 82 |         self.classes = list(set(y))
 83 |         n_points = len(y)
 84 |         if len(X) != n_points:
 85 |             raise Exception('Number of training points should be the same as training labels.')
 86 | 
 87 |         if not self._state_machine:
 88 |             self._state_machine = DefaultStateMachine(self.classes)
 89 | 
 90 |         # Initialize the parameters given the state machine, features, and target classes.
 91 |         self.parameters = self._initialize_parameters(self._state_machine, X[0].shape[2])
 92 | 
 93 |         # Create a new model object for each training example
 94 |         models = [_Model(self._state_machine, x, ty) for x, ty in zip(X, y)]
 95 | 
 96 |         self._evaluation_count = 0
 97 | 
 98 |         def _objective(parameters):
 99 |             gradient = np.zeros(self.parameters.shape)
100 |             ll = 0.0  # Log likelihood
101 |             # TODO: Embarrassingly parallel
102 |             for model in models:
103 |                 dll, dgradient = model.forward_backward(parameters.reshape(self.parameters.shape))
104 |                 ll += dll
105 |                 gradient += dgradient
106 | 
107 |             parameters_without_bias = np.array(parameters, dtype='float64')  # exclude the bias parameters from being regularized
108 |             parameters_without_bias[0] = 0
109 |             ll -= self.l2_regularization * np.dot(parameters_without_bias.T, parameters_without_bias)
110 |             gradient = gradient.flatten() - 2.0 * self.l2_regularization * parameters_without_bias
111 | 
112 |             if verbosity > 0:
113 |                 if self._evaluation_count == 0:
114 |                     print('{:10} {:10} {:10}'.format('Iteration', 'Log-likelihood', '|gradient|'))
115 |                 if self._evaluation_count % verbosity == 0:
116 |                     print('{:10} {:10.4} {:10.4}'.format(self._evaluation_count, ll, (abs(gradient).sum())))
117 |             self._evaluation_count += 1
118 | 
119 |             # TODO: Allow some of the parameters to be frozen. ie. not trained. Can later also completely remove
120 |             # TODO:     the computation associated with these parameters.
121 |             return -ll, -gradient
122 | 
123 |         def _objective_copy_gradient(paramers, g):
124 |             nll, ngradient = _objective(paramers)
125 |             g[:] = ngradient
126 |             return nll
127 | 
128 |         if self._optimizer:
129 |             self.optimizer_result = self._optimizer(_objective, self.parameters.flatten(), **self._optimizer_kwargs)
130 |             self.parameters = self.optimizer_result[0].reshape(self.parameters.shape)
131 |         else:
132 |             optimizer = lbfgs.LBFGS()
133 |             final_betas = optimizer.minimize(_objective_copy_gradient,
134 |                                              x0=self.parameters.flatten(),
135 |                                              progress=None)
136 |             self.optimizer_result = final_betas
137 |             self.parameters = final_betas.reshape(self.parameters.shape)
138 |         return self
139 | 
140 |     def predict_proba(self, X):
141 |         """Probability estimates.
142 | 
143 |         The returned estimates for all classes are ordered by the
144 |         label of classes.
145 | 
146 |         Parameters
147 |         ----------
148 |         X : List of ndarrays, one for each training example.
149 |             Each training example's shape is (string1_len, string2_len, n_features, where
150 |             string1_len and string2_len are the length of the two training strings and n_features the
151 |             number of features.
152 | 
153 |         Returns
154 |         -------
155 |         T : array-like, shape = [n_samples, n_classes]
156 |             Returns the probability of the sample for each class in the model,
157 |             where classes are ordered as they are in ``self.classes_``.
158 |         """
159 |         
160 |         parameters = np.ascontiguousarray(self.parameters.T)
161 | 
162 |         predictions = [_Model(self._state_machine, x).predict(parameters, self.viterbi)
163 |                        for x in X]
164 |         predictions = np.array([[probability
165 |                                  for _, probability
166 |                                  in sorted(prediction.items())]
167 |                                 for prediction in predictions])
168 |         return predictions
169 | 
170 |     def predict(self, X):
171 |         """Predict the class for X.
172 | 
173 |         The predicted class for each sample in X is returned.
174 | 
175 |         Parameters
176 |         ----------
177 |         X : List of ndarrays, one for each training example.
178 |             Each training example's shape is (string1_len,
179 |             string2_len, n_features), where string1_len and
180 |             string2_len are the length of the two training strings and
181 |             n_features the number of features.
182 | 
183 |         Returns
184 |         -------
185 |         y : iterable of shape = [n_samples]
186 |             The predicted classes.
187 | 
188 |         """
189 |         return [self.classes[prediction.argmax()] for prediction in self.predict_proba(X)]
190 | 
191 |     @staticmethod
192 |     def _initialize_parameters(state_machine, n_features):
193 |         """ Helper to create initial parameter vector with the correct shape. """
194 |         return np.zeros((state_machine.n_states 
195 |                          + state_machine.n_transitions,
196 |                          n_features))
197 | 
198 |     def get_params(self, deep=True):
199 |         """Get parameters for this estimator.
200 | 
201 |         Parameters
202 |         ----------
203 |         deep: boolean, optional
204 |             If True, will return the parameters for this estimator and
205 |             contained subobjects that are estimators.
206 | 
207 |         Returns
208 |         -------
209 |         params : mapping of string to any
210 |             Parameter names mapped to their values.
211 |         """
212 |         return {'l2_regularization': self.l2_regularization,
213 |                 'optimizer': self._optimizer,
214 |                 'optimizer_kwargs': self._optimizer_kwargs}
215 | 
216 |     def set_params(self, l2_regularization=0.0, optimizer=None, optimizer_kwargs=None):
217 |         """Set the parameters of this estimator.
218 | 
219 |         Returns
220 |         -------
221 |         self
222 |         """
223 |         self.l2_regularization = l2_regularization
224 |         self._optimizer = optimizer
225 |         self._optimizer_kwargs = optimizer_kwargs
226 |         return self
227 | 
228 | 
229 | class _Model(object):
230 |     """ The actual model that implements the inference routines. """
231 |     def __init__(self, state_machine, x, y=None):
232 |         self.state_machine = state_machine
233 |         self.states_to_classes = state_machine.states_to_classes
234 |         self.x = x
235 |         self.sparse_x = 'uninitialized'
236 |         self.y = y
237 |         self._lattice = self.state_machine.build_lattice(self.x)
238 | 
239 |     def forward_backward(self, parameters):
240 |         """ Run the forward backward algorithm with the given parameters. """
241 |         # If the features are sparse, we can use an optimization.
242 |         # I'm not using scipy.sparse here because we want to avoid a scipy dependency and also scipy.sparse doesn't seem
243 |         # to handle arrays of shape higher than 2.
244 |         if isinstance(self.sparse_x, str) and self.sparse_x == 'uninitialized':
245 |             if (self.x == 0).sum() * 1.0 / self.x.size > 0.6:
246 |                 self.sparse_x = self._construct_sparse_features(self.x)
247 |             else:
248 |                 self.sparse_x = 'not sparse'
249 | 
250 |         I, J, K = self.x.shape
251 |         if not isinstance(self.sparse_x, str):
252 |             C = self.sparse_x[0].shape[2]
253 |             S, _ = parameters.shape
254 |             x_dot_parameters = np.zeros((I, J, S))
255 |             sparse_multiply(x_dot_parameters, self.sparse_x[0], self.sparse_x[1], parameters.T, I, J, K, C, S)
256 |         else:
257 |             x_dot_parameters = np.dot(self.x, parameters.T)  # Pre-compute the dot product
258 |         alpha = self._forward(x_dot_parameters)
259 |         beta = self._backward(x_dot_parameters)
260 |         classes_to_ints = {k: i for i, k in enumerate(set(self.states_to_classes.values()))}
261 |         states_to_classes = np.array([classes_to_ints[self.states_to_classes[state]]
262 |                                       for state in range(max(self.states_to_classes.keys()) + 1)], dtype='int64')
263 |         if not isinstance(self.sparse_x, str):
264 |             ll, deriv = gradient_sparse(alpha, beta, parameters, states_to_classes,
265 |                                         self.sparse_x[0], self.sparse_x[1], classes_to_ints[self.y],
266 |                                         I, J, self.sparse_x[0].shape[2])
267 |         else:
268 |             ll, deriv = gradient(alpha, beta, parameters, states_to_classes,
269 |                                  self.x, classes_to_ints[self.y], I, J, K)
270 |         return ll, deriv
271 | 
272 |     def predict(self, parameters, viterbi):
273 |         """ Run forward algorithm to find the predicted distribution over classes. """
274 |         x_dot_parameters = np.einsum('ijk,kl->ijl', self.x, parameters)
275 | 
276 |         if not viterbi:
277 |             alpha = forward_predict(self._lattice, x_dot_parameters,
278 |                                     self.state_machine.n_states)
279 |         else:
280 |             alpha = forward_max_predict(self._lattice, x_dot_parameters,
281 |                                         self.state_machine.n_states)
282 | 
283 |         I, J, _ = self.x.shape
284 | 
285 |         class_Z = {}
286 |         Z = -np.inf
287 | 
288 |         for state, predicted_class in self.states_to_classes.items():
289 |             weight = alpha[I - 1, J - 1, state]
290 |             class_Z[self.states_to_classes[state]] = weight
291 |             Z = np.logaddexp(Z, weight)
292 | 
293 |         return {label: np.exp(class_z - Z) for label, class_z in class_Z.items()}
294 | 
295 |     def _forward(self, x_dot_parameters):
296 |         """ Helper to calculate the forward weights.  """
297 |         return forward(self._lattice, x_dot_parameters, 
298 |                        self.state_machine.n_states)
299 | 
300 |     def _backward(self, x_dot_parameters):
301 |         """ Helper to calculate the backward weights.  """
302 |         I, J, _ = self.x.shape
303 |         return backward(self._lattice, x_dot_parameters, I, J,
304 |                         self.state_machine.n_states)
305 | 
306 |     def _construct_sparse_features(self, x):
307 |         """ Helper to construct a sparse representation of the features. """
308 |         I, J, K = x.shape
309 |         new_array_height = (x != 0).sum(axis=2).max()
310 |         index_array = -np.ones((I, J, new_array_height), dtype='int64')
311 |         value_array = -np.ones((I, J, new_array_height), dtype='float64')
312 |         populate_sparse_features(x, index_array, value_array, I, J, K)
313 |         return index_array, value_array
314 | 


--------------------------------------------------------------------------------
/pyhacrf/tests/test_model.py:
--------------------------------------------------------------------------------
  1 | """ Tests for the model. """
  2 | 
  3 | import unittest
  4 | 
  5 | from numpy.testing import assert_array_almost_equal, assert_array_equal
  6 | import numpy as np
  7 | from numpy import random
  8 | from pyhacrf import Hacrf
  9 | from pyhacrf.state_machine import GeneralStateMachine, DefaultStateMachine
 10 | from pyhacrf.pyhacrf import _Model
 11 | from pyhacrf import StringPairFeatureExtractor
 12 | 
 13 | TEST_PRECISION = 3
 14 | 
 15 | 
 16 | class TestHacrf(unittest.TestCase):
 17 |     def test_initialize_parameters(self):
 18 |         start_states = [0]
 19 |         transitions = [(0, 0, (1, 1)),
 20 |                        (0, 1, (0, 1)),
 21 |                        (0, 0, (1, 0))]
 22 |         states_to_classes = {0: 'a'}
 23 |         state_machine = GeneralStateMachine(start_states=start_states,
 24 |                                             transitions=transitions,
 25 |                                             states_to_classes=states_to_classes)
 26 | 
 27 |         n_features = 3
 28 | 
 29 |         actual_parameters = Hacrf._initialize_parameters(state_machine, n_features)
 30 |         expected_parameter_shape = (5, 3)
 31 |         self.assertEqual(actual_parameters.shape, expected_parameter_shape)
 32 | 
 33 |     def test_default_state_machine(self):
 34 |         classes = ['a', 'b']
 35 |         expected_start_states, expected_transitions =\
 36 |                                   ([0, 1],
 37 |                                   [(0, 0, (1, 1)),
 38 |                                    (1, 1, (1, 1)),
 39 |                                    (0, 0, (0, 1)),
 40 |                                    (1, 1, (0, 1)),
 41 |                                    (0, 0, (1, 0)),
 42 |                                    (1, 1, (1, 0))])
 43 |         expected_states_to_classes = {0: 'a', 1: 'b'}
 44 |         state_machine = DefaultStateMachine(classes)
 45 |         self.assertEqual(state_machine._start_states,
 46 |                          expected_start_states)
 47 |         self.assertEqual(state_machine._transitions,
 48 |                          expected_transitions)
 49 |         self.assertEqual(state_machine.states_to_classes, 
 50 |                          expected_states_to_classes)
 51 | 
 52 |     def test_fit_predict(self):
 53 |         incorrect = ['helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship', 'hubby', 'krazii', 'mite', 'tropic']
 54 |         correct = ['hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship', 'husband', 'crazy', 'might', 'topic']
 55 |         training = zip(incorrect, correct)
 56 | 
 57 |         fe = StringPairFeatureExtractor(match=True, numeric=True)
 58 |         xf = fe.fit_transform(training)
 59 | 
 60 |         model = Hacrf()
 61 |         model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 62 | 
 63 |         expected_parameters = np.array([[-10.76945326, 144.03414923, 0.],
 64 |                                         [31.84369748, -106.41885651, 0.],
 65 |                                         [-52.08919467, 4.56943665, 0.],
 66 |                                         [31.01495044, -13.0593297, 0.],
 67 |                                         [49.77302218, -6.42566204, 0.],
 68 |                                         [-28.69877796, 24.47127009, 0.],
 69 |                                         [-85.34524911, 21.87370646, 0.],
 70 |                                         [106.41949333, 6.18587125, 0.]])
 71 |         print(model.parameters)
 72 |         assert_array_almost_equal(model.parameters, expected_parameters,
 73 |                                   decimal=TEST_PRECISION)
 74 | 
 75 |         expected_probas = np.array([[1.00000000e+000, 3.51235685e-039],
 76 |                                     [1.00000000e+000, 4.79716208e-039],
 77 |                                     [1.00000000e+000, 2.82744641e-139],
 78 |                                     [1.00000000e+000, 6.49580729e-012],
 79 |                                     [9.99933798e-001, 6.62022561e-005],
 80 |                                     [8.78935957e-005, 9.99912106e-001],
 81 |                                     [4.84538335e-009, 9.99999995e-001],
 82 |                                     [1.25170233e-250, 1.00000000e+000],
 83 |                                     [2.46673086e-010, 1.00000000e+000],
 84 |                                     [1.03521293e-033, 1.00000000e+000]])
 85 |         actual_predict_probas = model.predict_proba(xf)
 86 |         print(actual_predict_probas)
 87 |         assert_array_almost_equal(actual_predict_probas, expected_probas,
 88 |                                   decimal=TEST_PRECISION)
 89 | 
 90 |         expected_predictions = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 91 |         actual_predictions = model.predict(xf)
 92 |         assert_array_almost_equal(actual_predictions, expected_predictions,
 93 |                                   decimal=TEST_PRECISION)
 94 | 
 95 |     def test_fit_predict_regularized(self):
 96 |         incorrect = ['helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship', 'hubby', 'krazii', 'mite', 'tropic']
 97 |         correct = ['hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship', 'husband', 'crazy', 'might', 'topic']
 98 |         training = zip(incorrect, correct)
 99 | 
100 |         fe = StringPairFeatureExtractor(match=True, numeric=True)
101 |         xf = fe.fit_transform(training)
102 | 
103 |         model = Hacrf(l2_regularization=10.0)
104 |         model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
105 |         print(model.parameters)
106 | 
107 |         expected_parameters = np.array([[-0.0569188, 0.07413339, 0.],
108 |                                         [0.00187709, -0.06377866, 0.],
109 |                                         [-0.01908823, 0.00586189, 0.],
110 |                                         [0.01721114, -0.00636556, 0.],
111 |                                         [0.01578279, 0.0078614, 0.],
112 |                                         [-0.0139057, -0.00862948, 0.],
113 |                                         [-0.00623241, 0.02937325, 0.],
114 |                                         [0.00810951, -0.01774676, 0.]])
115 |         assert_array_almost_equal(model.parameters, expected_parameters, 
116 |                                   decimal=TEST_PRECISION)
117 | 
118 |         expected_probas = np.array([[0.5227226, 0.4772774],
119 |                                     [0.52568993, 0.47431007],
120 |                                     [0.4547091, 0.5452909],
121 |                                     [0.51179222, 0.48820778],
122 |                                     [0.46347576, 0.53652424],
123 |                                     [0.45710098, 0.54289902],
124 |                                     [0.46159657, 0.53840343],
125 |                                     [0.42997978, 0.57002022],
126 |                                     [0.47419724, 0.52580276],
127 |                                     [0.50797852, 0.49202148]])
128 |         actual_predict_probas = model.predict_proba(xf)
129 |         print(actual_predict_probas)
130 |         assert_array_almost_equal(actual_predict_probas, expected_probas, 
131 |                                   decimal=TEST_PRECISION)
132 | 
133 |         expected_predictions = np.array([0, 0, 1, 0, 1, 1, 1, 1, 1, 0])
134 |         actual_predictions = model.predict(xf)
135 |         assert_array_almost_equal(actual_predictions, expected_predictions, 
136 |                                   decimal=TEST_PRECISION)
137 | 
138 |     def test_fit_predict_regularized_viterbi(self):
139 |         incorrect = ['helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship', 'hubby', 'krazii', 'mite', 'tropic']
140 |         correct = ['hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship', 'husband', 'crazy', 'might', 'topic']
141 |         training = zip(incorrect, correct)
142 | 
143 |         fe = StringPairFeatureExtractor(match=True, numeric=True)
144 |         xf = fe.fit_transform(training)
145 | 
146 |         model = Hacrf(l2_regularization=10.0, viterbi=True)
147 |         model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
148 |         print(model.parameters)
149 | 
150 |         expected_parameters = np.array([[-0.0569188, 0.07413339, 0.],
151 |                                         [0.00187709, -0.06377866, 0.],
152 |                                         [-0.01908823, 0.00586189, 0.],
153 |                                         [0.01721114, -0.00636556, 0.],
154 |                                         [0.01578279, 0.0078614, 0.],
155 |                                         [-0.0139057, -0.00862948, 0.],
156 |                                         [-0.00623241, 0.02937325, 0.],
157 |                                         [0.00810951, -0.01774676, 0.]])
158 |         assert_array_almost_equal(model.parameters, expected_parameters,
159 |                                   decimal=TEST_PRECISION)
160 | 
161 |         expected_probas = np.array([[0.56394611, 0.43605389],
162 |                                     [0.52977205, 0.47022795],
163 |                                     [0.4751729, 0.5248271],
164 |                                     [0.51183761, 0.48816239],
165 |                                     [0.48608081, 0.51391919],
166 |                                     [0.4986367, 0.5013633],
167 |                                     [0.46947222, 0.53052778],
168 |                                     [0.43233544, 0.56766456],
169 |                                     [0.47463002, 0.52536998],
170 |                                     [0.51265109, 0.48734891]])
171 |         actual_predict_probas = model.predict_proba(xf)
172 |         print(actual_predict_probas)
173 |         assert_array_almost_equal(actual_predict_probas, expected_probas,
174 |                                   decimal=TEST_PRECISION)
175 | 
176 |         expected_predictions = np.array([0, 0, 1, 0, 1, 1, 1, 1, 1, 0])
177 |         actual_predictions = model.predict(xf)
178 |         assert_array_almost_equal(actual_predictions, expected_predictions,
179 |                                   decimal=TEST_PRECISION)
180 | 
181 | 
182 | class TestModel(unittest.TestCase):
183 |     def test_build_lattice(self):
184 |         n_states = 4  # Because 3 is the max
185 | 
186 |         start_states = [0, 1]
187 |         transitions = [(0, 0, (1, 1)),
188 |                        (0, 1, (0, 1)),
189 |                        (0, 0, (1, 0)),
190 |                        (0, 3, lambda i, j, k: (0, 2))]
191 |         states_to_classes = {0: 0, 1: 1, 3: 3}
192 | 
193 |         state_machine = GeneralStateMachine(start_states, transitions, states_to_classes)
194 |         x = np.zeros((2, 3, 9))
195 |         #               #     ________
196 |         # 1.  .  .      # 1  0 - 10 - 31
197 |         #               #    | /_______
198 |         # 0.  .  .      # 0 10 -- 1    3
199 |         #  0  1  2      #    0    1    2
200 |         #
201 |         # 1(0, 1), 3(0, 2), 1(1, 1), 1(0, 0) should be pruned because they represent partial alignments.
202 |         # Only nodes that are reachable by stepping back from (1, 2) must be included in the lattice.
203 |         actual_lattice = state_machine.build_lattice(x)
204 |         expected_lattice = np.array([(0, 0, 0, 1, 0, 0, 2 + n_states),
205 |                                      (0, 0, 0, 1, 1, 0, 0 + n_states),
206 |                                      (1, 0, 0, 1, 2, 3, 3 + n_states),
207 |                                      (1, 1, 0, 1, 2, 1, 1 + n_states)])
208 |         assert_array_equal(actual_lattice, expected_lattice)
209 | 
210 |     def test_build_lattice_jumps(self):
211 |         n_states = 2  # Because 1 is the max
212 | 
213 |         start_states = [0, 1]
214 |         transitions = [(0, 0, (1, 1)),
215 |                        (0, 1, (0, 2)),
216 |                        (0, 0, (1, 0))]
217 |         states_to_classes = {0: 0, 1: 1}
218 | 
219 |         state_machine = GeneralStateMachine(start_states, transitions, states_to_classes)
220 |         x = np.zeros((2, 3, 9))
221 |         #               #     ________
222 |         # 1.  .  .      # 1  0    .    1
223 |         #               #    |  _______
224 |         # 0.  .  .      # 0 10 /  .    1
225 |         #  0  1  2      #    0    1    2
226 |         #
227 |         # 1(0, 2) should be pruned because they represent partial alignments.
228 |         # Only nodes that are reachable by stepping back from (1, 2) must be included in the lattice.
229 |         actual_lattice = state_machine.build_lattice(x)
230 |         expected_lattice = np.array([(0, 0, 0, 1, 0, 0, 2 + n_states),
231 |                                      (1, 0, 0, 1, 2, 1, 1 + n_states)])
232 |         assert_array_equal(actual_lattice, expected_lattice)
233 | 
234 |     def test_forward_single(self):
235 |         start_states = [0, 1]
236 |         transitions = [(0, 0, (1, 1)),
237 |                        (0, 1, (0, 1)),
238 |                        (0, 0, (1, 0)),
239 |                        (0, 2, lambda i, j, k: (0, 2))]
240 |         states_to_classes = {0: 'a', 1: 'a', 2: 'b'}  # Dummy
241 | 
242 |         state_machine = GeneralStateMachine(start_states, transitions, states_to_classes)
243 | 
244 |         parameters = np.array(range(-7, 7), dtype='float64').reshape((7, 2))
245 |         # parameters =
246 |         # 0([[-7, -6],
247 |         # 1  [-5, -4],
248 |         # 2  [-3, -2],
249 |         # 3  [-1,  0],
250 |         # 4  [ 1,  2],
251 |         # 5  [ 3,  4],
252 |         # 6  [ 5,  6]])
253 |         x = np.array([[[0, 1],
254 |                        [1, 0],
255 |                        [2, 1]],
256 |                       [[0, 1],
257 |                        [1, 0],
258 |                        [1, 0]]])
259 |         y = 'a'
260 |         # Expected lattice:
261 |         #               #     ________
262 |         # 1.  .  .      # 1  0  __0 - 21
263 |         #               #    | /
264 |         # 0.  .  .      # 0  0
265 |         #  0  1  2      #    0    1    2
266 |         expected_alpha = {
267 |             (0, 0, 0): np.exp(-6),
268 |             (0, 0, 0, 1, 0, 0, 5): np.exp(-6) * np.exp(4),
269 |             (0, 0, 0, 1, 1, 0, 3): np.exp(-6) * np.exp(-1),
270 |             (1, 0, 0): np.exp(-6) * np.exp(4) * np.exp(-6),
271 |             (1, 0, 0, 1, 2, 2, 6): np.exp(-6) * np.exp(4) * np.exp(-6) * np.exp(5),
272 |             (1, 1, 0): np.exp(-6) * np.exp(-1) * np.exp(-7),
273 |             (1, 1, 0, 1, 2, 1, 4): np.exp(-6) * np.exp(-1) * np.exp(-7) * np.exp(1),
274 |             (1, 2, 1): np.exp(-6) * np.exp(-1) * np.exp(-7) * np.exp(1) * np.exp(-5),
275 |             (1, 2, 2): np.exp(-6) * np.exp(4) * np.exp(-6) * np.exp(5) * np.exp(-3)
276 |         }
277 |         expected_alpha = {k: np.emath.log(v) for k, v in expected_alpha.items()}
278 |         test_model = _Model(state_machine, x, y)
279 |         x_dot_parameters = np.dot(x, parameters.T)  # Pre-compute the dot product
280 |         actual_alpha = test_model._forward(x_dot_parameters)
281 | 
282 |         actual_alpha = {k: v for k, v in actual_alpha.items()
283 |                         if not np.isneginf(v)}
284 |         print(actual_alpha)
285 | 
286 |         self.assertEqual(len(actual_alpha), len(expected_alpha))
287 |         print
288 |         for key in sorted(expected_alpha.keys()):
289 |             print(key, (expected_alpha[key]), (actual_alpha[key]))
290 |             self.assertEqual(actual_alpha[key], expected_alpha[key])
291 | 
292 |     def test_forward_connected(self):
293 |         classes = ['a', 'b']
294 |         parameters = np.array(range(-8, 8), dtype='float64').reshape((8, 2))
295 |         # parameters =
296 |         #0([[-8, -7],
297 |         #1  [-6, -5],
298 |         #2  [-4, -3],
299 |         #3  [-2, -1],
300 |         #4  [ 0,  1],
301 |         #5  [ 2,  3],
302 |         #6  [ 4,  5],
303 |         #7  [ 6,  7]])
304 |         x = np.array([[[0, 1],
305 |                        [2, 1]],
306 |                       [[0, 1],
307 |                        [1, 0]]])
308 |         y = 'a'
309 |         expected_alpha = {
310 |             (0, 0, 0): np.exp(-7),
311 |             (0, 0, 0, 0, 1, 0, 4): np.exp(-7) * np.exp(1),
312 |             (0, 0, 0, 1, 0, 0, 6): np.exp(-7) * np.exp(5),
313 |             (0, 0, 0, 1, 1, 0, 2): np.exp(-7) * np.exp(-4),
314 |             (0, 0, 1): np.exp(-5),
315 |             (0, 0, 1, 0, 1, 1, 5): np.exp(-5) * np.exp(7),
316 |             (0, 0, 1, 1, 0, 1, 7): np.exp(-5) * np.exp(7),
317 |             (0, 0, 1, 1, 1, 1, 3): np.exp(-5) * np.exp(-2),
318 |             (0, 1, 0): np.exp(-7) * np.exp(1) * np.exp(-23),
319 |             (0, 1, 0, 1, 1, 0, 6): np.exp(-7) * np.exp(1) * np.exp(-23) * np.exp(4),
320 |             (0, 1, 1): np.exp(-5) * np.exp(7) * np.exp(-17),
321 |             (0, 1, 1, 1, 1, 1, 7): np.exp(-5) * np.exp(7) * np.exp(-17) * np.exp(6),
322 |             (1, 0, 0): np.exp(-7) * np.exp(5) * np.exp(-7),
323 |             (1, 0, 0, 1, 1, 0, 4): np.exp(-7) * np.exp(5) * np.exp(-7) * np.exp(0),
324 |             (1, 0, 1): np.exp(-5) * np.exp(7) * np.exp(-5),
325 |             (1, 0, 1, 1, 1, 1, 5): np.exp(-5) * np.exp(7) * np.exp(-5) * np.exp(2),
326 |             (1, 1, 0): (np.exp(-11) + np.exp(-25) + np.exp(-9)) * np.exp(-8),
327 |             (1, 1, 1): (np.exp(-1) + np.exp(-9) + np.exp(-7)) * np.exp(-6)
328 |         }
329 |         expected_alpha = {k: np.emath.log(v) for k, v in expected_alpha.items()}
330 | 
331 |         state_machine = DefaultStateMachine(classes)
332 |         print
333 |         test_model = _Model(state_machine, x, y)
334 |         for s in test_model._lattice:
335 |             print(s)
336 |         x_dot_parameters = np.dot(x, parameters.T)  # Pre-compute the dot product
337 |         actual_alpha = test_model._forward(x_dot_parameters)
338 | 
339 |         self.assertEqual(len(actual_alpha), len(expected_alpha))
340 |         for key in sorted(expected_alpha.keys()):
341 |             print(key, expected_alpha[key], actual_alpha[key])
342 |             self.assertAlmostEqual(actual_alpha[key], expected_alpha[key])
343 | 
344 |     def test_backward_connected(self):
345 |         parameters = np.array(range(-3, 3), dtype='float64').reshape((3, 2))
346 |         # parameters =
347 |         #0 ([[-3, -2],
348 |         #1   [-1,  0],
349 |         #2   [ 1,  2]])
350 |         x = np.array([[[0, 1],
351 |                        [2, 1]],
352 |                       [[0, 1],
353 |                        [1, 0]]])
354 |         y = 'a'
355 |         expected_beta = {
356 |             (0, 0, 0): (np.exp(-4) + np.exp(-12)),  # * np.exp(-2),
357 |             (0, 0, 0, 0, 1, 0, 1): np.exp(-3) * np.exp(1) * np.exp(-8),  # * np.exp(-2),
358 |             (0, 0, 0, 1, 0, 0, 2): np.exp(-3) * np.exp(-1) * np.exp(-2),  # * np.exp(2),
359 |             (0, 1, 0): np.exp(-3) * np.exp(1),  # * np.exp(-8),
360 |             (0, 1, 0, 1, 1, 0, 2): np.exp(-3),  # * np.exp(1),
361 |             (1, 0, 0): np.exp(-3) * np.exp(-1),  # * np.exp(-2),
362 |             (1, 0, 0, 1, 1, 0, 1): np.exp(-3),  # * np.exp(-1),
363 |             (1, 1, 0): 1.0  # np.exp(-3)
364 |         }
365 |         expected_beta = {k: np.emath.log(v) for k, v in expected_beta.items()}
366 | 
367 |         start_states = [0]
368 |         transitions = [(0, 0, (0, 1)),
369 |                        (0, 0, (1, 0))]
370 |         states_to_classes = {0: 'a'}
371 |         n_states = 1
372 | 
373 |         state_machine = GeneralStateMachine(start_states, transitions, states_to_classes)
374 | 
375 |         test_model = _Model(state_machine, x, y)
376 |         for s in test_model._lattice:
377 |             print(s)
378 |         x_dot_parameters = np.dot(x, parameters.T)  # Pre-compute the dot product
379 |         actual_beta = test_model._backward(x_dot_parameters)
380 |         print(actual_beta)
381 | 
382 |         print
383 |         self.assertEqual(len(actual_beta), len(expected_beta))
384 |         for key in sorted(expected_beta.keys(), reverse=True):
385 |             print(key, expected_beta[key], actual_beta[key])
386 |             self.assertAlmostEqual(actual_beta[key], expected_beta[key])
387 | 
388 |     def test_forward_backward_same_partition_value(self):
389 |         classes = ['a', 'b']
390 |         parameters = np.array(range(-8, 8), dtype='float64').reshape((8, 2))
391 |         x = np.array([[[0, 1],
392 |                        [2, 1]],
393 |                       [[0, 1],
394 |                        [1, 0]]])
395 |         y = 'a'
396 |         state_machine = DefaultStateMachine(classes)
397 |         test_model = _Model(state_machine, x, y)
398 |         x_dot_parameters = np.dot(x, parameters.T)  # Pre-compute the dot product
399 |         actual_alpha = test_model._forward(x_dot_parameters)
400 |         actual_beta = test_model._backward(x_dot_parameters)
401 | 
402 |         print(actual_alpha[(1, 1, 0)], actual_beta[(0, 0, 0)])
403 |         print(actual_alpha[(1, 1, 1)], actual_beta[(0, 0, 1)])
404 |         self.assertAlmostEqual(actual_alpha[(1, 1, 0)], actual_beta[(0, 0, 0)] + (np.dot(x[0, 0, :], parameters[0, :])))
405 |         self.assertAlmostEqual(actual_alpha[(1, 1, 1)], actual_beta[(0, 0, 1)] + (np.dot(x[0, 0, :], parameters[1, :])))
406 | 
407 |     def test_derivate_chain(self):
408 |         classes = ['a', 'b']
409 |         parameters = np.array(range(-8, 8), dtype='float64').reshape((8, 2))
410 |         # parameters =
411 |         #0([[-8, -7],
412 |         #1  [-6, -5],
413 |         #2  [-4, -3],
414 |         #3  [-2, -1],
415 |         #4  [ 0,  1],
416 |         #5  [ 2,  3],
417 |         #6  [ 4,  5],
418 |         #7  [ 6,  7]])
419 |         x = np.array([[[0, 1],
420 |                        [1, 2]]], dtype='float64')
421 |         y = 'a'
422 |         state_machine = DefaultStateMachine(classes)
423 |         test_model = _Model(state_machine, x, y)
424 |         print(test_model._lattice)
425 |         #
426 |         # 0   01 --- 01
427 |         #     0      1
428 |         # states_to_classes = {0: 'a', 1: 'b'}
429 |         # (0, 0, 0) :               exp(-7)
430 |         # (0, 0, 0, 0, 1, 0, 4) :   exp(-7) * exp(2)
431 |         # (0, 0, 1) :               exp(-5)
432 |         # (0, 0, 1, 0, 1, 1, 5) :   exp(-5) * exp(8)
433 |         # (0, 1, 0) :               exp(-7) * exp(2) * exp(-8 - 14)     = exp(-27)
434 |         # (0, 1, 1) :               exp(-5) * exp(8) * exp(-6 - 10)     = exp(-13)
435 |         # p(y|G,X) = f0(g00,g01,x00,x01,y) f1(g40,g41,x10,x11,y) f2(g00,g01,x00,x01,y)  +
436 |         #            f0(g10,g11,x00,x01,y) f1(g50,g51,x10,x11,y) f2(g10,g11,x00,x01,y)
437 |         # = exp(-27) / (exp(-27) + exp(-13))
438 |         expected_ll = np.emath.log(np.exp(-27) / (np.exp(-27) + np.exp(-13)))
439 |         expected_dll = np.zeros(parameters.shape)
440 | 
441 |         # Finite difference gradient approximation
442 |         delta = 10.0**-7
443 |         S, D = expected_dll.shape
444 |         for s in range(S):
445 |             for d in range(D):
446 |                 dg = np.zeros(parameters.shape)
447 |                 dg[s, d] = delta
448 |                 y0, _ = test_model.forward_backward(parameters)
449 |                 y1, _ = test_model.forward_backward(parameters + dg)
450 |                 print(s, d, y0, y1)
451 |                 expected_dll[s, d] = (y1 - y0) / delta
452 | 
453 |         actual_ll, actual_dll = test_model.forward_backward(parameters)
454 | 
455 |         print(expected_ll, actual_ll)
456 |         print(expected_dll)
457 |         print(actual_dll)
458 |         self.assertAlmostEqual(actual_ll, expected_ll)
459 |         assert_array_almost_equal(actual_dll, expected_dll, decimal=TEST_PRECISION)
460 | 
461 |     def test_derivate_medium(self):
462 |         classes = ['a', 'b']
463 |         parameters = np.array(range(-8, 8), dtype='float64').reshape((8, 2))
464 |         x = np.array([[[0, 1],
465 |                        [2, 1]],
466 |                       [[0, 1],
467 |                        [1, 0.0]]])
468 |         y = 'a'
469 |         state_machine = DefaultStateMachine(classes)
470 |         test_model = _Model(state_machine, x, y)
471 |         print(test_model._lattice)
472 | 
473 |         expected_dll = np.zeros(parameters.shape)
474 | 
475 |         # Finite difference gradient approximation
476 |         delta = 10.0**-7
477 |         S, D = expected_dll.shape
478 |         for s in range(S):
479 |             for d in range(D):
480 |                 dg = np.zeros(parameters.shape)
481 |                 dg[s, d] = delta
482 |                 y0, _ = test_model.forward_backward(parameters)
483 |                 y1, _ = test_model.forward_backward(parameters + dg)
484 |                 print(s, d, y0, y1)
485 |                 expected_dll[s, d] = (y1 - y0) / delta
486 | 
487 |         actual_ll, actual_dll = test_model.forward_backward(parameters)
488 | 
489 |         print(expected_dll)
490 |         print(actual_dll)
491 |         assert_array_almost_equal(actual_dll, expected_dll, decimal=TEST_PRECISION)
492 | 
493 |     def test_derivate_large(self):
494 |         classes = ['a', 'b', 'c']
495 |         y = 'b'
496 |         x = random.randn(8, 3, 10) * 5 + 3
497 |         state_machine = DefaultStateMachine(classes)
498 |         parameters = Hacrf._initialize_parameters(state_machine, x.shape[2])
499 |         parameters = random.randn(*parameters.shape) * 10 - 2
500 | 
501 |         test_model = _Model(state_machine, x, y)
502 |         print(test_model._lattice)
503 | 
504 |         expected_dll = np.zeros(parameters.shape)
505 | 
506 |         # Finite difference gradient approximation
507 |         delta = 10.0**-7
508 |         S, D = expected_dll.shape
509 |         for s in range(S):
510 |             for d in range(D):
511 |                 dg = np.zeros(parameters.shape)
512 |                 dg[s, d] = delta
513 |                 y0, _ = test_model.forward_backward(parameters)
514 |                 y1, _ = test_model.forward_backward(parameters + dg)
515 |                 print(s, d, y0, y1)
516 |                 expected_dll[s, d] = (y1 - y0) / delta
517 | 
518 |         actual_ll, actual_dll = test_model.forward_backward(parameters)
519 | 
520 |         print(expected_dll)
521 |         print(actual_dll)
522 |         self.assertEqual((np.isnan(actual_dll)).any(), False)
523 |         assert_array_almost_equal(actual_dll, expected_dll, decimal=TEST_PRECISION)
524 | 
525 | if __name__ == '__main__':
526 |     unittest.main()
527 | 


--------------------------------------------------------------------------------
/examples/Highered dataset.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "metadata": {
   3 |   "name": "",
   4 |   "signature": "sha256:abf0ae00f3fa6aac52649ff752418b6a4d46aeb22019b490dbb11e667c93d006"
   5 |  },
   6 |  "nbformat": 3,
   7 |  "nbformat_minor": 0,
   8 |  "worksheets": [
   9 |   {
  10 |    "cells": [
  11 |     {
  12 |      "cell_type": "code",
  13 |      "collapsed": false,
  14 |      "input": [
  15 |       "%load_ext vimception"
  16 |      ],
  17 |      "language": "python",
  18 |      "metadata": {},
  19 |      "outputs": [
  20 |       {
  21 |        "javascript": [
  22 |         "\n",
  23 |         "var cmd = IPython.keyboard_manager.command_shortcuts;\n",
  24 |         "var edit = IPython.keyboard_manager.edit_shortcuts;\n",
  25 |         "var def_cmd = IPython.default_command_shortcuts;\n",
  26 |         "var def_edit = IPython.default_edit_shortcuts;\n",
  27 |         "\n",
  28 |         "// get the code mirror editor of a curently selected cell\n",
  29 |         "function C() { return IPython.notebook.get_selected_cell().code_mirror; };\n",
  30 |         "\n",
  31 |         "// Change the mode of all current and future CodeMirror instances\n",
  32 |         "// Emacs users can use this function as just to('emacs') so long as they've\n",
  33 |         "// required/loaded emacs.js from CodeMirror\n",
  34 |         "function to(mode) {\n",
  35 |         "    var mode = mode || 'vim'\n",
  36 |         "    // first let's apply vim mode to all current cells\n",
  37 |         "    function to_mode(c) { return c.code_mirror.setOption('keyMap', mode);};\n",
  38 |         "    IPython.notebook.get_cells().map(to_mode);\n",
  39 |         "    // apply the mode to future cells created\n",
  40 |         "    IPython.Cell.options_default.cm_config.keyMap = mode;\n",
  41 |         "}\n",
  42 |         "\n",
  43 |         "function getCSS(path) {\n",
  44 |         "    $('<link/>', {\n",
  45 |         "       rel: 'stylesheet',\n",
  46 |         "       type: 'text/css',\n",
  47 |         "       href: path,\n",
  48 |         "    }).appendTo('head');\n",
  49 |         "}\n",
  50 |         "\n",
  51 |         "// I messed around with trying to get requireJS going here, but gave up and\n",
  52 |         "// just using this answer from SO \n",
  53 |         "// http://stackoverflow.com/questions/11803215/how-to-include-multiple-js-files-using-jquery-getscript-method\n",
  54 |         "\n",
  55 |         "var p = \"/static/components/codemirror/addon/\";\n",
  56 |         "\n",
  57 |         "$.when(\n",
  58 |         "// Grab the CodeMirror vim keymap\n",
  59 |         "$.getScript(p + \"../keymap/vim.js\"),\n",
  60 |         "// also make search work via /\n",
  61 |         "$.getScript(p + \"search/search.js\"),\n",
  62 |         "$.getScript(p + \"search/searchcursor.js\"),\n",
  63 |         "\n",
  64 |         "// TODO: hook-up gq to perform a harwrap\n",
  65 |         "$.getScript(p + \"wrap/hardwrap.js\"),\n",
  66 |         "$.getScript(p + \"selection/active-line.js\"),\n",
  67 |         "\n",
  68 |         "$.getScript(p + \"display/fullscreen.js\"),\n",
  69 |         "getCSS(p + \"display/fullscreen.css\"),\n",
  70 |         "getCSS(p + \"dialog/dialog.css\"),\n",
  71 |         "$.getScript(p + \"dialog/dialog.js\"),\n",
  72 |         "\n",
  73 |         "\n",
  74 |         "    $.Deferred(function( deferred ){\n",
  75 |         "        $( deferred.resolve );\n",
  76 |         "    })\n",
  77 |         ").then(function success(){\n",
  78 |         "\n",
  79 |         "console.log('Great success');\n",
  80 |         "\n",
  81 |         "IPython.CodeCell.options_default.cm_config.foldGutter = true;\n",
  82 |         "IPython.CodeCell.options_default.cm_config.gutters =  [\"CodeMirror-linenumbers\", \"CodeMirror-foldgutter\"];\n",
  83 |         "\n",
  84 |         "IPython.Cell.prototype.at_top = function () {\n",
  85 |         "        var cm = this.code_mirror;\n",
  86 |         "        var cursor = cm.getCursor();\n",
  87 |         "        if (cursor.line === 0) {\n",
  88 |         "            return true;\n",
  89 |         "        }\n",
  90 |         "        return false;\n",
  91 |         "    };\n",
  92 |         "\n",
  93 |         "\n",
  94 |         "IPython.Cell.prototype.at_bottom = function () {\n",
  95 |         "    var cm = this.code_mirror;\n",
  96 |         "    var cursor = cm.getCursor();\n",
  97 |         "    if (cursor.line === (cm.lineCount()-1)) {\n",
  98 |         "        return true;\n",
  99 |         "    }\n",
 100 |         "    return false;\n",
 101 |         "};\n",
 102 |         "// on all code mirror instances on this page, apply the function f\n",
 103 |         "function all_cm(f) {\n",
 104 |         "    // apply f to every code mirror instance. f takes one parameter\n",
 105 |         "    IPython.notebook.get_cells().map(function (c) { f(c.code_mirror); } );\n",
 106 |         "}\n",
 107 |         "\n",
 108 |         "\n",
 109 |         "to('vim');\n",
 110 |         "function vim_up(event) {\n",
 111 |         "    var cell = IPython.notebook.get_selected_cell();\n",
 112 |         "    if (cell && cell.at_top() && cell.code_mirror.options.keyMap === 'vim') {\n",
 113 |         "        console.log('inside the business logic k');\n",
 114 |         "        event.preventDefault();\n",
 115 |         "        IPython.notebook.command_mode()\n",
 116 |         "        IPython.notebook.select_prev();\n",
 117 |         "        IPython.notebook.edit_mode();\n",
 118 |         "        return false;\n",
 119 |         "    };\n",
 120 |         "}\n",
 121 |         "\n",
 122 |         "function vim_down(event) {\n",
 123 |         "            var cell = IPython.notebook.get_selected_cell();\n",
 124 |         "            if (cell && cell.at_bottom() && cell.code_mirror.options.keyMap === 'vim') {\n",
 125 |         "                event.preventDefault();\n",
 126 |         "                IPython.notebook.command_mode()\n",
 127 |         "                IPython.notebook.select_next();\n",
 128 |         "                IPython.notebook.edit_mode();\n",
 129 |         "                return false;\n",
 130 |         "            };\n",
 131 |         "        }\n",
 132 |         "\n",
 133 |         "var m = '(vim) '\n",
 134 |         "var edit_shortcuts = {\n",
 135 |         "    'k' : {\n",
 136 |         "        help    : m + 'up a line, even across cells',\n",
 137 |         "        help_index : 'AA',\n",
 138 |         "        handler : vim_up\n",
 139 |         "    },\n",
 140 |         "    'j' : {\n",
 141 |         "        help    : m + 'down a line, even across cells',\n",
 142 |         "        help_index : 'AA',\n",
 143 |         "        handler : vim_down\n",
 144 |         "    },\n",
 145 |         "\n",
 146 |         "};\n",
 147 |         "\n",
 148 |         "var command_shortcuts = {\n",
 149 |         "    'c' :  {\n",
 150 |         "        help    : m + def_cmd['y'].help,\n",
 151 |         "        help_index : 'AA',\n",
 152 |         "        handler : def_cmd['y'].handler\n",
 153 |         "    }\n",
 154 |         "\n",
 155 |         "\n",
 156 |         "};\n",
 157 |         "\n",
 158 |         "edit.add_shortcuts(edit_shortcuts);\n",
 159 |         "cmd.add_shortcuts(command_shortcuts);\n",
 160 |         "//edit.add_shortcuts('k', def_edit['up'].handler);\n",
 161 |         "//edit.add_shortcut('j', def_edit['down'].handler);\n",
 162 |         "\n",
 163 |         "// N.B. This code looks fairly simple, but it took me forever to \n",
 164 |         "// figure out how to do this, \n",
 165 |         "// \n",
 166 |         "// there's a problem here, Ctrl-[ is already handled by CodeMirror by the time we \n",
 167 |         "// (IPython.keyboard_manager) get it CodeMirror issues signals on mode change, \n",
 168 |         "// so we have to hook into that to get Ctrl-[\n",
 169 |         "edit.remove_shortcut('Ctrl-[');\n",
 170 |         "edit.remove_shortcut('Esc');\n",
 171 |         "\n",
 172 |         "CodeMirror.commands.leaveInsertOrEdit = function (cm) {\n",
 173 |         "    if ( cm.state.vim.insertMode ) {\n",
 174 |         "        // do magic here to get out of insert mode\n",
 175 |         "        CodeMirror.keyMap['vim-insert']['Esc'](cm);\n",
 176 |         "    } else {\n",
 177 |         "        IPython.notebook.command_mode();\n",
 178 |         "        IPython.notebook.focus_cell();\n",
 179 |         "    }\n",
 180 |         "};\n",
 181 |         "        \n",
 182 |         "//C().options.extraKeys['Ctrl-['] = 'leaveInsertOrEdit';\n",
 183 |         "all_cm( function (cm) {\n",
 184 |         "    cm.options.extraKeys['Ctrl-['] = 'leaveInsertOrEdit';\n",
 185 |         "    cm.options.extraKeys['Esc'] = 'leaveInsertOrEdit';\n",
 186 |         "    if ( CodeMirror.defaults.extraKeys === null ) { \n",
 187 |         "        CodeMirror.defaults.extraKeys = {};\n",
 188 |         "    }\n",
 189 |         "    // TODO: make this change permanent\n",
 190 |         "    // this part seems to be ignore when adding a new cell\n",
 191 |         "    // - alternative solution would be to listen for NewCell events and rerun the CM function on it\n",
 192 |         "    // - it could also be the case that when we instatiate CodeMirror, we somehow leave out CM.defaults.extraKeys\n",
 193 |         "    IPython.CodeCell.options_default.cm_config.extraKeys['Ctrl-['] = 'leaveInsertOrEdit';\n",
 194 |         "    IPython.TextCell.options_default.cm_config.extraKeys['Ctrl-['] = 'leaveInsertOrEdit';\n",
 195 |         "    IPython.CodeCell.options_default.cm_config.extraKeys['Esc'] = 'leaveInsertOrEdit';\n",
 196 |         "    IPython.TextCell.options_default.cm_config.extraKeys['Esc'] = 'leaveInsertOrEdit';\n",
 197 |         "})\n",
 198 |         "\n",
 199 |         "// On blur, make sure we go back to command mode for CodeMirror (in case user clicked away)\n",
 200 |         "// TODO: Make this permanent - how to get CodeMirror to do this for new cells created after\n",
 201 |         "all_cm( function (cm) {\n",
 202 |         "    cm.on('blur', function(cm) {\n",
 203 |         "        // TODO: I wish I understood a better way to do this, but fake pressing Escape work\n",
 204 |         "        CodeMirror.keyMap['vim-insert']['Esc'](cm);\n",
 205 |         "        CodeMirror.keyMap['vim']['Esc'](cm);\n",
 206 |         "        cm.setOption('styleActiveLine', false);\n",
 207 |         "        if (cm.getOption(\"fullScreen\")) {\n",
 208 |         "            cm.setOption('fullScreen', false); \n",
 209 |         "            // fullScreen the newly selected code mirror (doesn't work)\n",
 210 |         "            //setTimeout(100, function() {\n",
 211 |         "            //    console.log(IPython.notebook.get_selected_cell().code_mirror);\n",
 212 |         "            //    IPython.notebook.get_selected_cell().code_mirror.setOption('fullScreen', true); \n",
 213 |         "            //});\n",
 214 |         "        }\n",
 215 |         "    });\n",
 216 |         "    cm.on('focus', function(cm) {\n",
 217 |         "        cm.setOption('styleActiveLine', true);\n",
 218 |         "    });\n",
 219 |         "});\n",
 220 |         "\n",
 221 |         "// 'i' by default interrupts the kernel (what Ctrl-C does at the terminal)\n",
 222 |         "cmd.remove_shortcut('i');\n",
 223 |         "cmd.add_shortcut('i', def_cmd.enter);\n",
 224 |         "\n",
 225 |         "// not quite what we want - 'i' requires a double-tap\n",
 226 |         "// add documentation for this.\n",
 227 |         "cmd.add_shortcut('ctrl-c', function(e) { IPython.notebook.kernel.interrupt(); return false});\n",
 228 |         "\n",
 229 |         "\n",
 230 |         "function focus_last(e) {\n",
 231 |         "    var cells = IPython.notebook.get_cells();\n",
 232 |         "    cells[cells.length-1].focus_cell();\n",
 233 |         "};\n",
 234 |         "\n",
 235 |         "function focus_first(e) {\n",
 236 |         "    var cells = IPython.notebook.get_cells();\n",
 237 |         "    cells[0].focus_cell();\n",
 238 |         "};\n",
 239 |         "\n",
 240 |         "function combo_tap(combo, action) {\n",
 241 |         "    var that = this;\n",
 242 |         "    var timeout;\n",
 243 |         "    function f() {\n",
 244 |         "        console.log('f called once');\n",
 245 |         "        \n",
 246 |         "        // redo this so that when an action is performed, we restore the original combo\n",
 247 |         "        cmd.add_shortcut(combo[1], \n",
 248 |         "                function() { console.log(\"doing action\", combo); reset(); action(); timeout.clear();} );\n",
 249 |         "        timeout = setTimeout(function () {\n",
 250 |         "            console.log('resetting f');\n",
 251 |         "            reset();\n",
 252 |         "            //cmd.add_shortcut(combo[0], reset)\n",
 253 |         "        }, 800);\n",
 254 |         "    }\n",
 255 |         "    function reset(e) {\n",
 256 |         "        //cmd.remove_shortcut(combo[0]);\n",
 257 |         "        console.log('reset called');\n",
 258 |         "        //if (timeout) {\n",
 259 |         "        //    console.log('resetting aborted');\n",
 260 |         "        //    clearTimeout(timeout);\n",
 261 |         "        //    timeout = null;\n",
 262 |         "        //}\n",
 263 |         "        //that(combo, action); \n",
 264 |         "        cmd.add_shortcut(combo[0], f);\n",
 265 |         "    }\n",
 266 |         "    console.log(\"combo tap for\", combo);\n",
 267 |         "    \n",
 268 |         "    reset();\n",
 269 |         "};\n",
 270 |         "cmd.add_shortcut('shift-g', focus_last);\n",
 271 |         "combo_tap('gg', focus_first);\n",
 272 |         "\n",
 273 |         "// XXX: the way combo tap is currently implemented, this won't work\n",
 274 |         "// need a more generic mechanism for combo-taps with common prefixes\n",
 275 |         "// combo_tap('gq', f();\n",
 276 |         "//cmd.remove_shortcut('d');\n",
 277 |         "// cut\n",
 278 |         "combo_tap('dd', def_cmd['x'].handler);\n",
 279 |         "\n",
 280 |         "// copy\n",
 281 |         "combo_tap('yy', def_cmd['c'].handler);\n",
 282 |         "\n",
 283 |         "// paste\n",
 284 |         "cmd.add_shortcut('p', def_cmd['v']);\n",
 285 |         "\n",
 286 |         "// undo\n",
 287 |         "cmd.add_shortcut('u', def_cmd['z']);\n",
 288 |         "\n",
 289 |         "// Join (merge down with cell below)\n",
 290 |         "cmd.add_shortcut('shift-j', def_cmd['shift-m'])\n",
 291 |         "\n",
 292 |         "//edit.add_shortcut('k', def_edit['up'].handler);\n",
 293 |         "//[edit.add_shortcut('j', def_edit['down'].handler);\n",
 294 |         "\n",
 295 |         "CodeMirror.prototype.save = function() { \n",
 296 |         "    IPython.notebook.save_checkpoint()\n",
 297 |         "}\n",
 298 |         "\n",
 299 |         "function focus_last(e) {\n",
 300 |         "    var cells = IPython.notebook.get_cells();\n",
 301 |         "    cells[cells.length-1].focus_cell();\n",
 302 |         "};\n",
 303 |         "\n",
 304 |         "function focus_first(e) {\n",
 305 |         "    console.log('focus first called');\n",
 306 |         "    var cells = IPython.notebook.get_cells();\n",
 307 |         "    cells[0].focus_cell();\n",
 308 |         "};\n",
 309 |         "\n",
 310 |         "\n",
 311 |         "cmd.add_shortcut('shift-g', focus_last);\n",
 312 |         "combo_tap('gg', focus_first);\n",
 313 |         "\n",
 314 |         "// get rid of the default Ctrl-W binding\n",
 315 |         "// this only works for Firefox\n",
 316 |         "$(document).ready(function() {\n",
 317 |         "\t$(this).bind('keypress', function(e) {\n",
 318 |         "\t\tvar key = (e.keyCode ? e.keyCode : e.charCode);\n",
 319 |         "\t\tif (key == '119' && e.ctrlKey) {\n",
 320 |         "\t\t\treturn false;\n",
 321 |         "\t\t}\n",
 322 |         "\t});\n",
 323 |         "});\n",
 324 |         "\n",
 325 |         "window.addEventListener(\"beforeunload\", function( event ) {\n",
 326 |         "    var press = jQuery.Event(\"keypress\");\n",
 327 |         "    press.ctrlKey = false;\n",
 328 |         "    press.which = 27; // escape\n",
 329 |         "    $(document).trigger(press);\n",
 330 |         "    event.returnValue = \"\\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\";\n",
 331 |         "    event.returnValue +=\"\\nX  Chrome sucks at captruring Ctrl-W, sorry  X\";\n",
 332 |         "    event.returnValue += \"\\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\";\n",
 333 |         "});\n",
 334 |         "\n",
 335 |         "// update the keyboard shortcuts\n",
 336 |         "IPython.quick_help = new IPython.QuickHelp();\n",
 337 |         "\n",
 338 |         "//IPython.CodeCell.options_default.cm_config.styleActiveLine = true;\n",
 339 |         "\n",
 340 |         "all_cm( function (cm) {\n",
 341 |         "    cm.setOption('foldGutter', true);\n",
 342 |         "    cm.setOption('gutters',  [\"CodeMirror-linenumbers\", \"CodeMirror-foldgutter\"]);\n",
 343 |         "    cm.options.extraKeys[\"Ctrl-F\"] = function(cm){ cm.foldCode(cm.getCursor()); };\n",
 344 |         "    var wrapOptions = {column: 78, killTrailingSpace: true, wrapOn: /\\s\\S|[^\\.\\d]/ };\n",
 345 |         "    // XXX: add a hardwrap-range to this as well\n",
 346 |         "    cm.options.extraKeys[\"F2\"] =  function(cm) { cm.wrapParagraph(cm.getCursor(), wrapOptions); };\n",
 347 |         "    //cm.options.extraKeys[\"[\"] =  function(cm) { cm.setOption(\"fullScreen\", !cm.getOption(\"fullScreen\"))};\n",
 348 |         "    IPython.CodeCell.options_default.cm_config.extraKeys['Ctrl-F'] = function(cm){ cm.foldCode(cm.getCursor()); };\n",
 349 |         "    IPython.TextCell.options_default.cm_config.extraKeys['Ctrl-F'] = function(cm){ cm.foldCode(cm.getCursor()); };\n",
 350 |         "\n",
 351 |         "    // todo - do this for new cells as well\n",
 352 |         "    // support this a :only? turn off full screen on blur\n",
 353 |         "    cm.options.extraKeys[\"F11\"] =  function(cm) { cm.setOption(\"fullScreen\", !cm.getOption(\"fullScreen\"))};\n",
 354 |         "    cm.options.extraKeys[\"Ctrl-A\"] =  function(cm) {\n",
 355 |         "          if (cm.getOption(\"fullScreen\")) cm.setOption(\"fullScreen\", false);\n",
 356 |         "        };\n",
 357 |         "    //all_cm( function (cm) {\n",
 358 |         "});\n",
 359 |         "\n",
 360 |         "//setTimeout(function() {IPython.notebook.get_selected_cell().set_input_prompt('vim');}, 200)\n",
 361 |         "\n",
 362 |         "$(\"#ipython_notebook\").find('img').remove('#vim');\n",
 363 |         "$(\"#ipython_notebook\").append('<img id=\"vim\" src=\"http://www.vim.org/images/vim_on_fire.gif\"'\n",
 364 |         "        // XXX: add it to the distribution\n",
 365 |         "        // /static/custom/vim_on_fire.gif\" \n",
 366 |         "    + ' style=\"'\n",
 367 |         "    + 'position: absolute; left: 51px; top: -10px; height: initial;\">')\n",
 368 |         "$(\"#vim\").click( function () {$(this).hide()});\n",
 369 |         "\n",
 370 |         "\n",
 371 |         "// XXX: Autowrapping is kind of broken - you can write a line that will have\n",
 372 |         "// its last word (if it's 1 or 2 characters just go back and forth between the\n",
 373 |         "// current and the next lines)\n",
 374 |         "//all_cm(function (cm) {\n",
 375 |         "//    var wait, options = {column: 78, killTrailingSpace: true, wrapOn: /\\s\\S|[^\\.\\d]/};\n",
 376 |         "//    cm.on(\"change\", function(cm, change) {\n",
 377 |         "//      clearTimeout(wait);\n",
 378 |         "//      wait = setTimeout(function() {\n",
 379 |         "//        console.log(cm.wrapParagraphsInRange(change.from, CodeMirror.changeEnd(change), options));\n",
 380 |         "//      }, 300);\n",
 381 |         "//    });\n",
 382 |         "//});\n",
 383 |         "\n",
 384 |         "}, function  failure() { \n",
 385 |         "    alert('le sucks, something went wrong');\n",
 386 |         "\n",
 387 |         "});\n",
 388 |         "\n",
 389 |         "\n",
 390 |         "// at_top  and at_bottom methods for ipython-vimception\n",
 391 |         "    /**\n",
 392 |         "     * @method at_top\n",
 393 |         "     * @return {Boolean}\n",
 394 |         "     */\n",
 395 |         "    Cell.prototype.at_top = function () {\n",
 396 |         "        var cm = this.code_mirror;\n",
 397 |         "        var cursor = cm.getCursor();\n",
 398 |         "        if (cursor.line === 0 && cm.findPosV(cursor, -1, 'line').hitSide) {\n",
 399 |         "            return true;\n",
 400 |         "        } else {\n",
 401 |         "            return false;\n",
 402 |         "        }\n",
 403 |         "    };\n",
 404 |         "\n",
 405 |         "    /**\n",
 406 |         "     * @method at_bottom\n",
 407 |         "     * @return {Boolean}\n",
 408 |         "     * */\n",
 409 |         "    Cell.prototype.at_bottom = function () {\n",
 410 |         "        var cm = this.code_mirror;\n",
 411 |         "        var cursor = cm.getCursor();\n",
 412 |         "        if (cursor.line === (cm.lineCount()-1) && cm.findPosV(cursor, 1, 'line').hitSide) {\n",
 413 |         "            return true;\n",
 414 |         "        } else {\n",
 415 |         "            return false;\n",
 416 |         "        }\n",
 417 |         "    };\n"
 418 |        ],
 419 |        "metadata": {},
 420 |        "output_type": "display_data",
 421 |        "text": [
 422 |         "<IPython.core.display.Javascript at 0x104632e90>"
 423 |        ]
 424 |       },
 425 |       {
 426 |        "javascript": [
 427 |         "\n",
 428 |         "var cmd = IPython.keyboard_manager.command_shortcuts;\n",
 429 |         "var edit = IPython.keyboard_manager.edit_shortcuts;\n",
 430 |         "var def_cmd = IPython.default_command_shortcuts;\n",
 431 |         "var def_edit = IPython.default_edit_shortcuts;\n",
 432 |         "\n",
 433 |         "// get the code mirror editor of a curently selected cell\n",
 434 |         "function C() { return IPython.notebook.get_selected_cell().code_mirror; };\n",
 435 |         "\n",
 436 |         "// Change the mode of all current and future CodeMirror instances\n",
 437 |         "// Emacs users can use this function as just to('emacs') so long as they've\n",
 438 |         "// required/loaded emacs.js from CodeMirror\n",
 439 |         "function to(mode) {\n",
 440 |         "    var mode = mode || 'vim'\n",
 441 |         "    // first let's apply vim mode to all current cells\n",
 442 |         "    function to_mode(c) { return c.code_mirror.setOption('keyMap', mode);};\n",
 443 |         "    IPython.notebook.get_cells().map(to_mode);\n",
 444 |         "    // apply the mode to future cells created\n",
 445 |         "    IPython.Cell.options_default.cm_config.keyMap = mode;\n",
 446 |         "}\n",
 447 |         "\n",
 448 |         "function getCSS(path) {\n",
 449 |         "    $('<link/>', {\n",
 450 |         "       rel: 'stylesheet',\n",
 451 |         "       type: 'text/css',\n",
 452 |         "       href: path,\n",
 453 |         "    }).appendTo('head');\n",
 454 |         "}\n",
 455 |         "\n",
 456 |         "// I messed around with trying to get requireJS going here, but gave up and\n",
 457 |         "// just using this answer from SO \n",
 458 |         "// http://stackoverflow.com/questions/11803215/how-to-include-multiple-js-files-using-jquery-getscript-method\n",
 459 |         "\n",
 460 |         "var p = \"/static/components/codemirror/addon/\";\n",
 461 |         "\n",
 462 |         "$.when(\n",
 463 |         "// Grab the CodeMirror vim keymap\n",
 464 |         "$.getScript(p + \"../keymap/vim.js\"),\n",
 465 |         "// also make search work via /\n",
 466 |         "$.getScript(p + \"search/search.js\"),\n",
 467 |         "$.getScript(p + \"search/searchcursor.js\"),\n",
 468 |         "\n",
 469 |         "// TODO: hook-up gq to perform a harwrap\n",
 470 |         "$.getScript(p + \"wrap/hardwrap.js\"),\n",
 471 |         "$.getScript(p + \"selection/active-line.js\"),\n",
 472 |         "\n",
 473 |         "$.getScript(p + \"display/fullscreen.js\"),\n",
 474 |         "getCSS(p + \"display/fullscreen.css\"),\n",
 475 |         "getCSS(p + \"dialog/dialog.css\"),\n",
 476 |         "$.getScript(p + \"dialog/dialog.js\"),\n",
 477 |         "\n",
 478 |         "\n",
 479 |         "    $.Deferred(function( deferred ){\n",
 480 |         "        $( deferred.resolve );\n",
 481 |         "    })\n",
 482 |         ").then(function success(){\n",
 483 |         "\n",
 484 |         "console.log('Great success');\n",
 485 |         "\n",
 486 |         "IPython.CodeCell.options_default.cm_config.foldGutter = true;\n",
 487 |         "IPython.CodeCell.options_default.cm_config.gutters =  [\"CodeMirror-linenumbers\", \"CodeMirror-foldgutter\"];\n",
 488 |         "\n",
 489 |         "IPython.Cell.prototype.at_top = function () {\n",
 490 |         "        var cm = this.code_mirror;\n",
 491 |         "        var cursor = cm.getCursor();\n",
 492 |         "        if (cursor.line === 0) {\n",
 493 |         "            return true;\n",
 494 |         "        }\n",
 495 |         "        return false;\n",
 496 |         "    };\n",
 497 |         "\n",
 498 |         "\n",
 499 |         "IPython.Cell.prototype.at_bottom = function () {\n",
 500 |         "    var cm = this.code_mirror;\n",
 501 |         "    var cursor = cm.getCursor();\n",
 502 |         "    if (cursor.line === (cm.lineCount()-1)) {\n",
 503 |         "        return true;\n",
 504 |         "    }\n",
 505 |         "    return false;\n",
 506 |         "};\n",
 507 |         "// on all code mirror instances on this page, apply the function f\n",
 508 |         "function all_cm(f) {\n",
 509 |         "    // apply f to every code mirror instance. f takes one parameter\n",
 510 |         "    IPython.notebook.get_cells().map(function (c) { f(c.code_mirror); } );\n",
 511 |         "}\n",
 512 |         "\n",
 513 |         "\n",
 514 |         "to('vim');\n",
 515 |         "function vim_up(event) {\n",
 516 |         "    var cell = IPython.notebook.get_selected_cell();\n",
 517 |         "    if (cell && cell.at_top() && cell.code_mirror.options.keyMap === 'vim') {\n",
 518 |         "        console.log('inside the business logic k');\n",
 519 |         "        event.preventDefault();\n",
 520 |         "        IPython.notebook.command_mode()\n",
 521 |         "        IPython.notebook.select_prev();\n",
 522 |         "        IPython.notebook.edit_mode();\n",
 523 |         "        return false;\n",
 524 |         "    };\n",
 525 |         "}\n",
 526 |         "\n",
 527 |         "function vim_down(event) {\n",
 528 |         "            var cell = IPython.notebook.get_selected_cell();\n",
 529 |         "            if (cell && cell.at_bottom() && cell.code_mirror.options.keyMap === 'vim') {\n",
 530 |         "                event.preventDefault();\n",
 531 |         "                IPython.notebook.command_mode()\n",
 532 |         "                IPython.notebook.select_next();\n",
 533 |         "                IPython.notebook.edit_mode();\n",
 534 |         "                return false;\n",
 535 |         "            };\n",
 536 |         "        }\n",
 537 |         "\n",
 538 |         "var m = '(vim) '\n",
 539 |         "var edit_shortcuts = {\n",
 540 |         "    'k' : {\n",
 541 |         "        help    : m + 'up a line, even across cells',\n",
 542 |         "        help_index : 'AA',\n",
 543 |         "        handler : vim_up\n",
 544 |         "    },\n",
 545 |         "    'j' : {\n",
 546 |         "        help    : m + 'down a line, even across cells',\n",
 547 |         "        help_index : 'AA',\n",
 548 |         "        handler : vim_down\n",
 549 |         "    },\n",
 550 |         "\n",
 551 |         "};\n",
 552 |         "\n",
 553 |         "var command_shortcuts = {\n",
 554 |         "    'c' :  {\n",
 555 |         "        help    : m + def_cmd['y'].help,\n",
 556 |         "        help_index : 'AA',\n",
 557 |         "        handler : def_cmd['y'].handler\n",
 558 |         "    }\n",
 559 |         "\n",
 560 |         "\n",
 561 |         "};\n",
 562 |         "\n",
 563 |         "edit.add_shortcuts(edit_shortcuts);\n",
 564 |         "cmd.add_shortcuts(command_shortcuts);\n",
 565 |         "//edit.add_shortcuts('k', def_edit['up'].handler);\n",
 566 |         "//edit.add_shortcut('j', def_edit['down'].handler);\n",
 567 |         "\n",
 568 |         "// N.B. This code looks fairly simple, but it took me forever to \n",
 569 |         "// figure out how to do this, \n",
 570 |         "// \n",
 571 |         "// there's a problem here, Ctrl-[ is already handled by CodeMirror by the time we \n",
 572 |         "// (IPython.keyboard_manager) get it CodeMirror issues signals on mode change, \n",
 573 |         "// so we have to hook into that to get Ctrl-[\n",
 574 |         "edit.remove_shortcut('Ctrl-[');\n",
 575 |         "edit.remove_shortcut('Esc');\n",
 576 |         "\n",
 577 |         "CodeMirror.commands.leaveInsertOrEdit = function (cm) {\n",
 578 |         "    if ( cm.state.vim.insertMode ) {\n",
 579 |         "        // do magic here to get out of insert mode\n",
 580 |         "        CodeMirror.keyMap['vim-insert']['Esc'](cm);\n",
 581 |         "    } else {\n",
 582 |         "        IPython.notebook.command_mode();\n",
 583 |         "        IPython.notebook.focus_cell();\n",
 584 |         "    }\n",
 585 |         "};\n",
 586 |         "        \n",
 587 |         "//C().options.extraKeys['Ctrl-['] = 'leaveInsertOrEdit';\n",
 588 |         "all_cm( function (cm) {\n",
 589 |         "    cm.options.extraKeys['Ctrl-['] = 'leaveInsertOrEdit';\n",
 590 |         "    cm.options.extraKeys['Esc'] = 'leaveInsertOrEdit';\n",
 591 |         "    if ( CodeMirror.defaults.extraKeys === null ) { \n",
 592 |         "        CodeMirror.defaults.extraKeys = {};\n",
 593 |         "    }\n",
 594 |         "    // TODO: make this change permanent\n",
 595 |         "    // this part seems to be ignore when adding a new cell\n",
 596 |         "    // - alternative solution would be to listen for NewCell events and rerun the CM function on it\n",
 597 |         "    // - it could also be the case that when we instatiate CodeMirror, we somehow leave out CM.defaults.extraKeys\n",
 598 |         "    IPython.CodeCell.options_default.cm_config.extraKeys['Ctrl-['] = 'leaveInsertOrEdit';\n",
 599 |         "    IPython.TextCell.options_default.cm_config.extraKeys['Ctrl-['] = 'leaveInsertOrEdit';\n",
 600 |         "    IPython.CodeCell.options_default.cm_config.extraKeys['Esc'] = 'leaveInsertOrEdit';\n",
 601 |         "    IPython.TextCell.options_default.cm_config.extraKeys['Esc'] = 'leaveInsertOrEdit';\n",
 602 |         "})\n",
 603 |         "\n",
 604 |         "// On blur, make sure we go back to command mode for CodeMirror (in case user clicked away)\n",
 605 |         "// TODO: Make this permanent - how to get CodeMirror to do this for new cells created after\n",
 606 |         "all_cm( function (cm) {\n",
 607 |         "    cm.on('blur', function(cm) {\n",
 608 |         "        // TODO: I wish I understood a better way to do this, but fake pressing Escape work\n",
 609 |         "        CodeMirror.keyMap['vim-insert']['Esc'](cm);\n",
 610 |         "        CodeMirror.keyMap['vim']['Esc'](cm);\n",
 611 |         "        cm.setOption('styleActiveLine', false);\n",
 612 |         "        if (cm.getOption(\"fullScreen\")) {\n",
 613 |         "            cm.setOption('fullScreen', false); \n",
 614 |         "            // fullScreen the newly selected code mirror (doesn't work)\n",
 615 |         "            //setTimeout(100, function() {\n",
 616 |         "            //    console.log(IPython.notebook.get_selected_cell().code_mirror);\n",
 617 |         "            //    IPython.notebook.get_selected_cell().code_mirror.setOption('fullScreen', true); \n",
 618 |         "            //});\n",
 619 |         "        }\n",
 620 |         "    });\n",
 621 |         "    cm.on('focus', function(cm) {\n",
 622 |         "        cm.setOption('styleActiveLine', true);\n",
 623 |         "    });\n",
 624 |         "});\n",
 625 |         "\n",
 626 |         "// 'i' by default interrupts the kernel (what Ctrl-C does at the terminal)\n",
 627 |         "cmd.remove_shortcut('i');\n",
 628 |         "cmd.add_shortcut('i', def_cmd.enter);\n",
 629 |         "\n",
 630 |         "// not quite what we want - 'i' requires a double-tap\n",
 631 |         "// add documentation for this.\n",
 632 |         "cmd.add_shortcut('ctrl-c', function(e) { IPython.notebook.kernel.interrupt(); return false});\n",
 633 |         "\n",
 634 |         "\n",
 635 |         "function focus_last(e) {\n",
 636 |         "    var cells = IPython.notebook.get_cells();\n",
 637 |         "    cells[cells.length-1].focus_cell();\n",
 638 |         "};\n",
 639 |         "\n",
 640 |         "function focus_first(e) {\n",
 641 |         "    var cells = IPython.notebook.get_cells();\n",
 642 |         "    cells[0].focus_cell();\n",
 643 |         "};\n",
 644 |         "\n",
 645 |         "function combo_tap(combo, action) {\n",
 646 |         "    var that = this;\n",
 647 |         "    var timeout;\n",
 648 |         "    function f() {\n",
 649 |         "        console.log('f called once');\n",
 650 |         "        \n",
 651 |         "        // redo this so that when an action is performed, we restore the original combo\n",
 652 |         "        cmd.add_shortcut(combo[1], \n",
 653 |         "                function() { console.log(\"doing action\", combo); reset(); action(); timeout.clear();} );\n",
 654 |         "        timeout = setTimeout(function () {\n",
 655 |         "            console.log('resetting f');\n",
 656 |         "            reset();\n",
 657 |         "            //cmd.add_shortcut(combo[0], reset)\n",
 658 |         "        }, 800);\n",
 659 |         "    }\n",
 660 |         "    function reset(e) {\n",
 661 |         "        //cmd.remove_shortcut(combo[0]);\n",
 662 |         "        console.log('reset called');\n",
 663 |         "        //if (timeout) {\n",
 664 |         "        //    console.log('resetting aborted');\n",
 665 |         "        //    clearTimeout(timeout);\n",
 666 |         "        //    timeout = null;\n",
 667 |         "        //}\n",
 668 |         "        //that(combo, action); \n",
 669 |         "        cmd.add_shortcut(combo[0], f);\n",
 670 |         "    }\n",
 671 |         "    console.log(\"combo tap for\", combo);\n",
 672 |         "    \n",
 673 |         "    reset();\n",
 674 |         "};\n",
 675 |         "cmd.add_shortcut('shift-g', focus_last);\n",
 676 |         "combo_tap('gg', focus_first);\n",
 677 |         "\n",
 678 |         "// XXX: the way combo tap is currently implemented, this won't work\n",
 679 |         "// need a more generic mechanism for combo-taps with common prefixes\n",
 680 |         "// combo_tap('gq', f();\n",
 681 |         "//cmd.remove_shortcut('d');\n",
 682 |         "// cut\n",
 683 |         "combo_tap('dd', def_cmd['x'].handler);\n",
 684 |         "\n",
 685 |         "// copy\n",
 686 |         "combo_tap('yy', def_cmd['c'].handler);\n",
 687 |         "\n",
 688 |         "// paste\n",
 689 |         "cmd.add_shortcut('p', def_cmd['v']);\n",
 690 |         "\n",
 691 |         "// undo\n",
 692 |         "cmd.add_shortcut('u', def_cmd['z']);\n",
 693 |         "\n",
 694 |         "// Join (merge down with cell below)\n",
 695 |         "cmd.add_shortcut('shift-j', def_cmd['shift-m'])\n",
 696 |         "\n",
 697 |         "//edit.add_shortcut('k', def_edit['up'].handler);\n",
 698 |         "//[edit.add_shortcut('j', def_edit['down'].handler);\n",
 699 |         "\n",
 700 |         "CodeMirror.prototype.save = function() { \n",
 701 |         "    IPython.notebook.save_checkpoint()\n",
 702 |         "}\n",
 703 |         "\n",
 704 |         "function focus_last(e) {\n",
 705 |         "    var cells = IPython.notebook.get_cells();\n",
 706 |         "    cells[cells.length-1].focus_cell();\n",
 707 |         "};\n",
 708 |         "\n",
 709 |         "function focus_first(e) {\n",
 710 |         "    console.log('focus first called');\n",
 711 |         "    var cells = IPython.notebook.get_cells();\n",
 712 |         "    cells[0].focus_cell();\n",
 713 |         "};\n",
 714 |         "\n",
 715 |         "\n",
 716 |         "cmd.add_shortcut('shift-g', focus_last);\n",
 717 |         "combo_tap('gg', focus_first);\n",
 718 |         "\n",
 719 |         "// get rid of the default Ctrl-W binding\n",
 720 |         "// this only works for Firefox\n",
 721 |         "$(document).ready(function() {\n",
 722 |         "\t$(this).bind('keypress', function(e) {\n",
 723 |         "\t\tvar key = (e.keyCode ? e.keyCode : e.charCode);\n",
 724 |         "\t\tif (key == '119' && e.ctrlKey) {\n",
 725 |         "\t\t\treturn false;\n",
 726 |         "\t\t}\n",
 727 |         "\t});\n",
 728 |         "});\n",
 729 |         "\n",
 730 |         "window.addEventListener(\"beforeunload\", function( event ) {\n",
 731 |         "    var press = jQuery.Event(\"keypress\");\n",
 732 |         "    press.ctrlKey = false;\n",
 733 |         "    press.which = 27; // escape\n",
 734 |         "    $(document).trigger(press);\n",
 735 |         "    event.returnValue = \"\\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\";\n",
 736 |         "    event.returnValue +=\"\\nX  Chrome sucks at captruring Ctrl-W, sorry  X\";\n",
 737 |         "    event.returnValue += \"\\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\";\n",
 738 |         "});\n",
 739 |         "\n",
 740 |         "// update the keyboard shortcuts\n",
 741 |         "IPython.quick_help = new IPython.QuickHelp();\n",
 742 |         "\n",
 743 |         "//IPython.CodeCell.options_default.cm_config.styleActiveLine = true;\n",
 744 |         "\n",
 745 |         "all_cm( function (cm) {\n",
 746 |         "    cm.setOption('foldGutter', true);\n",
 747 |         "    cm.setOption('gutters',  [\"CodeMirror-linenumbers\", \"CodeMirror-foldgutter\"]);\n",
 748 |         "    cm.options.extraKeys[\"Ctrl-F\"] = function(cm){ cm.foldCode(cm.getCursor()); };\n",
 749 |         "    var wrapOptions = {column: 78, killTrailingSpace: true, wrapOn: /\\s\\S|[^\\.\\d]/ };\n",
 750 |         "    // XXX: add a hardwrap-range to this as well\n",
 751 |         "    cm.options.extraKeys[\"F2\"] =  function(cm) { cm.wrapParagraph(cm.getCursor(), wrapOptions); };\n",
 752 |         "    //cm.options.extraKeys[\"[\"] =  function(cm) { cm.setOption(\"fullScreen\", !cm.getOption(\"fullScreen\"))};\n",
 753 |         "    IPython.CodeCell.options_default.cm_config.extraKeys['Ctrl-F'] = function(cm){ cm.foldCode(cm.getCursor()); };\n",
 754 |         "    IPython.TextCell.options_default.cm_config.extraKeys['Ctrl-F'] = function(cm){ cm.foldCode(cm.getCursor()); };\n",
 755 |         "\n",
 756 |         "    // todo - do this for new cells as well\n",
 757 |         "    // support this a :only? turn off full screen on blur\n",
 758 |         "    cm.options.extraKeys[\"F11\"] =  function(cm) { cm.setOption(\"fullScreen\", !cm.getOption(\"fullScreen\"))};\n",
 759 |         "    cm.options.extraKeys[\"Ctrl-A\"] =  function(cm) {\n",
 760 |         "          if (cm.getOption(\"fullScreen\")) cm.setOption(\"fullScreen\", false);\n",
 761 |         "        };\n",
 762 |         "    //all_cm( function (cm) {\n",
 763 |         "});\n",
 764 |         "\n",
 765 |         "//setTimeout(function() {IPython.notebook.get_selected_cell().set_input_prompt('vim');}, 200)\n",
 766 |         "\n",
 767 |         "$(\"#ipython_notebook\").find('img').remove('#vim');\n",
 768 |         "$(\"#ipython_notebook\").append('<img id=\"vim\" src=\"http://www.vim.org/images/vim_on_fire.gif\"'\n",
 769 |         "        // XXX: add it to the distribution\n",
 770 |         "        // /static/custom/vim_on_fire.gif\" \n",
 771 |         "    + ' style=\"'\n",
 772 |         "    + 'position: absolute; left: 51px; top: -10px; height: initial;\">')\n",
 773 |         "$(\"#vim\").click( function () {$(this).hide()});\n",
 774 |         "\n",
 775 |         "\n",
 776 |         "// XXX: Autowrapping is kind of broken - you can write a line that will have\n",
 777 |         "// its last word (if it's 1 or 2 characters just go back and forth between the\n",
 778 |         "// current and the next lines)\n",
 779 |         "//all_cm(function (cm) {\n",
 780 |         "//    var wait, options = {column: 78, killTrailingSpace: true, wrapOn: /\\s\\S|[^\\.\\d]/};\n",
 781 |         "//    cm.on(\"change\", function(cm, change) {\n",
 782 |         "//      clearTimeout(wait);\n",
 783 |         "//      wait = setTimeout(function() {\n",
 784 |         "//        console.log(cm.wrapParagraphsInRange(change.from, CodeMirror.changeEnd(change), options));\n",
 785 |         "//      }, 300);\n",
 786 |         "//    });\n",
 787 |         "//});\n",
 788 |         "\n",
 789 |         "}, function  failure() { \n",
 790 |         "    alert('le sucks, something went wrong');\n",
 791 |         "\n",
 792 |         "});\n",
 793 |         "\n",
 794 |         "\n",
 795 |         "// at_top  and at_bottom methods for ipython-vimception\n",
 796 |         "    /**\n",
 797 |         "     * @method at_top\n",
 798 |         "     * @return {Boolean}\n",
 799 |         "     */\n",
 800 |         "    Cell.prototype.at_top = function () {\n",
 801 |         "        var cm = this.code_mirror;\n",
 802 |         "        var cursor = cm.getCursor();\n",
 803 |         "        if (cursor.line === 0 && cm.findPosV(cursor, -1, 'line').hitSide) {\n",
 804 |         "            return true;\n",
 805 |         "        } else {\n",
 806 |         "            return false;\n",
 807 |         "        }\n",
 808 |         "    };\n",
 809 |         "\n",
 810 |         "    /**\n",
 811 |         "     * @method at_bottom\n",
 812 |         "     * @return {Boolean}\n",
 813 |         "     * */\n",
 814 |         "    Cell.prototype.at_bottom = function () {\n",
 815 |         "        var cm = this.code_mirror;\n",
 816 |         "        var cursor = cm.getCursor();\n",
 817 |         "        if (cursor.line === (cm.lineCount()-1) && cm.findPosV(cursor, 1, 'line').hitSide) {\n",
 818 |         "            return true;\n",
 819 |         "        } else {\n",
 820 |         "            return false;\n",
 821 |         "        }\n",
 822 |         "    };\n"
 823 |        ],
 824 |        "metadata": {},
 825 |        "output_type": "display_data",
 826 |        "text": [
 827 |         "<IPython.core.display.Javascript at 0x104638590>"
 828 |        ]
 829 |       }
 830 |      ],
 831 |      "prompt_number": 1
 832 |     },
 833 |     {
 834 |      "cell_type": "code",
 835 |      "collapsed": false,
 836 |      "input": [
 837 |       "%load_ext autoreload\n",
 838 |       "%autoreload 2"
 839 |      ],
 840 |      "language": "python",
 841 |      "metadata": {},
 842 |      "outputs": [],
 843 |      "prompt_number": 2
 844 |     },
 845 |     {
 846 |      "cell_type": "markdown",
 847 |      "metadata": {},
 848 |      "source": [
 849 |       "# Long input strings\n",
 850 |       "\n",
 851 |       "For certain tasks it might make more sense to tokenize input strings first and then extract features on these string lists rather than on the original character lists.\n",
 852 |       "\n",
 853 |       "To demonstrate this I'll take some example strings from [highered](https://github.com/datamade/highered/) and learn models using these two feature extraction techniques."
 854 |      ]
 855 |     },
 856 |     {
 857 |      "cell_type": "markdown",
 858 |      "metadata": {},
 859 |      "source": [
 860 |       "## Training examples"
 861 |      ]
 862 |     },
 863 |     {
 864 |      "cell_type": "code",
 865 |      "collapsed": false,
 866 |      "input": [
 867 |       "X = [(u'caring hands a step ahead', u'el valor little tykes ii'),\n",
 868 |       "  (u'dulles', u\"chicago public schools o'keeffe, isabell c.\"),\n",
 869 |       "  (u'erie neighborhood house fcch-carmen l. vega site',\n",
 870 |       "   u'erie neighborhood house fcch-servia galva site'),\n",
 871 |       "  (u'chicago public schools dvorak math & science tech academy, anton',\n",
 872 |       "   u'chicago public schools perez, manuel'),\n",
 873 |       "  (u'v & j day care center', u\"henry booth house granny's day care center\"),\n",
 874 |       "  (u'home of life community dev. corp. - home of life just for you',\n",
 875 |       "   u'urban family and community centers'),\n",
 876 |       "  (u'carole robertson center for learning fcch-ileana gonzalez',\n",
 877 |       "   u'carole robertson center for learning fcch-rhonda culverson'),\n",
 878 |       "  (u'bethel new life bethel child development',\n",
 879 |       "   u'mary crane league mary crane center (lake & pulaski)'),\n",
 880 |       "  (u'easter seals society of metropolitan chicago - stepping stones early/childhood lear',\n",
 881 |       "   u\"marcy newberry association kenyatta's day care\"),\n",
 882 |       "  (u'westside holistic family services westside holistic family services',\n",
 883 |       "   u'childserv lawndale'),\n",
 884 |       "  \n",
 885 |       "  (u'higgins', u'higgins'),\n",
 886 |       "  (u'ymca south side', u'ymca of metropolitan chicago - south side ymca'),\n",
 887 |       "  (u'chicago commons association paulo freire',\n",
 888 |       "   u'chicago commons association paulo freire'),\n",
 889 |       "  (u'fresh start daycare, inc.',\n",
 890 |       "   u'easter seals society of metropolitan chicago fresh start day care center'),\n",
 891 |       "  (u'el valor teddy bear 3', u'teddy bear 3'),\n",
 892 |       "  (u'chicago child care society chicago child care society',\n",
 893 |       "   u'chicago child care society-child and family dev center'),\n",
 894 |       "  (u'hull house - uptown', u'uptown family care center')]\n",
 895 |       "Y = [u'distinct',\n",
 896 |       "  u'distinct',\n",
 897 |       "  u'distinct',\n",
 898 |       "  u'distinct',\n",
 899 |       "  u'distinct',\n",
 900 |       "  u'distinct',\n",
 901 |       "  u'distinct',\n",
 902 |       "  u'distinct',\n",
 903 |       "  u'distinct',\n",
 904 |       "  u'distinct',\n",
 905 |       "  u'match',\n",
 906 |       "  u'match',\n",
 907 |       "  u'match',\n",
 908 |       "  u'match',\n",
 909 |       "  u'match',\n",
 910 |       "  u'match',\n",
 911 |       "  u'match']"
 912 |      ],
 913 |      "language": "python",
 914 |      "metadata": {},
 915 |      "outputs": [],
 916 |      "prompt_number": 5
 917 |     },
 918 |     {
 919 |      "cell_type": "code",
 920 |      "collapsed": false,
 921 |      "input": [
 922 |       "from pyhacrf import StringPairFeatureExtractor, Hacrf\n",
 923 |       "from scipy.optimize import fmin_l_bfgs_b\n",
 924 |       "import numpy as np"
 925 |      ],
 926 |      "language": "python",
 927 |      "metadata": {},
 928 |      "outputs": [],
 929 |      "prompt_number": 6
 930 |     },
 931 |     {
 932 |      "cell_type": "markdown",
 933 |      "metadata": {},
 934 |      "source": [
 935 |       "## Character level features"
 936 |      ]
 937 |     },
 938 |     {
 939 |      "cell_type": "code",
 940 |      "collapsed": false,
 941 |      "input": [
 942 |       "# Extract features\n",
 943 |       "feature_extractor = StringPairFeatureExtractor(match=True, numeric=True)\n",
 944 |       "X_extracted = feature_extractor.fit_transform(X)"
 945 |      ],
 946 |      "language": "python",
 947 |      "metadata": {},
 948 |      "outputs": [],
 949 |      "prompt_number": 7
 950 |     },
 951 |     {
 952 |      "cell_type": "code",
 953 |      "collapsed": false,
 954 |      "input": [
 955 |       "%%timeit -n1 -r1\n",
 956 |       "# Train model\n",
 957 |       "model = Hacrf(l2_regularization=1.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 10})\n",
 958 |       "model.fit(X_extracted, Y, verbosity=1)"
 959 |      ],
 960 |      "language": "python",
 961 |      "metadata": {},
 962 |      "outputs": [
 963 |       {
 964 |        "output_type": "stream",
 965 |        "stream": "stdout",
 966 |        "text": [
 967 |         "Iteration  Log-likelihood |gradient|\n",
 968 |         "         0     -11.78      650.6\n",
 969 |         "         1     -609.0  1.571e+03"
 970 |        ]
 971 |       },
 972 |       {
 973 |        "output_type": "stream",
 974 |        "stream": "stdout",
 975 |        "text": [
 976 |         "\n",
 977 |         "         2     -54.72  1.567e+03"
 978 |        ]
 979 |       },
 980 |       {
 981 |        "output_type": "stream",
 982 |        "stream": "stdout",
 983 |        "text": [
 984 |         "\n",
 985 |         "         3     -11.31      560.6"
 986 |        ]
 987 |       },
 988 |       {
 989 |        "output_type": "stream",
 990 |        "stream": "stdout",
 991 |        "text": [
 992 |         "\n",
 993 |         "         4     -10.83      142.5"
 994 |        ]
 995 |       },
 996 |       {
 997 |        "output_type": "stream",
 998 |        "stream": "stdout",
 999 |        "text": [
1000 |         "\n",
1001 |         "         5     -10.78      118.5"
1002 |        ]
1003 |       },
1004 |       {
1005 |        "output_type": "stream",
1006 |        "stream": "stdout",
1007 |        "text": [
1008 |         "\n",
1009 |         "         6      -10.7      143.8"
1010 |        ]
1011 |       },
1012 |       {
1013 |        "output_type": "stream",
1014 |        "stream": "stdout",
1015 |        "text": [
1016 |         "\n",
1017 |         "         7     -10.43      249.6"
1018 |        ]
1019 |       },
1020 |       {
1021 |        "output_type": "stream",
1022 |        "stream": "stdout",
1023 |        "text": [
1024 |         "\n",
1025 |         "         8     -10.13      328.6"
1026 |        ]
1027 |       },
1028 |       {
1029 |        "output_type": "stream",
1030 |        "stream": "stdout",
1031 |        "text": [
1032 |         "\n",
1033 |         "         9     -9.796      250.5"
1034 |        ]
1035 |       },
1036 |       {
1037 |        "output_type": "stream",
1038 |        "stream": "stdout",
1039 |        "text": [
1040 |         "\n",
1041 |         "        10     -9.573      102.2"
1042 |        ]
1043 |       },
1044 |       {
1045 |        "output_type": "stream",
1046 |        "stream": "stdout",
1047 |        "text": [
1048 |         "\n",
1049 |         "1 loops, best of 1: 8.73 s per loop\n"
1050 |        ]
1051 |       }
1052 |      ],
1053 |      "prompt_number": 9
1054 |     },
1055 |     {
1056 |      "cell_type": "code",
1057 |      "collapsed": false,
1058 |      "input": [
1059 |       "%%timeit -n1 -r1\n",
1060 |       "# Evaluate\n",
1061 |       "from sklearn.metrics import confusion_matrix\n",
1062 |       "predictions = model.predict(X_extracted)\n",
1063 |       "print(confusion_matrix(Y, predictions))\n",
1064 |       "print(model.predict_proba(X_extracted))"
1065 |      ],
1066 |      "language": "python",
1067 |      "metadata": {},
1068 |      "outputs": [
1069 |       {
1070 |        "output_type": "stream",
1071 |        "stream": "stdout",
1072 |        "text": [
1073 |         "[[8 2]\n",
1074 |         " [4 3]]\n",
1075 |         "[[ 0.64197473  0.35802527]\n",
1076 |         " [ 0.351784    0.648216  ]\n",
1077 |         " [ 0.6553065   0.3446935 ]\n",
1078 |         " [ 0.87671132  0.12328868]\n",
1079 |         " [ 0.47772325  0.52227675]\n",
1080 |         " [ 0.878586    0.121414  ]\n",
1081 |         " [ 0.70987436  0.29012564]\n",
1082 |         " [ 0.64765774  0.35234226]\n",
1083 |         " [ 0.93360185  0.06639815]\n",
1084 |         " [ 0.92714317  0.07285683]\n",
1085 |         " [ 0.48782793  0.51217207]\n",
1086 |         " [ 0.40930797  0.59069203]\n",
1087 |         " [ 0.59444836  0.40555164]\n",
1088 |         " [ 0.39622435  0.60377565]\n",
1089 |         " [ 0.63782341  0.36217659]\n",
1090 |         " [ 0.69982284  0.30017716]\n",
1091 |         " [ 0.5777424   0.4222576 ]]"
1092 |        ]
1093 |       },
1094 |       {
1095 |        "output_type": "stream",
1096 |        "stream": "stdout",
1097 |        "text": [
1098 |         "\n",
1099 |         "1 loops, best of 1: 1.67 s per loop\n"
1100 |        ]
1101 |       }
1102 |      ],
1103 |      "prompt_number": 10
1104 |     },
1105 |     {
1106 |      "cell_type": "markdown",
1107 |      "metadata": {},
1108 |      "source": [
1109 |       "## Token level features"
1110 |      ]
1111 |     },
1112 |     {
1113 |      "cell_type": "code",
1114 |      "collapsed": false,
1115 |      "input": [
1116 |       "from pyhacrf import PairFeatureExtractor"
1117 |      ],
1118 |      "language": "python",
1119 |      "metadata": {},
1120 |      "outputs": [],
1121 |      "prompt_number": 14
1122 |     },
1123 |     {
1124 |      "cell_type": "code",
1125 |      "collapsed": false,
1126 |      "input": [
1127 |       "tokX = [[sentence.split(' ') for sentence in pair] for pair in X]"
1128 |      ],
1129 |      "language": "python",
1130 |      "metadata": {},
1131 |      "outputs": [],
1132 |      "prompt_number": 15
1133 |     },
1134 |     {
1135 |      "cell_type": "code",
1136 |      "collapsed": false,
1137 |      "input": [
1138 |       "real = [\n",
1139 |       "    lambda i, j, s1, s2: 1.0,\n",
1140 |       "    lambda i, j, s1, s2: 1.0 if s1[i] == s2[j] else 0.0,\n",
1141 |       "    lambda i, j, s1, s2: 1.0 if s1[i] == s2[j] and len(s1[i]) >= 6 else 0.0,\n",
1142 |       "    lambda i, j, s1, s2: 1.0 if s1[i].isdigit() and s2[j].isdigit() and s1[i] == s2[j] else 0.0,\n",
1143 |       "    lambda i, j, s1, s2: 1.0 if s1[i].isalpha() and s2[j].isalpha() and s1[i] == s2[j] else 0.0,\n",
1144 |       "    lambda i, j, s1, s2: 1.0 if not s1[i].isalpha() and not s2[j].isalpha() else 0.0\n",
1145 |       "]\n",
1146 |       "# Other ideas are:\n",
1147 |       "#  to look up whether words are dictionary words,\n",
1148 |       "#  longest common subsequence,\n",
1149 |       "#  standard edit distance\n",
1150 |       "feature_extractor = PairFeatureExtractor(real=real)\n",
1151 |       "X_extracted = feature_extractor.fit_transform(tokX)"
1152 |      ],
1153 |      "language": "python",
1154 |      "metadata": {},
1155 |      "outputs": [],
1156 |      "prompt_number": 16
1157 |     },
1158 |     {
1159 |      "cell_type": "code",
1160 |      "collapsed": false,
1161 |      "input": [
1162 |       "#%%timeit -n1 -r1\n",
1163 |       "# Train model\n",
1164 |       "model = Hacrf(l2_regularization=1.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 400})\n",
1165 |       "model.fit(X_extracted, Y, verbosity=10)"
1166 |      ],
1167 |      "language": "python",
1168 |      "metadata": {},
1169 |      "outputs": [
1170 |       {
1171 |        "output_type": "stream",
1172 |        "stream": "stdout",
1173 |        "text": [
1174 |         "Iteration  Log-likelihood |gradient|\n",
1175 |         "         0     -11.78      113.8\n",
1176 |         "        10     -8.721      16.12"
1177 |        ]
1178 |       },
1179 |       {
1180 |        "output_type": "stream",
1181 |        "stream": "stdout",
1182 |        "text": [
1183 |         "\n",
1184 |         "        20     -8.366      1.147"
1185 |        ]
1186 |       },
1187 |       {
1188 |        "output_type": "stream",
1189 |        "stream": "stdout",
1190 |        "text": [
1191 |         "\n",
1192 |         "        30     -8.362    0.06527"
1193 |        ]
1194 |       },
1195 |       {
1196 |        "output_type": "stream",
1197 |        "stream": "stdout",
1198 |        "text": [
1199 |         "\n",
1200 |         "        40     -8.362   0.005777"
1201 |        ]
1202 |       },
1203 |       {
1204 |        "output_type": "stream",
1205 |        "stream": "stdout",
1206 |        "text": [
1207 |         "\n"
1208 |        ]
1209 |       },
1210 |       {
1211 |        "metadata": {},
1212 |        "output_type": "pyout",
1213 |        "prompt_number": 17,
1214 |        "text": [
1215 |         "<pyhacrf.pyhacrf.Hacrf at 0x1068fb750>"
1216 |        ]
1217 |       }
1218 |      ],
1219 |      "prompt_number": 17
1220 |     },
1221 |     {
1222 |      "cell_type": "code",
1223 |      "collapsed": false,
1224 |      "input": [
1225 |       "%%timeit -n1 -r1\n",
1226 |       "# Evaluate\n",
1227 |       "from sklearn.metrics import confusion_matrix\n",
1228 |       "predictions = model.predict(X_extracted)\n",
1229 |       "print(confusion_matrix(Y, predictions))\n",
1230 |       "print(model.predict_proba(X_extracted))"
1231 |      ],
1232 |      "language": "python",
1233 |      "metadata": {},
1234 |      "outputs": [
1235 |       {
1236 |        "output_type": "stream",
1237 |        "stream": "stdout",
1238 |        "text": [
1239 |         "[[9 1]\n",
1240 |         " [2 5]]\n",
1241 |         "[[ 0.72215688  0.27784312]\n",
1242 |         " [ 0.41200325  0.58799675]\n",
1243 |         " [ 0.56910178  0.43089822]\n",
1244 |         " [ 0.92672238  0.07327762]\n",
1245 |         " [ 0.56921501  0.43078499]\n",
1246 |         " [ 0.98737206  0.01262794]\n",
1247 |         " [ 0.56762697  0.43237303]\n",
1248 |         " [ 0.70141322  0.29858678]\n",
1249 |         " [ 0.97308327  0.02691673]\n",
1250 |         " [ 0.94721007  0.05278993]\n",
1251 |         " [ 0.32690805  0.67309195]\n",
1252 |         " [ 0.20741219  0.79258781]\n",
1253 |         " [ 0.30060707  0.69939293]\n",
1254 |         " [ 0.47280063  0.52719937]\n",
1255 |         " [ 0.4531238   0.5468762 ]\n",
1256 |         " [ 0.59051241  0.40948759]\n",
1257 |         " [ 0.66717449  0.33282551]]\n",
1258 |         "1 loops, best of 1: 30.8 ms per loop\n"
1259 |        ]
1260 |       }
1261 |      ],
1262 |      "prompt_number": 18
1263 |     },
1264 |     {
1265 |      "cell_type": "markdown",
1266 |      "metadata": {},
1267 |      "source": [
1268 |       "## Edit distance and word frequency features\n",
1269 |       "\n",
1270 |       "Let's also add the the Levenschtein distance as a features. \n",
1271 |       "\n",
1272 |       "When we peek at the training examples, it looks as if less common words should be more informative of a match - let's add a feature for the word frequency as well."
1273 |      ]
1274 |     },
1275 |     {
1276 |      "cell_type": "code",
1277 |      "collapsed": false,
1278 |      "input": [
1279 |       "import editdistance"
1280 |      ],
1281 |      "language": "python",
1282 |      "metadata": {},
1283 |      "outputs": [],
1284 |      "prompt_number": 19
1285 |     },
1286 |     {
1287 |      "cell_type": "code",
1288 |      "collapsed": false,
1289 |      "input": [
1290 |       "editdistance.eval('cheese', 'kaas')"
1291 |      ],
1292 |      "language": "python",
1293 |      "metadata": {},
1294 |      "outputs": [
1295 |       {
1296 |        "metadata": {},
1297 |        "output_type": "pyout",
1298 |        "prompt_number": 20,
1299 |        "text": [
1300 |         "5L"
1301 |        ]
1302 |       }
1303 |      ],
1304 |      "prompt_number": 20
1305 |     },
1306 |     {
1307 |      "cell_type": "code",
1308 |      "collapsed": false,
1309 |      "input": [
1310 |       "tokX = [[sentence.split(' ') for sentence in pair] for pair in X]"
1311 |      ],
1312 |      "language": "python",
1313 |      "metadata": {},
1314 |      "outputs": []
1315 |     },
1316 |     {
1317 |      "cell_type": "code",
1318 |      "collapsed": false,
1319 |      "input": [
1320 |       "real = [\n",
1321 |       "    lambda i, j, s1, s2: 1.0,\n",
1322 |       "    lambda i, j, s1, s2: 1.0 if s1[i] == s2[j] else 0.0,\n",
1323 |       "    lambda i, j, s1, s2: 1.0 if s1[i].isdigit() and s2[j].isdigit() and s1[i] == s2[j] else 0.0,\n",
1324 |       "    lambda i, j, s1, s2: 1.0 if not s1[i].isalpha() and not s2[j].isalpha() else 0.0,\n",
1325 |       "    lambda i, j, s1, s2: editdistance.eval(s1[i], s2[j]),\n",
1326 |       "    lambda i, j, s1, s2: np.log(editdistance.eval(s1[i], s2[j]) + 1),\n",
1327 |       "    lambda i, j, s1, s2: (editdistance.eval(s1[i], s2[j])) / max(len(s1[i]), len(s2[j])),\n",
1328 |       "    lambda i, j, s1, s2: 1.0 - (editdistance.eval(s1[i], s2[j])) / max(len(s1[i]), len(s2[j]))\n",
1329 |       "]\n",
1330 |       "# Other ideas are:\n",
1331 |       "#  to look up whether words are dictionary words,\n",
1332 |       "#  longest common subsequence,\n",
1333 |       "#  standard edit distance"
1334 |      ],
1335 |      "language": "python",
1336 |      "metadata": {},
1337 |      "outputs": [],
1338 |      "prompt_number": 48
1339 |     },
1340 |     {
1341 |      "cell_type": "code",
1342 |      "collapsed": false,
1343 |      "input": [
1344 |       "from sklearn.metrics import confusion_matrix, accuracy_score\n",
1345 |       "from sklearn.cross_validation import train_test_split"
1346 |      ],
1347 |      "language": "python",
1348 |      "metadata": {},
1349 |      "outputs": [],
1350 |      "prompt_number": 46
1351 |     },
1352 |     {
1353 |      "cell_type": "code",
1354 |      "collapsed": false,
1355 |      "input": [
1356 |       "# Train model\n",
1357 |       "errors_val = []\n",
1358 |       "errors_train = []\n",
1359 |       "for i, featureset in enumerate([[0, 1],\n",
1360 |       "                                [0, 1, 2],\n",
1361 |       "                                [0, 1, 2, 3],\n",
1362 |       "                                [0, 4], \n",
1363 |       "                                [0, 1, 4], \n",
1364 |       "                                [0, 1, 2, 3, 4],\n",
1365 |       "                                [0, 5],\n",
1366 |       "                                [0, 1, 5],\n",
1367 |       "                                [0, 1, 2, 3, 5],\n",
1368 |       "                                [0, 6],\n",
1369 |       "                                [0, 1, 6],\n",
1370 |       "                                [0, 1, 2, 3, 6],\n",
1371 |       "                                [0, 7],\n",
1372 |       "                                [0, 1, 7],\n",
1373 |       "                                [0, 1, 2, 3, 7]]):\n",
1374 |       "    print '{:4}{:18}'.format(i, featureset),\n",
1375 |       "    errs_val = []\n",
1376 |       "    errs_train = []\n",
1377 |       "    for repeat in xrange(15):\n",
1378 |       "        x_train, x_val, y_train, y_val = train_test_split(tokX, Y, test_size=0.2)\n",
1379 |       "        feature_extractor = PairFeatureExtractor(real=[real[f] for f in featureset])\n",
1380 |       "        X_extracted = feature_extractor.fit_transform(x_train)\n",
1381 |       "\n",
1382 |       "        model = Hacrf(l2_regularization=1.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 400})\n",
1383 |       "        model.fit(X_extracted, y_train)\n",
1384 |       "        \n",
1385 |       "        predictions = model.predict(X_extracted)\n",
1386 |       "        err_train = 1.0 - accuracy_score(y_train, predictions)\n",
1387 |       "        \n",
1388 |       "        X_extracted = feature_extractor.transform(x_val)\n",
1389 |       "        predictions = model.predict(X_extracted)\n",
1390 |       "        err_val = 1.0 - accuracy_score(y_val, predictions)\n",
1391 |       "        if repeat % 10 == 0:\n",
1392 |       "            print '{:.2f}'.format(err_train),\n",
1393 |       "            print '{:.2f}'.format(err_val),\n",
1394 |       "        errs_val.append(err_val)\n",
1395 |       "        errs_train.append(err_train)\n",
1396 |       "    print '  => {:.2f} +- {:.2f} | {:.2f} +- {:.2f}'.format(np.average(errs_train), \n",
1397 |       "                                                            np.std(errs_train),\n",
1398 |       "                                                            np.average(errs_val), \n",
1399 |       "                                                            np.std(errs_val))\n",
1400 |       "    errors_train.append(errs_train)\n",
1401 |       "    errors_val.append(errs_val)"
1402 |      ],
1403 |      "language": "python",
1404 |      "metadata": {},
1405 |      "outputs": [
1406 |       {
1407 |        "output_type": "stream",
1408 |        "stream": "stdout",
1409 |        "text": [
1410 |         "   0[0, 1]             "
1411 |        ]
1412 |       },
1413 |       {
1414 |        "output_type": "stream",
1415 |        "stream": "stdout",
1416 |        "text": [
1417 |         "0.46 0.25 "
1418 |        ]
1419 |       },
1420 |       {
1421 |        "output_type": "stream",
1422 |        "stream": "stdout",
1423 |        "text": [
1424 |         "0.31 0.00 "
1425 |        ]
1426 |       },
1427 |       {
1428 |        "output_type": "stream",
1429 |        "stream": "stdout",
1430 |        "text": [
1431 |         "  => 0.28 +- 0.11 | 0.43 +- 0.21\n",
1432 |         "   1[0, 1, 2]          "
1433 |        ]
1434 |       },
1435 |       {
1436 |        "output_type": "stream",
1437 |        "stream": "stdout",
1438 |        "text": [
1439 |         "0.23 0.25 "
1440 |        ]
1441 |       },
1442 |       {
1443 |        "output_type": "stream",
1444 |        "stream": "stdout",
1445 |        "text": [
1446 |         "0.31 0.75 "
1447 |        ]
1448 |       },
1449 |       {
1450 |        "output_type": "stream",
1451 |        "stream": "stdout",
1452 |        "text": [
1453 |         "  => 0.24 +- 0.09 | 0.50 +- 0.24\n",
1454 |         "   2[0, 1, 2, 3]       "
1455 |        ]
1456 |       },
1457 |       {
1458 |        "output_type": "stream",
1459 |        "stream": "stdout",
1460 |        "text": [
1461 |         "0.23 0.50 "
1462 |        ]
1463 |       },
1464 |       {
1465 |        "output_type": "stream",
1466 |        "stream": "stdout",
1467 |        "text": [
1468 |         "0.15 0.75 "
1469 |        ]
1470 |       },
1471 |       {
1472 |        "output_type": "stream",
1473 |        "stream": "stdout",
1474 |        "text": [
1475 |         "  => 0.21 +- 0.05 | 0.57 +- 0.19\n",
1476 |         "   3[0, 4]             "
1477 |        ]
1478 |       },
1479 |       {
1480 |        "output_type": "stream",
1481 |        "stream": "stdout",
1482 |        "text": [
1483 |         "0.08 0.25 "
1484 |        ]
1485 |       },
1486 |       {
1487 |        "output_type": "stream",
1488 |        "stream": "stdout",
1489 |        "text": [
1490 |         "0.08 0.75 "
1491 |        ]
1492 |       },
1493 |       {
1494 |        "output_type": "stream",
1495 |        "stream": "stdout",
1496 |        "text": [
1497 |         "  => 0.12 +- 0.04 | 0.40 +- 0.22\n",
1498 |         "   4[0, 1, 4]          "
1499 |        ]
1500 |       },
1501 |       {
1502 |        "output_type": "stream",
1503 |        "stream": "stdout",
1504 |        "text": [
1505 |         "0.08 0.25 "
1506 |        ]
1507 |       },
1508 |       {
1509 |        "output_type": "stream",
1510 |        "stream": "stdout",
1511 |        "text": [
1512 |         "0.23 0.25 "
1513 |        ]
1514 |       },
1515 |       {
1516 |        "output_type": "stream",
1517 |        "stream": "stdout",
1518 |        "text": [
1519 |         "  => 0.13 +- 0.07 | 0.42 +- 0.20\n",
1520 |         "   5[0, 1, 2, 3, 4]    "
1521 |        ]
1522 |       },
1523 |       {
1524 |        "output_type": "stream",
1525 |        "stream": "stdout",
1526 |        "text": [
1527 |         "0.15 0.25 "
1528 |        ]
1529 |       },
1530 |       {
1531 |        "output_type": "stream",
1532 |        "stream": "stdout",
1533 |        "text": [
1534 |         "0.08 0.50 "
1535 |        ]
1536 |       },
1537 |       {
1538 |        "output_type": "stream",
1539 |        "stream": "stdout",
1540 |        "text": [
1541 |         "  => 0.09 +- 0.07 | 0.43 +- 0.17\n",
1542 |         "   6[0, 5]             "
1543 |        ]
1544 |       },
1545 |       {
1546 |        "output_type": "stream",
1547 |        "stream": "stdout",
1548 |        "text": [
1549 |         "0.15 0.50 "
1550 |        ]
1551 |       },
1552 |       {
1553 |        "output_type": "stream",
1554 |        "stream": "stdout",
1555 |        "text": [
1556 |         "0.23 0.00 "
1557 |        ]
1558 |       },
1559 |       {
1560 |        "output_type": "stream",
1561 |        "stream": "stdout",
1562 |        "text": [
1563 |         "  => 0.17 +- 0.07 | 0.40 +- 0.18\n",
1564 |         "   7[0, 1, 5]          "
1565 |        ]
1566 |       },
1567 |       {
1568 |        "output_type": "stream",
1569 |        "stream": "stdout",
1570 |        "text": [
1571 |         "0.23 0.25 "
1572 |        ]
1573 |       },
1574 |       {
1575 |        "output_type": "stream",
1576 |        "stream": "stdout",
1577 |        "text": [
1578 |         "0.15 0.50 "
1579 |        ]
1580 |       },
1581 |       {
1582 |        "output_type": "stream",
1583 |        "stream": "stdout",
1584 |        "text": [
1585 |         "  => 0.17 +- 0.09 | 0.40 +- 0.29\n",
1586 |         "   8[0, 1, 2, 3, 5]    "
1587 |        ]
1588 |       },
1589 |       {
1590 |        "output_type": "stream",
1591 |        "stream": "stdout",
1592 |        "text": [
1593 |         "0.23 0.25 "
1594 |        ]
1595 |       },
1596 |       {
1597 |        "output_type": "stream",
1598 |        "stream": "stdout",
1599 |        "text": [
1600 |         "0.15 0.50 "
1601 |        ]
1602 |       },
1603 |       {
1604 |        "output_type": "stream",
1605 |        "stream": "stdout",
1606 |        "text": [
1607 |         "  => 0.16 +- 0.05 | 0.52 +- 0.17\n",
1608 |         "   9[0, 6]             "
1609 |        ]
1610 |       },
1611 |       {
1612 |        "output_type": "stream",
1613 |        "stream": "stdout",
1614 |        "text": [
1615 |         "0.31 0.50 "
1616 |        ]
1617 |       },
1618 |       {
1619 |        "output_type": "stream",
1620 |        "stream": "stdout",
1621 |        "text": [
1622 |         "0.31 0.75 "
1623 |        ]
1624 |       },
1625 |       {
1626 |        "output_type": "stream",
1627 |        "stream": "stdout",
1628 |        "text": [
1629 |         "  => 0.24 +- 0.05 | 0.42 +- 0.24\n",
1630 |         "  10[0, 1, 6]          "
1631 |        ]
1632 |       },
1633 |       {
1634 |        "output_type": "stream",
1635 |        "stream": "stdout",
1636 |        "text": [
1637 |         "0.15 0.75 "
1638 |        ]
1639 |       },
1640 |       {
1641 |        "output_type": "stream",
1642 |        "stream": "stdout",
1643 |        "text": [
1644 |         "0.23 0.75 "
1645 |        ]
1646 |       },
1647 |       {
1648 |        "output_type": "stream",
1649 |        "stream": "stdout",
1650 |        "text": [
1651 |         "  => 0.22 +- 0.09 | 0.52 +- 0.27\n",
1652 |         "  11[0, 1, 2, 3, 6]    "
1653 |        ]
1654 |       },
1655 |       {
1656 |        "output_type": "stream",
1657 |        "stream": "stdout",
1658 |        "text": [
1659 |         "0.08 0.50 "
1660 |        ]
1661 |       },
1662 |       {
1663 |        "output_type": "stream",
1664 |        "stream": "stdout",
1665 |        "text": [
1666 |         "0.00 0.50 "
1667 |        ]
1668 |       },
1669 |       {
1670 |        "output_type": "stream",
1671 |        "stream": "stdout",
1672 |        "text": [
1673 |         "  => 0.14 +- 0.08 | 0.53 +- 0.20\n",
1674 |         "  12[0, 7]             "
1675 |        ]
1676 |       },
1677 |       {
1678 |        "output_type": "stream",
1679 |        "stream": "stdout",
1680 |        "text": [
1681 |         "0.23 0.75 "
1682 |        ]
1683 |       },
1684 |       {
1685 |        "output_type": "stream",
1686 |        "stream": "stdout",
1687 |        "text": [
1688 |         "0.23 0.50 "
1689 |        ]
1690 |       },
1691 |       {
1692 |        "output_type": "stream",
1693 |        "stream": "stdout",
1694 |        "text": [
1695 |         "  => 0.24 +- 0.07 | 0.52 +- 0.23\n",
1696 |         "  13[0, 1, 7]          "
1697 |        ]
1698 |       },
1699 |       {
1700 |        "output_type": "stream",
1701 |        "stream": "stdout",
1702 |        "text": [
1703 |         "0.23 0.75 "
1704 |        ]
1705 |       },
1706 |       {
1707 |        "output_type": "stream",
1708 |        "stream": "stdout",
1709 |        "text": [
1710 |         "0.23 0.50 "
1711 |        ]
1712 |       },
1713 |       {
1714 |        "output_type": "stream",
1715 |        "stream": "stdout",
1716 |        "text": [
1717 |         "  => 0.24 +- 0.09 | 0.52 +- 0.23\n",
1718 |         "  14[0, 1, 2, 3, 7]    "
1719 |        ]
1720 |       },
1721 |       {
1722 |        "output_type": "stream",
1723 |        "stream": "stdout",
1724 |        "text": [
1725 |         "0.23 0.50 "
1726 |        ]
1727 |       },
1728 |       {
1729 |        "output_type": "stream",
1730 |        "stream": "stdout",
1731 |        "text": [
1732 |         "0.15 0.75 "
1733 |        ]
1734 |       },
1735 |       {
1736 |        "output_type": "stream",
1737 |        "stream": "stdout",
1738 |        "text": [
1739 |         "  => 0.21 +- 0.03 | 0.38 +- 0.22\n"
1740 |        ]
1741 |       }
1742 |      ],
1743 |      "prompt_number": 51
1744 |     },
1745 |     {
1746 |      "cell_type": "markdown",
1747 |      "metadata": {},
1748 |      "source": [
1749 |       "## Conclusion\n",
1750 |       "\n",
1751 |       "It seems that tokenising the text not only speeds up training and scoring by 40x, it also improves the predictions. We definitely need more data to do this properly though."
1752 |      ]
1753 |     },
1754 |     {
1755 |      "cell_type": "code",
1756 |      "collapsed": false,
1757 |      "input": [
1758 |       "from time import sleep"
1759 |      ],
1760 |      "language": "python",
1761 |      "metadata": {},
1762 |      "outputs": [],
1763 |      "prompt_number": 11
1764 |     },
1765 |     {
1766 |      "cell_type": "code",
1767 |      "collapsed": false,
1768 |      "input": [
1769 |       "from IPython import parallel\n",
1770 |       "c = parallel.Client()\n",
1771 |       "view = c.load_balanced_view()"
1772 |      ],
1773 |      "language": "python",
1774 |      "metadata": {},
1775 |      "outputs": [
1776 |       {
1777 |        "output_type": "stream",
1778 |        "stream": "stderr",
1779 |        "text": [
1780 |         "/Users/dirkocoetsee/anaconda/lib/python2.7/site-packages/IPython/parallel/client/client.py:446: RuntimeWarning: \n",
1781 |         "            Controller appears to be listening on localhost, but not on this machine.\n",
1782 |         "            If this is true, you should specify Client(...,sshserver='you@192.168.43.8')\n",
1783 |         "            or instruct your controller to listen on an external IP.\n",
1784 |         "  RuntimeWarning)\n"
1785 |        ]
1786 |       }
1787 |      ],
1788 |      "prompt_number": 21
1789 |     },
1790 |     {
1791 |      "cell_type": "code",
1792 |      "collapsed": false,
1793 |      "input": [
1794 |       "def k():\n",
1795 |       "    sleep(8)\n",
1796 |       "    print 'kaas'"
1797 |      ],
1798 |      "language": "python",
1799 |      "metadata": {},
1800 |      "outputs": [],
1801 |      "prompt_number": 23
1802 |     },
1803 |     {
1804 |      "cell_type": "code",
1805 |      "collapsed": false,
1806 |      "input": [
1807 |       "%%px --noblock\n",
1808 |       "from time import sleep\n",
1809 |       "sleep(15)\n",
1810 |       "print 'kaas'\n",
1811 |       "a=4"
1812 |      ],
1813 |      "language": "python",
1814 |      "metadata": {},
1815 |      "outputs": [
1816 |       {
1817 |        "metadata": {},
1818 |        "output_type": "pyout",
1819 |        "prompt_number": 37,
1820 |        "text": [
1821 |         "<AsyncResult: execute>"
1822 |        ]
1823 |       }
1824 |      ],
1825 |      "prompt_number": 37
1826 |     },
1827 |     {
1828 |      "cell_type": "code",
1829 |      "collapsed": false,
1830 |      "input": [
1831 |       "1+1"
1832 |      ],
1833 |      "language": "python",
1834 |      "metadata": {},
1835 |      "outputs": [
1836 |       {
1837 |        "metadata": {},
1838 |        "output_type": "pyout",
1839 |        "prompt_number": 38,
1840 |        "text": [
1841 |         "2"
1842 |        ]
1843 |       }
1844 |      ],
1845 |      "prompt_number": 38
1846 |     },
1847 |     {
1848 |      "cell_type": "code",
1849 |      "collapsed": false,
1850 |      "input": [
1851 |       "%pxresult"
1852 |      ],
1853 |      "language": "python",
1854 |      "metadata": {},
1855 |      "outputs": [
1856 |       {
1857 |        "output_type": "stream",
1858 |        "stream": "stdout",
1859 |        "text": [
1860 |         "[stdout:0] kaas\n"
1861 |        ]
1862 |       }
1863 |      ],
1864 |      "prompt_number": 39
1865 |     },
1866 |     {
1867 |      "cell_type": "code",
1868 |      "collapsed": false,
1869 |      "input": [],
1870 |      "language": "python",
1871 |      "metadata": {},
1872 |      "outputs": []
1873 |     }
1874 |    ],
1875 |    "metadata": {}
1876 |   }
1877 |  ]
1878 | }


--------------------------------------------------------------------------------