├── pyhacrf ├── tests │ ├── __init__.py │ ├── profile.py │ ├── test_features.py │ └── test_model.py ├── __init__.py ├── state_machine.py ├── feature_extraction.py ├── algorithms.pyx └── pyhacrf.py ├── requirements-dev.txt ├── setup.cfg ├── MANIFEST.in ├── .gitignore ├── LICENSE ├── setup.py ├── README.rst └── examples └── Highered dataset.ipynb /pyhacrf/tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | cython>=0.22 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.rst -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.rst 3 | -------------------------------------------------------------------------------- /pyhacrf/__init__.py: -------------------------------------------------------------------------------- 1 | """ Implements a Hidden Alignment Conditional Random Field (HACRF). """ 2 | 3 | from .pyhacrf import Hacrf 4 | from .feature_extraction import StringPairFeatureExtractor, PairFeatureExtractor 5 | 6 | __all__ = ['Hacrf', 'StringPairFeatureExtractor', 'PairFeatureExtractor'] 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | -------------------------------------------------------------------------------- /pyhacrf/tests/profile.py: -------------------------------------------------------------------------------- 1 | """ A slow test for profiling """ 2 | 3 | from numpy.testing import assert_array_almost_equal 4 | import numpy as np 5 | from numpy import random 6 | from pyhacrf import Hacrf 7 | from pyhacrf.pyhacrf import _Model 8 | 9 | 10 | def test_derivate_large(): 11 | classes = ['a', 'b', 'c'] 12 | y = 'b' 13 | x = random.randn(20, 3, 10) * 5 + 3 14 | state_machine, states_to_classes = Hacrf._default_state_machine(classes) 15 | parameters = Hacrf._initialize_parameters(state_machine, x.shape[2]) 16 | parameters = random.randn(*parameters.shape) * 10 - 2 17 | 18 | test_model = _Model(state_machine, states_to_classes, x, y) 19 | expected_dll = np.zeros(parameters.shape) 20 | 21 | # Finite difference gradient approximation 22 | delta = 10.0**-7 23 | S, D = expected_dll.shape 24 | for s in xrange(S): 25 | for d in xrange(D): 26 | dg = np.zeros(parameters.shape) 27 | dg[s, d] = delta 28 | y0, _ = test_model.forward_backward(parameters) 29 | y1, _ = test_model.forward_backward(parameters + dg) 30 | expected_dll[s, d] = (y1 - y0) / delta 31 | 32 | actual_ll, actual_dll = test_model.forward_backward(parameters) 33 | 34 | print (abs(actual_dll) - abs(expected_dll)).sum() 35 | assert_array_almost_equal(actual_dll, expected_dll, decimal=4) 36 | 37 | if __name__ == '__main__': 38 | test_derivate_large() 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Dirko Coetsee 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of pyhacrf nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension 2 | from codecs import open 3 | from os import path 4 | 5 | 6 | # from Michael Hoffman's http://www.ebi.ac.uk/~hoffman/software/sunflower/ 7 | class NumpyExtension(Extension): 8 | 9 | def __init__(self, *args, **kwargs): 10 | from numpy import get_include 11 | from numpy.distutils.misc_util import get_info 12 | kwargs.update(get_info('npymath')) 13 | kwargs['include_dirs'] += [get_include()] 14 | 15 | Extension.__init__(self, *args, **kwargs) 16 | 17 | 18 | here = path.abspath(path.dirname(__file__)) 19 | 20 | # Get the long description from the relevant file 21 | with open(path.join(here, 'README.rst'), encoding='utf-8') as f: 22 | long_description = f.read() 23 | 24 | 25 | setup( 26 | name='pyhacrf', 27 | version='0.1.2', 28 | packages=['pyhacrf'], 29 | install_requires=['numpy>=1.9', 'PyLBFGS>=0.1.3'], 30 | ext_modules=[NumpyExtension('pyhacrf.algorithms', 31 | ['pyhacrf/algorithms.c'])], 32 | url='https://github.com/dirko/pyhacrf', 33 | download_url='https://github.com/dirko/pyhacrf/tarball/0.1.2', 34 | license='BSD', 35 | author='Dirko Coetsee', 36 | author_email='dpcoetsee@gmail.com', 37 | description='Hidden alignment conditional random field, a discriminative string edit distance', 38 | long_description=long_description, 39 | classifiers=[ 40 | 'Intended Audience :: Science/Research', 41 | 'License :: OSI Approved :: BSD License', 42 | 'Operating System :: OS Independent', 43 | 'Programming Language :: Python', 44 | 'Topic :: Scientific/Engineering', 45 | ], 46 | ) 47 | -------------------------------------------------------------------------------- /pyhacrf/tests/test_features.py: -------------------------------------------------------------------------------- 1 | """ Tests for the feature extraction. """ 2 | 3 | import unittest 4 | 5 | from numpy.testing import assert_array_almost_equal 6 | import numpy as np 7 | from pyhacrf import StringPairFeatureExtractor 8 | 9 | 10 | class TestStringPairFeatureExtractor(unittest.TestCase): 11 | def test_transform_binary(self): 12 | s1 = "kat1" 13 | s2 = "cat2" 14 | # 1 . . . n 15 | # t . . m . 16 | # a . m . . 17 | # k . . . . 18 | # c a t 2 19 | expected_x = np.zeros((4, 4, 4)) 20 | expected_x[:, :, 0] = 2.0 21 | expected_x[:, 0, 1] = 1.0 22 | expected_x[0, :, 1] = 1.0 23 | expected_x[1, 1, 2] = 1.0 24 | expected_x[2, 2, 2] = 1.0 25 | expected_x[3, 3, 3] = 1.0 26 | 27 | test_extractor = StringPairFeatureExtractor(bias=2.0, start=True, match=True, numeric=True) 28 | actual_X = test_extractor.fit_transform([(s1, s2)]) 29 | 30 | assert_array_almost_equal(expected_x, actual_X[0]) 31 | 32 | def test_transform_transition(self): 33 | s1 = "ba" 34 | s2 = "ca" 35 | # a . . 36 | # b . . 37 | # c a 38 | chars = StringPairFeatureExtractor.CHARACTERS 39 | nchars = len(chars) 40 | print(nchars) 41 | expected_x = np.zeros((2, 2, len(chars)**2 + 1)) 42 | expected_x[:, :, 0] = 1.0 43 | expected_x[0, 0, 2 + nchars * 1 + 1] = 1.0 # b->c 44 | expected_x[0, 1, 0 + nchars * 1 + 1] = 1.0 # b->a 45 | expected_x[1, 0, 2 + nchars * 0 + 1] = 1.0 # a->c 46 | expected_x[1, 1, 0 + nchars * 0 + 1] = 1.0 # a->a 47 | 48 | test_extractor = StringPairFeatureExtractor(transition=True) 49 | actual_X = test_extractor.fit_transform([(s1, s2)]) 50 | 51 | assert_array_almost_equal(expected_x, actual_X[0]) 52 | 53 | if __name__ == '__main__': 54 | unittest.main() 55 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | pyhacrf 2 | ======= 3 | 4 | Hidden alignment conditional random field for classifying string pairs - 5 | a learnable edit distance. 6 | 7 | This package aims to implement the HACRF machine learning model with a 8 | ``sklearn``-like interface. It includes ways to fit a model to training 9 | examples and score new example. 10 | 11 | The model takes string pairs as input and classify them into any number 12 | of classes. In McCallum's original paper the model was applied to the 13 | database deduplication problem. Each database entry was paired with 14 | every other entry and the model then classified whether the pair was a 15 | 'match' or a 'mismatch' based on training examples of matches and 16 | mismatches. 17 | 18 | I also tried to use it as learnable string edit distance for normalizing 19 | noisy text. See *A Conditional Random Field for Discriminatively-trained 20 | Finite-state String Edit Distance* by McCallum, Bellare, and Pereira, 21 | and the report *Conditional Random Fields for Noisy text normalisation* 22 | by Dirko Coetsee. 23 | 24 | Example 25 | ------- 26 | 27 | .. code:: python 28 | 29 | from pyhacrf import StringPairFeatureExtractor, Hacrf 30 | 31 | training_X = [('helloooo', 'hello'), # Matching examples 32 | ('h0me', 'home'), 33 | ('krazii', 'crazy'), 34 | ('non matching string example', 'no really'), # Non-matching examples 35 | ('and another one', 'yep')] 36 | training_y = ['match', 37 | 'match', 38 | 'match', 39 | 'non-match', 40 | 'non-match'] 41 | 42 | # Extract features 43 | feature_extractor = StringPairFeatureExtractor(match=True, numeric=True) 44 | training_X_extracted = feature_extractor.fit_transform(training_X) 45 | 46 | # Train model 47 | model = Hacrf(l2_regularization=1.0) 48 | model.fit(training_X_extracted, training_y) 49 | 50 | # Evaluate 51 | from sklearn.metrics import confusion_matrix 52 | predictions = model.predict(training_X_extracted) 53 | 54 | print(confusion_matrix(training_y, predictions)) 55 | > [[0 3] 56 | > [2 0]] 57 | 58 | print(model.predict_proba(training_X_extracted)) 59 | > [[ 0.94914812 0.05085188] 60 | > [ 0.92397711 0.07602289] 61 | > [ 0.86756034 0.13243966] 62 | > [ 0.05438812 0.94561188] 63 | > [ 0.02641275 0.97358725]] 64 | 65 | Dependencies 66 | ------------ 67 | 68 | This package depends on ``numpy``. The LBFGS optimizer in ``pylbfgs`` is 69 | used, but alternative optimizers can be passed. 70 | 71 | Install 72 | ------- 73 | 74 | Install by running: 75 | 76 | :: 77 | 78 | python setup.py install 79 | 80 | or from pypi: 81 | 82 | :: 83 | 84 | pip install pyhacrf 85 | 86 | Developing 87 | ---------- 88 | Clone from repository, then 89 | 90 | :: 91 | 92 | pip install -r requirements-dev.txt 93 | cython pyhacrf/*.pyx 94 | python setup.py install 95 | 96 | To deploy to pypi, make sure you have compiled the \*.pyx files to \*.c -------------------------------------------------------------------------------- /pyhacrf/state_machine.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import defaultdict, deque 3 | 4 | 5 | class GeneralStateMachine(object): 6 | """ State machine which, together with two input sequences, is used to build the lattice. 7 | 8 | Each state and each transition is labelled by different integers. 9 | 10 | Parameters 11 | ---------- 12 | start_states : list of ints 13 | The states that the state machine can start in. 14 | 15 | transitions : List of tuples 16 | The start state, end state, and number of positions to move in each sequence. For example, 17 | [(0, 0, (0, 1)), # insertion into the first sequence, while going from state 0 to state 0. 18 | (1, 0, (1, 0)), # deletion from first sequence, while moving from state 1 to state 0. 19 | (2, 1, (1, 1)), # match/substitution - move from state 2 to state 1. 20 | ... 21 | ] 22 | 23 | states_to_classes : dictionary 24 | Dictionary where each state is mapped to a class. 25 | """ 26 | 27 | def __init__(self, start_states, transitions, states_to_classes): 28 | self._start_states = start_states 29 | self._transitions = transitions 30 | 31 | max_state = max(max(s for s, _, _ in transitions), max(s for _, s, _ in transitions)) + 1 32 | self.n_states = max_state 33 | self.n_transitions = len(transitions) 34 | self.states_to_classes = states_to_classes 35 | 36 | def build_lattice(self, x): 37 | """ Construct the list of nodes and edges for input features. """ 38 | I, J, _ = x.shape 39 | start_states, transitions = self._start_states, self._transitions 40 | 41 | lattice = [] 42 | transitions_d = defaultdict(list) 43 | for transition_index, (s0, s1, delta) in enumerate(transitions): 44 | transitions_d[s0].append((s1, delta, transition_index)) 45 | # Add start states 46 | unvisited_nodes = deque([(0, 0, s) for s in start_states]) 47 | visited_nodes = set() 48 | n_states = self.n_states 49 | 50 | while unvisited_nodes: 51 | node = unvisited_nodes.popleft() 52 | lattice.append(node) 53 | i, j, s0 = node 54 | for s1, delta, transition_index in transitions_d[s0]: 55 | try: 56 | di, dj = delta 57 | except TypeError: 58 | di, dj = delta(i, j, x) 59 | 60 | if i + di < I and j + dj < J: 61 | edge = (i, j, s0, i + di, j + dj, s1, transition_index + n_states) 62 | lattice.append(edge) 63 | dest_node = (i + di, j + dj, s1) 64 | if dest_node not in visited_nodes: 65 | unvisited_nodes.append(dest_node) 66 | visited_nodes.add(dest_node) 67 | 68 | lattice.sort() 69 | 70 | # Step backwards through lattice and add visitable nodes to the set of nodes to keep. The rest are discarded. 71 | final_lattice = [] 72 | visited_nodes = set((I-1, J-1, s) for s in range(n_states)) 73 | 74 | for node in lattice[::-1]: 75 | if node in visited_nodes: 76 | final_lattice.append(node) 77 | elif len(node) > 3: 78 | source_node, dest_node = node[0:3], node[3:6] 79 | if dest_node in visited_nodes: 80 | visited_nodes.add(source_node) 81 | final_lattice.append(node) 82 | 83 | reversed_list = list(reversed(final_lattice)) 84 | 85 | # Squash list 86 | lattice = [edge for edge in reversed_list if len(edge) > 3] 87 | return np.array(lattice, dtype='int64') 88 | 89 | 90 | class DefaultStateMachine(object): 91 | """ State machine which, together with two input sequences, is used to build the lattice. 92 | 93 | Simple and fast state machine with a single state for each class. 94 | Allows for character match/substitution, deletion, and insertion. 95 | 96 | Parameters 97 | ---------- 98 | classes : list 99 | The set of labels. 100 | """ 101 | BASE_LENGTH = 60 102 | 103 | def __init__(self, classes): 104 | n_classes = len(classes) 105 | deltas = ((1, 1), # Match 106 | (0, 1), # Insertion 107 | (1, 0)) # Deletion 108 | self._start_states = [i for i in range(n_classes)] 109 | self._transitions = [(i, i, delta) 110 | for delta in deltas 111 | for i in range(n_classes)] 112 | self._base_shape = (self.BASE_LENGTH, self.BASE_LENGTH) 113 | self.states_to_classes = {i: c for i, c in enumerate(classes)} 114 | self.n_transitions = len(self._transitions) 115 | self.n_states = len(classes) 116 | self._base_lattice = self._independent_lattice(self._base_shape) 117 | 118 | self._lattice_limits = self._lattice_ends() 119 | 120 | def _subset_independent_lattice(self, shape): 121 | I, J = shape 122 | 123 | if I < self.BASE_LENGTH and J < self.BASE_LENGTH: 124 | lattice = self._base_lattice.take( 125 | self._lattice_limits[I,J], 126 | axis=0) 127 | 128 | elif I < self.BASE_LENGTH: 129 | lattice = self._base_lattice.take( 130 | self._lattice_limits[I, None], 131 | axis=0) 132 | lattice = self._independent_lattice((I, J), lattice) 133 | elif J < self.BASE_LENGTH: 134 | lattice = self._base_lattice.take( 135 | self._lattice_limits[None, J], 136 | axis=0) 137 | lattice = self._independent_lattice((I, J), lattice) 138 | else: 139 | lattice = self._independent_lattice((I, J), self._base_lattice) 140 | 141 | return lattice 142 | 143 | def _independent_lattice(self, shape, lattice=None): 144 | """ Helper to construct the list of nodes and edges. """ 145 | I, J = shape 146 | 147 | if lattice is not None: 148 | end_I = min(I, max(lattice[..., 3])) - 1 149 | end_J = min(J, max(lattice[..., 4])) - 1 150 | unvisited_nodes = deque([(i, j, s) 151 | for i in range(end_I) 152 | for j in range(end_J) 153 | for s in self._start_states]) 154 | lattice = lattice.tolist() 155 | else: 156 | lattice = [] 157 | unvisited_nodes = deque([(0, 0, s) for s in self._start_states]) 158 | lattice += _grow_independent_lattice(self._transitions, 159 | self.n_states, (I, J), 160 | unvisited_nodes) 161 | lattice = np.array(sorted(lattice), dtype='int64') 162 | return lattice 163 | 164 | def build_lattice(self, x): 165 | """ Construct the list of nodes and edges for input features. """ 166 | I, J, _ = x.shape 167 | lattice = self._subset_independent_lattice((I, J)) 168 | return lattice 169 | 170 | def _lattice_ends(self) : 171 | 172 | lattice_limits = {} 173 | 174 | lengths = np.arange(self.BASE_LENGTH) 175 | lengths.reshape(1, -1) 176 | 177 | I = self._base_lattice[..., 3:4] < lengths 178 | for i in range(self.BASE_LENGTH) : 179 | lattice_limits[i, None] = I[..., i].nonzero()[0] 180 | 181 | J = self._base_lattice[..., 4:5] < lengths 182 | for j in range(self.BASE_LENGTH) : 183 | lattice_limits[None, j] = J[..., j].nonzero()[0] 184 | 185 | IJ = np.expand_dims(I, axis=0).T & J 186 | 187 | for i in range(self.BASE_LENGTH) : 188 | for j in range(self.BASE_LENGTH) : 189 | lattice_limits[i,j] = IJ[i, ..., j].nonzero()[0] 190 | 191 | return lattice_limits 192 | 193 | 194 | 195 | def _grow_independent_lattice(transitions, n_states, shape, unvisited_nodes): 196 | I, J = shape 197 | visited_nodes = set() 198 | lattice = [] 199 | 200 | transitions_d = defaultdict(list) 201 | for transition_index, (s0, s1, delta) in enumerate(transitions): 202 | if not callable(delta): 203 | di, dj = delta 204 | transitions_d[s0].append((s1, di, dj, 205 | transition_index + n_states)) 206 | 207 | while unvisited_nodes: 208 | i, j, s0 = unvisited_nodes.popleft() 209 | for s1, di, dj, edge_parameter_index in transitions_d[s0]: 210 | if i + di < I and j + dj < J: 211 | dest_node = (i + di, j + dj, s1) 212 | edge = (i, j, s0) + dest_node + (edge_parameter_index,) 213 | lattice.append(list(edge)) 214 | if dest_node not in visited_nodes: 215 | unvisited_nodes.append(dest_node) 216 | visited_nodes.add(dest_node) 217 | 218 | return lattice 219 | 220 | -------------------------------------------------------------------------------- /pyhacrf/feature_extraction.py: -------------------------------------------------------------------------------- 1 | # Authors: Dirko Coetsee 2 | # License: 3-clause BSD 3 | 4 | """ Implements feature extraction methods to use with HACRF models. """ 5 | 6 | import numpy as np 7 | import functools 8 | import itertools 9 | 10 | class PairFeatureExtractor(object): 11 | """Extract features from sequence pairs. 12 | 13 | For each feature, a grid is constructed for a sequency pair. The 14 | features are stacked, producing a 3 dimensional matrix of 15 | dimensions: 16 | 17 | (length of sequence 1) X (length of sequence 2) X (number of features) 18 | 19 | For example, a 'beginning' character feature grid for the sequences, 20 | 'kaas' and 'cheese' could look like this. 21 | 22 | c h e e s e 23 | k 1 1 1 1 1 1 24 | a 1 0 0 0 0 0 25 | a 1 0 0 0 0 0 26 | s 1 0 0 0 0 0 27 | 28 | These grids are made from two different types of feature 29 | functions: real and sparse. 30 | 31 | Real features are functions of the form: 32 | 33 | def some_feature_function(array1, array2): 34 | ... 35 | return feature_grid 36 | 37 | Given two sequences, s1 and s1, return a numpy.array with dimensions 38 | (length of array1) X (length of array2). 39 | 40 | For performance reasons, we take advantage of numpy broadcasting, and 41 | array1 is a column array and array2 is a row array. 42 | 43 | For a 'matching character' feature between 'kaas' and 'cheese', the 44 | sequences are transformed and then we use broadcasting 45 | 46 | > array1 = numpy.array([['k'], 47 | ['a'], 48 | ['a'], 49 | ['s']]) 50 | > array2 = numpy.array([['c', 'h', 'e', 'e', 's', 'e']) 51 | > array1 == array2 52 | numpy.array([[0, 0, 0, 0, 0, 0], 53 | [0, 0, 0, 0, 0, 0], 54 | [0, 0, 0, 0, 0, 0], 55 | [0, 0, 0, 0, 1, 0]]) 56 | 57 | When writing you own real feature functions, you can assume that 58 | the arrays will come in with the right shape. 59 | 60 | Sparse feature functions look similar: 61 | 62 | def some_feature_function(i, j, s1, s2): 63 | ... 64 | return some_index, total_vector_length 65 | 66 | but they always return two ints. The first is the index of the 67 | element that should be 1 and the second is the total length of 68 | vector. So for example if (4, 5) is returned, then the feature 69 | vector [0, 0, 0, 0, 1] is constructed. 70 | 71 | 72 | Parameters 73 | ---------- 74 | real: list: optional (default=[]) 75 | List of functions of the form 76 | def some_feature_function(i, j, s1, s2): 77 | ... 78 | return some_float 79 | 80 | sparse: list: optional (default=[]) 81 | List of functions of the form 82 | def some_feature_function(i, j, s1, s2): 83 | ... 84 | return some_index, total_vector_length 85 | 86 | """ 87 | 88 | def __init__(self, real=None, sparse=None): 89 | self._binary_features = [] 90 | if real: 91 | self._binary_features = real 92 | self._sparse_features = [] 93 | if sparse: 94 | self._sparse_features = sparse 95 | 96 | def fit_transform(self, raw_X, y=None): 97 | """Like transform. Transform sequence pairs to feature arrays that can be used as input to `Hacrf` models. 98 | 99 | Parameters 100 | ---------- 101 | raw_X : List of (sequence1_n, sequence2_n) pairs, one for each training example n. 102 | y : (ignored) 103 | 104 | Returns 105 | ------- 106 | X : List of numpy ndarrays, each with shape = (I_n, J_n, K), where I_n is the length of sequence1_n, J_n is the 107 | length of sequence2_n, and K is the number of features. 108 | Feature matrix list, for use with estimators or further transformers. 109 | """ 110 | return self.transform(raw_X) 111 | 112 | def transform(self, raw_X, y=None): 113 | """Transform sequence pairs to feature arrays that can be used as input to `Hacrf` models. 114 | 115 | Parameters 116 | ---------- 117 | raw_X : List of (sequence1_n, sequence2_n) pairs, one for each training example n. 118 | y : (ignored) 119 | 120 | Returns 121 | ------- 122 | X : List of numpy ndarrays, each with shape = (I_n, J_n, K), where I_n is the length of sequence1_n, J_n is the 123 | length of sequence2_n, and K is the number of features. 124 | Feature matrix list, for use with estimators or further transformers. 125 | """ 126 | return [self._extract_features(sequence1, sequence2) for sequence1, sequence2 in raw_X] 127 | 128 | def _extract_features(self, sequence1, sequence2): 129 | """ Helper to extract features for one data point. """ 130 | 131 | array1 = np.array(tuple(sequence1), ndmin=2).T 132 | array2 = np.array(tuple(sequence2), ndmin=2) 133 | 134 | K = (len(self._binary_features) 135 | + sum(num_feats for _, num_feats in self._sparse_features)) 136 | 137 | feature_array = np.zeros((array1.size, array2.size, K), dtype='float64') 138 | 139 | for k, feature_function in enumerate(self._binary_features): 140 | feature_array[..., k] = feature_function(array1, array2) 141 | 142 | if self._sparse_features: 143 | n_binary_features = len(self._binary_features) 144 | 145 | for i, j in np.ndindex(len(sequence1), len(sequence2)): 146 | k = n_binary_features 147 | 148 | for feature_function, num_features in self._sparse_features: 149 | 150 | feature_array[i, j, k + feature_function(i, j, sequence1, sequence2)] = 1.0 151 | k += num_features 152 | 153 | return feature_array 154 | 155 | 156 | class StringPairFeatureExtractor(PairFeatureExtractor): 157 | """ Extract features from sequence pairs. 158 | 159 | A grid is constructed for each sequence pair, for example for ("kaas", "cheese"): 160 | 161 | s * . . . @ . 162 | a * . . . . . 163 | a * . . . . . 164 | k * * * * * * 165 | c h e e s e 166 | 167 | For each element in the grid, a feature vector is constructed. The elements in the feature 168 | vector are determined by which features are active at that position in the grid. So for the 169 | example above, the 'match' feature will be 0 in every vector in every position except the 170 | position indicated with '@', where it will be 1. The 'start' feature will be 1 in all the 171 | positions with '*' and 0 everywhere else. 172 | 173 | 174 | Parameters 175 | ---------- 176 | bias: float: optional (default=1.0) 177 | A bias term that is always added to every position in the lattice. 178 | 179 | start: boolean: optional 180 | Binary feature that activates at the start of either sequence. 181 | 182 | end: boolean: optional 183 | Binary feature that activates at the end of either sequence. 184 | 185 | match: boolean: optional 186 | Binary feature that activates when elements at a position are equal. 187 | 188 | numeric: boolean, optional 189 | Binary feature that activates when all elements at a position are numerical. 190 | 191 | transition: boolean, optional 192 | Adds binary features for pairs of (lower case) input characters. 193 | """ 194 | 195 | # Constants 196 | CHARACTERS = 'abcdefghijklmnopqrstuvwxyz0123456789,./;\'\-=<>?:"|_+!@#$%^&*() ' 197 | 198 | 199 | 200 | def __init__(self, bias=1.0, start=False, end=False, match=False, numeric=False, transition=False): 201 | # TODO: For longer strings, tokenize and use Levenshtein 202 | # distance up until a lattice position. Other (possibly) 203 | # useful features might be whether characters are consonant or 204 | # vowel, punctuation, case. 205 | binary_features_active = [True, start, end, match, numeric] 206 | binary_features = [functools.partial(biases, bias=bias), 207 | starts, 208 | ends, 209 | matches, 210 | digits] 211 | 212 | self._binary_features = [feature 213 | for feature, active 214 | in zip(binary_features, 215 | binary_features_active) 216 | if active] 217 | self._sparse_features = [] 218 | if transition: 219 | characters_to_index = {character: index for index, character in enumerate(self.CHARACTERS)} 220 | curried_charIndex = functools.partial(charIndex, 221 | char2index = characters_to_index) 222 | self._sparse_features.append((curried_charIndex, 223 | len(characters_to_index) ** 2)) 224 | 225 | 226 | def charIndex(i, j, s1, s2, char2index=None) : 227 | char_i, char_j = s1[i].lower(), s2[j].lower() 228 | index = char2index[char_j] + char2index[char_i] * len(char2index) 229 | return index 230 | 231 | def biases(s1, s2, bias=1.0) : 232 | return np.full((s1.size, s2.size), bias) 233 | 234 | def starts(s1, s2) : 235 | M = np.zeros((s1.size, s2.size)) 236 | M[0,...] = 1 237 | M[...,0] = 1 238 | return M 239 | 240 | def ends(s1, s2) : 241 | M = np.zeros((s1.size, s2.size)) 242 | M[(s1.size-1),...] = 1 243 | M[...,(s2.size-1)] = 1 244 | return M 245 | 246 | def matches(s1, s2) : 247 | return (s1 == s2) 248 | 249 | def digits(s1, s2) : 250 | return np.char.isdigit(s1) & np.char.isdigit(s2) 251 | 252 | 253 | 254 | 255 | -------------------------------------------------------------------------------- /pyhacrf/algorithms.pyx: -------------------------------------------------------------------------------- 1 | #cython: boundscheck=False, wraparound=False, initializedcheck=False 2 | 3 | import numpy as np 4 | cimport numpy as np 5 | from numpy import ndarray 6 | from numpy cimport ndarray 7 | from numpy.math cimport logaddexp, INFINITY as inf 8 | cdef extern from "math.h" nogil : 9 | np.float64_t exp(np.float64_t x) 10 | 11 | cpdef dict forward(np.ndarray[np.int64_t, ndim=2] lattice, np.ndarray[np.float64_t, ndim=3] x_dot_parameters, long S): 12 | """ Helper to calculate the forward weights. """ 13 | cdef dict alpha = {} 14 | 15 | cdef unsigned int r 16 | cdef unsigned int i0, j0, s0, i1, j1, s1, edge_parameter_index 17 | cdef unsigned int I, J, s 18 | 19 | cdef unsigned int old_i0, old_j0, old_s0 20 | cdef np.float64_t edge_potential 21 | 22 | old_i0, old_j0, old_s0 = -1, -1, -1 23 | 24 | for r in range(lattice.shape[0]): 25 | i0, j0, s0 = lattice[r, 0], lattice[r, 1], lattice[r, 2] 26 | i1, j1, s1 = lattice[r, 3], lattice[r, 4], lattice[r, 5] 27 | edge_parameter_index = lattice[r, 6] 28 | 29 | if i0 != old_i0 or j0 != old_j0 or s0 != old_s0: 30 | if i0 == 0 and j0 == 0: 31 | alpha[(i0, j0, s0)] = x_dot_parameters[i0, j0, s0] 32 | else: 33 | alpha[(i0, j0, s0)] += x_dot_parameters[i0, j0, s0] 34 | 35 | old_i0, old_j0, old_s0 = i0, j0, s0 36 | 37 | edge_potential = (x_dot_parameters[i1, j1, edge_parameter_index] 38 | + alpha[(i0, j0, s0)]) 39 | alpha[(i0, j0, s0, i1, j1, s1, edge_parameter_index)] = edge_potential 40 | alpha[(i1, j1, s1)] = logaddexp( alpha.get((i1, j1, s1), -inf), 41 | edge_potential) 42 | 43 | I = x_dot_parameters.shape[0] - 1 44 | J = x_dot_parameters.shape[1] - 1 45 | 46 | for s in range(S): 47 | if I == J == 0: 48 | alpha[(I, J, s)] = x_dot_parameters[I, J, s] 49 | else: 50 | alpha[(I, J, s)] = alpha.get((I, J, s), -inf) + x_dot_parameters[I, J, s] 51 | 52 | return alpha 53 | 54 | cpdef np.float64_t[:, :, ::1] forward_predict(np.int64_t[:, ::1] lattice, 55 | np.float64_t[:, :, ::1] x_dot_parameters, 56 | long S) : 57 | """ Helper to calculate the forward weights for prediction. """ 58 | 59 | cdef np.float64_t[:, :, ::1] alpha = x_dot_parameters.copy() 60 | alpha[:] = -inf 61 | 62 | cdef unsigned int r 63 | cdef unsigned int i0, j0, s0, i1, j1, s1, edge_parameter_index 64 | 65 | cdef int old_s0 = -1 66 | 67 | cdef np.float64_t edge_potential, source_node_potential 68 | 69 | for r in range(lattice.shape[0]): 70 | i0, j0, s0 = lattice[r, 0], lattice[r, 1], lattice[r, 2] 71 | 72 | if s0 != old_s0 : 73 | if i0 == 0 and j0 == 0: 74 | source_node_potential = x_dot_parameters[i0, j0, s0] 75 | else: 76 | source_node_potential = (alpha[i0,j0,s0] 77 | + x_dot_parameters[i0,j0,s0]) 78 | old_s0 = s0 79 | 80 | i1, j1, s1 = lattice[r, 3], lattice[r, 4], lattice[r, 5] 81 | edge_parameter_index = lattice[r, 6] 82 | 83 | edge_potential = (x_dot_parameters[i1, j1, edge_parameter_index] 84 | + source_node_potential) 85 | 86 | alpha[i1, j1, s1] = logaddexp(alpha[i1, j1, s1], edge_potential) 87 | 88 | cdef int I = alpha.shape[0] - 1 89 | cdef int J = alpha.shape[1] - 1 90 | 91 | for s in range(S): 92 | if I == J == 0 : 93 | alpha[I, J, s] = x_dot_parameters[I, J, s] 94 | else: 95 | alpha[I, J, s] += x_dot_parameters[I, J, s] 96 | 97 | return alpha 98 | 99 | 100 | cpdef np.float64_t[:, :, ::1] forward_max_predict(np.int64_t[:, ::1] lattice, 101 | np.float64_t[:, :, ::1] x_dot_parameters, 102 | long S) : 103 | """ Helper to calculate the forward max-sum weights for prediction. """ 104 | 105 | cdef np.float64_t[:, :, ::1] alpha = x_dot_parameters.copy() 106 | alpha[:] = -inf 107 | 108 | cdef unsigned int r 109 | cdef unsigned int i0, j0, s0, i1, j1, s1, edge_parameter_index 110 | 111 | cdef int old_s0 = -1 112 | 113 | cdef np.float64_t edge_potential, source_node_potential 114 | 115 | for r in range(lattice.shape[0]): 116 | i0, j0, s0 = lattice[r, 0], lattice[r, 1], lattice[r, 2] 117 | 118 | if s0 != old_s0 : 119 | if i0 == 0 and j0 == 0: 120 | source_node_potential = x_dot_parameters[i0, j0, s0] 121 | else: 122 | source_node_potential = (alpha[i0,j0,s0] 123 | + x_dot_parameters[i0,j0,s0]) 124 | old_s0 = s0 125 | 126 | i1, j1, s1 = lattice[r, 3], lattice[r, 4], lattice[r, 5] 127 | edge_parameter_index = lattice[r, 6] 128 | 129 | edge_potential = (x_dot_parameters[i1, j1, edge_parameter_index] 130 | + source_node_potential) 131 | 132 | alpha[i1, j1, s1] = max(alpha[i1, j1, s1], edge_potential) 133 | 134 | cdef int I = alpha.shape[0] - 1 135 | cdef int J = alpha.shape[1] - 1 136 | 137 | for s in range(S): 138 | if I == J == 0 : 139 | alpha[I, J, s] = x_dot_parameters[I, J, s] 140 | else: 141 | alpha[I, J, s] += x_dot_parameters[I, J, s] 142 | 143 | return alpha 144 | 145 | 146 | cpdef dict backward(ndarray[np.int64_t, ndim=2] lattice, 147 | ndarray[np.float64_t, ndim=3] x_dot_parameters, 148 | long I, long J, long S): 149 | """ Helper to calculate the backward weights. """ 150 | cdef dict beta = {} 151 | 152 | cdef unsigned int r 153 | cdef unsigned int s 154 | cdef unsigned int i0, j0, s0, i1, j1, s1, edge_parameter_index 155 | 156 | cdef np.float64_t edge_potential 157 | 158 | for s in range(S): 159 | beta[(I-1, J-1, s)] = 0.0 160 | 161 | for r in range((lattice.shape[0] - 1), -1, -1): 162 | i0, j0, s0 = lattice[r, 0], lattice[r, 1], lattice[r, 2], 163 | i1, j1, s1 = lattice[r, 3], lattice[r, 4], lattice[r, 5] 164 | edge_parameter_index = lattice[r, 6] 165 | 166 | edge_potential = beta[(i1, j1, s1)] + x_dot_parameters[i1, j1, s1] 167 | beta[(i0, j0, s0, i1, j1, s1, edge_parameter_index)] = edge_potential 168 | beta[(i0, j0, s0)] = logaddexp( beta.get((i0, j0, s0), -inf), 169 | (edge_potential 170 | + x_dot_parameters[i1, 171 | j1, 172 | edge_parameter_index])) 173 | return beta 174 | 175 | 176 | def gradient(dict alpha, 177 | dict beta, 178 | ndarray[np.float64_t, ndim=2] parameters, 179 | ndarray[np.int64_t] states_to_classes, 180 | ndarray[np.float64_t, ndim=3] x, 181 | long y, 182 | long I, long J, long K): 183 | """ Helper to calculate the marginals and from that the gradient given the forward and backward weights. """ 184 | cdef unsigned int n_classes = max(states_to_classes) + 1 185 | cdef ndarray[np.float64_t] class_Z = np.zeros((n_classes,)) 186 | cdef np.float64_t Z = -inf 187 | cdef np.float64_t weight 188 | cdef unsigned int k 189 | 190 | for state, clas in enumerate(states_to_classes): 191 | weight = alpha[(I - 1, J - 1, state)] 192 | class_Z[clas] = weight 193 | Z = logaddexp(Z, weight) 194 | 195 | cdef ndarray[np.float64_t, ndim=2] derivative = np.full_like(parameters, 0.0) 196 | cdef unsigned int i0, j0, s0, i1, j1, s1, edge_parameter_index 197 | cdef np.float64_t alphabeta 198 | 199 | for node in alpha.viewkeys() | beta.viewkeys(): 200 | if len(node) == 3: 201 | i0, j0, s0 = node 202 | alphabeta = alpha[(i0, j0, s0)] + beta[(i0, j0, s0)] 203 | 204 | for k in range(K): 205 | if states_to_classes[s0] == y: 206 | derivative[s0, k] += (exp(alphabeta - class_Z[y]) - exp(alphabeta - Z)) * x[i0, j0, k] 207 | else: 208 | derivative[s0, k] -= exp(alphabeta - Z) * x[i0, j0, k] 209 | 210 | else: 211 | i0, j0, s0, i1, j1, s1, edge_parameter_index = node 212 | alphabeta = alpha[(i0, j0, s0, i1, j1, s1, edge_parameter_index)] \ 213 | + beta[(i0, j0, s0, i1, j1, s1, edge_parameter_index)] 214 | 215 | for k in xrange(K): 216 | if states_to_classes[s1] == y: 217 | derivative[edge_parameter_index, k] += (exp(alphabeta - class_Z[y]) - exp(alphabeta - Z)) * x[i1, j1, k] 218 | else: 219 | derivative[edge_parameter_index, k] -= exp(alphabeta - Z) * x[i1, j1, k] 220 | 221 | return (class_Z[y]) - (Z), derivative 222 | 223 | 224 | def gradient_sparse(dict alpha, 225 | dict beta, 226 | ndarray[np.float64_t, ndim=2] parameters, 227 | ndarray[np.int64_t] states_to_classes, 228 | ndarray[np.int64_t, ndim=3] x_index, 229 | ndarray[np.float64_t, ndim=3] x_value, 230 | long y, 231 | long I, long J, long K): 232 | """ 233 | Helper to calculate the marginals and from that the gradient given the forward and backward weights, for 234 | sparse input features. 235 | """ 236 | cdef unsigned int n_classes = max(states_to_classes) + 1 237 | cdef ndarray[np.float64_t] class_Z = np.zeros((n_classes,)) 238 | cdef np.float64_t Z = -inf 239 | cdef np.float64_t weight 240 | cdef unsigned int C = K 241 | cdef unsigned int c 242 | cdef int k 243 | 244 | for state, clas in enumerate(states_to_classes): 245 | weight = alpha[(I - 1, J - 1, state)] 246 | class_Z[clas] = weight 247 | Z = logaddexp(Z, weight) 248 | 249 | cdef ndarray[np.float64_t, ndim=2] derivative = np.full_like(parameters, 0.0) 250 | cdef unsigned int i0, j0, s0, i1, j1, s1, edge_parameter_index 251 | cdef np.float64_t alphabeta 252 | 253 | for node in alpha.viewkeys() | beta.viewkeys(): 254 | if len(node) == 3: 255 | i0, j0, s0 = node 256 | alphabeta = alpha[(i0, j0, s0)] + beta[(i0, j0, s0)] 257 | 258 | for c in range(C): 259 | k = x_index[i0, j0, c] 260 | if k < 0: 261 | break 262 | if states_to_classes[s0] == y: 263 | derivative[s0, k] += (exp(alphabeta - class_Z[y]) - exp(alphabeta - Z)) * x_value[i0, j0, c] 264 | else: 265 | derivative[s0, k] -= exp(alphabeta - Z) * x_value[i0, j0, c] 266 | 267 | else: 268 | i0, j0, s0, i1, j1, s1, edge_parameter_index = node 269 | alphabeta = alpha[(i0, j0, s0, i1, j1, s1, edge_parameter_index)] \ 270 | + beta[(i0, j0, s0, i1, j1, s1, edge_parameter_index)] 271 | 272 | for c in range(C): 273 | k = x_index[i1, j1, c] 274 | if k < 0: 275 | break 276 | if states_to_classes[s1] == y: 277 | derivative[edge_parameter_index, k] += (exp(alphabeta - class_Z[y]) - exp(alphabeta - Z)) * x_value[i1, j1, c] 278 | else: 279 | derivative[edge_parameter_index, k] -= exp(alphabeta - Z) * x_value[i1, j1, c] 280 | 281 | return (class_Z[y]) - (Z), derivative 282 | 283 | 284 | def populate_sparse_features(ndarray[np.float64_t, ndim=3] x, 285 | ndarray[np.int64_t, ndim=3] index_array, 286 | ndarray[np.float64_t, ndim=3] value_array, 287 | long I, long J, long K): 288 | """ Helper to fill in sparse feature arrays. """ 289 | cdef unsigned int i, j, c, k 290 | for i in range(I): 291 | for j in range(J): 292 | c = 0 293 | for k in range(K): 294 | if x[i, j, k] != 0: 295 | value_array[i, j, c] = x[i, j, k] 296 | index_array[i, j, c] = k 297 | c += 1 298 | 299 | def sparse_multiply(ndarray[np.float64_t, ndim=3] answer, 300 | ndarray[np.int64_t, ndim=3] index_array, 301 | ndarray[np.float64_t, ndim=3] value_array, 302 | ndarray[np.float64_t, ndim=2] dense_array, 303 | long I, long J, long K, long C, long S): 304 | """ Multiply a sparse three dimensional numpy array (using our own scheme) with a two dimensional array. """ 305 | cdef unsigned int i, j, s, c 306 | cdef int k 307 | for i in range(I): 308 | for j in range(J): 309 | for s in range(S): 310 | for c in range(C): 311 | k = index_array[i, j, c] 312 | if k < 0: 313 | break 314 | answer[i, j, s] += value_array[i, j, c] * dense_array[k, s] 315 | -------------------------------------------------------------------------------- /pyhacrf/pyhacrf.py: -------------------------------------------------------------------------------- 1 | # Authors: Dirko Coetsee 2 | # License: 3-clause BSD 3 | 4 | """ Implements a Hidden Alignment Conditional Random Field (HACRF). """ 5 | 6 | import numpy as np 7 | import lbfgs 8 | from .algorithms import forward, backward 9 | from .algorithms import forward_predict, forward_max_predict 10 | from .algorithms import gradient, gradient_sparse, populate_sparse_features, sparse_multiply 11 | from .state_machine import DefaultStateMachine 12 | 13 | 14 | class Hacrf(object): 15 | """ Hidden Alignment Conditional Random Field with L2 regularizer. 16 | 17 | Parameters 18 | ---------- 19 | l2_regularization : float, optional (default=0.0) 20 | The regularization parameter. 21 | 22 | optimizer : function, optional (default=None) 23 | The optimizing function that should be used minimize the negative log posterior. 24 | The function should have the signature: 25 | min_objective, argmin_objective, ... = fmin(obj, x0, **optimizer_kwargs), 26 | where obj is a function that returns 27 | the objective function and its gradient given a parameter vector; and x0 is the initial parameter vector. 28 | 29 | optimizer_kwargs : dictionary, optional (default=None) 30 | The keyword arguments to pass to the optimizing function. Only used when `optimizer` is also specified. 31 | 32 | state_machine : Instance of `GeneralStateMachine` or `DefaultStateMachine`, optional (default=`DefaultStateMachine`) 33 | The state machine to use to generate the lattice. 34 | 35 | viterbi : Boolean, optional (default=False). 36 | Whether to use Viterbi (max-sum) decoding for predictions (not training) 37 | instead of the default sum-product algorithm. 38 | 39 | References 40 | ---------- 41 | See *A Conditional Random Field for Discriminatively-trained Finite-state String Edit Distance* 42 | by McCallum, Bellare, and Pereira, and the report *Conditional Random Fields for Noisy text normalisation* 43 | by Dirko Coetsee. 44 | """ 45 | 46 | def __init__(self, 47 | l2_regularization=0.0, 48 | optimizer=None, 49 | optimizer_kwargs=None, 50 | state_machine=None, 51 | viterbi=False): 52 | self.parameters = None 53 | self.classes = None 54 | self.l2_regularization = l2_regularization 55 | self._optimizer = optimizer 56 | self._optimizer_kwargs = optimizer_kwargs 57 | self.viterbi = viterbi 58 | 59 | self._optimizer_result = None 60 | self._state_machine = state_machine 61 | self._states_to_classes = None 62 | self._evaluation_count = None 63 | 64 | def fit(self, X, y, verbosity=0): 65 | """Fit the model according to the given training data. 66 | 67 | Parameters 68 | ---------- 69 | X : List of ndarrays, one for each training example. 70 | Each training example's shape is (string1_len, string2_len, n_features), where 71 | string1_len and string2_len are the length of the two training strings and n_features the 72 | number of features. 73 | 74 | y : array-like, shape (n_samples,) 75 | Target vector relative to X. 76 | 77 | Returns 78 | ------- 79 | self : object 80 | Returns self. 81 | """ 82 | self.classes = list(set(y)) 83 | n_points = len(y) 84 | if len(X) != n_points: 85 | raise Exception('Number of training points should be the same as training labels.') 86 | 87 | if not self._state_machine: 88 | self._state_machine = DefaultStateMachine(self.classes) 89 | 90 | # Initialize the parameters given the state machine, features, and target classes. 91 | self.parameters = self._initialize_parameters(self._state_machine, X[0].shape[2]) 92 | 93 | # Create a new model object for each training example 94 | models = [_Model(self._state_machine, x, ty) for x, ty in zip(X, y)] 95 | 96 | self._evaluation_count = 0 97 | 98 | def _objective(parameters): 99 | gradient = np.zeros(self.parameters.shape) 100 | ll = 0.0 # Log likelihood 101 | # TODO: Embarrassingly parallel 102 | for model in models: 103 | dll, dgradient = model.forward_backward(parameters.reshape(self.parameters.shape)) 104 | ll += dll 105 | gradient += dgradient 106 | 107 | parameters_without_bias = np.array(parameters, dtype='float64') # exclude the bias parameters from being regularized 108 | parameters_without_bias[0] = 0 109 | ll -= self.l2_regularization * np.dot(parameters_without_bias.T, parameters_without_bias) 110 | gradient = gradient.flatten() - 2.0 * self.l2_regularization * parameters_without_bias 111 | 112 | if verbosity > 0: 113 | if self._evaluation_count == 0: 114 | print('{:10} {:10} {:10}'.format('Iteration', 'Log-likelihood', '|gradient|')) 115 | if self._evaluation_count % verbosity == 0: 116 | print('{:10} {:10.4} {:10.4}'.format(self._evaluation_count, ll, (abs(gradient).sum()))) 117 | self._evaluation_count += 1 118 | 119 | # TODO: Allow some of the parameters to be frozen. ie. not trained. Can later also completely remove 120 | # TODO: the computation associated with these parameters. 121 | return -ll, -gradient 122 | 123 | def _objective_copy_gradient(paramers, g): 124 | nll, ngradient = _objective(paramers) 125 | g[:] = ngradient 126 | return nll 127 | 128 | if self._optimizer: 129 | self.optimizer_result = self._optimizer(_objective, self.parameters.flatten(), **self._optimizer_kwargs) 130 | self.parameters = self.optimizer_result[0].reshape(self.parameters.shape) 131 | else: 132 | optimizer = lbfgs.LBFGS() 133 | final_betas = optimizer.minimize(_objective_copy_gradient, 134 | x0=self.parameters.flatten(), 135 | progress=None) 136 | self.optimizer_result = final_betas 137 | self.parameters = final_betas.reshape(self.parameters.shape) 138 | return self 139 | 140 | def predict_proba(self, X): 141 | """Probability estimates. 142 | 143 | The returned estimates for all classes are ordered by the 144 | label of classes. 145 | 146 | Parameters 147 | ---------- 148 | X : List of ndarrays, one for each training example. 149 | Each training example's shape is (string1_len, string2_len, n_features, where 150 | string1_len and string2_len are the length of the two training strings and n_features the 151 | number of features. 152 | 153 | Returns 154 | ------- 155 | T : array-like, shape = [n_samples, n_classes] 156 | Returns the probability of the sample for each class in the model, 157 | where classes are ordered as they are in ``self.classes_``. 158 | """ 159 | 160 | parameters = np.ascontiguousarray(self.parameters.T) 161 | 162 | predictions = [_Model(self._state_machine, x).predict(parameters, self.viterbi) 163 | for x in X] 164 | predictions = np.array([[probability 165 | for _, probability 166 | in sorted(prediction.items())] 167 | for prediction in predictions]) 168 | return predictions 169 | 170 | def predict(self, X): 171 | """Predict the class for X. 172 | 173 | The predicted class for each sample in X is returned. 174 | 175 | Parameters 176 | ---------- 177 | X : List of ndarrays, one for each training example. 178 | Each training example's shape is (string1_len, 179 | string2_len, n_features), where string1_len and 180 | string2_len are the length of the two training strings and 181 | n_features the number of features. 182 | 183 | Returns 184 | ------- 185 | y : iterable of shape = [n_samples] 186 | The predicted classes. 187 | 188 | """ 189 | return [self.classes[prediction.argmax()] for prediction in self.predict_proba(X)] 190 | 191 | @staticmethod 192 | def _initialize_parameters(state_machine, n_features): 193 | """ Helper to create initial parameter vector with the correct shape. """ 194 | return np.zeros((state_machine.n_states 195 | + state_machine.n_transitions, 196 | n_features)) 197 | 198 | def get_params(self, deep=True): 199 | """Get parameters for this estimator. 200 | 201 | Parameters 202 | ---------- 203 | deep: boolean, optional 204 | If True, will return the parameters for this estimator and 205 | contained subobjects that are estimators. 206 | 207 | Returns 208 | ------- 209 | params : mapping of string to any 210 | Parameter names mapped to their values. 211 | """ 212 | return {'l2_regularization': self.l2_regularization, 213 | 'optimizer': self._optimizer, 214 | 'optimizer_kwargs': self._optimizer_kwargs} 215 | 216 | def set_params(self, l2_regularization=0.0, optimizer=None, optimizer_kwargs=None): 217 | """Set the parameters of this estimator. 218 | 219 | Returns 220 | ------- 221 | self 222 | """ 223 | self.l2_regularization = l2_regularization 224 | self._optimizer = optimizer 225 | self._optimizer_kwargs = optimizer_kwargs 226 | return self 227 | 228 | 229 | class _Model(object): 230 | """ The actual model that implements the inference routines. """ 231 | def __init__(self, state_machine, x, y=None): 232 | self.state_machine = state_machine 233 | self.states_to_classes = state_machine.states_to_classes 234 | self.x = x 235 | self.sparse_x = 'uninitialized' 236 | self.y = y 237 | self._lattice = self.state_machine.build_lattice(self.x) 238 | 239 | def forward_backward(self, parameters): 240 | """ Run the forward backward algorithm with the given parameters. """ 241 | # If the features are sparse, we can use an optimization. 242 | # I'm not using scipy.sparse here because we want to avoid a scipy dependency and also scipy.sparse doesn't seem 243 | # to handle arrays of shape higher than 2. 244 | if isinstance(self.sparse_x, str) and self.sparse_x == 'uninitialized': 245 | if (self.x == 0).sum() * 1.0 / self.x.size > 0.6: 246 | self.sparse_x = self._construct_sparse_features(self.x) 247 | else: 248 | self.sparse_x = 'not sparse' 249 | 250 | I, J, K = self.x.shape 251 | if not isinstance(self.sparse_x, str): 252 | C = self.sparse_x[0].shape[2] 253 | S, _ = parameters.shape 254 | x_dot_parameters = np.zeros((I, J, S)) 255 | sparse_multiply(x_dot_parameters, self.sparse_x[0], self.sparse_x[1], parameters.T, I, J, K, C, S) 256 | else: 257 | x_dot_parameters = np.dot(self.x, parameters.T) # Pre-compute the dot product 258 | alpha = self._forward(x_dot_parameters) 259 | beta = self._backward(x_dot_parameters) 260 | classes_to_ints = {k: i for i, k in enumerate(set(self.states_to_classes.values()))} 261 | states_to_classes = np.array([classes_to_ints[self.states_to_classes[state]] 262 | for state in range(max(self.states_to_classes.keys()) + 1)], dtype='int64') 263 | if not isinstance(self.sparse_x, str): 264 | ll, deriv = gradient_sparse(alpha, beta, parameters, states_to_classes, 265 | self.sparse_x[0], self.sparse_x[1], classes_to_ints[self.y], 266 | I, J, self.sparse_x[0].shape[2]) 267 | else: 268 | ll, deriv = gradient(alpha, beta, parameters, states_to_classes, 269 | self.x, classes_to_ints[self.y], I, J, K) 270 | return ll, deriv 271 | 272 | def predict(self, parameters, viterbi): 273 | """ Run forward algorithm to find the predicted distribution over classes. """ 274 | x_dot_parameters = np.einsum('ijk,kl->ijl', self.x, parameters) 275 | 276 | if not viterbi: 277 | alpha = forward_predict(self._lattice, x_dot_parameters, 278 | self.state_machine.n_states) 279 | else: 280 | alpha = forward_max_predict(self._lattice, x_dot_parameters, 281 | self.state_machine.n_states) 282 | 283 | I, J, _ = self.x.shape 284 | 285 | class_Z = {} 286 | Z = -np.inf 287 | 288 | for state, predicted_class in self.states_to_classes.items(): 289 | weight = alpha[I - 1, J - 1, state] 290 | class_Z[self.states_to_classes[state]] = weight 291 | Z = np.logaddexp(Z, weight) 292 | 293 | return {label: np.exp(class_z - Z) for label, class_z in class_Z.items()} 294 | 295 | def _forward(self, x_dot_parameters): 296 | """ Helper to calculate the forward weights. """ 297 | return forward(self._lattice, x_dot_parameters, 298 | self.state_machine.n_states) 299 | 300 | def _backward(self, x_dot_parameters): 301 | """ Helper to calculate the backward weights. """ 302 | I, J, _ = self.x.shape 303 | return backward(self._lattice, x_dot_parameters, I, J, 304 | self.state_machine.n_states) 305 | 306 | def _construct_sparse_features(self, x): 307 | """ Helper to construct a sparse representation of the features. """ 308 | I, J, K = x.shape 309 | new_array_height = (x != 0).sum(axis=2).max() 310 | index_array = -np.ones((I, J, new_array_height), dtype='int64') 311 | value_array = -np.ones((I, J, new_array_height), dtype='float64') 312 | populate_sparse_features(x, index_array, value_array, I, J, K) 313 | return index_array, value_array 314 | -------------------------------------------------------------------------------- /pyhacrf/tests/test_model.py: -------------------------------------------------------------------------------- 1 | """ Tests for the model. """ 2 | 3 | import unittest 4 | 5 | from numpy.testing import assert_array_almost_equal, assert_array_equal 6 | import numpy as np 7 | from numpy import random 8 | from pyhacrf import Hacrf 9 | from pyhacrf.state_machine import GeneralStateMachine, DefaultStateMachine 10 | from pyhacrf.pyhacrf import _Model 11 | from pyhacrf import StringPairFeatureExtractor 12 | 13 | TEST_PRECISION = 3 14 | 15 | 16 | class TestHacrf(unittest.TestCase): 17 | def test_initialize_parameters(self): 18 | start_states = [0] 19 | transitions = [(0, 0, (1, 1)), 20 | (0, 1, (0, 1)), 21 | (0, 0, (1, 0))] 22 | states_to_classes = {0: 'a'} 23 | state_machine = GeneralStateMachine(start_states=start_states, 24 | transitions=transitions, 25 | states_to_classes=states_to_classes) 26 | 27 | n_features = 3 28 | 29 | actual_parameters = Hacrf._initialize_parameters(state_machine, n_features) 30 | expected_parameter_shape = (5, 3) 31 | self.assertEqual(actual_parameters.shape, expected_parameter_shape) 32 | 33 | def test_default_state_machine(self): 34 | classes = ['a', 'b'] 35 | expected_start_states, expected_transitions =\ 36 | ([0, 1], 37 | [(0, 0, (1, 1)), 38 | (1, 1, (1, 1)), 39 | (0, 0, (0, 1)), 40 | (1, 1, (0, 1)), 41 | (0, 0, (1, 0)), 42 | (1, 1, (1, 0))]) 43 | expected_states_to_classes = {0: 'a', 1: 'b'} 44 | state_machine = DefaultStateMachine(classes) 45 | self.assertEqual(state_machine._start_states, 46 | expected_start_states) 47 | self.assertEqual(state_machine._transitions, 48 | expected_transitions) 49 | self.assertEqual(state_machine.states_to_classes, 50 | expected_states_to_classes) 51 | 52 | def test_fit_predict(self): 53 | incorrect = ['helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship', 'hubby', 'krazii', 'mite', 'tropic'] 54 | correct = ['hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship', 'husband', 'crazy', 'might', 'topic'] 55 | training = zip(incorrect, correct) 56 | 57 | fe = StringPairFeatureExtractor(match=True, numeric=True) 58 | xf = fe.fit_transform(training) 59 | 60 | model = Hacrf() 61 | model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) 62 | 63 | expected_parameters = np.array([[-10.76945326, 144.03414923, 0.], 64 | [31.84369748, -106.41885651, 0.], 65 | [-52.08919467, 4.56943665, 0.], 66 | [31.01495044, -13.0593297, 0.], 67 | [49.77302218, -6.42566204, 0.], 68 | [-28.69877796, 24.47127009, 0.], 69 | [-85.34524911, 21.87370646, 0.], 70 | [106.41949333, 6.18587125, 0.]]) 71 | print(model.parameters) 72 | assert_array_almost_equal(model.parameters, expected_parameters, 73 | decimal=TEST_PRECISION) 74 | 75 | expected_probas = np.array([[1.00000000e+000, 3.51235685e-039], 76 | [1.00000000e+000, 4.79716208e-039], 77 | [1.00000000e+000, 2.82744641e-139], 78 | [1.00000000e+000, 6.49580729e-012], 79 | [9.99933798e-001, 6.62022561e-005], 80 | [8.78935957e-005, 9.99912106e-001], 81 | [4.84538335e-009, 9.99999995e-001], 82 | [1.25170233e-250, 1.00000000e+000], 83 | [2.46673086e-010, 1.00000000e+000], 84 | [1.03521293e-033, 1.00000000e+000]]) 85 | actual_predict_probas = model.predict_proba(xf) 86 | print(actual_predict_probas) 87 | assert_array_almost_equal(actual_predict_probas, expected_probas, 88 | decimal=TEST_PRECISION) 89 | 90 | expected_predictions = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) 91 | actual_predictions = model.predict(xf) 92 | assert_array_almost_equal(actual_predictions, expected_predictions, 93 | decimal=TEST_PRECISION) 94 | 95 | def test_fit_predict_regularized(self): 96 | incorrect = ['helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship', 'hubby', 'krazii', 'mite', 'tropic'] 97 | correct = ['hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship', 'husband', 'crazy', 'might', 'topic'] 98 | training = zip(incorrect, correct) 99 | 100 | fe = StringPairFeatureExtractor(match=True, numeric=True) 101 | xf = fe.fit_transform(training) 102 | 103 | model = Hacrf(l2_regularization=10.0) 104 | model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) 105 | print(model.parameters) 106 | 107 | expected_parameters = np.array([[-0.0569188, 0.07413339, 0.], 108 | [0.00187709, -0.06377866, 0.], 109 | [-0.01908823, 0.00586189, 0.], 110 | [0.01721114, -0.00636556, 0.], 111 | [0.01578279, 0.0078614, 0.], 112 | [-0.0139057, -0.00862948, 0.], 113 | [-0.00623241, 0.02937325, 0.], 114 | [0.00810951, -0.01774676, 0.]]) 115 | assert_array_almost_equal(model.parameters, expected_parameters, 116 | decimal=TEST_PRECISION) 117 | 118 | expected_probas = np.array([[0.5227226, 0.4772774], 119 | [0.52568993, 0.47431007], 120 | [0.4547091, 0.5452909], 121 | [0.51179222, 0.48820778], 122 | [0.46347576, 0.53652424], 123 | [0.45710098, 0.54289902], 124 | [0.46159657, 0.53840343], 125 | [0.42997978, 0.57002022], 126 | [0.47419724, 0.52580276], 127 | [0.50797852, 0.49202148]]) 128 | actual_predict_probas = model.predict_proba(xf) 129 | print(actual_predict_probas) 130 | assert_array_almost_equal(actual_predict_probas, expected_probas, 131 | decimal=TEST_PRECISION) 132 | 133 | expected_predictions = np.array([0, 0, 1, 0, 1, 1, 1, 1, 1, 0]) 134 | actual_predictions = model.predict(xf) 135 | assert_array_almost_equal(actual_predictions, expected_predictions, 136 | decimal=TEST_PRECISION) 137 | 138 | def test_fit_predict_regularized_viterbi(self): 139 | incorrect = ['helloooo', 'freshh', 'ffb', 'h0me', 'wonderin', 'relaionship', 'hubby', 'krazii', 'mite', 'tropic'] 140 | correct = ['hello', 'fresh', 'facebook', 'home', 'wondering', 'relationship', 'husband', 'crazy', 'might', 'topic'] 141 | training = zip(incorrect, correct) 142 | 143 | fe = StringPairFeatureExtractor(match=True, numeric=True) 144 | xf = fe.fit_transform(training) 145 | 146 | model = Hacrf(l2_regularization=10.0, viterbi=True) 147 | model.fit(xf, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) 148 | print(model.parameters) 149 | 150 | expected_parameters = np.array([[-0.0569188, 0.07413339, 0.], 151 | [0.00187709, -0.06377866, 0.], 152 | [-0.01908823, 0.00586189, 0.], 153 | [0.01721114, -0.00636556, 0.], 154 | [0.01578279, 0.0078614, 0.], 155 | [-0.0139057, -0.00862948, 0.], 156 | [-0.00623241, 0.02937325, 0.], 157 | [0.00810951, -0.01774676, 0.]]) 158 | assert_array_almost_equal(model.parameters, expected_parameters, 159 | decimal=TEST_PRECISION) 160 | 161 | expected_probas = np.array([[0.56394611, 0.43605389], 162 | [0.52977205, 0.47022795], 163 | [0.4751729, 0.5248271], 164 | [0.51183761, 0.48816239], 165 | [0.48608081, 0.51391919], 166 | [0.4986367, 0.5013633], 167 | [0.46947222, 0.53052778], 168 | [0.43233544, 0.56766456], 169 | [0.47463002, 0.52536998], 170 | [0.51265109, 0.48734891]]) 171 | actual_predict_probas = model.predict_proba(xf) 172 | print(actual_predict_probas) 173 | assert_array_almost_equal(actual_predict_probas, expected_probas, 174 | decimal=TEST_PRECISION) 175 | 176 | expected_predictions = np.array([0, 0, 1, 0, 1, 1, 1, 1, 1, 0]) 177 | actual_predictions = model.predict(xf) 178 | assert_array_almost_equal(actual_predictions, expected_predictions, 179 | decimal=TEST_PRECISION) 180 | 181 | 182 | class TestModel(unittest.TestCase): 183 | def test_build_lattice(self): 184 | n_states = 4 # Because 3 is the max 185 | 186 | start_states = [0, 1] 187 | transitions = [(0, 0, (1, 1)), 188 | (0, 1, (0, 1)), 189 | (0, 0, (1, 0)), 190 | (0, 3, lambda i, j, k: (0, 2))] 191 | states_to_classes = {0: 0, 1: 1, 3: 3} 192 | 193 | state_machine = GeneralStateMachine(start_states, transitions, states_to_classes) 194 | x = np.zeros((2, 3, 9)) 195 | # # ________ 196 | # 1. . . # 1 0 - 10 - 31 197 | # # | /_______ 198 | # 0. . . # 0 10 -- 1 3 199 | # 0 1 2 # 0 1 2 200 | # 201 | # 1(0, 1), 3(0, 2), 1(1, 1), 1(0, 0) should be pruned because they represent partial alignments. 202 | # Only nodes that are reachable by stepping back from (1, 2) must be included in the lattice. 203 | actual_lattice = state_machine.build_lattice(x) 204 | expected_lattice = np.array([(0, 0, 0, 1, 0, 0, 2 + n_states), 205 | (0, 0, 0, 1, 1, 0, 0 + n_states), 206 | (1, 0, 0, 1, 2, 3, 3 + n_states), 207 | (1, 1, 0, 1, 2, 1, 1 + n_states)]) 208 | assert_array_equal(actual_lattice, expected_lattice) 209 | 210 | def test_build_lattice_jumps(self): 211 | n_states = 2 # Because 1 is the max 212 | 213 | start_states = [0, 1] 214 | transitions = [(0, 0, (1, 1)), 215 | (0, 1, (0, 2)), 216 | (0, 0, (1, 0))] 217 | states_to_classes = {0: 0, 1: 1} 218 | 219 | state_machine = GeneralStateMachine(start_states, transitions, states_to_classes) 220 | x = np.zeros((2, 3, 9)) 221 | # # ________ 222 | # 1. . . # 1 0 . 1 223 | # # | _______ 224 | # 0. . . # 0 10 / . 1 225 | # 0 1 2 # 0 1 2 226 | # 227 | # 1(0, 2) should be pruned because they represent partial alignments. 228 | # Only nodes that are reachable by stepping back from (1, 2) must be included in the lattice. 229 | actual_lattice = state_machine.build_lattice(x) 230 | expected_lattice = np.array([(0, 0, 0, 1, 0, 0, 2 + n_states), 231 | (1, 0, 0, 1, 2, 1, 1 + n_states)]) 232 | assert_array_equal(actual_lattice, expected_lattice) 233 | 234 | def test_forward_single(self): 235 | start_states = [0, 1] 236 | transitions = [(0, 0, (1, 1)), 237 | (0, 1, (0, 1)), 238 | (0, 0, (1, 0)), 239 | (0, 2, lambda i, j, k: (0, 2))] 240 | states_to_classes = {0: 'a', 1: 'a', 2: 'b'} # Dummy 241 | 242 | state_machine = GeneralStateMachine(start_states, transitions, states_to_classes) 243 | 244 | parameters = np.array(range(-7, 7), dtype='float64').reshape((7, 2)) 245 | # parameters = 246 | # 0([[-7, -6], 247 | # 1 [-5, -4], 248 | # 2 [-3, -2], 249 | # 3 [-1, 0], 250 | # 4 [ 1, 2], 251 | # 5 [ 3, 4], 252 | # 6 [ 5, 6]]) 253 | x = np.array([[[0, 1], 254 | [1, 0], 255 | [2, 1]], 256 | [[0, 1], 257 | [1, 0], 258 | [1, 0]]]) 259 | y = 'a' 260 | # Expected lattice: 261 | # # ________ 262 | # 1. . . # 1 0 __0 - 21 263 | # # | / 264 | # 0. . . # 0 0 265 | # 0 1 2 # 0 1 2 266 | expected_alpha = { 267 | (0, 0, 0): np.exp(-6), 268 | (0, 0, 0, 1, 0, 0, 5): np.exp(-6) * np.exp(4), 269 | (0, 0, 0, 1, 1, 0, 3): np.exp(-6) * np.exp(-1), 270 | (1, 0, 0): np.exp(-6) * np.exp(4) * np.exp(-6), 271 | (1, 0, 0, 1, 2, 2, 6): np.exp(-6) * np.exp(4) * np.exp(-6) * np.exp(5), 272 | (1, 1, 0): np.exp(-6) * np.exp(-1) * np.exp(-7), 273 | (1, 1, 0, 1, 2, 1, 4): np.exp(-6) * np.exp(-1) * np.exp(-7) * np.exp(1), 274 | (1, 2, 1): np.exp(-6) * np.exp(-1) * np.exp(-7) * np.exp(1) * np.exp(-5), 275 | (1, 2, 2): np.exp(-6) * np.exp(4) * np.exp(-6) * np.exp(5) * np.exp(-3) 276 | } 277 | expected_alpha = {k: np.emath.log(v) for k, v in expected_alpha.items()} 278 | test_model = _Model(state_machine, x, y) 279 | x_dot_parameters = np.dot(x, parameters.T) # Pre-compute the dot product 280 | actual_alpha = test_model._forward(x_dot_parameters) 281 | 282 | actual_alpha = {k: v for k, v in actual_alpha.items() 283 | if not np.isneginf(v)} 284 | print(actual_alpha) 285 | 286 | self.assertEqual(len(actual_alpha), len(expected_alpha)) 287 | print 288 | for key in sorted(expected_alpha.keys()): 289 | print(key, (expected_alpha[key]), (actual_alpha[key])) 290 | self.assertEqual(actual_alpha[key], expected_alpha[key]) 291 | 292 | def test_forward_connected(self): 293 | classes = ['a', 'b'] 294 | parameters = np.array(range(-8, 8), dtype='float64').reshape((8, 2)) 295 | # parameters = 296 | #0([[-8, -7], 297 | #1 [-6, -5], 298 | #2 [-4, -3], 299 | #3 [-2, -1], 300 | #4 [ 0, 1], 301 | #5 [ 2, 3], 302 | #6 [ 4, 5], 303 | #7 [ 6, 7]]) 304 | x = np.array([[[0, 1], 305 | [2, 1]], 306 | [[0, 1], 307 | [1, 0]]]) 308 | y = 'a' 309 | expected_alpha = { 310 | (0, 0, 0): np.exp(-7), 311 | (0, 0, 0, 0, 1, 0, 4): np.exp(-7) * np.exp(1), 312 | (0, 0, 0, 1, 0, 0, 6): np.exp(-7) * np.exp(5), 313 | (0, 0, 0, 1, 1, 0, 2): np.exp(-7) * np.exp(-4), 314 | (0, 0, 1): np.exp(-5), 315 | (0, 0, 1, 0, 1, 1, 5): np.exp(-5) * np.exp(7), 316 | (0, 0, 1, 1, 0, 1, 7): np.exp(-5) * np.exp(7), 317 | (0, 0, 1, 1, 1, 1, 3): np.exp(-5) * np.exp(-2), 318 | (0, 1, 0): np.exp(-7) * np.exp(1) * np.exp(-23), 319 | (0, 1, 0, 1, 1, 0, 6): np.exp(-7) * np.exp(1) * np.exp(-23) * np.exp(4), 320 | (0, 1, 1): np.exp(-5) * np.exp(7) * np.exp(-17), 321 | (0, 1, 1, 1, 1, 1, 7): np.exp(-5) * np.exp(7) * np.exp(-17) * np.exp(6), 322 | (1, 0, 0): np.exp(-7) * np.exp(5) * np.exp(-7), 323 | (1, 0, 0, 1, 1, 0, 4): np.exp(-7) * np.exp(5) * np.exp(-7) * np.exp(0), 324 | (1, 0, 1): np.exp(-5) * np.exp(7) * np.exp(-5), 325 | (1, 0, 1, 1, 1, 1, 5): np.exp(-5) * np.exp(7) * np.exp(-5) * np.exp(2), 326 | (1, 1, 0): (np.exp(-11) + np.exp(-25) + np.exp(-9)) * np.exp(-8), 327 | (1, 1, 1): (np.exp(-1) + np.exp(-9) + np.exp(-7)) * np.exp(-6) 328 | } 329 | expected_alpha = {k: np.emath.log(v) for k, v in expected_alpha.items()} 330 | 331 | state_machine = DefaultStateMachine(classes) 332 | print 333 | test_model = _Model(state_machine, x, y) 334 | for s in test_model._lattice: 335 | print(s) 336 | x_dot_parameters = np.dot(x, parameters.T) # Pre-compute the dot product 337 | actual_alpha = test_model._forward(x_dot_parameters) 338 | 339 | self.assertEqual(len(actual_alpha), len(expected_alpha)) 340 | for key in sorted(expected_alpha.keys()): 341 | print(key, expected_alpha[key], actual_alpha[key]) 342 | self.assertAlmostEqual(actual_alpha[key], expected_alpha[key]) 343 | 344 | def test_backward_connected(self): 345 | parameters = np.array(range(-3, 3), dtype='float64').reshape((3, 2)) 346 | # parameters = 347 | #0 ([[-3, -2], 348 | #1 [-1, 0], 349 | #2 [ 1, 2]]) 350 | x = np.array([[[0, 1], 351 | [2, 1]], 352 | [[0, 1], 353 | [1, 0]]]) 354 | y = 'a' 355 | expected_beta = { 356 | (0, 0, 0): (np.exp(-4) + np.exp(-12)), # * np.exp(-2), 357 | (0, 0, 0, 0, 1, 0, 1): np.exp(-3) * np.exp(1) * np.exp(-8), # * np.exp(-2), 358 | (0, 0, 0, 1, 0, 0, 2): np.exp(-3) * np.exp(-1) * np.exp(-2), # * np.exp(2), 359 | (0, 1, 0): np.exp(-3) * np.exp(1), # * np.exp(-8), 360 | (0, 1, 0, 1, 1, 0, 2): np.exp(-3), # * np.exp(1), 361 | (1, 0, 0): np.exp(-3) * np.exp(-1), # * np.exp(-2), 362 | (1, 0, 0, 1, 1, 0, 1): np.exp(-3), # * np.exp(-1), 363 | (1, 1, 0): 1.0 # np.exp(-3) 364 | } 365 | expected_beta = {k: np.emath.log(v) for k, v in expected_beta.items()} 366 | 367 | start_states = [0] 368 | transitions = [(0, 0, (0, 1)), 369 | (0, 0, (1, 0))] 370 | states_to_classes = {0: 'a'} 371 | n_states = 1 372 | 373 | state_machine = GeneralStateMachine(start_states, transitions, states_to_classes) 374 | 375 | test_model = _Model(state_machine, x, y) 376 | for s in test_model._lattice: 377 | print(s) 378 | x_dot_parameters = np.dot(x, parameters.T) # Pre-compute the dot product 379 | actual_beta = test_model._backward(x_dot_parameters) 380 | print(actual_beta) 381 | 382 | print 383 | self.assertEqual(len(actual_beta), len(expected_beta)) 384 | for key in sorted(expected_beta.keys(), reverse=True): 385 | print(key, expected_beta[key], actual_beta[key]) 386 | self.assertAlmostEqual(actual_beta[key], expected_beta[key]) 387 | 388 | def test_forward_backward_same_partition_value(self): 389 | classes = ['a', 'b'] 390 | parameters = np.array(range(-8, 8), dtype='float64').reshape((8, 2)) 391 | x = np.array([[[0, 1], 392 | [2, 1]], 393 | [[0, 1], 394 | [1, 0]]]) 395 | y = 'a' 396 | state_machine = DefaultStateMachine(classes) 397 | test_model = _Model(state_machine, x, y) 398 | x_dot_parameters = np.dot(x, parameters.T) # Pre-compute the dot product 399 | actual_alpha = test_model._forward(x_dot_parameters) 400 | actual_beta = test_model._backward(x_dot_parameters) 401 | 402 | print(actual_alpha[(1, 1, 0)], actual_beta[(0, 0, 0)]) 403 | print(actual_alpha[(1, 1, 1)], actual_beta[(0, 0, 1)]) 404 | self.assertAlmostEqual(actual_alpha[(1, 1, 0)], actual_beta[(0, 0, 0)] + (np.dot(x[0, 0, :], parameters[0, :]))) 405 | self.assertAlmostEqual(actual_alpha[(1, 1, 1)], actual_beta[(0, 0, 1)] + (np.dot(x[0, 0, :], parameters[1, :]))) 406 | 407 | def test_derivate_chain(self): 408 | classes = ['a', 'b'] 409 | parameters = np.array(range(-8, 8), dtype='float64').reshape((8, 2)) 410 | # parameters = 411 | #0([[-8, -7], 412 | #1 [-6, -5], 413 | #2 [-4, -3], 414 | #3 [-2, -1], 415 | #4 [ 0, 1], 416 | #5 [ 2, 3], 417 | #6 [ 4, 5], 418 | #7 [ 6, 7]]) 419 | x = np.array([[[0, 1], 420 | [1, 2]]], dtype='float64') 421 | y = 'a' 422 | state_machine = DefaultStateMachine(classes) 423 | test_model = _Model(state_machine, x, y) 424 | print(test_model._lattice) 425 | # 426 | # 0 01 --- 01 427 | # 0 1 428 | # states_to_classes = {0: 'a', 1: 'b'} 429 | # (0, 0, 0) : exp(-7) 430 | # (0, 0, 0, 0, 1, 0, 4) : exp(-7) * exp(2) 431 | # (0, 0, 1) : exp(-5) 432 | # (0, 0, 1, 0, 1, 1, 5) : exp(-5) * exp(8) 433 | # (0, 1, 0) : exp(-7) * exp(2) * exp(-8 - 14) = exp(-27) 434 | # (0, 1, 1) : exp(-5) * exp(8) * exp(-6 - 10) = exp(-13) 435 | # p(y|G,X) = f0(g00,g01,x00,x01,y) f1(g40,g41,x10,x11,y) f2(g00,g01,x00,x01,y) + 436 | # f0(g10,g11,x00,x01,y) f1(g50,g51,x10,x11,y) f2(g10,g11,x00,x01,y) 437 | # = exp(-27) / (exp(-27) + exp(-13)) 438 | expected_ll = np.emath.log(np.exp(-27) / (np.exp(-27) + np.exp(-13))) 439 | expected_dll = np.zeros(parameters.shape) 440 | 441 | # Finite difference gradient approximation 442 | delta = 10.0**-7 443 | S, D = expected_dll.shape 444 | for s in range(S): 445 | for d in range(D): 446 | dg = np.zeros(parameters.shape) 447 | dg[s, d] = delta 448 | y0, _ = test_model.forward_backward(parameters) 449 | y1, _ = test_model.forward_backward(parameters + dg) 450 | print(s, d, y0, y1) 451 | expected_dll[s, d] = (y1 - y0) / delta 452 | 453 | actual_ll, actual_dll = test_model.forward_backward(parameters) 454 | 455 | print(expected_ll, actual_ll) 456 | print(expected_dll) 457 | print(actual_dll) 458 | self.assertAlmostEqual(actual_ll, expected_ll) 459 | assert_array_almost_equal(actual_dll, expected_dll, decimal=TEST_PRECISION) 460 | 461 | def test_derivate_medium(self): 462 | classes = ['a', 'b'] 463 | parameters = np.array(range(-8, 8), dtype='float64').reshape((8, 2)) 464 | x = np.array([[[0, 1], 465 | [2, 1]], 466 | [[0, 1], 467 | [1, 0.0]]]) 468 | y = 'a' 469 | state_machine = DefaultStateMachine(classes) 470 | test_model = _Model(state_machine, x, y) 471 | print(test_model._lattice) 472 | 473 | expected_dll = np.zeros(parameters.shape) 474 | 475 | # Finite difference gradient approximation 476 | delta = 10.0**-7 477 | S, D = expected_dll.shape 478 | for s in range(S): 479 | for d in range(D): 480 | dg = np.zeros(parameters.shape) 481 | dg[s, d] = delta 482 | y0, _ = test_model.forward_backward(parameters) 483 | y1, _ = test_model.forward_backward(parameters + dg) 484 | print(s, d, y0, y1) 485 | expected_dll[s, d] = (y1 - y0) / delta 486 | 487 | actual_ll, actual_dll = test_model.forward_backward(parameters) 488 | 489 | print(expected_dll) 490 | print(actual_dll) 491 | assert_array_almost_equal(actual_dll, expected_dll, decimal=TEST_PRECISION) 492 | 493 | def test_derivate_large(self): 494 | classes = ['a', 'b', 'c'] 495 | y = 'b' 496 | x = random.randn(8, 3, 10) * 5 + 3 497 | state_machine = DefaultStateMachine(classes) 498 | parameters = Hacrf._initialize_parameters(state_machine, x.shape[2]) 499 | parameters = random.randn(*parameters.shape) * 10 - 2 500 | 501 | test_model = _Model(state_machine, x, y) 502 | print(test_model._lattice) 503 | 504 | expected_dll = np.zeros(parameters.shape) 505 | 506 | # Finite difference gradient approximation 507 | delta = 10.0**-7 508 | S, D = expected_dll.shape 509 | for s in range(S): 510 | for d in range(D): 511 | dg = np.zeros(parameters.shape) 512 | dg[s, d] = delta 513 | y0, _ = test_model.forward_backward(parameters) 514 | y1, _ = test_model.forward_backward(parameters + dg) 515 | print(s, d, y0, y1) 516 | expected_dll[s, d] = (y1 - y0) / delta 517 | 518 | actual_ll, actual_dll = test_model.forward_backward(parameters) 519 | 520 | print(expected_dll) 521 | print(actual_dll) 522 | self.assertEqual((np.isnan(actual_dll)).any(), False) 523 | assert_array_almost_equal(actual_dll, expected_dll, decimal=TEST_PRECISION) 524 | 525 | if __name__ == '__main__': 526 | unittest.main() 527 | -------------------------------------------------------------------------------- /examples/Highered dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:abf0ae00f3fa6aac52649ff752418b6a4d46aeb22019b490dbb11e667c93d006" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [ 15 | "%load_ext vimception" 16 | ], 17 | "language": "python", 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "javascript": [ 22 | "\n", 23 | "var cmd = IPython.keyboard_manager.command_shortcuts;\n", 24 | "var edit = IPython.keyboard_manager.edit_shortcuts;\n", 25 | "var def_cmd = IPython.default_command_shortcuts;\n", 26 | "var def_edit = IPython.default_edit_shortcuts;\n", 27 | "\n", 28 | "// get the code mirror editor of a curently selected cell\n", 29 | "function C() { return IPython.notebook.get_selected_cell().code_mirror; };\n", 30 | "\n", 31 | "// Change the mode of all current and future CodeMirror instances\n", 32 | "// Emacs users can use this function as just to('emacs') so long as they've\n", 33 | "// required/loaded emacs.js from CodeMirror\n", 34 | "function to(mode) {\n", 35 | " var mode = mode || 'vim'\n", 36 | " // first let's apply vim mode to all current cells\n", 37 | " function to_mode(c) { return c.code_mirror.setOption('keyMap', mode);};\n", 38 | " IPython.notebook.get_cells().map(to_mode);\n", 39 | " // apply the mode to future cells created\n", 40 | " IPython.Cell.options_default.cm_config.keyMap = mode;\n", 41 | "}\n", 42 | "\n", 43 | "function getCSS(path) {\n", 44 | " $('', {\n", 45 | " rel: 'stylesheet',\n", 46 | " type: 'text/css',\n", 47 | " href: path,\n", 48 | " }).appendTo('head');\n", 49 | "}\n", 50 | "\n", 51 | "// I messed around with trying to get requireJS going here, but gave up and\n", 52 | "// just using this answer from SO \n", 53 | "// http://stackoverflow.com/questions/11803215/how-to-include-multiple-js-files-using-jquery-getscript-method\n", 54 | "\n", 55 | "var p = \"/static/components/codemirror/addon/\";\n", 56 | "\n", 57 | "$.when(\n", 58 | "// Grab the CodeMirror vim keymap\n", 59 | "$.getScript(p + \"../keymap/vim.js\"),\n", 60 | "// also make search work via /\n", 61 | "$.getScript(p + \"search/search.js\"),\n", 62 | "$.getScript(p + \"search/searchcursor.js\"),\n", 63 | "\n", 64 | "// TODO: hook-up gq to perform a harwrap\n", 65 | "$.getScript(p + \"wrap/hardwrap.js\"),\n", 66 | "$.getScript(p + \"selection/active-line.js\"),\n", 67 | "\n", 68 | "$.getScript(p + \"display/fullscreen.js\"),\n", 69 | "getCSS(p + \"display/fullscreen.css\"),\n", 70 | "getCSS(p + \"dialog/dialog.css\"),\n", 71 | "$.getScript(p + \"dialog/dialog.js\"),\n", 72 | "\n", 73 | "\n", 74 | " $.Deferred(function( deferred ){\n", 75 | " $( deferred.resolve );\n", 76 | " })\n", 77 | ").then(function success(){\n", 78 | "\n", 79 | "console.log('Great success');\n", 80 | "\n", 81 | "IPython.CodeCell.options_default.cm_config.foldGutter = true;\n", 82 | "IPython.CodeCell.options_default.cm_config.gutters = [\"CodeMirror-linenumbers\", \"CodeMirror-foldgutter\"];\n", 83 | "\n", 84 | "IPython.Cell.prototype.at_top = function () {\n", 85 | " var cm = this.code_mirror;\n", 86 | " var cursor = cm.getCursor();\n", 87 | " if (cursor.line === 0) {\n", 88 | " return true;\n", 89 | " }\n", 90 | " return false;\n", 91 | " };\n", 92 | "\n", 93 | "\n", 94 | "IPython.Cell.prototype.at_bottom = function () {\n", 95 | " var cm = this.code_mirror;\n", 96 | " var cursor = cm.getCursor();\n", 97 | " if (cursor.line === (cm.lineCount()-1)) {\n", 98 | " return true;\n", 99 | " }\n", 100 | " return false;\n", 101 | "};\n", 102 | "// on all code mirror instances on this page, apply the function f\n", 103 | "function all_cm(f) {\n", 104 | " // apply f to every code mirror instance. f takes one parameter\n", 105 | " IPython.notebook.get_cells().map(function (c) { f(c.code_mirror); } );\n", 106 | "}\n", 107 | "\n", 108 | "\n", 109 | "to('vim');\n", 110 | "function vim_up(event) {\n", 111 | " var cell = IPython.notebook.get_selected_cell();\n", 112 | " if (cell && cell.at_top() && cell.code_mirror.options.keyMap === 'vim') {\n", 113 | " console.log('inside the business logic k');\n", 114 | " event.preventDefault();\n", 115 | " IPython.notebook.command_mode()\n", 116 | " IPython.notebook.select_prev();\n", 117 | " IPython.notebook.edit_mode();\n", 118 | " return false;\n", 119 | " };\n", 120 | "}\n", 121 | "\n", 122 | "function vim_down(event) {\n", 123 | " var cell = IPython.notebook.get_selected_cell();\n", 124 | " if (cell && cell.at_bottom() && cell.code_mirror.options.keyMap === 'vim') {\n", 125 | " event.preventDefault();\n", 126 | " IPython.notebook.command_mode()\n", 127 | " IPython.notebook.select_next();\n", 128 | " IPython.notebook.edit_mode();\n", 129 | " return false;\n", 130 | " };\n", 131 | " }\n", 132 | "\n", 133 | "var m = '(vim) '\n", 134 | "var edit_shortcuts = {\n", 135 | " 'k' : {\n", 136 | " help : m + 'up a line, even across cells',\n", 137 | " help_index : 'AA',\n", 138 | " handler : vim_up\n", 139 | " },\n", 140 | " 'j' : {\n", 141 | " help : m + 'down a line, even across cells',\n", 142 | " help_index : 'AA',\n", 143 | " handler : vim_down\n", 144 | " },\n", 145 | "\n", 146 | "};\n", 147 | "\n", 148 | "var command_shortcuts = {\n", 149 | " 'c' : {\n", 150 | " help : m + def_cmd['y'].help,\n", 151 | " help_index : 'AA',\n", 152 | " handler : def_cmd['y'].handler\n", 153 | " }\n", 154 | "\n", 155 | "\n", 156 | "};\n", 157 | "\n", 158 | "edit.add_shortcuts(edit_shortcuts);\n", 159 | "cmd.add_shortcuts(command_shortcuts);\n", 160 | "//edit.add_shortcuts('k', def_edit['up'].handler);\n", 161 | "//edit.add_shortcut('j', def_edit['down'].handler);\n", 162 | "\n", 163 | "// N.B. This code looks fairly simple, but it took me forever to \n", 164 | "// figure out how to do this, \n", 165 | "// \n", 166 | "// there's a problem here, Ctrl-[ is already handled by CodeMirror by the time we \n", 167 | "// (IPython.keyboard_manager) get it CodeMirror issues signals on mode change, \n", 168 | "// so we have to hook into that to get Ctrl-[\n", 169 | "edit.remove_shortcut('Ctrl-[');\n", 170 | "edit.remove_shortcut('Esc');\n", 171 | "\n", 172 | "CodeMirror.commands.leaveInsertOrEdit = function (cm) {\n", 173 | " if ( cm.state.vim.insertMode ) {\n", 174 | " // do magic here to get out of insert mode\n", 175 | " CodeMirror.keyMap['vim-insert']['Esc'](cm);\n", 176 | " } else {\n", 177 | " IPython.notebook.command_mode();\n", 178 | " IPython.notebook.focus_cell();\n", 179 | " }\n", 180 | "};\n", 181 | " \n", 182 | "//C().options.extraKeys['Ctrl-['] = 'leaveInsertOrEdit';\n", 183 | "all_cm( function (cm) {\n", 184 | " cm.options.extraKeys['Ctrl-['] = 'leaveInsertOrEdit';\n", 185 | " cm.options.extraKeys['Esc'] = 'leaveInsertOrEdit';\n", 186 | " if ( CodeMirror.defaults.extraKeys === null ) { \n", 187 | " CodeMirror.defaults.extraKeys = {};\n", 188 | " }\n", 189 | " // TODO: make this change permanent\n", 190 | " // this part seems to be ignore when adding a new cell\n", 191 | " // - alternative solution would be to listen for NewCell events and rerun the CM function on it\n", 192 | " // - it could also be the case that when we instatiate CodeMirror, we somehow leave out CM.defaults.extraKeys\n", 193 | " IPython.CodeCell.options_default.cm_config.extraKeys['Ctrl-['] = 'leaveInsertOrEdit';\n", 194 | " IPython.TextCell.options_default.cm_config.extraKeys['Ctrl-['] = 'leaveInsertOrEdit';\n", 195 | " IPython.CodeCell.options_default.cm_config.extraKeys['Esc'] = 'leaveInsertOrEdit';\n", 196 | " IPython.TextCell.options_default.cm_config.extraKeys['Esc'] = 'leaveInsertOrEdit';\n", 197 | "})\n", 198 | "\n", 199 | "// On blur, make sure we go back to command mode for CodeMirror (in case user clicked away)\n", 200 | "// TODO: Make this permanent - how to get CodeMirror to do this for new cells created after\n", 201 | "all_cm( function (cm) {\n", 202 | " cm.on('blur', function(cm) {\n", 203 | " // TODO: I wish I understood a better way to do this, but fake pressing Escape work\n", 204 | " CodeMirror.keyMap['vim-insert']['Esc'](cm);\n", 205 | " CodeMirror.keyMap['vim']['Esc'](cm);\n", 206 | " cm.setOption('styleActiveLine', false);\n", 207 | " if (cm.getOption(\"fullScreen\")) {\n", 208 | " cm.setOption('fullScreen', false); \n", 209 | " // fullScreen the newly selected code mirror (doesn't work)\n", 210 | " //setTimeout(100, function() {\n", 211 | " // console.log(IPython.notebook.get_selected_cell().code_mirror);\n", 212 | " // IPython.notebook.get_selected_cell().code_mirror.setOption('fullScreen', true); \n", 213 | " //});\n", 214 | " }\n", 215 | " });\n", 216 | " cm.on('focus', function(cm) {\n", 217 | " cm.setOption('styleActiveLine', true);\n", 218 | " });\n", 219 | "});\n", 220 | "\n", 221 | "// 'i' by default interrupts the kernel (what Ctrl-C does at the terminal)\n", 222 | "cmd.remove_shortcut('i');\n", 223 | "cmd.add_shortcut('i', def_cmd.enter);\n", 224 | "\n", 225 | "// not quite what we want - 'i' requires a double-tap\n", 226 | "// add documentation for this.\n", 227 | "cmd.add_shortcut('ctrl-c', function(e) { IPython.notebook.kernel.interrupt(); return false});\n", 228 | "\n", 229 | "\n", 230 | "function focus_last(e) {\n", 231 | " var cells = IPython.notebook.get_cells();\n", 232 | " cells[cells.length-1].focus_cell();\n", 233 | "};\n", 234 | "\n", 235 | "function focus_first(e) {\n", 236 | " var cells = IPython.notebook.get_cells();\n", 237 | " cells[0].focus_cell();\n", 238 | "};\n", 239 | "\n", 240 | "function combo_tap(combo, action) {\n", 241 | " var that = this;\n", 242 | " var timeout;\n", 243 | " function f() {\n", 244 | " console.log('f called once');\n", 245 | " \n", 246 | " // redo this so that when an action is performed, we restore the original combo\n", 247 | " cmd.add_shortcut(combo[1], \n", 248 | " function() { console.log(\"doing action\", combo); reset(); action(); timeout.clear();} );\n", 249 | " timeout = setTimeout(function () {\n", 250 | " console.log('resetting f');\n", 251 | " reset();\n", 252 | " //cmd.add_shortcut(combo[0], reset)\n", 253 | " }, 800);\n", 254 | " }\n", 255 | " function reset(e) {\n", 256 | " //cmd.remove_shortcut(combo[0]);\n", 257 | " console.log('reset called');\n", 258 | " //if (timeout) {\n", 259 | " // console.log('resetting aborted');\n", 260 | " // clearTimeout(timeout);\n", 261 | " // timeout = null;\n", 262 | " //}\n", 263 | " //that(combo, action); \n", 264 | " cmd.add_shortcut(combo[0], f);\n", 265 | " }\n", 266 | " console.log(\"combo tap for\", combo);\n", 267 | " \n", 268 | " reset();\n", 269 | "};\n", 270 | "cmd.add_shortcut('shift-g', focus_last);\n", 271 | "combo_tap('gg', focus_first);\n", 272 | "\n", 273 | "// XXX: the way combo tap is currently implemented, this won't work\n", 274 | "// need a more generic mechanism for combo-taps with common prefixes\n", 275 | "// combo_tap('gq', f();\n", 276 | "//cmd.remove_shortcut('d');\n", 277 | "// cut\n", 278 | "combo_tap('dd', def_cmd['x'].handler);\n", 279 | "\n", 280 | "// copy\n", 281 | "combo_tap('yy', def_cmd['c'].handler);\n", 282 | "\n", 283 | "// paste\n", 284 | "cmd.add_shortcut('p', def_cmd['v']);\n", 285 | "\n", 286 | "// undo\n", 287 | "cmd.add_shortcut('u', def_cmd['z']);\n", 288 | "\n", 289 | "// Join (merge down with cell below)\n", 290 | "cmd.add_shortcut('shift-j', def_cmd['shift-m'])\n", 291 | "\n", 292 | "//edit.add_shortcut('k', def_edit['up'].handler);\n", 293 | "//[edit.add_shortcut('j', def_edit['down'].handler);\n", 294 | "\n", 295 | "CodeMirror.prototype.save = function() { \n", 296 | " IPython.notebook.save_checkpoint()\n", 297 | "}\n", 298 | "\n", 299 | "function focus_last(e) {\n", 300 | " var cells = IPython.notebook.get_cells();\n", 301 | " cells[cells.length-1].focus_cell();\n", 302 | "};\n", 303 | "\n", 304 | "function focus_first(e) {\n", 305 | " console.log('focus first called');\n", 306 | " var cells = IPython.notebook.get_cells();\n", 307 | " cells[0].focus_cell();\n", 308 | "};\n", 309 | "\n", 310 | "\n", 311 | "cmd.add_shortcut('shift-g', focus_last);\n", 312 | "combo_tap('gg', focus_first);\n", 313 | "\n", 314 | "// get rid of the default Ctrl-W binding\n", 315 | "// this only works for Firefox\n", 316 | "$(document).ready(function() {\n", 317 | "\t$(this).bind('keypress', function(e) {\n", 318 | "\t\tvar key = (e.keyCode ? e.keyCode : e.charCode);\n", 319 | "\t\tif (key == '119' && e.ctrlKey) {\n", 320 | "\t\t\treturn false;\n", 321 | "\t\t}\n", 322 | "\t});\n", 323 | "});\n", 324 | "\n", 325 | "window.addEventListener(\"beforeunload\", function( event ) {\n", 326 | " var press = jQuery.Event(\"keypress\");\n", 327 | " press.ctrlKey = false;\n", 328 | " press.which = 27; // escape\n", 329 | " $(document).trigger(press);\n", 330 | " event.returnValue = \"\\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\";\n", 331 | " event.returnValue +=\"\\nX Chrome sucks at captruring Ctrl-W, sorry X\";\n", 332 | " event.returnValue += \"\\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\";\n", 333 | "});\n", 334 | "\n", 335 | "// update the keyboard shortcuts\n", 336 | "IPython.quick_help = new IPython.QuickHelp();\n", 337 | "\n", 338 | "//IPython.CodeCell.options_default.cm_config.styleActiveLine = true;\n", 339 | "\n", 340 | "all_cm( function (cm) {\n", 341 | " cm.setOption('foldGutter', true);\n", 342 | " cm.setOption('gutters', [\"CodeMirror-linenumbers\", \"CodeMirror-foldgutter\"]);\n", 343 | " cm.options.extraKeys[\"Ctrl-F\"] = function(cm){ cm.foldCode(cm.getCursor()); };\n", 344 | " var wrapOptions = {column: 78, killTrailingSpace: true, wrapOn: /\\s\\S|[^\\.\\d]/ };\n", 345 | " // XXX: add a hardwrap-range to this as well\n", 346 | " cm.options.extraKeys[\"F2\"] = function(cm) { cm.wrapParagraph(cm.getCursor(), wrapOptions); };\n", 347 | " //cm.options.extraKeys[\"[\"] = function(cm) { cm.setOption(\"fullScreen\", !cm.getOption(\"fullScreen\"))};\n", 348 | " IPython.CodeCell.options_default.cm_config.extraKeys['Ctrl-F'] = function(cm){ cm.foldCode(cm.getCursor()); };\n", 349 | " IPython.TextCell.options_default.cm_config.extraKeys['Ctrl-F'] = function(cm){ cm.foldCode(cm.getCursor()); };\n", 350 | "\n", 351 | " // todo - do this for new cells as well\n", 352 | " // support this a :only? turn off full screen on blur\n", 353 | " cm.options.extraKeys[\"F11\"] = function(cm) { cm.setOption(\"fullScreen\", !cm.getOption(\"fullScreen\"))};\n", 354 | " cm.options.extraKeys[\"Ctrl-A\"] = function(cm) {\n", 355 | " if (cm.getOption(\"fullScreen\")) cm.setOption(\"fullScreen\", false);\n", 356 | " };\n", 357 | " //all_cm( function (cm) {\n", 358 | "});\n", 359 | "\n", 360 | "//setTimeout(function() {IPython.notebook.get_selected_cell().set_input_prompt('vim');}, 200)\n", 361 | "\n", 362 | "$(\"#ipython_notebook\").find('img').remove('#vim');\n", 363 | "$(\"#ipython_notebook\").append('')\n", 368 | "$(\"#vim\").click( function () {$(this).hide()});\n", 369 | "\n", 370 | "\n", 371 | "// XXX: Autowrapping is kind of broken - you can write a line that will have\n", 372 | "// its last word (if it's 1 or 2 characters just go back and forth between the\n", 373 | "// current and the next lines)\n", 374 | "//all_cm(function (cm) {\n", 375 | "// var wait, options = {column: 78, killTrailingSpace: true, wrapOn: /\\s\\S|[^\\.\\d]/};\n", 376 | "// cm.on(\"change\", function(cm, change) {\n", 377 | "// clearTimeout(wait);\n", 378 | "// wait = setTimeout(function() {\n", 379 | "// console.log(cm.wrapParagraphsInRange(change.from, CodeMirror.changeEnd(change), options));\n", 380 | "// }, 300);\n", 381 | "// });\n", 382 | "//});\n", 383 | "\n", 384 | "}, function failure() { \n", 385 | " alert('le sucks, something went wrong');\n", 386 | "\n", 387 | "});\n", 388 | "\n", 389 | "\n", 390 | "// at_top and at_bottom methods for ipython-vimception\n", 391 | " /**\n", 392 | " * @method at_top\n", 393 | " * @return {Boolean}\n", 394 | " */\n", 395 | " Cell.prototype.at_top = function () {\n", 396 | " var cm = this.code_mirror;\n", 397 | " var cursor = cm.getCursor();\n", 398 | " if (cursor.line === 0 && cm.findPosV(cursor, -1, 'line').hitSide) {\n", 399 | " return true;\n", 400 | " } else {\n", 401 | " return false;\n", 402 | " }\n", 403 | " };\n", 404 | "\n", 405 | " /**\n", 406 | " * @method at_bottom\n", 407 | " * @return {Boolean}\n", 408 | " * */\n", 409 | " Cell.prototype.at_bottom = function () {\n", 410 | " var cm = this.code_mirror;\n", 411 | " var cursor = cm.getCursor();\n", 412 | " if (cursor.line === (cm.lineCount()-1) && cm.findPosV(cursor, 1, 'line').hitSide) {\n", 413 | " return true;\n", 414 | " } else {\n", 415 | " return false;\n", 416 | " }\n", 417 | " };\n" 418 | ], 419 | "metadata": {}, 420 | "output_type": "display_data", 421 | "text": [ 422 | "" 423 | ] 424 | }, 425 | { 426 | "javascript": [ 427 | "\n", 428 | "var cmd = IPython.keyboard_manager.command_shortcuts;\n", 429 | "var edit = IPython.keyboard_manager.edit_shortcuts;\n", 430 | "var def_cmd = IPython.default_command_shortcuts;\n", 431 | "var def_edit = IPython.default_edit_shortcuts;\n", 432 | "\n", 433 | "// get the code mirror editor of a curently selected cell\n", 434 | "function C() { return IPython.notebook.get_selected_cell().code_mirror; };\n", 435 | "\n", 436 | "// Change the mode of all current and future CodeMirror instances\n", 437 | "// Emacs users can use this function as just to('emacs') so long as they've\n", 438 | "// required/loaded emacs.js from CodeMirror\n", 439 | "function to(mode) {\n", 440 | " var mode = mode || 'vim'\n", 441 | " // first let's apply vim mode to all current cells\n", 442 | " function to_mode(c) { return c.code_mirror.setOption('keyMap', mode);};\n", 443 | " IPython.notebook.get_cells().map(to_mode);\n", 444 | " // apply the mode to future cells created\n", 445 | " IPython.Cell.options_default.cm_config.keyMap = mode;\n", 446 | "}\n", 447 | "\n", 448 | "function getCSS(path) {\n", 449 | " $('', {\n", 450 | " rel: 'stylesheet',\n", 451 | " type: 'text/css',\n", 452 | " href: path,\n", 453 | " }).appendTo('head');\n", 454 | "}\n", 455 | "\n", 456 | "// I messed around with trying to get requireJS going here, but gave up and\n", 457 | "// just using this answer from SO \n", 458 | "// http://stackoverflow.com/questions/11803215/how-to-include-multiple-js-files-using-jquery-getscript-method\n", 459 | "\n", 460 | "var p = \"/static/components/codemirror/addon/\";\n", 461 | "\n", 462 | "$.when(\n", 463 | "// Grab the CodeMirror vim keymap\n", 464 | "$.getScript(p + \"../keymap/vim.js\"),\n", 465 | "// also make search work via /\n", 466 | "$.getScript(p + \"search/search.js\"),\n", 467 | "$.getScript(p + \"search/searchcursor.js\"),\n", 468 | "\n", 469 | "// TODO: hook-up gq to perform a harwrap\n", 470 | "$.getScript(p + \"wrap/hardwrap.js\"),\n", 471 | "$.getScript(p + \"selection/active-line.js\"),\n", 472 | "\n", 473 | "$.getScript(p + \"display/fullscreen.js\"),\n", 474 | "getCSS(p + \"display/fullscreen.css\"),\n", 475 | "getCSS(p + \"dialog/dialog.css\"),\n", 476 | "$.getScript(p + \"dialog/dialog.js\"),\n", 477 | "\n", 478 | "\n", 479 | " $.Deferred(function( deferred ){\n", 480 | " $( deferred.resolve );\n", 481 | " })\n", 482 | ").then(function success(){\n", 483 | "\n", 484 | "console.log('Great success');\n", 485 | "\n", 486 | "IPython.CodeCell.options_default.cm_config.foldGutter = true;\n", 487 | "IPython.CodeCell.options_default.cm_config.gutters = [\"CodeMirror-linenumbers\", \"CodeMirror-foldgutter\"];\n", 488 | "\n", 489 | "IPython.Cell.prototype.at_top = function () {\n", 490 | " var cm = this.code_mirror;\n", 491 | " var cursor = cm.getCursor();\n", 492 | " if (cursor.line === 0) {\n", 493 | " return true;\n", 494 | " }\n", 495 | " return false;\n", 496 | " };\n", 497 | "\n", 498 | "\n", 499 | "IPython.Cell.prototype.at_bottom = function () {\n", 500 | " var cm = this.code_mirror;\n", 501 | " var cursor = cm.getCursor();\n", 502 | " if (cursor.line === (cm.lineCount()-1)) {\n", 503 | " return true;\n", 504 | " }\n", 505 | " return false;\n", 506 | "};\n", 507 | "// on all code mirror instances on this page, apply the function f\n", 508 | "function all_cm(f) {\n", 509 | " // apply f to every code mirror instance. f takes one parameter\n", 510 | " IPython.notebook.get_cells().map(function (c) { f(c.code_mirror); } );\n", 511 | "}\n", 512 | "\n", 513 | "\n", 514 | "to('vim');\n", 515 | "function vim_up(event) {\n", 516 | " var cell = IPython.notebook.get_selected_cell();\n", 517 | " if (cell && cell.at_top() && cell.code_mirror.options.keyMap === 'vim') {\n", 518 | " console.log('inside the business logic k');\n", 519 | " event.preventDefault();\n", 520 | " IPython.notebook.command_mode()\n", 521 | " IPython.notebook.select_prev();\n", 522 | " IPython.notebook.edit_mode();\n", 523 | " return false;\n", 524 | " };\n", 525 | "}\n", 526 | "\n", 527 | "function vim_down(event) {\n", 528 | " var cell = IPython.notebook.get_selected_cell();\n", 529 | " if (cell && cell.at_bottom() && cell.code_mirror.options.keyMap === 'vim') {\n", 530 | " event.preventDefault();\n", 531 | " IPython.notebook.command_mode()\n", 532 | " IPython.notebook.select_next();\n", 533 | " IPython.notebook.edit_mode();\n", 534 | " return false;\n", 535 | " };\n", 536 | " }\n", 537 | "\n", 538 | "var m = '(vim) '\n", 539 | "var edit_shortcuts = {\n", 540 | " 'k' : {\n", 541 | " help : m + 'up a line, even across cells',\n", 542 | " help_index : 'AA',\n", 543 | " handler : vim_up\n", 544 | " },\n", 545 | " 'j' : {\n", 546 | " help : m + 'down a line, even across cells',\n", 547 | " help_index : 'AA',\n", 548 | " handler : vim_down\n", 549 | " },\n", 550 | "\n", 551 | "};\n", 552 | "\n", 553 | "var command_shortcuts = {\n", 554 | " 'c' : {\n", 555 | " help : m + def_cmd['y'].help,\n", 556 | " help_index : 'AA',\n", 557 | " handler : def_cmd['y'].handler\n", 558 | " }\n", 559 | "\n", 560 | "\n", 561 | "};\n", 562 | "\n", 563 | "edit.add_shortcuts(edit_shortcuts);\n", 564 | "cmd.add_shortcuts(command_shortcuts);\n", 565 | "//edit.add_shortcuts('k', def_edit['up'].handler);\n", 566 | "//edit.add_shortcut('j', def_edit['down'].handler);\n", 567 | "\n", 568 | "// N.B. This code looks fairly simple, but it took me forever to \n", 569 | "// figure out how to do this, \n", 570 | "// \n", 571 | "// there's a problem here, Ctrl-[ is already handled by CodeMirror by the time we \n", 572 | "// (IPython.keyboard_manager) get it CodeMirror issues signals on mode change, \n", 573 | "// so we have to hook into that to get Ctrl-[\n", 574 | "edit.remove_shortcut('Ctrl-[');\n", 575 | "edit.remove_shortcut('Esc');\n", 576 | "\n", 577 | "CodeMirror.commands.leaveInsertOrEdit = function (cm) {\n", 578 | " if ( cm.state.vim.insertMode ) {\n", 579 | " // do magic here to get out of insert mode\n", 580 | " CodeMirror.keyMap['vim-insert']['Esc'](cm);\n", 581 | " } else {\n", 582 | " IPython.notebook.command_mode();\n", 583 | " IPython.notebook.focus_cell();\n", 584 | " }\n", 585 | "};\n", 586 | " \n", 587 | "//C().options.extraKeys['Ctrl-['] = 'leaveInsertOrEdit';\n", 588 | "all_cm( function (cm) {\n", 589 | " cm.options.extraKeys['Ctrl-['] = 'leaveInsertOrEdit';\n", 590 | " cm.options.extraKeys['Esc'] = 'leaveInsertOrEdit';\n", 591 | " if ( CodeMirror.defaults.extraKeys === null ) { \n", 592 | " CodeMirror.defaults.extraKeys = {};\n", 593 | " }\n", 594 | " // TODO: make this change permanent\n", 595 | " // this part seems to be ignore when adding a new cell\n", 596 | " // - alternative solution would be to listen for NewCell events and rerun the CM function on it\n", 597 | " // - it could also be the case that when we instatiate CodeMirror, we somehow leave out CM.defaults.extraKeys\n", 598 | " IPython.CodeCell.options_default.cm_config.extraKeys['Ctrl-['] = 'leaveInsertOrEdit';\n", 599 | " IPython.TextCell.options_default.cm_config.extraKeys['Ctrl-['] = 'leaveInsertOrEdit';\n", 600 | " IPython.CodeCell.options_default.cm_config.extraKeys['Esc'] = 'leaveInsertOrEdit';\n", 601 | " IPython.TextCell.options_default.cm_config.extraKeys['Esc'] = 'leaveInsertOrEdit';\n", 602 | "})\n", 603 | "\n", 604 | "// On blur, make sure we go back to command mode for CodeMirror (in case user clicked away)\n", 605 | "// TODO: Make this permanent - how to get CodeMirror to do this for new cells created after\n", 606 | "all_cm( function (cm) {\n", 607 | " cm.on('blur', function(cm) {\n", 608 | " // TODO: I wish I understood a better way to do this, but fake pressing Escape work\n", 609 | " CodeMirror.keyMap['vim-insert']['Esc'](cm);\n", 610 | " CodeMirror.keyMap['vim']['Esc'](cm);\n", 611 | " cm.setOption('styleActiveLine', false);\n", 612 | " if (cm.getOption(\"fullScreen\")) {\n", 613 | " cm.setOption('fullScreen', false); \n", 614 | " // fullScreen the newly selected code mirror (doesn't work)\n", 615 | " //setTimeout(100, function() {\n", 616 | " // console.log(IPython.notebook.get_selected_cell().code_mirror);\n", 617 | " // IPython.notebook.get_selected_cell().code_mirror.setOption('fullScreen', true); \n", 618 | " //});\n", 619 | " }\n", 620 | " });\n", 621 | " cm.on('focus', function(cm) {\n", 622 | " cm.setOption('styleActiveLine', true);\n", 623 | " });\n", 624 | "});\n", 625 | "\n", 626 | "// 'i' by default interrupts the kernel (what Ctrl-C does at the terminal)\n", 627 | "cmd.remove_shortcut('i');\n", 628 | "cmd.add_shortcut('i', def_cmd.enter);\n", 629 | "\n", 630 | "// not quite what we want - 'i' requires a double-tap\n", 631 | "// add documentation for this.\n", 632 | "cmd.add_shortcut('ctrl-c', function(e) { IPython.notebook.kernel.interrupt(); return false});\n", 633 | "\n", 634 | "\n", 635 | "function focus_last(e) {\n", 636 | " var cells = IPython.notebook.get_cells();\n", 637 | " cells[cells.length-1].focus_cell();\n", 638 | "};\n", 639 | "\n", 640 | "function focus_first(e) {\n", 641 | " var cells = IPython.notebook.get_cells();\n", 642 | " cells[0].focus_cell();\n", 643 | "};\n", 644 | "\n", 645 | "function combo_tap(combo, action) {\n", 646 | " var that = this;\n", 647 | " var timeout;\n", 648 | " function f() {\n", 649 | " console.log('f called once');\n", 650 | " \n", 651 | " // redo this so that when an action is performed, we restore the original combo\n", 652 | " cmd.add_shortcut(combo[1], \n", 653 | " function() { console.log(\"doing action\", combo); reset(); action(); timeout.clear();} );\n", 654 | " timeout = setTimeout(function () {\n", 655 | " console.log('resetting f');\n", 656 | " reset();\n", 657 | " //cmd.add_shortcut(combo[0], reset)\n", 658 | " }, 800);\n", 659 | " }\n", 660 | " function reset(e) {\n", 661 | " //cmd.remove_shortcut(combo[0]);\n", 662 | " console.log('reset called');\n", 663 | " //if (timeout) {\n", 664 | " // console.log('resetting aborted');\n", 665 | " // clearTimeout(timeout);\n", 666 | " // timeout = null;\n", 667 | " //}\n", 668 | " //that(combo, action); \n", 669 | " cmd.add_shortcut(combo[0], f);\n", 670 | " }\n", 671 | " console.log(\"combo tap for\", combo);\n", 672 | " \n", 673 | " reset();\n", 674 | "};\n", 675 | "cmd.add_shortcut('shift-g', focus_last);\n", 676 | "combo_tap('gg', focus_first);\n", 677 | "\n", 678 | "// XXX: the way combo tap is currently implemented, this won't work\n", 679 | "// need a more generic mechanism for combo-taps with common prefixes\n", 680 | "// combo_tap('gq', f();\n", 681 | "//cmd.remove_shortcut('d');\n", 682 | "// cut\n", 683 | "combo_tap('dd', def_cmd['x'].handler);\n", 684 | "\n", 685 | "// copy\n", 686 | "combo_tap('yy', def_cmd['c'].handler);\n", 687 | "\n", 688 | "// paste\n", 689 | "cmd.add_shortcut('p', def_cmd['v']);\n", 690 | "\n", 691 | "// undo\n", 692 | "cmd.add_shortcut('u', def_cmd['z']);\n", 693 | "\n", 694 | "// Join (merge down with cell below)\n", 695 | "cmd.add_shortcut('shift-j', def_cmd['shift-m'])\n", 696 | "\n", 697 | "//edit.add_shortcut('k', def_edit['up'].handler);\n", 698 | "//[edit.add_shortcut('j', def_edit['down'].handler);\n", 699 | "\n", 700 | "CodeMirror.prototype.save = function() { \n", 701 | " IPython.notebook.save_checkpoint()\n", 702 | "}\n", 703 | "\n", 704 | "function focus_last(e) {\n", 705 | " var cells = IPython.notebook.get_cells();\n", 706 | " cells[cells.length-1].focus_cell();\n", 707 | "};\n", 708 | "\n", 709 | "function focus_first(e) {\n", 710 | " console.log('focus first called');\n", 711 | " var cells = IPython.notebook.get_cells();\n", 712 | " cells[0].focus_cell();\n", 713 | "};\n", 714 | "\n", 715 | "\n", 716 | "cmd.add_shortcut('shift-g', focus_last);\n", 717 | "combo_tap('gg', focus_first);\n", 718 | "\n", 719 | "// get rid of the default Ctrl-W binding\n", 720 | "// this only works for Firefox\n", 721 | "$(document).ready(function() {\n", 722 | "\t$(this).bind('keypress', function(e) {\n", 723 | "\t\tvar key = (e.keyCode ? e.keyCode : e.charCode);\n", 724 | "\t\tif (key == '119' && e.ctrlKey) {\n", 725 | "\t\t\treturn false;\n", 726 | "\t\t}\n", 727 | "\t});\n", 728 | "});\n", 729 | "\n", 730 | "window.addEventListener(\"beforeunload\", function( event ) {\n", 731 | " var press = jQuery.Event(\"keypress\");\n", 732 | " press.ctrlKey = false;\n", 733 | " press.which = 27; // escape\n", 734 | " $(document).trigger(press);\n", 735 | " event.returnValue = \"\\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\";\n", 736 | " event.returnValue +=\"\\nX Chrome sucks at captruring Ctrl-W, sorry X\";\n", 737 | " event.returnValue += \"\\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\";\n", 738 | "});\n", 739 | "\n", 740 | "// update the keyboard shortcuts\n", 741 | "IPython.quick_help = new IPython.QuickHelp();\n", 742 | "\n", 743 | "//IPython.CodeCell.options_default.cm_config.styleActiveLine = true;\n", 744 | "\n", 745 | "all_cm( function (cm) {\n", 746 | " cm.setOption('foldGutter', true);\n", 747 | " cm.setOption('gutters', [\"CodeMirror-linenumbers\", \"CodeMirror-foldgutter\"]);\n", 748 | " cm.options.extraKeys[\"Ctrl-F\"] = function(cm){ cm.foldCode(cm.getCursor()); };\n", 749 | " var wrapOptions = {column: 78, killTrailingSpace: true, wrapOn: /\\s\\S|[^\\.\\d]/ };\n", 750 | " // XXX: add a hardwrap-range to this as well\n", 751 | " cm.options.extraKeys[\"F2\"] = function(cm) { cm.wrapParagraph(cm.getCursor(), wrapOptions); };\n", 752 | " //cm.options.extraKeys[\"[\"] = function(cm) { cm.setOption(\"fullScreen\", !cm.getOption(\"fullScreen\"))};\n", 753 | " IPython.CodeCell.options_default.cm_config.extraKeys['Ctrl-F'] = function(cm){ cm.foldCode(cm.getCursor()); };\n", 754 | " IPython.TextCell.options_default.cm_config.extraKeys['Ctrl-F'] = function(cm){ cm.foldCode(cm.getCursor()); };\n", 755 | "\n", 756 | " // todo - do this for new cells as well\n", 757 | " // support this a :only? turn off full screen on blur\n", 758 | " cm.options.extraKeys[\"F11\"] = function(cm) { cm.setOption(\"fullScreen\", !cm.getOption(\"fullScreen\"))};\n", 759 | " cm.options.extraKeys[\"Ctrl-A\"] = function(cm) {\n", 760 | " if (cm.getOption(\"fullScreen\")) cm.setOption(\"fullScreen\", false);\n", 761 | " };\n", 762 | " //all_cm( function (cm) {\n", 763 | "});\n", 764 | "\n", 765 | "//setTimeout(function() {IPython.notebook.get_selected_cell().set_input_prompt('vim');}, 200)\n", 766 | "\n", 767 | "$(\"#ipython_notebook\").find('img').remove('#vim');\n", 768 | "$(\"#ipython_notebook\").append('')\n", 773 | "$(\"#vim\").click( function () {$(this).hide()});\n", 774 | "\n", 775 | "\n", 776 | "// XXX: Autowrapping is kind of broken - you can write a line that will have\n", 777 | "// its last word (if it's 1 or 2 characters just go back and forth between the\n", 778 | "// current and the next lines)\n", 779 | "//all_cm(function (cm) {\n", 780 | "// var wait, options = {column: 78, killTrailingSpace: true, wrapOn: /\\s\\S|[^\\.\\d]/};\n", 781 | "// cm.on(\"change\", function(cm, change) {\n", 782 | "// clearTimeout(wait);\n", 783 | "// wait = setTimeout(function() {\n", 784 | "// console.log(cm.wrapParagraphsInRange(change.from, CodeMirror.changeEnd(change), options));\n", 785 | "// }, 300);\n", 786 | "// });\n", 787 | "//});\n", 788 | "\n", 789 | "}, function failure() { \n", 790 | " alert('le sucks, something went wrong');\n", 791 | "\n", 792 | "});\n", 793 | "\n", 794 | "\n", 795 | "// at_top and at_bottom methods for ipython-vimception\n", 796 | " /**\n", 797 | " * @method at_top\n", 798 | " * @return {Boolean}\n", 799 | " */\n", 800 | " Cell.prototype.at_top = function () {\n", 801 | " var cm = this.code_mirror;\n", 802 | " var cursor = cm.getCursor();\n", 803 | " if (cursor.line === 0 && cm.findPosV(cursor, -1, 'line').hitSide) {\n", 804 | " return true;\n", 805 | " } else {\n", 806 | " return false;\n", 807 | " }\n", 808 | " };\n", 809 | "\n", 810 | " /**\n", 811 | " * @method at_bottom\n", 812 | " * @return {Boolean}\n", 813 | " * */\n", 814 | " Cell.prototype.at_bottom = function () {\n", 815 | " var cm = this.code_mirror;\n", 816 | " var cursor = cm.getCursor();\n", 817 | " if (cursor.line === (cm.lineCount()-1) && cm.findPosV(cursor, 1, 'line').hitSide) {\n", 818 | " return true;\n", 819 | " } else {\n", 820 | " return false;\n", 821 | " }\n", 822 | " };\n" 823 | ], 824 | "metadata": {}, 825 | "output_type": "display_data", 826 | "text": [ 827 | "" 828 | ] 829 | } 830 | ], 831 | "prompt_number": 1 832 | }, 833 | { 834 | "cell_type": "code", 835 | "collapsed": false, 836 | "input": [ 837 | "%load_ext autoreload\n", 838 | "%autoreload 2" 839 | ], 840 | "language": "python", 841 | "metadata": {}, 842 | "outputs": [], 843 | "prompt_number": 2 844 | }, 845 | { 846 | "cell_type": "markdown", 847 | "metadata": {}, 848 | "source": [ 849 | "# Long input strings\n", 850 | "\n", 851 | "For certain tasks it might make more sense to tokenize input strings first and then extract features on these string lists rather than on the original character lists.\n", 852 | "\n", 853 | "To demonstrate this I'll take some example strings from [highered](https://github.com/datamade/highered/) and learn models using these two feature extraction techniques." 854 | ] 855 | }, 856 | { 857 | "cell_type": "markdown", 858 | "metadata": {}, 859 | "source": [ 860 | "## Training examples" 861 | ] 862 | }, 863 | { 864 | "cell_type": "code", 865 | "collapsed": false, 866 | "input": [ 867 | "X = [(u'caring hands a step ahead', u'el valor little tykes ii'),\n", 868 | " (u'dulles', u\"chicago public schools o'keeffe, isabell c.\"),\n", 869 | " (u'erie neighborhood house fcch-carmen l. vega site',\n", 870 | " u'erie neighborhood house fcch-servia galva site'),\n", 871 | " (u'chicago public schools dvorak math & science tech academy, anton',\n", 872 | " u'chicago public schools perez, manuel'),\n", 873 | " (u'v & j day care center', u\"henry booth house granny's day care center\"),\n", 874 | " (u'home of life community dev. corp. - home of life just for you',\n", 875 | " u'urban family and community centers'),\n", 876 | " (u'carole robertson center for learning fcch-ileana gonzalez',\n", 877 | " u'carole robertson center for learning fcch-rhonda culverson'),\n", 878 | " (u'bethel new life bethel child development',\n", 879 | " u'mary crane league mary crane center (lake & pulaski)'),\n", 880 | " (u'easter seals society of metropolitan chicago - stepping stones early/childhood lear',\n", 881 | " u\"marcy newberry association kenyatta's day care\"),\n", 882 | " (u'westside holistic family services westside holistic family services',\n", 883 | " u'childserv lawndale'),\n", 884 | " \n", 885 | " (u'higgins', u'higgins'),\n", 886 | " (u'ymca south side', u'ymca of metropolitan chicago - south side ymca'),\n", 887 | " (u'chicago commons association paulo freire',\n", 888 | " u'chicago commons association paulo freire'),\n", 889 | " (u'fresh start daycare, inc.',\n", 890 | " u'easter seals society of metropolitan chicago fresh start day care center'),\n", 891 | " (u'el valor teddy bear 3', u'teddy bear 3'),\n", 892 | " (u'chicago child care society chicago child care society',\n", 893 | " u'chicago child care society-child and family dev center'),\n", 894 | " (u'hull house - uptown', u'uptown family care center')]\n", 895 | "Y = [u'distinct',\n", 896 | " u'distinct',\n", 897 | " u'distinct',\n", 898 | " u'distinct',\n", 899 | " u'distinct',\n", 900 | " u'distinct',\n", 901 | " u'distinct',\n", 902 | " u'distinct',\n", 903 | " u'distinct',\n", 904 | " u'distinct',\n", 905 | " u'match',\n", 906 | " u'match',\n", 907 | " u'match',\n", 908 | " u'match',\n", 909 | " u'match',\n", 910 | " u'match',\n", 911 | " u'match']" 912 | ], 913 | "language": "python", 914 | "metadata": {}, 915 | "outputs": [], 916 | "prompt_number": 5 917 | }, 918 | { 919 | "cell_type": "code", 920 | "collapsed": false, 921 | "input": [ 922 | "from pyhacrf import StringPairFeatureExtractor, Hacrf\n", 923 | "from scipy.optimize import fmin_l_bfgs_b\n", 924 | "import numpy as np" 925 | ], 926 | "language": "python", 927 | "metadata": {}, 928 | "outputs": [], 929 | "prompt_number": 6 930 | }, 931 | { 932 | "cell_type": "markdown", 933 | "metadata": {}, 934 | "source": [ 935 | "## Character level features" 936 | ] 937 | }, 938 | { 939 | "cell_type": "code", 940 | "collapsed": false, 941 | "input": [ 942 | "# Extract features\n", 943 | "feature_extractor = StringPairFeatureExtractor(match=True, numeric=True)\n", 944 | "X_extracted = feature_extractor.fit_transform(X)" 945 | ], 946 | "language": "python", 947 | "metadata": {}, 948 | "outputs": [], 949 | "prompt_number": 7 950 | }, 951 | { 952 | "cell_type": "code", 953 | "collapsed": false, 954 | "input": [ 955 | "%%timeit -n1 -r1\n", 956 | "# Train model\n", 957 | "model = Hacrf(l2_regularization=1.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 10})\n", 958 | "model.fit(X_extracted, Y, verbosity=1)" 959 | ], 960 | "language": "python", 961 | "metadata": {}, 962 | "outputs": [ 963 | { 964 | "output_type": "stream", 965 | "stream": "stdout", 966 | "text": [ 967 | "Iteration Log-likelihood |gradient|\n", 968 | " 0 -11.78 650.6\n", 969 | " 1 -609.0 1.571e+03" 970 | ] 971 | }, 972 | { 973 | "output_type": "stream", 974 | "stream": "stdout", 975 | "text": [ 976 | "\n", 977 | " 2 -54.72 1.567e+03" 978 | ] 979 | }, 980 | { 981 | "output_type": "stream", 982 | "stream": "stdout", 983 | "text": [ 984 | "\n", 985 | " 3 -11.31 560.6" 986 | ] 987 | }, 988 | { 989 | "output_type": "stream", 990 | "stream": "stdout", 991 | "text": [ 992 | "\n", 993 | " 4 -10.83 142.5" 994 | ] 995 | }, 996 | { 997 | "output_type": "stream", 998 | "stream": "stdout", 999 | "text": [ 1000 | "\n", 1001 | " 5 -10.78 118.5" 1002 | ] 1003 | }, 1004 | { 1005 | "output_type": "stream", 1006 | "stream": "stdout", 1007 | "text": [ 1008 | "\n", 1009 | " 6 -10.7 143.8" 1010 | ] 1011 | }, 1012 | { 1013 | "output_type": "stream", 1014 | "stream": "stdout", 1015 | "text": [ 1016 | "\n", 1017 | " 7 -10.43 249.6" 1018 | ] 1019 | }, 1020 | { 1021 | "output_type": "stream", 1022 | "stream": "stdout", 1023 | "text": [ 1024 | "\n", 1025 | " 8 -10.13 328.6" 1026 | ] 1027 | }, 1028 | { 1029 | "output_type": "stream", 1030 | "stream": "stdout", 1031 | "text": [ 1032 | "\n", 1033 | " 9 -9.796 250.5" 1034 | ] 1035 | }, 1036 | { 1037 | "output_type": "stream", 1038 | "stream": "stdout", 1039 | "text": [ 1040 | "\n", 1041 | " 10 -9.573 102.2" 1042 | ] 1043 | }, 1044 | { 1045 | "output_type": "stream", 1046 | "stream": "stdout", 1047 | "text": [ 1048 | "\n", 1049 | "1 loops, best of 1: 8.73 s per loop\n" 1050 | ] 1051 | } 1052 | ], 1053 | "prompt_number": 9 1054 | }, 1055 | { 1056 | "cell_type": "code", 1057 | "collapsed": false, 1058 | "input": [ 1059 | "%%timeit -n1 -r1\n", 1060 | "# Evaluate\n", 1061 | "from sklearn.metrics import confusion_matrix\n", 1062 | "predictions = model.predict(X_extracted)\n", 1063 | "print(confusion_matrix(Y, predictions))\n", 1064 | "print(model.predict_proba(X_extracted))" 1065 | ], 1066 | "language": "python", 1067 | "metadata": {}, 1068 | "outputs": [ 1069 | { 1070 | "output_type": "stream", 1071 | "stream": "stdout", 1072 | "text": [ 1073 | "[[8 2]\n", 1074 | " [4 3]]\n", 1075 | "[[ 0.64197473 0.35802527]\n", 1076 | " [ 0.351784 0.648216 ]\n", 1077 | " [ 0.6553065 0.3446935 ]\n", 1078 | " [ 0.87671132 0.12328868]\n", 1079 | " [ 0.47772325 0.52227675]\n", 1080 | " [ 0.878586 0.121414 ]\n", 1081 | " [ 0.70987436 0.29012564]\n", 1082 | " [ 0.64765774 0.35234226]\n", 1083 | " [ 0.93360185 0.06639815]\n", 1084 | " [ 0.92714317 0.07285683]\n", 1085 | " [ 0.48782793 0.51217207]\n", 1086 | " [ 0.40930797 0.59069203]\n", 1087 | " [ 0.59444836 0.40555164]\n", 1088 | " [ 0.39622435 0.60377565]\n", 1089 | " [ 0.63782341 0.36217659]\n", 1090 | " [ 0.69982284 0.30017716]\n", 1091 | " [ 0.5777424 0.4222576 ]]" 1092 | ] 1093 | }, 1094 | { 1095 | "output_type": "stream", 1096 | "stream": "stdout", 1097 | "text": [ 1098 | "\n", 1099 | "1 loops, best of 1: 1.67 s per loop\n" 1100 | ] 1101 | } 1102 | ], 1103 | "prompt_number": 10 1104 | }, 1105 | { 1106 | "cell_type": "markdown", 1107 | "metadata": {}, 1108 | "source": [ 1109 | "## Token level features" 1110 | ] 1111 | }, 1112 | { 1113 | "cell_type": "code", 1114 | "collapsed": false, 1115 | "input": [ 1116 | "from pyhacrf import PairFeatureExtractor" 1117 | ], 1118 | "language": "python", 1119 | "metadata": {}, 1120 | "outputs": [], 1121 | "prompt_number": 14 1122 | }, 1123 | { 1124 | "cell_type": "code", 1125 | "collapsed": false, 1126 | "input": [ 1127 | "tokX = [[sentence.split(' ') for sentence in pair] for pair in X]" 1128 | ], 1129 | "language": "python", 1130 | "metadata": {}, 1131 | "outputs": [], 1132 | "prompt_number": 15 1133 | }, 1134 | { 1135 | "cell_type": "code", 1136 | "collapsed": false, 1137 | "input": [ 1138 | "real = [\n", 1139 | " lambda i, j, s1, s2: 1.0,\n", 1140 | " lambda i, j, s1, s2: 1.0 if s1[i] == s2[j] else 0.0,\n", 1141 | " lambda i, j, s1, s2: 1.0 if s1[i] == s2[j] and len(s1[i]) >= 6 else 0.0,\n", 1142 | " lambda i, j, s1, s2: 1.0 if s1[i].isdigit() and s2[j].isdigit() and s1[i] == s2[j] else 0.0,\n", 1143 | " lambda i, j, s1, s2: 1.0 if s1[i].isalpha() and s2[j].isalpha() and s1[i] == s2[j] else 0.0,\n", 1144 | " lambda i, j, s1, s2: 1.0 if not s1[i].isalpha() and not s2[j].isalpha() else 0.0\n", 1145 | "]\n", 1146 | "# Other ideas are:\n", 1147 | "# to look up whether words are dictionary words,\n", 1148 | "# longest common subsequence,\n", 1149 | "# standard edit distance\n", 1150 | "feature_extractor = PairFeatureExtractor(real=real)\n", 1151 | "X_extracted = feature_extractor.fit_transform(tokX)" 1152 | ], 1153 | "language": "python", 1154 | "metadata": {}, 1155 | "outputs": [], 1156 | "prompt_number": 16 1157 | }, 1158 | { 1159 | "cell_type": "code", 1160 | "collapsed": false, 1161 | "input": [ 1162 | "#%%timeit -n1 -r1\n", 1163 | "# Train model\n", 1164 | "model = Hacrf(l2_regularization=1.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 400})\n", 1165 | "model.fit(X_extracted, Y, verbosity=10)" 1166 | ], 1167 | "language": "python", 1168 | "metadata": {}, 1169 | "outputs": [ 1170 | { 1171 | "output_type": "stream", 1172 | "stream": "stdout", 1173 | "text": [ 1174 | "Iteration Log-likelihood |gradient|\n", 1175 | " 0 -11.78 113.8\n", 1176 | " 10 -8.721 16.12" 1177 | ] 1178 | }, 1179 | { 1180 | "output_type": "stream", 1181 | "stream": "stdout", 1182 | "text": [ 1183 | "\n", 1184 | " 20 -8.366 1.147" 1185 | ] 1186 | }, 1187 | { 1188 | "output_type": "stream", 1189 | "stream": "stdout", 1190 | "text": [ 1191 | "\n", 1192 | " 30 -8.362 0.06527" 1193 | ] 1194 | }, 1195 | { 1196 | "output_type": "stream", 1197 | "stream": "stdout", 1198 | "text": [ 1199 | "\n", 1200 | " 40 -8.362 0.005777" 1201 | ] 1202 | }, 1203 | { 1204 | "output_type": "stream", 1205 | "stream": "stdout", 1206 | "text": [ 1207 | "\n" 1208 | ] 1209 | }, 1210 | { 1211 | "metadata": {}, 1212 | "output_type": "pyout", 1213 | "prompt_number": 17, 1214 | "text": [ 1215 | "" 1216 | ] 1217 | } 1218 | ], 1219 | "prompt_number": 17 1220 | }, 1221 | { 1222 | "cell_type": "code", 1223 | "collapsed": false, 1224 | "input": [ 1225 | "%%timeit -n1 -r1\n", 1226 | "# Evaluate\n", 1227 | "from sklearn.metrics import confusion_matrix\n", 1228 | "predictions = model.predict(X_extracted)\n", 1229 | "print(confusion_matrix(Y, predictions))\n", 1230 | "print(model.predict_proba(X_extracted))" 1231 | ], 1232 | "language": "python", 1233 | "metadata": {}, 1234 | "outputs": [ 1235 | { 1236 | "output_type": "stream", 1237 | "stream": "stdout", 1238 | "text": [ 1239 | "[[9 1]\n", 1240 | " [2 5]]\n", 1241 | "[[ 0.72215688 0.27784312]\n", 1242 | " [ 0.41200325 0.58799675]\n", 1243 | " [ 0.56910178 0.43089822]\n", 1244 | " [ 0.92672238 0.07327762]\n", 1245 | " [ 0.56921501 0.43078499]\n", 1246 | " [ 0.98737206 0.01262794]\n", 1247 | " [ 0.56762697 0.43237303]\n", 1248 | " [ 0.70141322 0.29858678]\n", 1249 | " [ 0.97308327 0.02691673]\n", 1250 | " [ 0.94721007 0.05278993]\n", 1251 | " [ 0.32690805 0.67309195]\n", 1252 | " [ 0.20741219 0.79258781]\n", 1253 | " [ 0.30060707 0.69939293]\n", 1254 | " [ 0.47280063 0.52719937]\n", 1255 | " [ 0.4531238 0.5468762 ]\n", 1256 | " [ 0.59051241 0.40948759]\n", 1257 | " [ 0.66717449 0.33282551]]\n", 1258 | "1 loops, best of 1: 30.8 ms per loop\n" 1259 | ] 1260 | } 1261 | ], 1262 | "prompt_number": 18 1263 | }, 1264 | { 1265 | "cell_type": "markdown", 1266 | "metadata": {}, 1267 | "source": [ 1268 | "## Edit distance and word frequency features\n", 1269 | "\n", 1270 | "Let's also add the the Levenschtein distance as a features. \n", 1271 | "\n", 1272 | "When we peek at the training examples, it looks as if less common words should be more informative of a match - let's add a feature for the word frequency as well." 1273 | ] 1274 | }, 1275 | { 1276 | "cell_type": "code", 1277 | "collapsed": false, 1278 | "input": [ 1279 | "import editdistance" 1280 | ], 1281 | "language": "python", 1282 | "metadata": {}, 1283 | "outputs": [], 1284 | "prompt_number": 19 1285 | }, 1286 | { 1287 | "cell_type": "code", 1288 | "collapsed": false, 1289 | "input": [ 1290 | "editdistance.eval('cheese', 'kaas')" 1291 | ], 1292 | "language": "python", 1293 | "metadata": {}, 1294 | "outputs": [ 1295 | { 1296 | "metadata": {}, 1297 | "output_type": "pyout", 1298 | "prompt_number": 20, 1299 | "text": [ 1300 | "5L" 1301 | ] 1302 | } 1303 | ], 1304 | "prompt_number": 20 1305 | }, 1306 | { 1307 | "cell_type": "code", 1308 | "collapsed": false, 1309 | "input": [ 1310 | "tokX = [[sentence.split(' ') for sentence in pair] for pair in X]" 1311 | ], 1312 | "language": "python", 1313 | "metadata": {}, 1314 | "outputs": [] 1315 | }, 1316 | { 1317 | "cell_type": "code", 1318 | "collapsed": false, 1319 | "input": [ 1320 | "real = [\n", 1321 | " lambda i, j, s1, s2: 1.0,\n", 1322 | " lambda i, j, s1, s2: 1.0 if s1[i] == s2[j] else 0.0,\n", 1323 | " lambda i, j, s1, s2: 1.0 if s1[i].isdigit() and s2[j].isdigit() and s1[i] == s2[j] else 0.0,\n", 1324 | " lambda i, j, s1, s2: 1.0 if not s1[i].isalpha() and not s2[j].isalpha() else 0.0,\n", 1325 | " lambda i, j, s1, s2: editdistance.eval(s1[i], s2[j]),\n", 1326 | " lambda i, j, s1, s2: np.log(editdistance.eval(s1[i], s2[j]) + 1),\n", 1327 | " lambda i, j, s1, s2: (editdistance.eval(s1[i], s2[j])) / max(len(s1[i]), len(s2[j])),\n", 1328 | " lambda i, j, s1, s2: 1.0 - (editdistance.eval(s1[i], s2[j])) / max(len(s1[i]), len(s2[j]))\n", 1329 | "]\n", 1330 | "# Other ideas are:\n", 1331 | "# to look up whether words are dictionary words,\n", 1332 | "# longest common subsequence,\n", 1333 | "# standard edit distance" 1334 | ], 1335 | "language": "python", 1336 | "metadata": {}, 1337 | "outputs": [], 1338 | "prompt_number": 48 1339 | }, 1340 | { 1341 | "cell_type": "code", 1342 | "collapsed": false, 1343 | "input": [ 1344 | "from sklearn.metrics import confusion_matrix, accuracy_score\n", 1345 | "from sklearn.cross_validation import train_test_split" 1346 | ], 1347 | "language": "python", 1348 | "metadata": {}, 1349 | "outputs": [], 1350 | "prompt_number": 46 1351 | }, 1352 | { 1353 | "cell_type": "code", 1354 | "collapsed": false, 1355 | "input": [ 1356 | "# Train model\n", 1357 | "errors_val = []\n", 1358 | "errors_train = []\n", 1359 | "for i, featureset in enumerate([[0, 1],\n", 1360 | " [0, 1, 2],\n", 1361 | " [0, 1, 2, 3],\n", 1362 | " [0, 4], \n", 1363 | " [0, 1, 4], \n", 1364 | " [0, 1, 2, 3, 4],\n", 1365 | " [0, 5],\n", 1366 | " [0, 1, 5],\n", 1367 | " [0, 1, 2, 3, 5],\n", 1368 | " [0, 6],\n", 1369 | " [0, 1, 6],\n", 1370 | " [0, 1, 2, 3, 6],\n", 1371 | " [0, 7],\n", 1372 | " [0, 1, 7],\n", 1373 | " [0, 1, 2, 3, 7]]):\n", 1374 | " print '{:4}{:18}'.format(i, featureset),\n", 1375 | " errs_val = []\n", 1376 | " errs_train = []\n", 1377 | " for repeat in xrange(15):\n", 1378 | " x_train, x_val, y_train, y_val = train_test_split(tokX, Y, test_size=0.2)\n", 1379 | " feature_extractor = PairFeatureExtractor(real=[real[f] for f in featureset])\n", 1380 | " X_extracted = feature_extractor.fit_transform(x_train)\n", 1381 | "\n", 1382 | " model = Hacrf(l2_regularization=1.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 400})\n", 1383 | " model.fit(X_extracted, y_train)\n", 1384 | " \n", 1385 | " predictions = model.predict(X_extracted)\n", 1386 | " err_train = 1.0 - accuracy_score(y_train, predictions)\n", 1387 | " \n", 1388 | " X_extracted = feature_extractor.transform(x_val)\n", 1389 | " predictions = model.predict(X_extracted)\n", 1390 | " err_val = 1.0 - accuracy_score(y_val, predictions)\n", 1391 | " if repeat % 10 == 0:\n", 1392 | " print '{:.2f}'.format(err_train),\n", 1393 | " print '{:.2f}'.format(err_val),\n", 1394 | " errs_val.append(err_val)\n", 1395 | " errs_train.append(err_train)\n", 1396 | " print ' => {:.2f} +- {:.2f} | {:.2f} +- {:.2f}'.format(np.average(errs_train), \n", 1397 | " np.std(errs_train),\n", 1398 | " np.average(errs_val), \n", 1399 | " np.std(errs_val))\n", 1400 | " errors_train.append(errs_train)\n", 1401 | " errors_val.append(errs_val)" 1402 | ], 1403 | "language": "python", 1404 | "metadata": {}, 1405 | "outputs": [ 1406 | { 1407 | "output_type": "stream", 1408 | "stream": "stdout", 1409 | "text": [ 1410 | " 0[0, 1] " 1411 | ] 1412 | }, 1413 | { 1414 | "output_type": "stream", 1415 | "stream": "stdout", 1416 | "text": [ 1417 | "0.46 0.25 " 1418 | ] 1419 | }, 1420 | { 1421 | "output_type": "stream", 1422 | "stream": "stdout", 1423 | "text": [ 1424 | "0.31 0.00 " 1425 | ] 1426 | }, 1427 | { 1428 | "output_type": "stream", 1429 | "stream": "stdout", 1430 | "text": [ 1431 | " => 0.28 +- 0.11 | 0.43 +- 0.21\n", 1432 | " 1[0, 1, 2] " 1433 | ] 1434 | }, 1435 | { 1436 | "output_type": "stream", 1437 | "stream": "stdout", 1438 | "text": [ 1439 | "0.23 0.25 " 1440 | ] 1441 | }, 1442 | { 1443 | "output_type": "stream", 1444 | "stream": "stdout", 1445 | "text": [ 1446 | "0.31 0.75 " 1447 | ] 1448 | }, 1449 | { 1450 | "output_type": "stream", 1451 | "stream": "stdout", 1452 | "text": [ 1453 | " => 0.24 +- 0.09 | 0.50 +- 0.24\n", 1454 | " 2[0, 1, 2, 3] " 1455 | ] 1456 | }, 1457 | { 1458 | "output_type": "stream", 1459 | "stream": "stdout", 1460 | "text": [ 1461 | "0.23 0.50 " 1462 | ] 1463 | }, 1464 | { 1465 | "output_type": "stream", 1466 | "stream": "stdout", 1467 | "text": [ 1468 | "0.15 0.75 " 1469 | ] 1470 | }, 1471 | { 1472 | "output_type": "stream", 1473 | "stream": "stdout", 1474 | "text": [ 1475 | " => 0.21 +- 0.05 | 0.57 +- 0.19\n", 1476 | " 3[0, 4] " 1477 | ] 1478 | }, 1479 | { 1480 | "output_type": "stream", 1481 | "stream": "stdout", 1482 | "text": [ 1483 | "0.08 0.25 " 1484 | ] 1485 | }, 1486 | { 1487 | "output_type": "stream", 1488 | "stream": "stdout", 1489 | "text": [ 1490 | "0.08 0.75 " 1491 | ] 1492 | }, 1493 | { 1494 | "output_type": "stream", 1495 | "stream": "stdout", 1496 | "text": [ 1497 | " => 0.12 +- 0.04 | 0.40 +- 0.22\n", 1498 | " 4[0, 1, 4] " 1499 | ] 1500 | }, 1501 | { 1502 | "output_type": "stream", 1503 | "stream": "stdout", 1504 | "text": [ 1505 | "0.08 0.25 " 1506 | ] 1507 | }, 1508 | { 1509 | "output_type": "stream", 1510 | "stream": "stdout", 1511 | "text": [ 1512 | "0.23 0.25 " 1513 | ] 1514 | }, 1515 | { 1516 | "output_type": "stream", 1517 | "stream": "stdout", 1518 | "text": [ 1519 | " => 0.13 +- 0.07 | 0.42 +- 0.20\n", 1520 | " 5[0, 1, 2, 3, 4] " 1521 | ] 1522 | }, 1523 | { 1524 | "output_type": "stream", 1525 | "stream": "stdout", 1526 | "text": [ 1527 | "0.15 0.25 " 1528 | ] 1529 | }, 1530 | { 1531 | "output_type": "stream", 1532 | "stream": "stdout", 1533 | "text": [ 1534 | "0.08 0.50 " 1535 | ] 1536 | }, 1537 | { 1538 | "output_type": "stream", 1539 | "stream": "stdout", 1540 | "text": [ 1541 | " => 0.09 +- 0.07 | 0.43 +- 0.17\n", 1542 | " 6[0, 5] " 1543 | ] 1544 | }, 1545 | { 1546 | "output_type": "stream", 1547 | "stream": "stdout", 1548 | "text": [ 1549 | "0.15 0.50 " 1550 | ] 1551 | }, 1552 | { 1553 | "output_type": "stream", 1554 | "stream": "stdout", 1555 | "text": [ 1556 | "0.23 0.00 " 1557 | ] 1558 | }, 1559 | { 1560 | "output_type": "stream", 1561 | "stream": "stdout", 1562 | "text": [ 1563 | " => 0.17 +- 0.07 | 0.40 +- 0.18\n", 1564 | " 7[0, 1, 5] " 1565 | ] 1566 | }, 1567 | { 1568 | "output_type": "stream", 1569 | "stream": "stdout", 1570 | "text": [ 1571 | "0.23 0.25 " 1572 | ] 1573 | }, 1574 | { 1575 | "output_type": "stream", 1576 | "stream": "stdout", 1577 | "text": [ 1578 | "0.15 0.50 " 1579 | ] 1580 | }, 1581 | { 1582 | "output_type": "stream", 1583 | "stream": "stdout", 1584 | "text": [ 1585 | " => 0.17 +- 0.09 | 0.40 +- 0.29\n", 1586 | " 8[0, 1, 2, 3, 5] " 1587 | ] 1588 | }, 1589 | { 1590 | "output_type": "stream", 1591 | "stream": "stdout", 1592 | "text": [ 1593 | "0.23 0.25 " 1594 | ] 1595 | }, 1596 | { 1597 | "output_type": "stream", 1598 | "stream": "stdout", 1599 | "text": [ 1600 | "0.15 0.50 " 1601 | ] 1602 | }, 1603 | { 1604 | "output_type": "stream", 1605 | "stream": "stdout", 1606 | "text": [ 1607 | " => 0.16 +- 0.05 | 0.52 +- 0.17\n", 1608 | " 9[0, 6] " 1609 | ] 1610 | }, 1611 | { 1612 | "output_type": "stream", 1613 | "stream": "stdout", 1614 | "text": [ 1615 | "0.31 0.50 " 1616 | ] 1617 | }, 1618 | { 1619 | "output_type": "stream", 1620 | "stream": "stdout", 1621 | "text": [ 1622 | "0.31 0.75 " 1623 | ] 1624 | }, 1625 | { 1626 | "output_type": "stream", 1627 | "stream": "stdout", 1628 | "text": [ 1629 | " => 0.24 +- 0.05 | 0.42 +- 0.24\n", 1630 | " 10[0, 1, 6] " 1631 | ] 1632 | }, 1633 | { 1634 | "output_type": "stream", 1635 | "stream": "stdout", 1636 | "text": [ 1637 | "0.15 0.75 " 1638 | ] 1639 | }, 1640 | { 1641 | "output_type": "stream", 1642 | "stream": "stdout", 1643 | "text": [ 1644 | "0.23 0.75 " 1645 | ] 1646 | }, 1647 | { 1648 | "output_type": "stream", 1649 | "stream": "stdout", 1650 | "text": [ 1651 | " => 0.22 +- 0.09 | 0.52 +- 0.27\n", 1652 | " 11[0, 1, 2, 3, 6] " 1653 | ] 1654 | }, 1655 | { 1656 | "output_type": "stream", 1657 | "stream": "stdout", 1658 | "text": [ 1659 | "0.08 0.50 " 1660 | ] 1661 | }, 1662 | { 1663 | "output_type": "stream", 1664 | "stream": "stdout", 1665 | "text": [ 1666 | "0.00 0.50 " 1667 | ] 1668 | }, 1669 | { 1670 | "output_type": "stream", 1671 | "stream": "stdout", 1672 | "text": [ 1673 | " => 0.14 +- 0.08 | 0.53 +- 0.20\n", 1674 | " 12[0, 7] " 1675 | ] 1676 | }, 1677 | { 1678 | "output_type": "stream", 1679 | "stream": "stdout", 1680 | "text": [ 1681 | "0.23 0.75 " 1682 | ] 1683 | }, 1684 | { 1685 | "output_type": "stream", 1686 | "stream": "stdout", 1687 | "text": [ 1688 | "0.23 0.50 " 1689 | ] 1690 | }, 1691 | { 1692 | "output_type": "stream", 1693 | "stream": "stdout", 1694 | "text": [ 1695 | " => 0.24 +- 0.07 | 0.52 +- 0.23\n", 1696 | " 13[0, 1, 7] " 1697 | ] 1698 | }, 1699 | { 1700 | "output_type": "stream", 1701 | "stream": "stdout", 1702 | "text": [ 1703 | "0.23 0.75 " 1704 | ] 1705 | }, 1706 | { 1707 | "output_type": "stream", 1708 | "stream": "stdout", 1709 | "text": [ 1710 | "0.23 0.50 " 1711 | ] 1712 | }, 1713 | { 1714 | "output_type": "stream", 1715 | "stream": "stdout", 1716 | "text": [ 1717 | " => 0.24 +- 0.09 | 0.52 +- 0.23\n", 1718 | " 14[0, 1, 2, 3, 7] " 1719 | ] 1720 | }, 1721 | { 1722 | "output_type": "stream", 1723 | "stream": "stdout", 1724 | "text": [ 1725 | "0.23 0.50 " 1726 | ] 1727 | }, 1728 | { 1729 | "output_type": "stream", 1730 | "stream": "stdout", 1731 | "text": [ 1732 | "0.15 0.75 " 1733 | ] 1734 | }, 1735 | { 1736 | "output_type": "stream", 1737 | "stream": "stdout", 1738 | "text": [ 1739 | " => 0.21 +- 0.03 | 0.38 +- 0.22\n" 1740 | ] 1741 | } 1742 | ], 1743 | "prompt_number": 51 1744 | }, 1745 | { 1746 | "cell_type": "markdown", 1747 | "metadata": {}, 1748 | "source": [ 1749 | "## Conclusion\n", 1750 | "\n", 1751 | "It seems that tokenising the text not only speeds up training and scoring by 40x, it also improves the predictions. We definitely need more data to do this properly though." 1752 | ] 1753 | }, 1754 | { 1755 | "cell_type": "code", 1756 | "collapsed": false, 1757 | "input": [ 1758 | "from time import sleep" 1759 | ], 1760 | "language": "python", 1761 | "metadata": {}, 1762 | "outputs": [], 1763 | "prompt_number": 11 1764 | }, 1765 | { 1766 | "cell_type": "code", 1767 | "collapsed": false, 1768 | "input": [ 1769 | "from IPython import parallel\n", 1770 | "c = parallel.Client()\n", 1771 | "view = c.load_balanced_view()" 1772 | ], 1773 | "language": "python", 1774 | "metadata": {}, 1775 | "outputs": [ 1776 | { 1777 | "output_type": "stream", 1778 | "stream": "stderr", 1779 | "text": [ 1780 | "/Users/dirkocoetsee/anaconda/lib/python2.7/site-packages/IPython/parallel/client/client.py:446: RuntimeWarning: \n", 1781 | " Controller appears to be listening on localhost, but not on this machine.\n", 1782 | " If this is true, you should specify Client(...,sshserver='you@192.168.43.8')\n", 1783 | " or instruct your controller to listen on an external IP.\n", 1784 | " RuntimeWarning)\n" 1785 | ] 1786 | } 1787 | ], 1788 | "prompt_number": 21 1789 | }, 1790 | { 1791 | "cell_type": "code", 1792 | "collapsed": false, 1793 | "input": [ 1794 | "def k():\n", 1795 | " sleep(8)\n", 1796 | " print 'kaas'" 1797 | ], 1798 | "language": "python", 1799 | "metadata": {}, 1800 | "outputs": [], 1801 | "prompt_number": 23 1802 | }, 1803 | { 1804 | "cell_type": "code", 1805 | "collapsed": false, 1806 | "input": [ 1807 | "%%px --noblock\n", 1808 | "from time import sleep\n", 1809 | "sleep(15)\n", 1810 | "print 'kaas'\n", 1811 | "a=4" 1812 | ], 1813 | "language": "python", 1814 | "metadata": {}, 1815 | "outputs": [ 1816 | { 1817 | "metadata": {}, 1818 | "output_type": "pyout", 1819 | "prompt_number": 37, 1820 | "text": [ 1821 | "" 1822 | ] 1823 | } 1824 | ], 1825 | "prompt_number": 37 1826 | }, 1827 | { 1828 | "cell_type": "code", 1829 | "collapsed": false, 1830 | "input": [ 1831 | "1+1" 1832 | ], 1833 | "language": "python", 1834 | "metadata": {}, 1835 | "outputs": [ 1836 | { 1837 | "metadata": {}, 1838 | "output_type": "pyout", 1839 | "prompt_number": 38, 1840 | "text": [ 1841 | "2" 1842 | ] 1843 | } 1844 | ], 1845 | "prompt_number": 38 1846 | }, 1847 | { 1848 | "cell_type": "code", 1849 | "collapsed": false, 1850 | "input": [ 1851 | "%pxresult" 1852 | ], 1853 | "language": "python", 1854 | "metadata": {}, 1855 | "outputs": [ 1856 | { 1857 | "output_type": "stream", 1858 | "stream": "stdout", 1859 | "text": [ 1860 | "[stdout:0] kaas\n" 1861 | ] 1862 | } 1863 | ], 1864 | "prompt_number": 39 1865 | }, 1866 | { 1867 | "cell_type": "code", 1868 | "collapsed": false, 1869 | "input": [], 1870 | "language": "python", 1871 | "metadata": {}, 1872 | "outputs": [] 1873 | } 1874 | ], 1875 | "metadata": {} 1876 | } 1877 | ] 1878 | } --------------------------------------------------------------------------------