├── .gitignore
├── .travis.yml
├── LICENSE.txt
├── README.md
├── SimpleHOHMM
    ├── __init__.py
    ├── builder.py
    ├── model.py
    ├── package_info.json
    └── utility.py
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── api_reference.rst
    │   ├── conf.py
    │   ├── getting_started.rst
    │   ├── index.rst
    │   ├── license.rst
    │   ├── references.rst
    │   └── tutorials.rst
├── requirements.txt
├── setup.py
└── test
    ├── __init__.py
    ├── test_builder.py
    ├── test_hmm.py
    └── test_utility.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | 
 3 | language: python
 4 | python:
 5 |   - "2.7"
 6 |   - "3.4"
 7 |   - "3.5"
 8 |   - "pypy"
 9 |   - "pypy3"
10 | 
11 | branches:
12 |   only:
13 |     - master
14 | 
15 | install:
16 |   - pip install coveralls
17 | 
18 | script:
19 |   - coverage run -m unittest discover -s test
20 | 
21 | after_success:
22 |   - coveralls
23 |   
24 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Jacob Krantz
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Simple-HOHMM  
 2 | 
 3 | [![Build Status](https://travis-ci.org/jacobkrantz/Simple-HOHMM.svg?branch=master)](https://travis-ci.org/jacobkrantz/Simple-HOHMM)
 4 | [![Coverage Status](https://coveralls.io/repos/github/jacobkrantz/Simple-HOHMM/badge.svg?branch=master)](https://coveralls.io/github/jacobkrantz/Simple-HOHMM?branch=master)
 5 | [![Documentation Status](https://readthedocs.org/projects/simple-hohmm/badge/?version=latest)](http://simple-hohmm.readthedocs.io/en/latest/?badge=latest)  
 6 | 
 7 | Simple-HOHMM is an end-to-end sequence classifier using Hidden Markov Models. Let the builder construct a model for you based on chosen model attributes. Now you can solve the classic problems of HMMs: evaluating, decoding, and learning. Play with different orders of history to maximize the accuracy of your model!
 8 | 
 9 | ## General
10 | 
11 | #### Solving Fundamental Problems
12 | * **Evaluation**  
13 | 	Given an observation sequence and an HMM, determine the probability that the HMM would emit that exact observation sequence. Done with the *Forward Algorithm*.
14 | * **Decoding**  
15 | 	Given an observation sequence and an HMM, determine the most likely hidden state sequence that would emit the observation sequence. Done with the *Viterbi Algorithm*.
16 | * **Learning**  
17 | 	Given a set of observation sequences and an HMM, reestimate the model parameters so as to maximize the probabilities resulting from the Evaluation problem. Done with the *Baum Welch EM Algorithm*.
18 | 
19 | #### Features
20 | * Learning is done in any manner desired: **supervised**, **semi-supervised**, or **unsupervised**. Supervised is done with training examples of explicit counts. Semi-supervised is generated with some examples followed by a learning algorithm. Unsupervised is done by creating a model of either uniformly or randomly distributed parameters followed by a learning algorithm.
21 | * Discrete (Multinomial) emissions only.
22 | * Ergotic state transitions are assumed by the model, but setting certain probabilities to zero effectively emulates unreachable states.
23 | * Smoothing of model parameters is done with additive k-smoothing to avoid cases of zero probability, especially useful for higher order modeling.
24 | * `HiddenMarkovModel` can be trained using `HiddenMarkovModelBuilder` or by passing in explicit HMM parameter values.
25 | 
26 | ## Getting Started
27 | 
28 | #### Requirements
29 | This project is currently written in pure python code with zero dependencies for installation. Code has been tested and runs with Python 2, Python 3, and [pypy](https://pypy.org/). Running with pypy offers drastic speed improvements, consider this when working with large models.
30 | 
31 | #### Installing Simple-HOHMM
32 | No distribution exists on PyPI yet. To use the code now, you can install it directly from the repository:  
33 | `>>> pip install git+https://github.com/jacobkrantz/Simple-HOHMM.git`  
34 | Take a look at the documentation to view all methods of installation.  
35 | 
36 | #### Documentation  
37 | [Documentation](http://simple-hohmm.readthedocs.io/en/latest/?badge=latest) consisting of API reference and basic tutorials is live but the API reference not been developed yet. The tutorials there should get you up and running in the general use cases.  
38 | 
39 | ## Contributions
40 | Contributions are welcome. We have not hashed out exactly what that will look like yet. For now, feel free to fork the repository and dive in as you see fit, whether that is making/improving documentation, tutorials, test cases, issues, or source code. Contributors should have all dependencies installed from `requirements.txt`. This can be done using:  
41 |  `>>> pip install -r requirements.txt`
42 | 
43 | #### Testing
44 | Run the unit tests before opening a pull request to ensure the build does not break.   
45 | * Testing is done through the Python module `unittest`.
46 | * Automated testing is performed by Travis CI.
47 | * All test cases are located in `/test`.  
48 | 
49 | To run the entire suite of tests locally, execute:  
50 | `>>> python -m unittest discover -s test`  
51 | alternatively:  
52 | `>>> python setup.py test`
53 | 
54 | #### Documentation
55 | Docs are built using Sphinx and hosted using ReadTheDocs. You can edit the docs by updating the `.rst` files in the `/docs` folder.  
56 | Make the documentation:
57 | ```
58 | >>> cd docs
59 | >>> make html
60 | ```  
61 | View in browser:  
62 | `>>> xdg-open build/html/index.html` (if using linux)  
63 | `>>> open build/html/index.html` (if mac)
64 | 
65 | #### Viewing Code Coverage  
66 | View code coverage before opening a pull request to ensure coverage is maintained or improved.   
67 | run the unit tests using coverage:  
68 | `>>> coverage run -m unittest discover -s test`  
69 | View the coverage report:  
70 | `>>> coverage report -m`  
71 | 


--------------------------------------------------------------------------------
/SimpleHOHMM/__init__.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from os.path import dirname
 3 | 
 4 | from .builder import HiddenMarkovModelBuilder
 5 | from .model import HiddenMarkovModel
 6 | 
 7 | with open(dirname(__file__) + '/package_info.json') as f:
 8 |     _info = json.load(f)
 9 | 
10 | __version__ = str(_info["version"])
11 | __author__ = str(_info["author"])
12 | __contact__ = str(_info["author_email"])
13 | 


--------------------------------------------------------------------------------
/SimpleHOHMM/builder.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | from itertools import product
  3 | import random as ran
  4 | 
  5 | from .model import HiddenMarkovModel as HMM
  6 | from .utility import init_matrix, init_matrix_uniform, init_matrix_random
  7 | 
  8 | class HiddenMarkovModelBuilder:
  9 | 
 10 |     def __init__(self):
 11 |         self._obs_sequences = list()
 12 |         self._state_sequences = list()
 13 |         self._single_states = None
 14 |         self._all_obs = None
 15 | 
 16 |     def add_training_example(self, o, s):
 17 |         """
 18 |         Adds a single training example to the model builder.
 19 |         Args:
 20 |             o (list<char>): Observation sequence
 21 |             s (list<char>): Hidden state sequence
 22 |         """
 23 |         self._obs_sequences.append(o)
 24 |         self._state_sequences.append(s)
 25 | 
 26 |     def add_batch_training_examples(self, o_lst, s_lst):
 27 |         """
 28 |         Adds a batch of training examples to the model builder.
 29 |         Args:
 30 |             o_lst (list<list<char>>): Observation sequences
 31 |             s_lst (list<list<char>>): Hidden state sequences
 32 |         """
 33 |         self._obs_sequences += o_lst
 34 |         self._state_sequences += s_lst
 35 | 
 36 |     def set_single_states(self, single_states):
 37 |         """
 38 |         Sets the singular hidden states vocabulary for the HMM. If called
 39 |         multiple times, the vocabulary is overwritten.
 40 |         Args:
 41 |             single_states (list<string>): list of possible singular hidden
 42 |                 states. These states should disregard HMM order.
 43 |         """
 44 |         self._single_states = list(single_states)
 45 | 
 46 |     def set_all_obs(self, all_obs):
 47 |         """
 48 |         Sets the observation vocabulary for the HMM. If called multiple
 49 |         times, the vocabulary is overwritten.
 50 |         Args:
 51 |             all_obs (list<string>): list of possible model observations.
 52 |         """
 53 |         self._all_obs = list(all_obs)
 54 | 
 55 |     def build(self, highest_order=1, k_smoothing=0.0, synthesize_states=False, include_pi=True):
 56 |         """
 57 |         Builds a Hidden Markov Model based on the previously added
 58 |             training examples.
 59 |         Args:
 60 |             highest_order (int): History window of hidden states. Defaults to 1.
 61 |             k_smoothing (float): Parameter for add-k smoothing, a
 62 |                 generalization of Laplace smoothing. Defaults to 0.0.
 63 |             synthesize_states (boolean): Generate all states from permutations
 64 |                 of single states. Avoids OOV for higher order models and
 65 |                 and ensures model is fully ergodic.
 66 |             include_pi (boolean): True if the starting probabilities should be
 67 |                 calculated from explicit training counts. False if the starting
 68 |                 probabilities should all be set to 1 and thus ignored.
 69 |         Returns:
 70 |             HiddenMarkovModel: capable of evaluating, decoding, and learning.
 71 |         """
 72 |         if(highest_order < 1):
 73 |             raise ValueError("highest order must be 1 or greater.")
 74 | 
 75 |         # build state and observation sets
 76 |         if(self._all_obs is None):
 77 |             all_obs = self._get_unique_elements(self._obs_sequences)
 78 |         else:
 79 |             all_obs = self._all_obs
 80 | 
 81 |         if(self._single_states is None):
 82 |             single_states = self._get_higher_order_states(self._state_sequences, 1)
 83 |             if(synthesize_states):
 84 |                 all_states = self._make_permutations(single_states, highest_order)
 85 |             else:
 86 |                 all_states = self._get_higher_order_states(self._state_sequences, highest_order)
 87 |         else:
 88 |             synthesize_states = True
 89 |             single_states = self._single_states
 90 |             all_states = self._make_permutations(single_states, highest_order)
 91 | 
 92 |         # build probability distribution parameters
 93 |         start_probs = list()
 94 |         for i in range(highest_order):
 95 |             start_probs.append(self._calculate_start_probs(
 96 |                 state_sequences = self._state_sequences,
 97 |                 single_states = single_states,
 98 |                 order = i+1,
 99 |                 k_smoothing = k_smoothing,
100 |                 synthesize_states = synthesize_states,
101 |                 set_to_1 = not include_pi
102 |             ))
103 | 
104 |         trans_probs = self._calculate_transition_probs(all_states, highest_order, k_smoothing)
105 |         emission_probs = self._calculate_emission_probs(single_states, all_obs, k_smoothing)
106 | 
107 |         # combine all parameters to build final model
108 |         return HMM(
109 |             trans_probs,
110 |             emission_probs,
111 |             start_probs,
112 |             all_obs,
113 |             all_states,
114 |             single_states=single_states,
115 |             order=highest_order
116 |         )
117 | 
118 |     def build_unsupervised(self, single_states=None, all_obs=None, distribution="random", highest_order=1):
119 |         """
120 |         Builds a Hidden Markov Model based on a uniform probability
121 |         distribution.
122 |         Args:
123 |             single_states (list<>): list of unique elements detailing all
124 |                 possible hidden states the model should account for. If default,
125 |                 uses the values set previously through 'set_single_states'.
126 |             all_obs (list<>): list of unique elements detailing all possible
127 |                 observation elements the model should account for. If default,
128 |                 uses the values set previously through 'set_all_obs'.
129 |             distribution (string): either 'random' for a random probability
130 |                 distribution, or 'uniform' for a uniform probability
131 |                 distribution. defaults to 'random'.
132 |             highest_order (int): History window of hidden states. Defaults to 1.
133 |         Returns:
134 |             HiddenMarkovModel: capable of evaluating, decoding, and learning.
135 |         """
136 |         if(distribution not in ('random', 'uniform')):
137 |             raise ValueError("parameter 'distribution must be either 'random' or 'uniform'")
138 |         if(single_states is None):
139 |             single_states = self._single_states
140 |         if(all_obs is None):
141 |             all_obs = self._all_obs
142 | 
143 |         single_states = list(set(single_states))
144 |         all_obs = list(set(all_obs))
145 |         all_states = self._make_permutations(single_states, highest_order)
146 |         num_states = len(all_states)
147 |         if(distribution == 'uniform'):
148 |             trans_probs = init_matrix_uniform(num_states, num_states)
149 |             emission_probs = init_matrix_uniform(num_states, len(all_obs))
150 |             start_probs = self._init_uniform_start_probs(
151 |                 single_states,
152 |                 highest_order
153 |             )
154 |         else: # 'random'
155 |             trans_probs = init_matrix_random(num_states, num_states)
156 |             emission_probs = init_matrix_random(num_states, len(all_obs))
157 |             start_probs = self._init_random_start_probs(
158 |                 single_states,
159 |                 highest_order
160 |             )
161 | 
162 |         # combine all parameters to build final model
163 |         return HMM(
164 |             trans_probs,
165 |             emission_probs,
166 |             start_probs,
167 |             all_obs,
168 |             all_states,
169 |             single_states=single_states,
170 |             order=highest_order
171 |         )
172 | 
173 |     def clear_all_sets(self):
174 |         """
175 |         Deletes all training examples previously in the builder.
176 |         Deletes observation and hidden state vocabularies.
177 |         """
178 |         self._obs_sequences = list()
179 |         self._state_sequences = list()
180 |         self._single_states = None
181 |         self._all_obs = None
182 | 
183 |     # ----------------- #
184 |     #      Private      #
185 |     # ----------------- #
186 | 
187 |     def _get_unique_elements(self, set_of_lists):
188 |         unique_set = set()
189 |         for obs_lst in set_of_lists:
190 |             unique_set.update(set(obs_lst))
191 |         return list(unique_set)
192 | 
193 |     def _calculate_transition_probs(self, all_states, order, k_smoothing):
194 |         matrix_size = len(all_states)
195 |         state_trans_dict = dict()
196 | 
197 |         # initialize matrix and normalization dict
198 |         trans_probs = init_matrix(matrix_size, matrix_size, "int")
199 |         for state in all_states:
200 |             state_trans_dict[state] = 0
201 | 
202 |         # insert counts of transitions
203 |         state_sequences = self._make_higher_order_states(
204 |             self._state_sequences,
205 |             order
206 |         )
207 | 
208 |         for states in state_sequences:
209 |             for i in range(1, len(states)):
210 |                 prev_index = all_states.index(states[i - 1])
211 |                 cur_index = all_states.index(states[i])
212 |                 trans_probs[prev_index][cur_index] += 1
213 |                 state_trans_dict[all_states[prev_index]] += 1
214 | 
215 |         # normalize such that for all rows sum(trans_probs[state][s0...sn]) == 1
216 |         for prev_index in range(matrix_size):
217 |             divisor = state_trans_dict[all_states[prev_index]]
218 |             if divisor == 0 and k_smoothing == 0:
219 |                 continue # avoid ZeroDivisionError
220 | 
221 |             for cur_index in range(matrix_size):
222 |                 trans_probs[prev_index][cur_index] += k_smoothing
223 |                 trans_probs[prev_index][cur_index] /= float(
224 |                     divisor + (matrix_size * k_smoothing)
225 |                 )
226 | 
227 |         return trans_probs
228 | 
229 |     def _calculate_emission_probs(self, all_states, all_obs, k_smoothing):
230 |         rows = len(all_states)
231 |         columns = len(all_obs)
232 |         state_emission_dict = dict()
233 | 
234 |         # initializate matrix and normalization dict
235 |         emission_probs = init_matrix(rows, columns, "int")
236 |         for state in all_states:
237 |             state_emission_dict[state] = 0 + k_smoothing
238 | 
239 |         # insert counts of emissions
240 |         for i in range(len(self._obs_sequences)):
241 |             obs_lst = self._obs_sequences[i]
242 |             states_lst = self._state_sequences[i]
243 |             for j in range(len(obs_lst)):
244 |                 obs = obs_lst[j]
245 |                 obs_index = all_obs.index(obs)
246 | 
247 |                 state = states_lst[j]
248 |                 state_index = all_states.index(state)
249 | 
250 |                 emission_probs[state_index][obs_index] += 1
251 |                 state_emission_dict[state] += 1
252 | 
253 |         # normalize such that for all rows sum(emission_probs[state][o0...on]) == 1
254 |         for row in range(rows):
255 |             divisor = float(state_emission_dict[all_states[row]])
256 |             for column in range(columns):
257 |                 emission_probs[row][column] += k_smoothing
258 |                 emission_probs[row][column] /= float(
259 |                     divisor + (rows * k_smoothing)
260 |                 )
261 | 
262 |         return emission_probs
263 | 
264 |     def _get_higher_order_states(self, state_sequences, order):
265 |         if(order == 1):
266 |             return self._get_unique_elements(state_sequences)
267 | 
268 |         all_states_set = set()
269 | 
270 |         for sequence in state_sequences:
271 |             if(len(sequence) <= order):
272 |                 continue
273 | 
274 |             for i in range(order - 1, len(sequence)):
275 |                 state = ""
276 |                 for j in range(i-order+1, i+1):
277 |                     state += (sequence[j] + '-')
278 | 
279 |                 all_states_set.add(state[:len(state)-1])
280 | 
281 |         return list(all_states_set)
282 | 
283 |     def _calculate_start_probs(self, state_sequences, single_states, order, k_smoothing, synthesize_states, set_to_1):
284 |         """
285 |         Calculates the starting probability distribution for a given order.
286 |         Args:
287 |             state_sequences (list<list<char>>): Hidden state sequences
288 |             single_states (list<string>): list of possible singular hidden
289 |                 states. These states should disregard HMM order.
290 |             order (int): History window of hidden states.
291 |             k_smoothing (float): Parameter for add-k smoothing, a
292 |                 generalization of Laplace smoothing.
293 |             synthesize_states (boolean): if True, creates states
294 |             set_to_1 (boolean): set all starting probabilities to 1 if true.
295 |                 Otherwise, calculate and normalize from training counts.
296 |         Returns:
297 |             dict[state:probability]
298 |         """
299 |         start_probs_dict = dict()
300 | 
301 |         # initialize dictionary to state:initial count
302 |         if synthesize_states:
303 |             states = self._make_permutations(single_states, order)
304 |         else:
305 |             states = self._get_higher_order_states(state_sequences, order)
306 | 
307 |         for state in states:
308 |             start_probs_dict[state] = 1 if set_to_1 else k_smoothing
309 | 
310 |         if set_to_1:
311 |             return start_probs_dict
312 | 
313 |         # insert counts
314 |         start_state_emissions = 0
315 |         for state_seq in state_sequences:
316 |             if(len(state_seq) < order):
317 |                 continue
318 | 
319 |             state = ""
320 |             for i in range(order):
321 |                 state += (state_seq[i] + '-')
322 |             start_probs_dict[state[:len(state)-1]] += 1
323 |             start_state_emissions += 1
324 | 
325 |         # normalize dictionary such that sum(start_probs_dict[s0...sn]) = 1
326 |         for state in start_probs_dict.keys():
327 |             start_probs_dict[state] /= float(
328 |                 start_state_emissions
329 |                 + (len(states) * k_smoothing)
330 |             )
331 | 
332 |         return start_probs_dict
333 | 
334 |     def _init_uniform_start_probs(self, states, highest_order):
335 |         start_probs = []
336 |         for i in range(highest_order):
337 |             start_probs_dict = dict()
338 |             states_of_order = self._make_permutations(states, i + 1)
339 |             value = float(1.0 / len(states_of_order))
340 |             for i, state in enumerate(states_of_order):
341 |                 start_probs_dict[state] = value
342 | 
343 |             start_probs.append(start_probs_dict)
344 | 
345 |         return start_probs
346 | 
347 |     def _init_random_start_probs(self, states, highest_order):
348 |         start_probs = []
349 |         for i in range(highest_order):
350 |             start_probs_dict = dict()
351 |             states_of_order = self._make_permutations(states, i + 1)
352 |             values = [ran.random() for i in range(len(states_of_order))]
353 |             for i, state in enumerate(states_of_order):
354 |                 start_probs_dict[state] = values[i] / sum(values)
355 | 
356 |             start_probs.append(start_probs_dict)
357 | 
358 |         return start_probs
359 | 
360 |     def _make_higher_order_states(self, state_sequences, order):
361 |         """
362 |         Args:
363 |             state_sequences (list<list<string>>): states to convert to a
364 |                 given order.
365 |             order (int): n-gram value of history.
366 |         Returns:
367 |             list<list<string>> state_sequences mapped to n-grams.
368 |         Example:
369 |             state_sequences = [['a', 'b', 'c', 'd', 'e', 'f']]
370 |             order = 1: [['a', 'b', 'c', 'd', 'e', 'f']]
371 |             order = 2: [['a-b', 'b-c', 'c-d', 'd-e', 'e-f']]
372 |             order = 3: [['a-b-c', 'b-c-d', 'c-d-e', 'd-e-f']]
373 |         """
374 |         if(order == 1):
375 |             return state_sequences
376 | 
377 |         new_sequences = []
378 |         for sequence in state_sequences:
379 |             new_sequence = []
380 |             for i in range(order-1, len(sequence)):
381 |                 state = ""
382 |                 for j in range(i-order+1, i+1):
383 |                     state += (sequence[j] + '-')
384 | 
385 |                 new_sequence.append(state[:len(state)-1])
386 | 
387 |             new_sequences.append(new_sequence)
388 | 
389 |         return new_sequences
390 | 
391 |     def _make_permutations(self, states, highest_order):
392 |         """ makes a list of all permutation states from a single state. """
393 |         if(highest_order == 1):
394 |             return states
395 | 
396 |         states_lists = product(states, repeat = highest_order)
397 |         new_states = []
398 |         for states_lst in states_lists:
399 |             state = ""
400 |             for i in range(len(states_lst)):
401 |                 state += (states_lst[i] + '-')
402 | 
403 |             new_states.append(state[:len(state)-1])
404 | 
405 |         return new_states
406 | 


--------------------------------------------------------------------------------
/SimpleHOHMM/model.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | from itertools import chain
  4 | from math import log
  5 | 
  6 | from .utility import init_matrix, init_3d_matrix
  7 | 
  8 | class HiddenMarkovModel:
  9 |     """
 10 |     Notation used:
 11 |         HMM: Hidden Markov Model
 12 |         O: Observation sequence
 13 |         S: Hidden state sequence
 14 |         A: State transition probability distribution matrix
 15 |         B: Observation emission probability distribution matrix
 16 |         pi: Initial state probability distribution vector
 17 |         lambda: A HMM comprised of (A,B,pi)
 18 |     """
 19 |     def __init__(self, A, B, pi, all_obs, all_states, single_states=None, order=1):
 20 |         if(single_states == None):
 21 |             self._single_states = all_states
 22 |         else:
 23 |             self._single_states = single_states
 24 |         self._all_states = all_states
 25 |         self._all_obs = all_obs
 26 |         self._A = A
 27 |         self._B = B
 28 |         self._pi = pi
 29 |         self._highest_order = order
 30 | 
 31 |     def evaluate(self, sequence):
 32 |         """
 33 |         Evaluation Problem: Calculate P(O|lambda).
 34 |             Calculates the probability of emitting the given observation
 35 |             sequence based on the HMM. Uses the forward algorithm.
 36 |         Args:
 37 |             sequence (list<char>): observation sequence O
 38 |         Returns:
 39 |             float: probability of sequence being emitted
 40 |         """
 41 |         self._check_legal_sequence(sequence)
 42 |         if(len(sequence) == 0):
 43 |             return 0
 44 | 
 45 |         alpha = self._forward(sequence)
 46 |         fwd_probability = sum(map(
 47 |             lambda s: alpha[s][len(sequence) - 1],
 48 |             range(len(self._all_states)))
 49 |         )
 50 |         return fwd_probability
 51 | 
 52 |     def decode(self, sequence):
 53 |         """
 54 |         Decoding Problem: Given O and lambda, find S such that S 'best'
 55 |             describes O using lambda. Uses the Viterbi Algorithm.
 56 |         Args:
 57 |             sequence (list<char>): observation sequence O
 58 |         Returns:
 59 |             list<string>: hidden state sequence S
 60 |         """
 61 |         self._check_legal_sequence(sequence)
 62 |         if(len(sequence) == 0):
 63 |             return []
 64 |         return self._viterbi(sequence)
 65 | 
 66 |     def learn(self, sequences, delta=0.0001, k_smoothing=0.0, iterations=-1):
 67 |         """
 68 |         Learning Problem: Reestimate the model parameters (A,B,pi) iteratively
 69 |             using the Baum-Welch Algorithm (EM). Maximize P(O|lambda).
 70 |         It should be known that pi is currently not fully updated for HMMs
 71 |             of order greater than one.
 72 |         Args:
 73 |             sequences (list<O>): list of observations O = (O1,O2,...On) used
 74 |                 to train the initial (A,B,pi) parameters.
 75 |             delta (float): log value of iterative improvement such that when
 76 |                 evaluation probabilities improve by less than delta the
 77 |                 learning process is complete.
 78 |             k_smoothing (float): Smoothing parameter for add-k smoothing to
 79 |                 avoid zero probability. Value should be between [0.0, 1.0].
 80 |             iterations (int): number of iterations to perform. Will return
 81 |                 if convergence is found before all iterations
 82 |                 have been performed.
 83 |         Returns:
 84 |             (int): number of iterations to achieve convergence.
 85 |         """
 86 |         self._check_legal_sequence(set(chain.from_iterable(sequences)))
 87 |         num_sequences = len(sequences)
 88 | 
 89 |         cur_iterations = 0
 90 |         if(num_sequences == 0):
 91 |             return cur_iterations
 92 | 
 93 |         prior_score = sum(map(
 94 |             lambda O: log(self.evaluate(O)),
 95 |             sequences
 96 |         )) / num_sequences
 97 | 
 98 |         while True:
 99 |             for seq in sequences:
100 |                 self._train(seq, k_smoothing)
101 | 
102 |             cur_iterations += 1
103 |             new_score = sum(map(
104 |                 lambda O: log(self.evaluate(O)),
105 |                 sequences
106 |             )) / num_sequences
107 | 
108 |             if(abs(prior_score - new_score) < delta):
109 |                 break
110 |             if(iterations > -1 and cur_iterations >= iterations):
111 |                 break
112 |             prior_score = new_score
113 | 
114 |         return cur_iterations
115 | 
116 |     def get_parameters(self):
117 |         """ Dictionary of all model parameters. """
118 |         return {
119 |             "A": self._A,
120 |             "B": self._B,
121 |             "pi": self._pi,
122 |             "all_obs": self._all_obs,
123 |             "all_states": self._all_states,
124 |             "single_states": self._single_states
125 |         }
126 | 
127 |     def display_parameters(self):
128 |         """ Display the lambda parameters (A,B,pi) on the console. """
129 |         names = [
130 |             "Starting probabilities (pi):",
131 |             "Transition probabilities (A):",
132 |             "Emission probabilities (B):"
133 |         ]
134 |         for i, parameter in enumerate([self._pi, self._A, self._B]):
135 |             print(names[i])
136 |             for element in parameter:
137 |                 print(element)
138 | 
139 |     # ----------------- #
140 |     #      Private      #
141 |     # ----------------- #
142 | 
143 |     def _check_legal_sequence(self, seq):
144 |         """ Throws ValueError if an element of seq is not in self._all_obs """
145 |         illegal_obs = list([x for x in seq if x not in self._all_obs])
146 |         if(len(illegal_obs) == 0):
147 |             return True
148 | 
149 |         if(len(illegal_obs) == 1):
150 |             msg = "Observation out of vocabulary: '"
151 |         else:
152 |             msg = "Observations out of vocabulary: '"
153 |         raise ValueError(msg + ", ".join(illegal_obs) + "'")
154 | 
155 |     def _forward(self, sequence):
156 |         rows = len(self._all_states)
157 |         columns = len(sequence)
158 |         alpha = init_matrix(rows, columns, "float")
159 | 
160 |         # initialization step
161 |         for s_index, state in enumerate(self._single_states):
162 |             o_index = self._all_obs.index(sequence[0])
163 |             alpha[s_index][0] = (
164 |                 self._pi[0][state]
165 |                 * self._B[s_index][o_index]
166 |             )
167 | 
168 |         # iterative step
169 |         for t_index in range(columns - 1):
170 |             obs = sequence[t_index + 1]
171 |             for s_index, state in enumerate(self._all_states):
172 |                 single_state_index = self._single_states.index(
173 |                     self._get_state_by_order(state, 1)
174 |                 )
175 |                 for s_prime in range(len(self._all_states)):
176 |                     if(t_index + 1 < self._highest_order):
177 |                         state_by_order = self._get_state_by_order(
178 |                             self._all_states[s_index],
179 |                             t_index + 2
180 |                         )
181 |                         a_prob = self._pi[t_index + 1][state_by_order]
182 |                     else:
183 |                         a_prob = self._A[s_prime][s_index]
184 | 
185 |                     alpha[s_index][t_index + 1] += (
186 |                         alpha[s_prime][t_index]
187 |                         * a_prob
188 |                         * self._B[single_state_index][self._all_obs.index(obs)]
189 |                     )
190 | 
191 |         return alpha
192 | 
193 |     def _backward(self, sequence):
194 |         rows = len(self._all_states)
195 |         columns = len(sequence)
196 |         beta = init_matrix(rows, columns, "float")
197 | 
198 |         # initialization step
199 |         for s_index, state in enumerate(self._all_states):
200 |             beta[s_index][-1] = 1
201 | 
202 |         # iterative step
203 |         for t_index in reversed(range(columns-1)):
204 |             obs = sequence[t_index + 1]
205 |             for s_index in range(len(self._all_states)):
206 |                 for s_prime, state in enumerate(self._all_states):
207 |                     single_state_index = self._single_states.index(
208 |                         self._get_state_by_order(state, 1)
209 |                     )
210 |                     beta[s_index][t_index] += (
211 |                         beta[s_prime][t_index + 1]
212 |                         * self._A[s_index][s_prime]
213 |                         * self._B[single_state_index][self._all_obs.index(obs)]
214 |                     )
215 | 
216 |         return beta
217 | 
218 |     def _viterbi(self, sequence):
219 |         """
220 |         Notation used:
221 |             delta: matrix holding the highest probability state path
222 |                 at observation time t.
223 |             psi: backpointer matrix maintaining which state maximized delta.
224 |         Args:
225 |             sequence (list<char>): observation sequence O
226 |         Returns:
227 |             list<string>: hidden state sequence S
228 |         """
229 |         delta, psi = self._viterbi_forward(sequence)
230 |         return self._viterbi_backward(delta, psi, sequence)
231 | 
232 |     def _viterbi_forward(self, sequence):
233 |         """ build probability quantities delta and backpointers psi """
234 |         rows = len(self._all_states)
235 |         columns = len(sequence)
236 | 
237 |         delta = init_matrix(rows, columns, "int")
238 |         psi = init_matrix(rows, columns, 'int,int')
239 | 
240 |         # initialization step
241 |         obs_index = self._all_obs.index(sequence[0])
242 |         for s_index, state in enumerate(self._all_states):
243 |             single_state = self._get_state_by_order(state, 1)
244 |             single_state_index = self._single_states.index(single_state)
245 |             delta[s_index][0] = (
246 |                 self._pi[0][single_state]
247 |                 * self._B[single_state_index][obs_index]
248 |             )
249 | 
250 |         # iterative step
251 |         for o_index in range(1, columns):
252 |             o_master_index = self._all_obs.index(sequence[o_index])
253 |             for s_index, state in enumerate(self._all_states):
254 |                 max_prob = 0
255 |                 row_back = 0
256 |                 col_back = 0
257 | 
258 |                 single_state_index = self._single_states.index(self._get_state_by_order(state, 1))
259 |                 emission_multiplier = self._B[single_state_index][o_master_index]
260 | 
261 |                 # a multiplier of 0.0 nullfies the following computation
262 |                 if emission_multiplier == 0.0:
263 |                     continue
264 | 
265 |                 for prev_s_index in range(rows):
266 |                     transition_multiplier = 0
267 |                     if(o_index < self._highest_order):
268 |                         state_by_order = self._get_state_by_order(
269 |                             self._all_states[s_index],
270 |                             o_index + 1
271 |                         )
272 |                         transition_multiplier = self._pi[o_index][state_by_order]
273 |                     else:
274 |                         transition_multiplier = self._A[prev_s_index][s_index]
275 | 
276 |                     cur_prob = (
277 |                         delta[prev_s_index][o_index - 1]
278 |                         * transition_multiplier
279 |                         * emission_multiplier
280 |                     )
281 |                     if cur_prob > max_prob:
282 |                         max_prob = cur_prob
283 |                         row_back = prev_s_index
284 |                         col_back = o_index - 1
285 | 
286 |                 delta[s_index][o_index] = max_prob
287 |                 psi[s_index][o_index] = (row_back, col_back)
288 | 
289 |         return delta, psi
290 | 
291 |     def _viterbi_backward(self, delta, psi, sequence):
292 |         """ Decode by following the backpointers of psi """
293 |         rev_output = []
294 |         j_max = len(sequence)
295 |         max_final = 0
296 |         i_final = 0
297 | 
298 |         # find highest probability start state
299 |         for i in range(len(self._all_states)):
300 |             current_final = delta[i][j_max - 1]
301 |             if current_final > max_final:
302 |                 max_final = current_final
303 |                 i_final = i
304 | 
305 |         rev_output.append(self._get_state_by_order(self._all_states[i_final], 1))
306 |         i_cur = psi[i_final][j_max - 1][0]
307 |         j_cur = psi[i_final][j_max - 1][1]
308 | 
309 |         for j in range(j_max - 2, -1, -1):
310 |             rev_output.append(self._get_state_by_order(self._all_states[i_cur], 1))
311 |             i_cur_old = i_cur
312 |             i_cur = psi[i_cur][j_cur][0]
313 |             j_cur = psi[i_cur_old][j_cur][1]
314 | 
315 |         return rev_output[::-1]
316 | 
317 |     def _train(self, sequence, k_smoothing=0.0):
318 |         """
319 |         Use the Baum-Welch Algorithm which utilizes Expectation-Maximization
320 |         and the Forward-Backward algorithm to find the maximum likelihood
321 |         estimate for parameters (A,B,pi).
322 |         Notation used:
323 |             gamma: Probability of being in state i at time t
324 |                 given O and (A,B,pi).
325 |                 Row: state. Column: observation
326 |             xi: Joint probability of being in state i at time t and
327 |                 state (i + 1) at time (t + 1) given O and (A,B,pi).
328 |                 xi[state i][state j][time t]
329 |         Args:
330 |             sequence (list<char>): Observation sequence O
331 |             k_smoothing (float): Smoothing parameter for add-k smoothing to
332 |                 avoid zero probability. Value should be between [0.0, 1.0].
333 |         """
334 |         rows = len(self._all_states)
335 |         columns = len(sequence)
336 | 
337 |         alpha = self._forward(sequence)
338 |         beta = self._backward(sequence)
339 | 
340 |         # build gamma
341 |         gamma = init_matrix(rows, columns, "float")
342 |         for s_index in range(rows):
343 |             for o_index in range(columns):
344 |                 prob = alpha[s_index][o_index] * beta[s_index][o_index]
345 |                 prob /= sum(map(
346 |                     lambda j: alpha[j][o_index] * beta[j][o_index],
347 |                     range(rows)
348 |                 ))
349 |                 gamma[s_index][o_index] = prob
350 | 
351 |         # buid xi
352 |         xi = init_3d_matrix(rows, rows, columns - 1)
353 |         for o_index in range(columns - 1):
354 |             obs = sequence[o_index]
355 |             obs_next = sequence[o_index + 1]
356 | 
357 |             denominator = 0.0
358 |             for s_from in range(rows):
359 |                 for s_to, state_to in enumerate(self._all_states):
360 |                     single_state_index = self._single_states.index(
361 |                         self._get_state_by_order(state_to, 1)
362 |                     )
363 |                     prob = (
364 |                         alpha[s_from][o_index]
365 |                         * beta[s_to][o_index + 1]
366 |                         * self._A[s_from][s_to]
367 |                         * self._B[single_state_index][self._all_obs.index(obs_next)]
368 |                     )
369 |                     xi[s_from][s_to][o_index] = prob
370 |                     denominator += prob
371 | 
372 |             if denominator == 0:
373 |                 continue
374 | 
375 |             for s_from in range(rows):
376 |                 for s_to in range(rows):
377 |                     xi[s_from][s_to][o_index] /= denominator
378 | 
379 |         # update all parameters (A,B,pi).
380 |         for s_index, state in enumerate(self._all_states):
381 |             # update pi
382 |             self._pi[self._highest_order - 1][state] = (
383 |                 (gamma[s_index][0] + k_smoothing)
384 |                 / (1 + rows * k_smoothing)
385 |             )
386 | 
387 |             # update A
388 |             gamma_sum = sum(map(
389 |                 lambda o_index: gamma[s_index][o_index],
390 |                 range(columns - 1)
391 |             ))
392 |             if(gamma_sum == 0):
393 |                 for s_prime in range(rows):
394 |                     self._A[s_index][s_prime] = 0
395 |             else:
396 |                 for s_prime in range(rows):
397 |                     xi_sum = sum(map(
398 |                         lambda o_index: xi[s_index][s_prime][o_index],
399 |                         range(columns - 1)
400 |                     ))
401 |                     self._A[s_index][s_prime] = (
402 |                         (xi_sum + k_smoothing)
403 |                         / (gamma_sum + (rows * k_smoothing))
404 |                     )
405 | 
406 |             # update B
407 |             gamma_sum += gamma[s_index][columns - 1]
408 |             single_state_index = self._single_states.index(
409 |                 self._get_state_by_order(state, 1)
410 |             )
411 |             if(gamma_sum == 0):
412 |                 for o_index in range(columns):
413 |                     self._B[single_state_index][o_index] = 0
414 |             else:
415 |                 gamma_b_sum = list(map(
416 |                     lambda x: 0,
417 |                     range(len(self._all_obs))
418 |                 ))
419 | 
420 |                 for o_index in range(columns):
421 |                     full_obs_index = self._all_obs.index(sequence[o_index])
422 |                     gamma_b_sum[full_obs_index] += gamma[s_index][o_index]
423 | 
424 |                 for o_index in range(len(self._all_obs)):
425 |                     self._B[single_state_index][o_index] = (
426 |                         (gamma_b_sum[o_index] + k_smoothing)
427 |                         / (gamma_sum + (columns * k_smoothing))
428 |                     )
429 | 
430 |     def _get_state_by_order(self, state, order):
431 |         """
432 |         Gets single state for any order HMM.
433 |         Examples (let order == 1):
434 |             'a1-b0-a0' => 'a0'
435 |             'a1-b0' => 'b0'
436 |             'a1' => 'a1'
437 |         Args:
438 |             state: '-' delimited composite state
439 |             order: desired order state to return
440 |         Returns:
441 |             string: modified state
442 |         """
443 |         if(self._highest_order == 1):
444 |             return state
445 |         split_state = state.split('-')
446 |         l = len(split_state)
447 |         if(order > l):
448 |             raise ValueError("Specified order is higher than given state.")
449 | 
450 |         return '-'.join(split_state[l - order:l])
451 | 


--------------------------------------------------------------------------------
/SimpleHOHMM/package_info.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"author":"Jacob Krantz",
3 | 	"author_email":"jkrantz@zagmail.gonzaga.edu",
4 | 	"version": "0.0.3"
5 | }
6 | 


--------------------------------------------------------------------------------
/SimpleHOHMM/utility.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from copy import deepcopy
 3 | import random as ran
 4 | 
 5 | def init_matrix(rows, columns, data_type="float"):
 6 |     """
 7 |     Initialize a matrix using lists with provided size: (rows,columns)
 8 |     Args:
 9 |         rows (int)
10 |         columns (int)
11 |         data_type (string) must be one of:
12 |             'float': 0.0 | 'int': 0 | 'int,int': tuple(0,0)
13 |     Return:
14 |         the zero matrix of specified size and type
15 |     """
16 |     if(data_type == 'int'):
17 |         item = 0
18 |     elif(data_type == 'float'):
19 |         item = 0.0
20 |     elif(data_type == 'int,int'):
21 |         item = (0,0)
22 | 
23 |     matrix = []
24 |     row = []
25 |     for i in range(0, columns):
26 |         row.append(item)
27 |     for j in range(0, rows):
28 |         matrix.append(deepcopy(row))
29 |     return matrix
30 | 
31 | def init_3d_matrix(x, y, z):
32 |     """
33 |     Initialize a 3-dim matrix using lists with provided size: (X,Y,Z).
34 |     Args:
35 |         x (int)
36 |         y (int)
37 |         z (int)
38 |     Return:
39 |         the zero matrix of specified size with zeroed float values.
40 |     """
41 |     d1 = []
42 |     for i in range(z):
43 |         d1.append(0.0)
44 | 
45 |     d2 = []
46 |     for j in range(y):
47 |         d2.append(deepcopy(d1))
48 | 
49 |     matrix = []
50 |     for k in range(x):
51 |         matrix.append(deepcopy(d2))
52 | 
53 |     return matrix
54 | 
55 | def init_matrix_uniform(row_len, column_len):
56 |     """
57 |     Initialize a matrix such that all rows sum to 1 and
58 |     all elements in a row are the same.
59 |     Args:
60 |         row_len (int): Number of rows the matrix will have.
61 |         column_len (int): Number of columns matrix will have.
62 |     Returns:
63 |         list<list<float>>: uniformly distributed matrix.
64 |     """
65 |     value = float(1.0 / column_len)
66 |     row = list(map(lambda x : value, range(column_len)))
67 |     return list(map(lambda x : deepcopy(row), range(row_len)))
68 | 
69 | def init_matrix_random(row_len, column_len):
70 |     """
71 |     Initialize a matrix such that all rows sum to 1 and elements are
72 |     generated pseudo-randomly.
73 |     Args:
74 |         row_len (int): Number of rows the matrix will have.
75 |         column_len (int): Number of columns matrix will have.
76 |     Returns:
77 |         list<list<float>>: randomly distributed matrix.
78 |     """
79 |     return list(map(lambda x : _make_random_row(column_len), range(row_len)))
80 | 
81 | def _make_random_row(num_elements):
82 |     """ Generates a list of row_len random floats that sum to 1. """
83 |     row = [ran.random() for i in range(num_elements)]
84 |     s = sum(row)
85 |     return [ i / s for i in row ]
86 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = Simple-HOHMM
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | set SPHINXPROJ=Simple-HOHMM
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/source/api_reference.rst:
--------------------------------------------------------------------------------
1 | API Reference
2 | =============
3 | 
4 | TODO: detailed reference guide to using the API
5 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Simple-HOHMM documentation build configuration file, created by
  4 | # sphinx-quickstart on Fri Dec 29 20:17:14 2017.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | # If extensions (or modules to document with autodoc) are in another directory,
 16 | # add these directories to sys.path here. If the directory is relative to the
 17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 18 | #
 19 | # import os
 20 | # import sys
 21 | # sys.path.insert(0, os.path.abspath('.'))
 22 | 
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | #
 28 | # needs_sphinx = '1.0'
 29 | 
 30 | # Add any Sphinx extension module names here, as strings. They can be
 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 32 | # ones.
 33 | extensions = ['sphinx.ext.autodoc']
 34 | 
 35 | # Add any paths that contain templates here, relative to this directory.
 36 | templates_path = ['_templates']
 37 | 
 38 | # The suffix(es) of source filenames.
 39 | # You can specify multiple suffix as a list of string:
 40 | #
 41 | # source_suffix = ['.rst', '.md']
 42 | source_suffix = '.rst'
 43 | 
 44 | # The master toctree document.
 45 | master_doc = 'index'
 46 | 
 47 | # General information about the project.
 48 | project = u'Simple-HOHMM'
 49 | copyright = u'2017, Jacob Krantz'
 50 | author = u'Jacob Krantz'
 51 | 
 52 | # The version info for the project you're documenting, acts as replacement for
 53 | # |version| and |release|, also used in various other places throughout the
 54 | # built documents.
 55 | #
 56 | # The short X.Y version.
 57 | version = u'0.0.3'
 58 | # The full version, including alpha/beta/rc tags.
 59 | release = u'0.0.3'
 60 | 
 61 | # The language for content autogenerated by Sphinx. Refer to documentation
 62 | # for a list of supported languages.
 63 | #
 64 | # This is also used if you do content translation via gettext catalogs.
 65 | # Usually you set "language" from the command line for these cases.
 66 | language = None
 67 | 
 68 | # List of patterns, relative to source directory, that match files and
 69 | # directories to ignore when looking for source files.
 70 | # This patterns also effect to html_static_path and html_extra_path
 71 | exclude_patterns = []
 72 | 
 73 | # The name of the Pygments (syntax highlighting) style to use.
 74 | pygments_style = 'sphinx'
 75 | 
 76 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 77 | todo_include_todos = False
 78 | 
 79 | 
 80 | # -- Options for HTML output ----------------------------------------------
 81 | 
 82 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 83 | # a list of builtin themes.
 84 | #
 85 | html_theme = 'sphinx_rtd_theme'
 86 | 
 87 | # Theme options are theme-specific and customize the look and feel of a theme
 88 | # further.  For a list of options available for each theme, see the
 89 | # documentation.
 90 | #
 91 | # html_theme_options = {}
 92 | 
 93 | # Add any paths that contain custom static files (such as style sheets) here,
 94 | # relative to this directory. They are copied after the builtin static files,
 95 | # so a file named "default.css" will overwrite the builtin "default.css".
 96 | html_static_path = ['_static']
 97 | 
 98 | # Custom sidebar templates, must be a dictionary that maps document names
 99 | # to template names.
100 | #
101 | # This is required for the alabaster theme
102 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
103 | html_sidebars = {
104 |     '**': [
105 |         'relations.html',  # needs 'show_related': True theme option to display
106 |         'searchbox.html',
107 |     ]
108 | }
109 | 
110 | 
111 | # -- Options for HTMLHelp output ------------------------------------------
112 | 
113 | # Output file base name for HTML help builder.
114 | htmlhelp_basename = 'Simple-HOHMMdoc'
115 | 
116 | 
117 | # -- Options for LaTeX output ---------------------------------------------
118 | 
119 | latex_elements = {
120 |     # The paper size ('letterpaper' or 'a4paper').
121 |     #
122 |     # 'papersize': 'letterpaper',
123 | 
124 |     # The font size ('10pt', '11pt' or '12pt').
125 |     #
126 |     # 'pointsize': '10pt',
127 | 
128 |     # Additional stuff for the LaTeX preamble.
129 |     #
130 |     # 'preamble': '',
131 | 
132 |     # Latex figure (float) alignment
133 |     #
134 |     # 'figure_align': 'htbp',
135 | }
136 | 
137 | # Grouping the document tree into LaTeX files. List of tuples
138 | # (source start file, target name, title,
139 | #  author, documentclass [howto, manual, or own class]).
140 | latex_documents = [
141 |     (master_doc, 'Simple-HOHMM.tex', u'Simple-HOHMM Documentation',
142 |      u'Jacob Krantz', 'manual'),
143 | ]
144 | 
145 | 
146 | # -- Options for manual page output ---------------------------------------
147 | 
148 | # One entry per manual page. List of tuples
149 | # (source start file, name, description, authors, manual section).
150 | man_pages = [
151 |     (master_doc, 'simple-hohmm', u'Simple-HOHMM Documentation',
152 |      [author], 1)
153 | ]
154 | 
155 | 
156 | # -- Options for Texinfo output -------------------------------------------
157 | 
158 | # Grouping the document tree into Texinfo files. List of tuples
159 | # (source start file, target name, title, author,
160 | #  dir menu entry, description, category)
161 | texinfo_documents = [
162 |     (master_doc, 'Simple-HOHMM', u'Simple-HOHMM Documentation',
163 |      author, 'Simple-HOHMM', 'One line description of project.',
164 |      'Miscellaneous'),
165 | ]
166 | 


--------------------------------------------------------------------------------
/docs/source/getting_started.rst:
--------------------------------------------------------------------------------
 1 | Getting Started
 2 | ===============
 3 | 
 4 | Installation for Python 2 or 3
 5 | ------------------------------
 6 | 
 7 | ``Simple-HOHMM`` can be installed directly from Github using ``pip``. You must have ``git`` installed for this process to work.
 8 | ::
 9 | 
10 | 	>>> pip install git+https://github.com/jacobkrantz/Simple-HOHMM.git
11 | 
12 | If you want the most recent staging build:
13 | ::
14 | 
15 | 	>>> pip install git+https://github.com/jacobkrantz/Simple-HOHMM.git@staging
16 | 
17 | Alternative: to view the source code and run the tests before installation:
18 | ::
19 | 
20 | 	>>> git clone https://github.com/jacobkrantz/Simple-HOHMM.git
21 | 	>>> cd Simple-HOHMM
22 | 	>>> python setup.py test
23 | 	>>> python setup.py install
24 | 
25 | Installation for Pypy
26 | ---------------------
27 | 
28 | For usage with ``pypy``, you must install with ``pip`` inside ``pypy``:
29 | ::
30 | 
31 | 	>>> pypy -m pip install git+https://github.com/jacobkrantz/Simple-HOHMM.git
32 | 
33 | If this fails, try installing ``pip`` for ``pypy`` first:
34 | ::
35 | 
36 | 	>>> curl -O https://bootstrap.pypa.io/get-pip.py
37 | 	>>> pypy get-pip.py
38 | 
39 | If you want the most recent staging build still with ``pypy``:
40 | ::
41 | 
42 |  	>>> pypy -m pip install git+https://github.com/jacobkrantz/Simple-HOHMM.git@staging
43 | 
44 | Alternative staging branch with ``pypy``:
45 | ::
46 | 
47 | 	>>> sudo pypy -m pip install --upgrade https://github.com/jacobkrantz/Simple-HOHMM/archive/staging.zip
48 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | Simple-HOHMM documentation
 2 | ==========================
 3 | 
 4 | Simple-HOHMM is an end-to-end sequence classifier using Hidden Markov Models. Let the builder construct a model for you based on chosen model attributes. Now you can solve the classic problems of HMMs: evaluating, decoding, and learning. Play with different orders of history to maximize the accuracy of your model.
 5 | 
 6 | This documentation is under development, but the tutorials are a good place to start.
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 2
10 |    :caption: Topics
11 | 
12 |    getting_started
13 |    tutorials
14 |    api_reference
15 |    references
16 |    license
17 | 


--------------------------------------------------------------------------------
/docs/source/license.rst:
--------------------------------------------------------------------------------
 1 | License
 2 | =======
 3 | 
 4 | The MIT License (MIT)
 5 | 
 6 | Copyright (c) 2017 Jacob Krantz
 7 | 
 8 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
11 | 
12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
13 | 


--------------------------------------------------------------------------------
/docs/source/references.rst:
--------------------------------------------------------------------------------
 1 | Implementation References
 2 | =========================
 3 | 
 4 | [1] L. R. Rabiner, "A tutorial on hidden Markov models and selected applications in speech recognition," in Proceedings of the IEEE, vol. 77, no. 2, pp. 257-286, Feb 1989.
 5 | doi: 10.1109/5.18626
 6 | URL: http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=18626&isnumber=698
 7 | 
 8 | [2] Daniel Jurafsky & James H. Martin. (2016).
 9 | *Speech and Language Processing*. Draft of August 7, 2017.
10 | URL: https://web.stanford.edu/~jurafsky/slp3/
11 | 
12 | [3] Du Preez, J.A., *Efficient high-order hidden Markov modelling.*
13 | PhD Dissertation, University of Stellenbosch, South Africa, 1998.
14 | URL: http://www.ussigbase.org/downloads/jadp_phd.pdf
15 | 
16 | Web articles
17 | ------------
18 | 
19 | * https://en.wikipedia.org/wiki/Forward_algorithm
20 | * https://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm
21 | * https://en.wikipedia.org/wiki/Viterbi_algorithm
22 | * https://en.wikipedia.org/wiki/Baum%E2%80%93Welch_algorithm
23 | 


--------------------------------------------------------------------------------
/docs/source/tutorials.rst:
--------------------------------------------------------------------------------
  1 | Tutorials
  2 | =========
  3 | 
  4 | The following tutorials are meant to give you a jump start in applying the tools of Simple-HOHMM. To see what model attributes are adjustable, view the API Reference.
  5 | 
  6 | Supervised
  7 | ----------
  8 | The following example is adapted from `Wikipedia <https://en.wikipedia.org/wiki/Viterbi_algorithm>`_.
  9 | 
 10 | Suppose villagers are either healthy or have a fever. Fevers are diagnosed by the doctor asking patients how they feel (normal, dizzy, or cold). Assuming their health can be modeled by a discrete Markov chain, the observations are ``(normal, dizzy, cold)`` and the hidden states are ``(healthy, fever)``. The doctor has seen patients in the past, and kept that data. The observations are in one list and the states are in another such that ``states[i]`` corresponds to ``observations[i]``:
 11 | ::
 12 | 
 13 | 	observations = [
 14 | 		['normal', 'cold', 'dizzy', 'dizzy','normal','normal'],
 15 | 		['cold', 'cold', 'dizzy', 'normal','normal','normal'],
 16 | 		['dizzy', 'dizzy', 'cold', 'normal', 'dizzy', 'normal'],
 17 | 		['normal', 'normal', 'cold', 'dizzy', 'dizzy', 'dizzy']
 18 | 	]
 19 | 	states = [
 20 | 		['healthy', 'healthy', 'fever', 'fever', 'healthy', 'healthy'],
 21 | 		['healthy', 'fever', 'fever', 'healthy', 'healthy', 'fever'],
 22 | 		['fever', 'fever', 'fever', 'healthy', 'healthy', 'healthy'],
 23 | 		['healthy', 'healthy', 'healthy', 'fever', 'fever', 'fever']
 24 | 	]
 25 | 
 26 | We can now build a first order Hidden Markov Model based on the observations and states above:
 27 | ::
 28 | 
 29 | 	from SimpleHOHMM import HiddenMarkovModelBuilder as Builder
 30 | 	builder = Builder()
 31 | 	builder.add_batch_training_examples(observations, states)
 32 | 	hmm = builder.build()
 33 | 
 34 | Now suppose a patient has been seeing the doctor for three days and felt ``(normal, cold, dizzy)``. What might the doctor guess about this patient's health? This is solved with Viterbi decoding:
 35 | ::
 36 | 
 37 | 	obs =  ['normal', 'cold', 'dizzy']
 38 | 	states = hmm.decode(obs)
 39 | 	print(states) # prints: ['healthy', 'healthy', 'fever']
 40 | 
 41 | We can also determine the likelihood of a patient feeling ``(normal, cold, dizzy)``:
 42 | ::
 43 | 
 44 | 	obs = ['normal', 'cold', 'dizzy']
 45 | 	likelihood = hmm.evaluate(obs)
 46 | 	print(likelihood) # prints: 0.0433770021525
 47 | 
 48 | 
 49 | Semi-Supervised
 50 | ---------------
 51 | For this example, we will use the same ``observations`` and ``states`` as the Supervised example.
 52 | Here we initialize our model just as before:
 53 | ::
 54 | 
 55 | 	from SimpleHOHMM import HiddenMarkovModelBuilder as Builder
 56 | 	builder = Builder()
 57 | 	builder.add_batch_training_examples(observations, states)
 58 | 	hmm = builder.build()
 59 | 
 60 | From here we can improve the model's training even further by exposing it to observations it has not seen before. Since we are using a small set, we will limit the learning process to one iteration instead of delta convergence by utilizing the ``iterations=1`` parameter. Also, we use ``k_smoothing=0.05`` to avoid cases of zero probability:
 61 | ::
 62 | 
 63 | 	sequences = [
 64 | 			['normal', 'cold', 'dizzy','normal','normal'],
 65 | 			['normal', 'cold', 'normal','dizzy','normal'],
 66 | 			['dizzy', 'dizzy', 'dizzy','cold','normal'],
 67 | 			['dizzy', 'dizzy', 'normal','normal','normal'],
 68 | 			['cold', 'cold', 'dizzy','normal','normal'],
 69 | 			['normal', 'dizzy', 'dizzy','normal','cold'],
 70 | 			['normal', 'cold', 'dizzy', 'cold'],
 71 | 			['normal', 'cold', 'dizzy']
 72 | 	]
 73 | 	hmm.learn(sequences, k_smoothing=0.05, iterations=1)
 74 | 
 75 | We now determine the updated likelihood and hidden state sequence. Notice that running hmm.learn() has increased the likelihood of our observation:
 76 | ::
 77 | 
 78 | 	obs = ['normal', 'cold', 'dizzy']
 79 | 	print(hmm.evaluate(obs)) # prints 0.052111435936
 80 | 	print(hmm.decode(obs)) # prints ['healthy', 'fever', 'fever']
 81 | 
 82 | Unsupervised
 83 | ------------
 84 | 
 85 | In fully unsupervised scenarios, we build and train a model with no prior training examples to draw from. The only data we supply to our model is the set of possible observations, the set of possible hidden states, and a collection of observation sequences to optimize for.
 86 | 
 87 | We first gather the data to supply to our model:
 88 | ::
 89 | 
 90 | 	possible_observations = ['normal', 'healthy', 'dizzy']
 91 | 	possible_states = ['healthy', 'fever']
 92 | 	sequences = [
 93 | 		['normal', 'cold', 'dizzy','normal','normal'],
 94 | 		['normal', 'cold', 'normal','dizzy','normal'],
 95 | 		['dizzy', 'dizzy', 'dizzy','cold','normal'],
 96 | 		['dizzy', 'dizzy', 'normal','normal','normal'],
 97 | 		['cold', 'cold', 'dizzy','normal','normal'],
 98 | 		['normal', 'dizzy', 'dizzy','normal','cold'], #start new here
 99 | 		['normal', 'cold', 'dizzy', 'dizzy','normal','normal'],
100 | 		['dizzy', 'cold', 'dizzy', 'normal','normal','normal'],
101 | 		['dizzy', 'cold', 'dizzy', 'normal','normal','normal'],
102 | 		['normal', 'cold', 'dizzy', 'dizzy','cold','normal'],
103 | 		['dizzy', 'dizzy', 'dizzy', 'dizzy', 'cold', 'cold'],
104 | 		['cold', 'cold', 'cold', 'normal', 'dizzy', 'normal'],
105 | 		['dizzy', 'normal', 'cold', 'cold', 'dizzy', 'dizzy']
106 | 	]
107 | 
108 | There are two initial distributions to choose from, either ``uniform`` or ``random``. This selection applies to model parameters A, B, pi. In our case we will initialize with a random distribution:
109 | ::
110 | 
111 | 	from SimpleHOHMM import HiddenMarkovModelBuilder as Builder
112 | 	builder = Builder()
113 | 	hmm = builder.build_unsupervised(
114 | 		single_states=possible_states,
115 | 		all_obs=possible_observations,
116 | 		distribution="random",
117 | 		highest_order=2
118 | 	)
119 | 
120 | We can view the initial model parameters, train our model using Baum-Welch EM, then again view our parameters to see how they have been modified:
121 | ::
122 | 
123 | 	hmm.display_parameters()
124 | 	hmm.learn(sequences, k_smoothing=0.001)
125 | 	hmm.display_parameters()
126 | 
127 | Results may be inconsistent due to the random initial distributions. You can play with different k_smoothing values, delta values, and sequence selection. Of course, train on prior examples where possible.
128 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | coverage==4.4.2
2 | Sphinx==1.6.5
3 | sphinx-rtd-theme==0.2.4
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import json
 5 | import sys
 6 | 
 7 | try:
 8 |     from setuptools.core import setup
 9 | except ImportError:
10 |     from distutils.core import setup
11 | 
12 | with open('SimpleHOHMM/package_info.json') as f:
13 |     _info = json.load(f)
14 | 
15 | def setup_package():
16 |     needs_sphinx = {'build_sphinx', 'upload_docs'}.intersection(sys.argv)
17 |     sphinx = ['sphinx'] if needs_sphinx else []
18 |     setup(
19 |         setup_requires=sphinx,
20 |         name='SimpleHOHMM',
21 |         version=_info["version"],
22 |         author=_info["author"],
23 |         author_email=_info["author_email"],
24 |         packages=['SimpleHOHMM'],
25 |         package_data={'SimpleHOHMM': ['package_info.json']},
26 |         url='https://simple-hohmm.readthedocs.io',
27 |         license='LICENSE.txt',
28 |         description='High Order Hidden Markov Model for sequence classification',
29 |         test_suite='test.test_suite',
30 |     )
31 | 
32 | if __name__ == "__main__":
33 |     setup_package()
34 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from .test_builder import TestHMMBuilder
 3 | from .test_hmm import TestHMM
 4 | 
 5 | def test_suite():
 6 |     loader = unittest.TestLoader()
 7 | 
 8 |     test_classes_to_run = [TestHMMBuilder, TestHMM]
 9 |     suites_list = []
10 | 
11 |     for test_class in test_classes_to_run:
12 |         suite = loader.loadTestsFromTestCase(test_class)
13 |         suites_list.append(suite)
14 | 
15 |     return unittest.TestSuite(suites_list)
16 | 


--------------------------------------------------------------------------------
/test/test_builder.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from SimpleHOHMM import HiddenMarkovModelBuilder as Builder
  4 | 
  5 | class TestHMMBuilder(unittest.TestCase):
  6 | 
  7 |     def setUp(self):
  8 |         self._obs = [
  9 |             ['normal', 'cold', 'dizzy', 'dizzy','normal','normal'],
 10 |             ['dizzy', 'cold', 'dizzy', 'normal','normal','normal'],
 11 |             ['dizzy', 'cold', 'dizzy', 'normal','normal','normal'],
 12 |             ['normal', 'cold', 'dizzy', 'dizzy','cold','normal'],
 13 |             ['dizzy', 'dizzy', 'dizzy', 'dizzy', 'cold', 'cold'],
 14 |             ['cold', 'cold', 'cold', 'normal', 'dizzy', 'normal'],
 15 |             ['dizzy', 'normal', 'cold', 'cold', 'dizzy', 'dizzy']
 16 |         ]
 17 |         self._states = [
 18 |             ['healthy', 'healthy', 'fever', 'fever', 'healthy', 'healthy'],
 19 |             ['fever', 'fever', 'fever', 'healthy', 'healthy', 'fever'],
 20 |             ['fever', 'fever', 'fever', 'healthy', 'healthy', 'fever'],
 21 |             ['healthy', 'healthy', 'fever', 'fever', 'fever', 'healthy'],
 22 |             ['fever', 'fever', 'fever', 'fever', 'fever', 'fever'],
 23 |             ['fever', 'fever', 'fever', 'healthy', 'fever', 'healthy'],
 24 |             ['fever', 'healthy', 'fever', 'fever', 'fever', 'fever']
 25 |         ]
 26 | 
 27 |     def tearDown(self):
 28 |         self._obs = None
 29 |         self._states = None
 30 | 
 31 |     def test_build(self):
 32 |         builder = Builder()
 33 |         builder.add_training_example(self._obs[0], self._states[0])
 34 |         builder.add_batch_training_examples(self._obs[1:], self._states[1:])
 35 |         for do_synthesize in [True, False]:
 36 |             for order in range(1, 5):
 37 |                 hmm = builder.build(
 38 |                     highest_order=order,
 39 |                     k_smoothing=.01,
 40 |                     synthesize_states=do_synthesize
 41 |                 )
 42 |                 self._test_parameters(hmm.get_parameters(), order)
 43 | 
 44 |     def test_build_synthesize(self):
 45 |         builder = Builder()
 46 |         builder.add_batch_training_examples(self._obs, self._states)
 47 |         hmm_synth = builder.build(
 48 |             highest_order=3,
 49 |             k_smoothing=.01,
 50 |             synthesize_states=True
 51 |         )
 52 |         hmm_no_synth = builder.build(
 53 |             highest_order=3,
 54 |             k_smoothing=.01,
 55 |             synthesize_states=False
 56 |         )
 57 |         params_synth = hmm_synth.get_parameters()
 58 |         params = hmm_no_synth.get_parameters()
 59 |         # there should be more possible starting states with params_synth
 60 |         self.assertGreater(len(params_synth["pi"][2]), len(params["pi"][2]))
 61 |         # there should be more possible state transitions with params_synth
 62 |         self.assertGreater(len(params_synth["A"]), len(params["A"]))
 63 |         self.assertEqual(len(params_synth["B"]), len(params["B"]))
 64 | 
 65 |     def test_set_states_before_build(self):
 66 |         builder = Builder()
 67 |         builder.add_batch_training_examples(self._obs, self._states)
 68 |         builder.set_all_obs(['normal', 'cold', 'dizzy'])
 69 |         builder.set_single_states(['fever', 'healthy', 'blah'])
 70 |         hmm = builder.build(
 71 |             highest_order=2,
 72 |             k_smoothing=.01,
 73 |             synthesize_states=False
 74 |         )
 75 |         hmm2 = builder.build(
 76 |             highest_order=2,
 77 |             k_smoothing=.01,
 78 |             synthesize_states=True
 79 |         )
 80 |         self._test_parameters(hmm.get_parameters(), 2)
 81 |         self.assertEqual(hmm.get_parameters(), hmm2.get_parameters())
 82 | 
 83 |     def test_build_uniform(self):
 84 |         builder = Builder()
 85 |         builder.set_all_obs(['normal', 'cold', 'dizzy'])
 86 |         builder.set_single_states(['healthy', 'fever'])
 87 |         uniform_hmm = builder.build_unsupervised(distribution="uniform")
 88 |         uniform_hmm_2 = builder.build_unsupervised(distribution="uniform")
 89 |         self.assertEqual(
 90 |             uniform_hmm.get_parameters(),
 91 |             uniform_hmm_2.get_parameters()
 92 |         )
 93 | 
 94 |         params = uniform_hmm.get_parameters()
 95 |         self.assertEqual(len(set(params["pi"][0].values())), 1)
 96 |         for row in params["A"]:
 97 |             self.assertEqual(len(set(row)), 1)
 98 |             self.assertAlmostEqual(sum(row), 1)
 99 |         for row in params["B"]:
100 |             self.assertEqual(len(set(row)), 1)
101 |             self.assertAlmostEqual(sum(row), 1)
102 | 
103 |     def test_build_random(self):
104 |         builder = Builder()
105 |         builder.set_all_obs(['normal', 'cold', 'dizzy'])
106 |         builder.set_single_states(['healthy', 'fever'])
107 |         random_hmm = builder.build_unsupervised(distribution="random")
108 |         random_hmm_2 = builder.build_unsupervised(distribution="random")
109 |         self.assertNotEqual( # ignore small chance they could be the same
110 |             random_hmm.get_parameters(),
111 |             random_hmm_2.get_parameters()
112 |         )
113 | 
114 |         params = random_hmm.get_parameters()
115 |         self.assertAlmostEqual(sum(params["pi"][0].values()), 1)
116 |         self.assertGreater(len(params["A"]), 1)
117 |         for row in params["A"]:
118 |             self.assertGreater(len(row), 1)
119 |             self.assertAlmostEqual(sum(row), 1)
120 | 
121 |         self.assertGreater(len(params["B"]), 1)
122 |         for row in params["B"]:
123 |             self.assertGreater(len(row), 1)
124 |             self.assertAlmostEqual(sum(row), 1)
125 | 
126 |     def _test_parameters(self, params, order):
127 |         for value in params.values():
128 |             self.assertIsNotNone(value)
129 |         for i in range(order):
130 |             self.assertAlmostEqual(sum(params["pi"][i].values()), 1)
131 |         self.assertLessEqual(
132 |             len(params["single_states"]),
133 |             len(params["all_states"])
134 |         )
135 |         if(order > 1):
136 |             return
137 | 
138 |         for i in range(2):
139 |             self.assertAlmostEqual(sum(params["A"][i]), 1)
140 |         for i in range(2):
141 |             self.assertAlmostEqual(sum(params["B"][i]), 1)
142 | 
143 |     def test_start_probs_parameters(self):
144 |         # test for when include_pi = false (all entries should be 1)
145 |         builder = Builder()
146 |         builder.add_batch_training_examples(self._obs, self._states)
147 |         for order in range(1, 3):
148 |             hmm = builder.build(
149 |                 highest_order=order,
150 |                 k_smoothing=.01,
151 |                 synthesize_states=True,
152 |                 include_pi=False
153 |             )
154 |             params = hmm.get_parameters()
155 |             pi = params["pi"]
156 |             for i in range(order):
157 |                 [self.assertEqual(v, 1) for v in pi[i].values()]
158 | 
159 |     def test_clear_all_sets(self):
160 |         builder = Builder()
161 |         builder.add_training_example(self._obs[0], self._states[0])
162 |         builder.clear_all_sets()
163 |         self.assertEqual(len(builder._obs_sequences), 0)
164 |         self.assertEqual(len(builder._state_sequences), 0)
165 |         self.assertIsNone(builder._single_states)
166 |         self.assertIsNone(builder._all_obs)
167 | 


--------------------------------------------------------------------------------
/test/test_hmm.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from SimpleHOHMM import HiddenMarkovModel as HMM
  4 | 
  5 | class TestHMM(unittest.TestCase):
  6 | 
  7 |     def setUp(self):
  8 |         all_observations = ['normal', 'cold', 'dizzy']
  9 |         all_states = ['healthy', 'fever']
 10 |         start_probs = [{"healthy": 0.6, "fever": 0.4}]
 11 | 
 12 |         trans_probs = [
 13 |             [0.7, 0.3],
 14 |             [0.4, 0.6]
 15 |         ]
 16 | 
 17 |         emission_probs = [
 18 |             [0.5, 0.4, 0.1],
 19 |             [0.1, 0.3, 0.6]
 20 |         ]
 21 | 
 22 |         self._hmm = HMM(
 23 |             A=trans_probs,
 24 |             B=emission_probs,
 25 |             pi=start_probs,
 26 |             all_obs=all_observations,
 27 |             all_states=all_states
 28 |         )
 29 |         self._sequence = ['normal', 'cold', 'dizzy', 'dizzy','cold','normal']
 30 | 
 31 |     def tearDown(self):
 32 |         self._hmm = None
 33 |         self._sequence = None
 34 | 
 35 |     def test_hmm_evaluate(self):
 36 |         eval = self._hmm.evaluate(self._sequence)
 37 |         self.assertGreater(eval, 0)
 38 |         self.assertLess(eval, 1)
 39 | 
 40 |     def test_hmm_decode(self):
 41 |         decoded = self._hmm.decode(self._sequence)
 42 |         self.assertEqual(len(decoded), len(self._sequence))
 43 |         for state in decoded:
 44 |             self.assertFalse(state in self._sequence)
 45 | 
 46 |     def test_hmm_high_order(self):
 47 |         pi = [{
 48 |             'healthy': 0.2863247863247863,
 49 |             'fever': 0.7136752136752137
 50 |         }, {
 51 |             'healthy-healthy': 0.2855113636363636,
 52 |             'fever-fever': 0.5696022727272727,
 53 |             'healthy-fever': 0.0014204545454545455,
 54 |             'fever-healthy': 0.1434659090909091
 55 |         }]
 56 |         A = [[
 57 |             0.0024752475247524753, 0.9925742574257426,
 58 |             0.0024752475247524753, 0.0024752475247524753
 59 |         ],[
 60 |             0.0024752475247524753, 0.0024752475247524753,
 61 |             0.25, 0.745049504950495
 62 |         ],[
 63 |             0.5972222222222222, 0.3988095238095238,
 64 |             0.001984126984126984, 0.001984126984126984
 65 |         ],[
 66 |             0.0006648936170212767, 0.0006648936170212767,
 67 |             0.33311170212765956, 0.6655585106382979
 68 |         ]]
 69 |         B = [
 70 |             [0.0007127583749109052, 0.8560228082679971, 0.14326443335709194],
 71 |             [0.5711737424188371, 0.07170888333927934, 0.3571173742418837]
 72 |         ]
 73 |         all_states = [
 74 |             'healthy-healthy', 'healthy-fever',
 75 |             'fever-healthy', 'fever-fever'
 76 |         ]
 77 |         hmm = HMM(
 78 |             A=A,
 79 |             B=B,
 80 |             pi=pi,
 81 |             all_obs=['normal', 'cold', 'dizzy'],
 82 |             all_states=all_states,
 83 |             single_states=['healthy', 'fever'],
 84 |             order=2
 85 |         )
 86 |         self.assertEqual(len(hmm.decode(self._sequence)), len(self._sequence))
 87 | 
 88 |     def test_hmm_learn(self):
 89 |         sequences = [
 90 |             ['normal', 'cold', 'dizzy','normal','normal'],
 91 |             ['normal', 'cold', 'normal','dizzy','normal'],
 92 |             ['dizzy', 'dizzy', 'dizzy','cold','normal'],
 93 |             ['dizzy', 'dizzy', 'normal','normal','normal'],
 94 |             ['cold', 'cold', 'dizzy','normal','normal'],
 95 |             ['normal', 'dizzy', 'dizzy','normal','cold'],
 96 |         ]
 97 |         num_iterations = self._hmm.learn(sequences, k_smoothing=0.005)
 98 |         self.assertGreater(num_iterations, 0)
 99 |         self.test_hmm_evaluate()
100 |         self.test_hmm_decode()
101 | 


--------------------------------------------------------------------------------
/test/test_utility.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jacobkrantz/Simple-HOHMM/73d0da85e2e06c7ec7683b2e28079fbf6991580e/test/test_utility.py


--------------------------------------------------------------------------------