├── CHANGES.rst
├── AUTHORS.rst
├── examples
    ├── __pycache__
    │   └── utils.cpython-37.pyc
    ├── sim_funcs.py
    └── run_tebm_sim.py
├── lib
    └── tebm
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── _utils.py
    │   ├── _tebm_fix.pyx
    │   ├── _tebmc_fix.pyx
    │   ├── stats.py
    │   ├── _tebmc_var.pyx
    │   ├── base_fix.py
    │   ├── tebm_fix.py
    │   ├── cthmm_fix.py
    │   └── cthmm_var.py
├── README.md
├── LICENSE.txt
└── setup.py


/CHANGES.rst:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
1 | Peter A. Wijeratne (p.wijeratne@pm.me)
2 | 


--------------------------------------------------------------------------------
/examples/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pawij/tebm/HEAD/examples/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/lib/tebm/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | tebm
 3 | ========
 4 | 
 5 | ``tebm`` is a set of algorithms for learning and inference of
 6 | Temporal Event-Based Models.
 7 | """
 8 | 
 9 | try:
10 |     import setuptools_scm
11 |     __version__ = setuptools_scm.get_version(  # xref setup.py
12 |         root="../..", relative_to=__file__,
13 |         version_scheme="post-release", local_scheme="node-and-date")
14 | except (ImportError, LookupError):
15 |     try:
16 |         from ._version import version as __version__
17 |     except ImportError:
18 |         pass
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Temporal Event-Based Model (TEBM)
 2 | The TEBM is a generative model that can estimate the timing and uncertainty of events from semi-longitudinal datasets with irregularly sampled and missing data.
 3 | 
 4 | If you use the TEBM, please cite this paper:
 5 | 
 6 | Wijeratne, P.A., Eshaghi, A., Scotton, W.J., et al. The temporal event-based model: learning event timelines in progressive diseases. Imaging Neuroscience 2023. doi: https://doi.org/10.1162/imag_a_00010
 7 | 
 8 | # TEBM install requirements
 9 | Linux OS (Ubuntu 16.04.1, or greater)  
10 | g++>=7.5.0  
11 | c++>=3.8.0  
12 | python>=3.7  
13 | numpy>=1.19.5  
14 | scipy>=1.7.3  
15 | pandas  
16 | pickle  
17 | pathos  
18 | matplotlib
19 | 
20 | Install and link "kde_ebm" package, available here:
21 | 
22 | https://github.com/ucl-pond/kde_ebm
23 | 
24 | Navigate to top directory and issue the following command:
25 | 
26 | CC=g++ CFLAGS=-lstdc++ python setup.py install
27 | 
28 | # Running the code
29 | Navigate to examples/ and issue the following command:
30 | 
31 | python run_tebm_sim.py
32 | 
33 | # Worked example
34 | To follow...


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2023 Peter Wijeratne
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Author: Peter Wijeratne (p.wijeratne@pm.me)
 2 | 
 3 | import setuptools
 4 | from setuptools import Extension, find_packages, setup
 5 | from setuptools.command.build_ext import build_ext
 6 | 
 7 | class build_ext(build_ext):
 8 | 
 9 |     def finalize_options(self):
10 |         from Cython.Build import cythonize
11 |         import numpy as np
12 |         import numpy.distutils
13 | 
14 |         self.distribution.ext_modules[:] = cythonize("**/*.pyx")
15 |         for ext in self.distribution.ext_modules:
16 |             for k, v in np.distutils.misc_util.get_info("npymath").items():
17 |                 setattr(ext, k, v)
18 |             ext.include_dirs = [np.get_include()]
19 | 
20 |         super().finalize_options()
21 | 
22 |     def build_extensions(self):
23 |         try:
24 |             self.compiler.compiler_so.remove("-Wstrict-prototypes")
25 |         except (AttributeError, ValueError):
26 |             pass
27 |         super().build_extensions()
28 | 
29 | 
30 | setup(
31 |     name="tebm",
32 |     description="Temporal Event-Based Models in Python with scikit-learn like API",
33 |     maintainer="Peter Wijeratne",
34 |     url="https://github.com/pawij/tebm",
35 |     license="Academic Use License (TBC)",
36 |     cmdclass={"build_ext": build_ext},
37 |     py_modules=[],
38 |     packages=find_packages("lib"),
39 |     package_dir={"": "lib"},
40 |     ext_modules=[Extension("", [])],
41 |     package_data={},
42 |     python_requires=">=3.5",
43 |     setup_requires=[
44 |         "Cython",
45 |         "numpy>=1.10",
46 |         "setuptools_scm>=3.3",
47 |     ],
48 |     install_requires=[
49 |         "numpy>=1.10",
50 |         "scipy>=0.15",
51 |     ],
52 | )
53 | 


--------------------------------------------------------------------------------
/lib/tebm/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.special import logsumexp
  3 | 
  4 | 
  5 | def normalize(a, axis=None):
  6 |     """
  7 |     Normalizes the input array so that it sums to 1.
  8 | 
  9 |     Parameters
 10 |     ----------
 11 |     a : array
 12 |         Non-normalized input data.
 13 | 
 14 |     axis : int
 15 |         Dimension along which normalization is performed.
 16 | 
 17 |     Notes
 18 |     -----
 19 |     Modifies the input **inplace**.
 20 |     """
 21 |     a_sum = a.sum(axis)
 22 |     if axis and a.ndim > 1:
 23 |         # Make sure we don't divide by zero.
 24 |         a_sum[a_sum == 0] = 1
 25 |         shape = list(a.shape)
 26 |         shape[axis] = 1
 27 |         a_sum.shape = shape
 28 | 
 29 |     a /= a_sum
 30 | 
 31 | 
 32 | def log_normalize(a, axis=None):
 33 |     """
 34 |     Normalizes the input array so that ``sum(exp(a)) == 1``.
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     a : array
 39 |         Non-normalized input data.
 40 | 
 41 |     axis : int
 42 |         Dimension along which normalization is performed.
 43 | 
 44 |     Notes
 45 |     -----
 46 |     Modifies the input **inplace**.
 47 |     """
 48 |     if axis is not None and a.shape[axis] == 1:
 49 |         # Handle single-state GMMHMM in the degenerate case normalizing a
 50 |         # single -inf to zero.
 51 |         a[:] = 0
 52 |     else:
 53 |         with np.errstate(under="ignore"):
 54 |             a_lse = logsumexp(a, axis, keepdims=True)
 55 |             #            a_lse = np.log(np.sum(np.exp(a), axis=1))
 56 |             #            a_lse = a_lse.reshape(len(a_lse),1)
 57 |         a -= a_lse
 58 | 
 59 | 
 60 | def iter_from_X_lengths(X, lengths):
 61 |     if lengths is None:
 62 |         yield 0, len(X)
 63 |     else:
 64 |         n_samples = X.shape[0]
 65 |         end = np.cumsum(lengths).astype(np.int32)
 66 |         start = end - lengths
 67 |         if end[-1] > n_samples:
 68 |             raise ValueError("more than {:d} samples in lengths array {!s}"
 69 |                              .format(n_samples, lengths))
 70 | 
 71 |         for i in range(len(lengths)):
 72 |             yield start[i], end[i]
 73 | 
 74 | 
 75 | def log_mask_zero(a):
 76 |     """Computes the log of input probabilities masking divide by zero in log.
 77 | 
 78 |     Notes
 79 |     -----
 80 |     During the M-step of EM-algorithm, very small intermediate start
 81 |     or transition probabilities could be normalized to zero, causing a
 82 |     *RuntimeWarning: divide by zero encountered in log*.
 83 | 
 84 |     This function masks this unharmful warning.
 85 |     """
 86 |     a = np.asarray(a)
 87 |     with np.errstate(divide="ignore"):
 88 |         return np.log(a)
 89 | 
 90 | 
 91 | def fill_covars(covars, covariance_type='full', n_components=1, n_features=1):
 92 |     if covariance_type == 'full':
 93 |         return covars
 94 |     elif covariance_type == 'diag':
 95 |         return np.array(list(map(np.diag, covars)))
 96 |     elif covariance_type == 'tied':
 97 |         return np.tile(covars, (n_components, 1, 1))
 98 |     elif covariance_type == 'spherical':
 99 |         eye = np.eye(n_features)[np.newaxis, :, :]
100 |         covars = covars[:, np.newaxis, np.newaxis]
101 |         return eye * covars
102 | 


--------------------------------------------------------------------------------
/lib/tebm/_utils.py:
--------------------------------------------------------------------------------
 1 | """Private utilities."""
 2 | 
 3 | import numpy as np
 4 | from sklearn.utils.validation import NotFittedError
 5 | 
 6 | 
 7 | # Copied from scikit-learn 0.19.
 8 | def _validate_covars(covars, covariance_type, n_components):
 9 |     """Do basic checks on matrix covariance sizes and values."""
10 |     from scipy import linalg
11 |     if covariance_type == 'spherical':
12 |         if len(covars) != n_components:
13 |             raise ValueError("'spherical' covars have length n_components")
14 |         elif np.any(covars <= 0):
15 |             raise ValueError("'spherical' covars must be non-negative")
16 |     elif covariance_type == 'tied':
17 |         if covars.shape[0] != covars.shape[1]:
18 |             raise ValueError("'tied' covars must have shape (n_dim, n_dim)")
19 |         elif (not np.allclose(covars, covars.T)
20 |               or np.any(linalg.eigvalsh(covars) <= 0)):
21 |             raise ValueError("'tied' covars must be symmetric, "
22 |                              "positive-definite")
23 |     elif covariance_type == 'diag':
24 |         if len(covars.shape) != 2:
25 |             raise ValueError("'diag' covars must have shape "
26 |                              "(n_components, n_dim)")
27 |         elif np.any(covars <= 0):
28 | #            raise ValueError("'diag' covars must be non-negative")
29 |             print("'diag' covars must be non-negative")
30 |     elif covariance_type == 'full':
31 |         if len(covars.shape) != 3:
32 |             raise ValueError("'full' covars must have shape "
33 |                              "(n_components, n_dim, n_dim)")
34 |         elif covars.shape[1] != covars.shape[2]:
35 |             raise ValueError("'full' covars must have shape "
36 |                              "(n_components, n_dim, n_dim)")
37 |         for n, cv in enumerate(covars):
38 |             if (not np.allclose(cv, cv.T)
39 |                     or np.any(linalg.eigvalsh(cv) <= 0)):
40 |                 raise ValueError("component %d of 'full' covars must be "
41 |                                  "symmetric, positive-definite" % n)
42 |     else:
43 |         raise ValueError("covariance_type must be one of " +
44 |                          "'spherical', 'tied', 'diag', 'full'")
45 | 
46 | 
47 | # Copied from scikit-learn 0.19.
48 | def distribute_covar_matrix_to_match_covariance_type(
49 |         tied_cv, covariance_type, n_components):
50 |     """Create all the covariance matrices from a given template."""
51 |     if covariance_type == 'spherical':
52 |         cv = np.tile(tied_cv.mean() * np.ones(tied_cv.shape[1]),
53 |                      (n_components, 1))
54 |     elif covariance_type == 'tied':
55 |         cv = tied_cv
56 |     elif covariance_type == 'diag':
57 |         cv = np.tile(np.diag(tied_cv), (n_components, 1))
58 |     elif covariance_type == 'full':
59 |         cv = np.tile(tied_cv, (n_components, 1, 1))
60 |     else:
61 |         raise ValueError("covariance_type must be one of " +
62 |                          "'spherical', 'tied', 'diag', 'full'")
63 |     return cv
64 | 
65 | 
66 | # Adapted from scikit-learn 0.21.
67 | def check_is_fitted(estimator, attribute):
68 |     if not hasattr(estimator, attribute):
69 |         raise NotFittedError(
70 |             "This %s instance is not fitted yet. Call 'fit' with "
71 |             "appropriate arguments before using this method."
72 |             % type(estimator).__name__)
73 | 


--------------------------------------------------------------------------------
/lib/tebm/_tebm_fix.pyx:
--------------------------------------------------------------------------------
  1 | # cython: language_level=3, boundscheck=False, wraparound=False
  2 | 
  3 | from cython cimport view
  4 | from numpy.math cimport expl, logl, log1pl, isinf, fabsl, INFINITY
  5 | 
  6 | import numpy as np
  7 | 
  8 | ctypedef double dtype_t
  9 | 
 10 | 
 11 | cdef inline int _argmax(dtype_t[:] X) nogil:
 12 |     cdef dtype_t X_max = -INFINITY
 13 |     cdef int pos = 0
 14 |     cdef int i
 15 |     for i in range(X.shape[0]):
 16 |         if X[i] > X_max:
 17 |             X_max = X[i]
 18 |             pos = i
 19 |     return pos
 20 | 
 21 | 
 22 | cdef inline dtype_t _max(dtype_t[:] X) nogil:
 23 |     return X[_argmax(X)]
 24 | 
 25 | 
 26 | cdef inline dtype_t _logsumexp(dtype_t[:] X) nogil:
 27 |     cdef dtype_t X_max = _max(X)
 28 |     if isinf(X_max):
 29 |         return -INFINITY
 30 | 
 31 |     cdef dtype_t acc = 0
 32 |     for i in range(X.shape[0]):
 33 |         acc += expl(X[i] - X_max)
 34 | 
 35 |     return logl(acc) + X_max
 36 | 
 37 | 
 38 | cdef inline dtype_t _logaddexp(dtype_t a, dtype_t b) nogil:
 39 |     if isinf(a) and a < 0:
 40 |         return b
 41 |     elif isinf(b) and b < 0:
 42 |         return a
 43 |     else:
 44 |         return max(a, b) + log1pl(expl(-fabsl(a - b)))
 45 | 
 46 | def _forward(int n_samples, int n_components,
 47 |              dtype_t[:] log_startprob,
 48 |              dtype_t[:, :] log_transmat,
 49 |              dtype_t[:, :] framelogprob,
 50 |              dtype_t[:, :] fwdlattice):
 51 | 
 52 |     cdef int t, i, j
 53 |     cdef dtype_t[::view.contiguous] work_buffer = np.zeros(n_components)
 54 | 
 55 |     with nogil:
 56 |         for i in range(n_components):
 57 |             fwdlattice[0, i] = log_startprob[i] + framelogprob[0, i]
 58 | 
 59 |         for t in range(1, n_samples):
 60 |             for j in range(n_components):
 61 |                 for i in range(n_components):
 62 |                     work_buffer[i] = fwdlattice[t - 1, i] + log_transmat[i, j]
 63 | 
 64 |                 fwdlattice[t, j] = _logsumexp(work_buffer) + framelogprob[t, j]
 65 | 
 66 | 
 67 | def _backward(int n_samples, int n_components,
 68 |               dtype_t[:] log_startprob,
 69 |               dtype_t[:, :] log_transmat,
 70 |               dtype_t[:, :] framelogprob,
 71 |               dtype_t[:, :] bwdlattice):
 72 | 
 73 |     cdef int t, i, j
 74 |     cdef dtype_t[::view.contiguous] work_buffer = np.zeros(n_components)
 75 | 
 76 |     with nogil:
 77 |         for i in range(n_components):
 78 |             bwdlattice[n_samples - 1, i] = 0.0
 79 | 
 80 |         for t in range(n_samples - 2, -1, -1):
 81 |             for i in range(n_components):
 82 |                 for j in range(n_components):
 83 |                     work_buffer[j] = (log_transmat[i, j]
 84 |                                       + framelogprob[t + 1, j]
 85 |                                       + bwdlattice[t + 1, j])
 86 |                 bwdlattice[t, i] = _logsumexp(work_buffer)
 87 | 
 88 | 
 89 | def _compute_log_prob_tau(int n_samples, int n_components,
 90 |                           dtype_t[:, :] fwdlattice,
 91 |                           dtype_t[:, :] log_transmat,
 92 |                           dtype_t[:, :] bwdlattice,
 93 |                           dtype_t[:, :] framelogprob,
 94 |                           dtype_t[:, :] log_xi_sum):
 95 | 
 96 |     cdef int t, i, j
 97 |     cdef dtype_t[:, ::view.contiguous] work_buffer = \
 98 |         np.full((n_components, n_components), -INFINITY)
 99 |     cdef dtype_t logprob = _logsumexp(fwdlattice[n_samples - 1])
100 | 
101 |     with nogil:
102 |         for t in range(n_samples - 1):
103 |             for i in range(n_components):
104 |                 for j in range(n_components):
105 |                     work_buffer[i, j] = (fwdlattice[t, i]
106 |                                          + log_transmat[i, j]
107 |                                          + framelogprob[t + 1, j]
108 |                                          + bwdlattice[t + 1, j]
109 |                                          - logprob)
110 | 
111 |             for i in range(n_components):
112 |                 for j in range(n_components):
113 |                     log_xi_sum[i, j] = _logaddexp(log_xi_sum[i, j],
114 |                                                   work_buffer[i, j])
115 | 
116 | 
117 | def _viterbi(int n_samples, int n_components,
118 |              dtype_t[:] log_startprob,
119 |              dtype_t[:, :] log_transmat,
120 |              dtype_t[:, :] framelogprob):
121 | 
122 |     cdef int i, j, t, where_from
123 | 
124 |     cdef int[::view.contiguous] state_sequence = \
125 |         np.empty(n_samples, dtype=np.int32)
126 |     cdef dtype_t[:, ::view.contiguous] viterbi_lattice = \
127 |         np.zeros((n_samples, n_components))
128 |     cdef dtype_t[::view.contiguous] work_buffer = np.empty(n_components)
129 | 
130 |     with nogil:
131 |         for i in range(n_components):
132 |             viterbi_lattice[0, i] = log_startprob[i] + framelogprob[0, i]
133 | 
134 |         # Induction
135 |         for t in range(1, n_samples):
136 |             for i in range(n_components):
137 |                 for j in range(n_components):
138 |                     work_buffer[j] = (log_transmat[j, i]
139 |                                       + viterbi_lattice[t - 1, j])
140 | 
141 |                 viterbi_lattice[t, i] = _max(work_buffer) + framelogprob[t, i]
142 | 
143 |         # Observation traceback
144 |         state_sequence[n_samples - 1] = where_from = \
145 |             _argmax(viterbi_lattice[n_samples - 1])
146 | 
147 |         for t in range(n_samples - 2, -1, -1):
148 |             for i in range(n_components):
149 |                 work_buffer[i] = (viterbi_lattice[t, i]
150 |                                   + log_transmat[i, where_from])
151 | 
152 |             state_sequence[t] = where_from = _argmax(work_buffer)
153 | 
154 |     return np.asarray(state_sequence)
155 | 


--------------------------------------------------------------------------------
/lib/tebm/_tebmc_fix.pyx:
--------------------------------------------------------------------------------
  1 | # cython: language_level=3, boundscheck=False, wraparound=False
  2 | 
  3 | from cython cimport view
  4 | from numpy.math cimport expl, logl, log1pl, isinf, fabsl, INFINITY
  5 | 
  6 | import numpy as np
  7 | 
  8 | ctypedef double dtype_t
  9 | 
 10 | 
 11 | cdef inline int _argmax(dtype_t[:] X) nogil:
 12 |     cdef dtype_t X_max = -INFINITY
 13 |     cdef int pos = 0
 14 |     cdef int i
 15 |     for i in range(X.shape[0]):
 16 |         if X[i] > X_max:
 17 |             X_max = X[i]
 18 |             pos = i
 19 |     return pos
 20 | 
 21 | 
 22 | cdef inline dtype_t _max(dtype_t[:] X) nogil:
 23 |     return X[_argmax(X)]
 24 | 
 25 | 
 26 | cdef inline dtype_t _logsumexp(dtype_t[:] X) nogil:
 27 |     cdef dtype_t X_max = _max(X)
 28 |     if isinf(X_max):
 29 |         return -INFINITY
 30 | 
 31 |     cdef dtype_t acc = 0
 32 |     for i in range(X.shape[0]):
 33 |         acc += expl(X[i] - X_max)
 34 | 
 35 |     return logl(acc) + X_max
 36 | 
 37 | 
 38 | cdef inline dtype_t _logaddexp(dtype_t a, dtype_t b) nogil:
 39 |     if isinf(a) and a < 0:
 40 |         return b
 41 |     elif isinf(b) and b < 0:
 42 |         return a
 43 |     else:
 44 |         return max(a, b) + log1pl(expl(-fabsl(a - b)))
 45 | 
 46 | 
 47 | def _forward(int n_samples, int n_components,
 48 |              dtype_t[:] log_startprob,
 49 |              dtype_t[:, :] log_transmat,
 50 |              dtype_t[:, :] framelogprob,
 51 |              dtype_t[:, :] fwdlattice):
 52 | 
 53 |     cdef int t, i, j
 54 |     cdef dtype_t[::view.contiguous] work_buffer = np.zeros(n_components)
 55 | 
 56 |     with nogil:
 57 |         for i in range(n_components):
 58 |             fwdlattice[0, i] = log_startprob[i] + framelogprob[0, i]
 59 | 
 60 |         for t in range(1, n_samples):
 61 |             for j in range(n_components):
 62 |                 for i in range(n_components):
 63 |                     work_buffer[i] = fwdlattice[t - 1, i] + log_transmat[i, j]
 64 | 
 65 |                 fwdlattice[t, j] = _logsumexp(work_buffer) + framelogprob[t, j]
 66 | 
 67 | 
 68 | def _backward(int n_samples, int n_components,
 69 |               dtype_t[:] log_startprob,
 70 |               dtype_t[:, :] log_transmat,
 71 |               dtype_t[:, :] framelogprob,
 72 |               dtype_t[:, :] bwdlattice):
 73 | 
 74 |     cdef int t, i, j
 75 |     cdef dtype_t[::view.contiguous] work_buffer = np.zeros(n_components)
 76 | 
 77 |     with nogil:
 78 |         for i in range(n_components):
 79 |             bwdlattice[n_samples - 1, i] = 0.0
 80 | 
 81 |         for t in range(n_samples - 2, -1, -1):
 82 |             for i in range(n_components):
 83 |                 for j in range(n_components):
 84 |                     work_buffer[j] = (log_transmat[i, j]
 85 |                                       + framelogprob[t + 1, j]
 86 |                                       + bwdlattice[t + 1, j])
 87 |                 bwdlattice[t, i] = _logsumexp(work_buffer)
 88 | 
 89 | 
 90 | def _compute_log_xi_sum(int n_samples, int n_components,
 91 |                         dtype_t[:, :] fwdlattice,
 92 |                         dtype_t[:, :] log_transmat,
 93 |                         dtype_t[:, :] bwdlattice,
 94 |                         dtype_t[:, :] framelogprob,
 95 |                         dtype_t[:, :] log_xi_sum):
 96 | 
 97 |     cdef int t, i, j
 98 |     cdef dtype_t[:, ::view.contiguous] work_buffer = \
 99 |         np.full((n_components, n_components), -INFINITY)
100 |     cdef dtype_t logprob = _logsumexp(fwdlattice[n_samples - 1])
101 | 
102 |     with nogil:
103 |         for t in range(n_samples - 1):
104 |             for i in range(n_components):
105 |                 for j in range(n_components):
106 |                     work_buffer[i, j] = (fwdlattice[t, i]
107 |                                          + log_transmat[i, j]
108 |                                          + framelogprob[t + 1, j]
109 |                                          + bwdlattice[t + 1, j]
110 |                                          - logprob)
111 | 
112 |             for i in range(n_components):
113 |                 for j in range(n_components):
114 |                     log_xi_sum[i, j] = _logaddexp(log_xi_sum[i, j],
115 |                                                   work_buffer[i, j])
116 | 
117 | 
118 | def _viterbi(int n_samples, int n_components,
119 |              dtype_t[:] log_startprob,
120 |              dtype_t[:, :] log_transmat,
121 |              dtype_t[:, :] framelogprob):
122 | 
123 |     cdef int i, j, t, where_from
124 |     cdef dtype_t logprob
125 | 
126 |     cdef int[::view.contiguous] state_sequence = \
127 |         np.empty(n_samples, dtype=np.int32)
128 |     cdef dtype_t[:, ::view.contiguous] viterbi_lattice = \
129 |         np.zeros((n_samples, n_components))
130 |     cdef dtype_t[::view.contiguous] work_buffer = np.empty(n_components)
131 | 
132 |     with nogil:
133 |         for i in range(n_components):
134 |             viterbi_lattice[0, i] = log_startprob[i] + framelogprob[0, i]
135 | 
136 |         # Induction
137 |         for t in range(1, n_samples):
138 |             for i in range(n_components):
139 |                 for j in range(n_components):
140 |                     work_buffer[j] = (log_transmat[j, i]
141 |                                       + viterbi_lattice[t - 1, j])
142 | 
143 |                 viterbi_lattice[t, i] = _max(work_buffer) + framelogprob[t, i]
144 | 
145 |         # Observation traceback
146 |         state_sequence[n_samples - 1] = where_from = \
147 |             _argmax(viterbi_lattice[n_samples - 1])
148 |         logprob = viterbi_lattice[n_samples - 1, where_from]
149 | 
150 |         for t in range(n_samples - 2, -1, -1):
151 |             for i in range(n_components):
152 |                 work_buffer[i] = (viterbi_lattice[t, i]
153 |                                   + log_transmat[i, where_from])
154 | 
155 |             state_sequence[t] = where_from = _argmax(work_buffer)
156 | 
157 |     return np.asarray(state_sequence), logprob
158 | 


--------------------------------------------------------------------------------
/lib/tebm/stats.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy import linalg, stats
  3 | 
  4 | def log_multivariate_normal_density(X, means, covars, covariance_type='diag'):
  5 |     """Compute the log probability under a multivariate Gaussian distribution.
  6 |     Parameters
  7 |     ----------
  8 |     X : array_like, shape (n_samples, n_features)
  9 |         List of n_features-dimensional data points. Each row corresponds to a
 10 |         single data point.
 11 |     means : array_like, shape (n_components, n_features)
 12 |         List of n_features-dimensional mean vectors for n_components Gaussians.
 13 |         Each row corresponds to a single mean vector.
 14 |     covars : array_like
 15 |         List of n_components covariance parameters for each Gaussian. The shape
 16 |         depends on `covariance_type`:
 17 |             (n_components, n_features)      if 'spherical',
 18 |             (n_features, n_features)    if 'tied',
 19 |             (n_components, n_features)    if 'diag',
 20 |             (n_components, n_features, n_features) if 'full'
 21 |     covariance_type : string
 22 |         Type of the covariance parameters.  Must be one of
 23 |         'spherical', 'tied', 'diag', 'full'.  Defaults to 'diag'.
 24 |     Returns
 25 |     -------
 26 |     lpr : array_like, shape (n_samples, n_components)
 27 |         Array containing the log probabilities of each data point in
 28 |         X under each of the n_components multivariate Gaussian distributions.
 29 |     """
 30 |     log_multivariate_normal_density_dict = {
 31 |         'spherical': _log_multivariate_normal_density_spherical,
 32 |         'tied': _log_multivariate_normal_density_tied,
 33 |         'diag': _log_multivariate_normal_density_diag,
 34 |         'full': _log_multivariate_normal_density_full}
 35 | 
 36 |     return log_multivariate_normal_density_dict[covariance_type](
 37 |         X, means, covars
 38 |     )
 39 | 
 40 | def _log_multivariate_normal_density_diag(X, means, covars):
 41 |     """Compute Gaussian log-density at X for a diagonal model."""
 42 |     # X: (ns, nf); means: (nc, nf); covars: (nc, nf) -> (ns, nc)
 43 |     n_samples, n_dim = X.shape
 44 |     # Avoid 0 log 0 = nan in degenerate covariance case.
 45 |     covars = np.maximum(covars, np.finfo(float).tiny)
 46 |     with np.errstate(over="ignore"):
 47 |         return -0.5 * (n_dim * np.log(2 * np.pi)
 48 |                        + np.log(covars).sum(axis=-1)
 49 |                        + ((X[:, None, :] - means) ** 2 / covars).sum(axis=-1))
 50 |     """
 51 |     with np.errstate(over="ignore"):
 52 |         S = weights[1]
 53 |         weights = weights[0]
 54 |         def calc_coeff(sig):
 55 |             return 1./np.sqrt(np.pi*2.0)*1./sig
 56 |         def calc_exp(x,mu,sig):
 57 |             x = (x-mu)/sig
 58 |             return np.exp(-.5*x*x)
 59 |         def normPdf(x,mu,sig):
 60 |             return calc_coeff(sig)*calc_exp(x,mu,sig)
 61 |         prob = normPdf(X[:, None, :], means, np.sqrt(covars))*weights
 62 |         #        prob = stats.norm.pdf(X[:, None, :], loc=means, scale=np.sqrt(covars))*weights
 63 |         # normalise
 64 |         for i in range(prob.shape[0]):
 65 |             for j in range(prob.shape[2]):
 66 |                 bm_pos = np.where(S == j)[0][0]
 67 |                 prob_h = prob[i,:,j][0]
 68 |                 prob_d = prob[i,:,j][-1]
 69 |                 if prob_h==0 and prob_d==0:
 70 |                     print (X)
 71 |                     print (means)
 72 |                     print (np.sqrt(covars))
 73 |                     print (weights)
 74 |                     print (prob)
 75 |                     quit()
 76 |                 if np.isnan(prob_h) or np.isnan(prob_d):
 77 |                     prob_h = .5
 78 |                 else:
 79 |                     prob_h = prob_h / (prob_h+prob_d)
 80 |                 prob_d = 1-prob_h
 81 |                 prob[i,:bm_pos+1,j] = prob_h
 82 |                 prob[i,bm_pos+1:,j] = prob_d
 83 |         prob[prob == 0] = np.finfo(float).eps
 84 |         like = np.nansum(np.log(prob),axis=-1)
 85 |         return like
 86 |     """
 87 | def _log_multivariate_normal_density_spherical(X, means, covars):
 88 |     """Compute Gaussian log-density at X for a spherical model."""
 89 |     cv = covars.copy()
 90 |     if covars.ndim == 1:
 91 |         cv = cv[:, np.newaxis]
 92 |     if cv.shape[1] == 1:
 93 |         cv = np.tile(cv, (1, X.shape[-1]))
 94 |     return _log_multivariate_normal_density_diag(X, means, cv)
 95 | 
 96 | 
 97 | def _log_multivariate_normal_density_tied(X, means, covars):
 98 |     """Compute Gaussian log-density at X for a tied model."""
 99 |     cv = np.tile(covars, (means.shape[0], 1, 1))
100 |     return _log_multivariate_normal_density_full(X, means, cv)
101 | 
102 | 
103 | def _log_multivariate_normal_density_full(X, means, covars, min_covar=1.e-7):
104 |     """Log probability for full covariance matrices."""
105 |     n_samples, n_dim = X.shape
106 |     nmix = len(means)
107 |     log_prob = np.empty((n_samples, nmix))
108 |     for c, (mu, cv) in enumerate(zip(means, covars)):
109 |         try:
110 |             cv_chol = linalg.cholesky(cv, lower=True)
111 |         except linalg.LinAlgError:
112 |             # The model is most probably stuck in a component with too
113 |             # few observations, we need to reinitialize this components
114 |             try:
115 |                 cv_chol = linalg.cholesky(cv + min_covar * np.eye(n_dim),
116 |                                           lower=True)
117 |             except linalg.LinAlgError:
118 |                 raise ValueError("'covars' must be symmetric, "
119 |                                  "positive-definite")
120 | 
121 |         cv_log_det = 2 * np.sum(np.log(np.diagonal(cv_chol)))
122 |         cv_sol = linalg.solve_triangular(cv_chol, (X - mu).T, lower=True).T
123 |         log_prob[:, c] = - .5 * (np.sum(cv_sol ** 2, axis=1) +
124 |                                  n_dim * np.log(2 * np.pi) + cv_log_det)
125 | 
126 |     return log_prob
127 | 


--------------------------------------------------------------------------------
/lib/tebm/_tebmc_var.pyx:
--------------------------------------------------------------------------------
  1 | # cython: language_level=3, boundscheck=False, wraparound=False
  2 | 
  3 | from cython cimport view
  4 | from numpy.math cimport expl, logl, log1pl, isinf, fabsl, INFINITY
  5 | 
  6 | import numpy as np
  7 | 
  8 | ctypedef double dtype_t
  9 | 
 10 | 
 11 | cdef inline int _argmax(dtype_t[:] X) nogil:
 12 |     cdef dtype_t X_max = -INFINITY
 13 |     cdef int pos = 0
 14 |     cdef int i
 15 |     for i in range(X.shape[0]):
 16 |         if X[i] > X_max:
 17 |             X_max = X[i]
 18 |             pos = i
 19 |     return pos
 20 | 
 21 | 
 22 | cdef inline dtype_t _max(dtype_t[:] X) nogil:
 23 |     return X[_argmax(X)]
 24 | 
 25 | 
 26 | cdef inline dtype_t _logsumexp(dtype_t[:] X) nogil:
 27 |     cdef dtype_t X_max = _max(X)
 28 |     if isinf(X_max):
 29 |         return -INFINITY
 30 | 
 31 |     cdef dtype_t acc = 0
 32 |     for i in range(X.shape[0]):
 33 |         acc += expl(X[i] - X_max)
 34 | 
 35 |     return logl(acc) + X_max
 36 | 
 37 | 
 38 | cdef inline dtype_t _logaddexp(dtype_t a, dtype_t b) nogil:
 39 |     if isinf(a) and a < 0:
 40 |         return b
 41 |     elif isinf(b) and b < 0:
 42 |         return a
 43 |     else:
 44 |         return max(a, b) + log1pl(expl(-fabsl(a - b)))
 45 | 
 46 | 
 47 | def _forward(int n_samples, int n_components,
 48 |              dtype_t[:] log_startprob,
 49 |              dtype_t[:, :, :] log_transmat,
 50 |              dtype_t[:, :] framelogprob,
 51 |              dtype_t[:, :] fwdlattice):
 52 | 
 53 |     cdef int t, i, j
 54 |     cdef dtype_t[::view.contiguous] work_buffer = np.zeros(n_components)
 55 | 
 56 |     with nogil:
 57 |         for i in range(n_components):
 58 |             fwdlattice[0, i] = log_startprob[i] + framelogprob[0, i]
 59 | 
 60 |         for t in range(1, n_samples):
 61 |             for j in range(n_components):
 62 |                 for i in range(n_components):
 63 | 		    # PW use transition matrix from this time interval
 64 | 		    # FIXME: check
 65 |                     work_buffer[i] = fwdlattice[t - 1, i] + log_transmat[t - 1, i, j]
 66 | 
 67 |                 fwdlattice[t, j] = _logsumexp(work_buffer) + framelogprob[t, j]
 68 | 
 69 | 
 70 | def _backward(int n_samples, int n_components,
 71 |               dtype_t[:] log_startprob,
 72 |               dtype_t[:, :, :] log_transmat,
 73 |               dtype_t[:, :] framelogprob,
 74 |               dtype_t[:, :] bwdlattice):
 75 | 
 76 |     cdef int t, i, j
 77 |     cdef dtype_t[::view.contiguous] work_buffer = np.zeros(n_components)
 78 | 
 79 |     with nogil:
 80 |         for i in range(n_components):
 81 |             bwdlattice[n_samples - 1, i] = 0.0
 82 | 
 83 |         for t in range(n_samples - 2, -1, -1):
 84 |             for i in range(n_components):
 85 |                 for j in range(n_components):
 86 | 		    # PW use transition matrix from this time interval
 87 | 		    # FIXME: check
 88 |                     work_buffer[j] = (log_transmat[t, i, j]
 89 |                                       + framelogprob[t + 1, j]
 90 |                                       + bwdlattice[t + 1, j])
 91 |                 bwdlattice[t, i] = _logsumexp(work_buffer)	
 92 | 
 93 | 
 94 | def _compute_log_xi_sum(int n_samples, int n_components,
 95 |                         dtype_t[:, :] fwdlattice,
 96 | #                        dtype_t[:, :, :] log_transmat,
 97 |                         dtype_t[:, :] log_transmat,
 98 |                         dtype_t[:, :] bwdlattice,
 99 |                         dtype_t[:, :] framelogprob,
100 |                         dtype_t[:, :] log_xi_sum):
101 | 
102 |     cdef int t, i, j
103 |     cdef dtype_t[:, ::view.contiguous] work_buffer = \
104 |         np.full((n_components, n_components), -INFINITY)
105 |     cdef dtype_t logprob = _logsumexp(fwdlattice[n_samples - 1])
106 | 
107 |     with nogil:
108 |         for t in range(n_samples - 1):
109 |             for i in range(n_components):
110 |                 for j in range(n_components):
111 |                     work_buffer[i, j] = (fwdlattice[t, i]
112 | 		    		      # PW use transition matrix from this time interval
113 | 				      # FIXME: check
114 | #                                         + log_transmat[t, i, j]
115 |                                          + log_transmat[i, j]
116 |                                          + framelogprob[t + 1, j]
117 |                                          + bwdlattice[t + 1, j]
118 |                                          - logprob)
119 | 
120 |             for i in range(n_components):
121 |                 for j in range(n_components):
122 |                     log_xi_sum[i, j] = _logaddexp(log_xi_sum[i, j],
123 |                                                   work_buffer[i, j])
124 | 
125 | 
126 | def _viterbi(int n_samples, int n_components,
127 |              dtype_t[:] log_startprob,
128 |              dtype_t[:, :, :] log_transmat,
129 |              dtype_t[:, :] framelogprob):
130 | 
131 |     cdef int i, j, t, where_from
132 |     cdef dtype_t logprob
133 | 
134 |     cdef int[::view.contiguous] state_sequence = \
135 |         np.empty(n_samples, dtype=np.int32)
136 |     cdef dtype_t[:, ::view.contiguous] viterbi_lattice = \
137 |         np.zeros((n_samples, n_components))
138 |     cdef dtype_t[::view.contiguous] work_buffer = np.empty(n_components)
139 | 
140 |     with nogil:
141 |         for i in range(n_components):
142 |             viterbi_lattice[0, i] = log_startprob[i] + framelogprob[0, i]
143 | 
144 |         # Induction
145 |         for t in range(1, n_samples):
146 |             for i in range(n_components):
147 |                 for j in range(n_components):
148 |                     # PW use transition matrix from this time interval
149 | 		    # FIXME: check
150 |                     work_buffer[j] = (log_transmat[t - 1, j, i]
151 |                                       + viterbi_lattice[t - 1, j])
152 | 
153 |                 viterbi_lattice[t, i] = _max(work_buffer) + framelogprob[t, i]
154 | 
155 |         # Observation traceback
156 |         state_sequence[n_samples - 1] = where_from = \
157 |             _argmax(viterbi_lattice[n_samples - 1])
158 |         logprob = viterbi_lattice[n_samples - 1, where_from]
159 | 
160 |         for t in range(n_samples - 2, -1, -1):
161 |             for i in range(n_components):
162 | 	    	# PW use transition matrix from this time interval
163 | 		# FIXME: check
164 |                 work_buffer[i] = (viterbi_lattice[t, i]
165 |                                   + log_transmat[t, i, where_from])
166 | 
167 |             state_sequence[t] = where_from = _argmax(work_buffer)
168 | 
169 |     return np.asarray(state_sequence), logprob
170 | 


--------------------------------------------------------------------------------
/lib/tebm/base_fix.py:
--------------------------------------------------------------------------------
  1 | # Fixed time interval Temporal Event-Based Model
  2 | # Base class
  3 | # Author: Peter Wijeratne (p.wijeratne@sussex.ac.uk)
  4 | 
  5 | import numpy as np
  6 | from scipy.special import logsumexp
  7 | from sklearn.base import BaseEstimator
  8 | 
  9 | from . import _tebm_fix
 10 | 
 11 | import warnings
 12 | warnings.filterwarnings('ignore', message='divide by zero encountered in log')
 13 | 
 14 | class BaseTEBM(BaseEstimator):
 15 | 
 16 |     def __init__(self,
 17 |                  X=None,
 18 |                  lengths=None,
 19 |                  n_stages=None,
 20 |                  time_mean=None,
 21 |                  n_iter=None,
 22 |                  fwd_only=False,
 23 |                  order=None,
 24 |                  algo='viterbi',
 25 |                  verbose=False):
 26 |         self.X = X
 27 |         self.lengths = lengths
 28 |         self.n_stages = n_stages
 29 |         if time_mean:
 30 |             self.time_mean = time_mean
 31 |         else:
 32 |             self.time_mean = 1
 33 |         self.n_iter = n_iter
 34 |         self.n_obs = X.shape[0]
 35 |         self.n_features = X.shape[1]
 36 |         self.fwd_only = fwd_only
 37 |         if order:
 38 |             self.order = order
 39 |         else:
 40 |             self.order = self.n_stages-1
 41 |         self.algo = algo
 42 |         # currently only do a single EM iteration, but this might change in the future
 43 |         self.tol = 1E-3
 44 |         self.verbose = verbose
 45 |         # initialise p_vec and a_mat
 46 |         self.p_vec_prior = np.full(self.n_stages, 1./self.n_stages)
 47 |         self.p_vec = self.p_vec_prior
 48 |         self.a_mat_prior = np.ones((n_stages,n_stages))
 49 |         if self.fwd_only:
 50 |             for i in range(len(self.a_mat_prior)):
 51 |                 self.a_mat_prior[i,i] = self.time_mean
 52 |                 self.a_mat_prior[i,:i] = 0.
 53 |                 if (i+self.order+1) < len(self.a_mat_prior):
 54 |                     self.a_mat_prior[i,i+self.order+1:] = 0.
 55 |                 count_nonzero = np.count_nonzero(self.a_mat_prior[i]!=0)
 56 |                 # distribute probability to nonzero states
 57 |                 for j in range(n_stages):
 58 |                     #                    self.a_mat_prior[i,:i] = 0.
 59 |                     if i!=j and self.a_mat_prior[i,j]!=0.:
 60 |                         self.a_mat_prior[i,j] = (1-self.a_mat_prior[i,i])/(count_nonzero-1)
 61 |                     elif i==(n_stages-1) and (j==n_stages-1):
 62 |                         self.a_mat_prior[i,j] = 1.
 63 |         else:
 64 |             self.a_mat_prior = np.full((self.n_stages, self.n_stages), 1./self.n_stages)
 65 |         self.a_mat = self.a_mat_prior
 66 | 
 67 |     def reinit(self):
 68 |         self.p_vec = self.p_vec_prior
 69 |         self.a_mat = self.a_mat_prior
 70 | 
 71 |     def compute_forward(self, loglike_i):
 72 |         n_samples, n_stages = loglike_i.shape
 73 |         alpha_i = np.zeros((n_samples, n_stages))
 74 |         _tebm_fix._forward(n_samples,
 75 |                            n_stages,
 76 |                            np.log(self.p_vec),
 77 |                            np.log(self.a_mat),
 78 |                            loglike_i,
 79 |                            alpha_i)
 80 |         return alpha_i
 81 | 
 82 |     def compute_backward(self, loglike_i):
 83 |         n_samples, n_stages = loglike_i.shape
 84 |         beta_i = np.zeros((n_samples, n_stages))
 85 |         _tebm_fix._backward(n_samples,
 86 |                             n_stages,
 87 |                             np.log(self.p_vec),
 88 |                             np.log(self.a_mat),
 89 |                             loglike_i,
 90 |                             beta_i)
 91 |         return beta_i
 92 | 
 93 |     def compute_posteriors(self, alpha_i, beta_i):
 94 |         post = alpha_i + beta_i
 95 |         post -= logsumexp(post, axis=1, keepdims=True)
 96 |         return np.exp(post)
 97 | 
 98 |     def update_params(self,
 99 |                       p_vec,
100 |                       a_mat,
101 |                       loglike_i,
102 |                       post_i,
103 |                       alpha_i,
104 |                       beta_i):
105 |         # initial probability
106 |         p_vec += post_i[0]
107 |         # transition matrix
108 |         n_samples, n_stages = loglike_i.shape
109 |         # skip if only one observation - no temporal info
110 |         if n_samples == 1:
111 |             return
112 |         log_prob_tau = np.full((n_stages, n_stages), -np.inf)
113 |         _tebm_fix._compute_log_prob_tau(n_samples,
114 |                                         n_stages,
115 |                                         alpha_i,
116 |                                         np.log(self.a_mat),
117 |                                         beta_i,
118 |                                         loglike_i,
119 |                                         log_prob_tau)         
120 |         a_mat += np.exp(log_prob_tau)
121 | 
122 |     def m_step(self, p_vec, a_mat):
123 |         # update initial probability
124 |         # apply prior
125 |         p_vec = np.maximum(self.p_vec_prior - 1 + p_vec, 0)
126 |         # prevent forbidden transitions
127 |         self.p_vec = np.where(self.p_vec == 0, 0, p_vec)
128 |         # normalise
129 |         self.p_vec = self.p_vec / self.p_vec.sum()
130 |         # update transition matrix
131 |         # apply prior
132 |         a_mat = np.maximum(self.a_mat_prior - 1 + a_mat, 0)
133 |         # prevent forbidden transitions
134 |         self.a_mat = np.where(self.a_mat == 0, 0, a_mat)
135 |         # normalise
136 |         row_sums = self.a_mat.sum(axis=1)
137 |         row_sums[row_sums==0] = 1
138 |         self.a_mat = self.a_mat / row_sums[:, np.newaxis]
139 | 
140 |     def fit(self):
141 |         self.reinit()
142 |         curr_loglike = -np.inf
143 |         for n in range(self.n_iter):
144 |             p_vec = np.zeros(self.n_stages)
145 |             a_mat = np.zeros((self.n_stages, self.n_stages))
146 |             loglike = 0
147 |             for i in range(len(self.lengths)):
148 |                 s_idx, e_idx = int(np.sum(self.lengths[:i])), int(np.sum(self.lengths[:i])+self.lengths[i])
149 |                 X_i = self.X[s_idx:e_idx]
150 |                 loglike_i = self.compute_log_likelihood(X_i, s_idx, e_idx)
151 |                 alpha_i = self.compute_forward(loglike_i)
152 |                 beta_i = self.compute_backward(loglike_i)
153 |                 post_i = self.compute_posteriors(alpha_i, beta_i)
154 |                 self.update_params(p_vec, a_mat, loglike_i, post_i, alpha_i, beta_i)
155 |                 loglike += logsumexp(alpha_i[-1])
156 |             self.m_step(p_vec, a_mat)
157 |             # check likelihood for convergence - currently we don't use this, as default self.n_iter = 1
158 |             if self.verbose:
159 |                 print (n, loglike-curr_loglike)
160 |             if loglike-curr_loglike < self.tol:
161 |                 break
162 |             curr_loglike = loglike
163 | 
164 |     def compute_viterbi(self, X, i, j):
165 |         loglike_i = self.compute_log_likelihood(X, i, j)
166 |         n_samples, n_stages = loglike_i.shape
167 |         stages = _tebm_fix._viterbi(n_samples,
168 |                                     n_stages,
169 |                                     np.log(self.p_vec),
170 |                                     np.log(self.a_mat),
171 |                                     loglike_i)
172 |         return stages
173 | 
174 |     def compute_map(self, X, i, j):
175 |         posteriors = self.posteriors(X)        
176 |         stages = np.argmax(posteriors, axis=1)
177 |         return stages
178 | 
179 |     def posteriors_X(self, X, lengths=None):
180 |         n_samples = X.shape[0]
181 |         posteriors = np.zeros((n_samples, self.n_stages))
182 |         for i in range(len(lengths)):
183 |             s_idx, e_idx = int(np.sum(lengths[:i])), int(np.sum(lengths[:i])+lengths[i])
184 |             X_i = X[s_idx:e_idx]
185 |             loglike_i = self.compute_log_likelihood(X_i, s_idx, e_idx)
186 |             alpha_i = self.compute_forward(loglike_i)
187 |             beta_i = self.compute_backward(loglike_i)
188 |             posteriors[s_idx:e_idx] = self.compute_posteriors(alpha_i, beta_i)
189 |         return posteriors
190 | 
191 |     def prob_X(self, X, lengths=None):
192 |         n_samples = X.shape[0]
193 |         prob = np.zeros((n_samples, self.n_stages))
194 |         for i in range(len(lengths)):
195 |             s_idx, e_idx = int(np.sum(lengths[:i])), int(np.sum(lengths[:i])+lengths[i])
196 |             X_i = X[s_idx:e_idx]
197 |             loglike_i = self.compute_log_likelihood(X_i, s_idx, e_idx)
198 |             alpha_i = self.compute_forward(loglike_i)
199 |             prob[s_idx:e_idx] = alpha_i
200 |         return prob
201 | 
202 |     def compute_model_log_likelihood(self, X, lengths=None):
203 |         loglike = 0
204 |         for i in range(len(lengths)):
205 |             s_idx, e_idx = int(np.sum(lengths[:i])), int(np.sum(lengths[:i])+lengths[i])
206 |             X_i = X[s_idx:e_idx]
207 |             loglike_i = self.compute_log_likelihood(X_i, s_idx, e_idx)
208 |             alpha_i = self.compute_forward(loglike_i)
209 |             loglike += logsumexp(alpha_i[-1])
210 |         return loglike
211 | 
212 |     def stage_X(self, X, lengths=None, algo=None):
213 |         if self.algo == 'viterbi':
214 |             stage_algo = self.compute_viterbi
215 |         elif self.algo == 'map':
216 |             stage_algo = self.compute_map        
217 |         n_samples = X.shape[0]
218 |         stages = np.empty(n_samples, dtype=int)
219 |         for i in range(len(lengths)):
220 |             s_idx, e_idx = int(np.sum(lengths[:i])), int(np.sum(lengths[:i])+lengths[i])
221 |             X_i = X[s_idx:e_idx]
222 |             stagesij = stage_algo(X_i, s_idx, e_idx)
223 |             stages[s_idx:e_idx] = stagesij
224 |         return stages
225 | 


--------------------------------------------------------------------------------
/examples/sim_funcs.py:
--------------------------------------------------------------------------------
  1 | # TEBM simulation functions
  2 | # Author: Peter Wijeratne (p.wijeratne@pm.me)
  3 | # Functions "gen_data_zscore", "gen_model_zscore", "gen_data_mixture", "gen_model_mixture" are adapted from pySuStaIn (https://github.com/ucl-pond/pySuStaIn)
  4 | 
  5 | import numpy as np
  6 | from scipy.stats import norm
  7 | 
  8 | def gen_data(n_subtypes,
  9 |              n_ppl,
 10 |              n_bms,
 11 |              n_obs,
 12 |              n_components,
 13 |              model_type='GMM',
 14 |              is_cut=False,
 15 |              n_zscores=None,
 16 |              z_max=None,
 17 |              sigma_noise=1.0,
 18 |              seq=[],
 19 |              fractions=[1],
 20 |              fwd_only=True,
 21 |              order=1,
 22 |              time_mean=1,
 23 |              verbose=False):
 24 |     # intialise z-score stuff
 25 |     if model_type=='Zscore':
 26 |         z_val_arr = np.array([[x+1 for x in range(n_zscores)]]*n_bms)
 27 |         z_max_arr = np.array([z_max]*n_bms)
 28 |         IX_vals = np.array([[x for x in range(n_bms)]*n_zscores]).T
 29 |         stage_biomarker_index = np.array([y for x in IX_vals.T for y in x])
 30 |         stage_zscore = np.array([y for x in z_val_arr.T for y in x])
 31 |         stage_biomarker_index = stage_biomarker_index.reshape(1,len(stage_biomarker_index))
 32 |         stage_zscore = stage_zscore.reshape(1,len(stage_zscore))
 33 |         min_biomarker_zscore = [0]*n_bms
 34 |         max_biomarker_zscore = z_max_arr
 35 |     # transition generator matrix and initial probability
 36 |     # TODO: design suitable timescale and observation times for simulations
 37 |     # FIXME: also allow for fixed intervals (see HACK below)
 38 |     Q_subtypes, pi0_subtypes = [], []
 39 |     for s in range(n_subtypes):
 40 |         Q = np.zeros((n_components, n_components))
 41 |         for i in range(n_components):
 42 |             vec = np.ones(n_components-1)
 43 |             vec /= np.sum(vec)
 44 |             Q[i,:i] = vec[:i]
 45 |             Q[i,i+1:] = vec[i:]
 46 |             #            Q[i,i] = -time_mean[s]
 47 |             #            Q[i,i] = -np.random.rand()
 48 |             #            Q[i,i] = -(.25 + np.random.rand()*3.75)
 49 |             #            Q[i,i] = -(.5 + np.random.rand()*1.5)
 50 |             Q[i,i] = -(.1 + np.random.rand()*3.9)
 51 |         # zero-out forbidden states
 52 |         if fwd_only:
 53 |             for i in range(n_components):
 54 |                 for j in range(n_components):
 55 |                     if j<i:
 56 |                         Q[i,j] = 0            
 57 |         if order:
 58 |             for i in range(n_components):
 59 |                 for j in range(n_components):
 60 |                     if not (j<=(i+order) and j>=(i-order)):
 61 |                         Q[i,j] = 0
 62 |         # renormalise
 63 |         for i in range(n_components):
 64 |             scale = np.sum([x if jj!=i else 0 for jj,x in enumerate(Q[i])])
 65 |             for j in range(n_components):
 66 |                 if i!=j:
 67 |                     if scale!=0:
 68 |                         Q[i,j] *= -Q[i,i]/scale
 69 |                     else:
 70 |                         Q[i,j] = 0.
 71 |                 elif i==(n_components-1) and j==(n_components-1) and fwd_only:
 72 |                     Q[i,j] = 0.
 73 |         """
 74 |         rates = np.array([-1.*(.5+np.random.rand()*.5) for i in range(n_components)])
 75 |         Q = np.zeros((n_components, n_components))
 76 |         for i in range(n_components-1):
 77 |             temp = []
 78 |             for j in range(i):
 79 |                 temp.append(0)
 80 |             temp.append(rates[i])
 81 |             temp.append(-rates[i])
 82 |             for j in range(2+i,n_components):
 83 |                 temp.append(0)
 84 |             Q[i] = temp
 85 |         """
 86 |         # always set initial probability as uniform
 87 |         pi0 = np.ones(len(Q))
 88 |         Q_subtypes.append(Q)
 89 |         pi0_subtypes.append(pi0)
 90 |     # true sojourns from generated transition rate matrix
 91 |     sojourns_true = []
 92 |     for i in range(len(Q)-1):
 93 |         #        temp = [(1/Q[i,i])*np.log(np.random.rand()) for x in range(1000)]
 94 |         #        print ('True mean duration', np.mean(temp))
 95 |         sojourn_i = -1/Q[i,i]
 96 |         if verbose:
 97 |             print ('Stage',i,'true sojourn', sojourn_i)
 98 |         sojourns_true.append(sojourn_i)
 99 |     if verbose:
100 |         print ('Total sequence true sojourn', np.sum(sojourns_true))
101 |     # Markov sequence generation
102 |     stages, times, jumps = [], [], []
103 |     # total time spent in each state
104 |     sojourn = np.zeros(n_components)
105 |     # number of occurrences of each state
106 |     counts = np.zeros(n_components)
107 |     for i in range(n_ppl):
108 |         # generate full jump process
109 |         # tvec is time of transition, xvec is stage at corresponding time
110 |         #        tvec, xvec, dt = sim_markov(Q_subtypes[subtypes[i]], pi0_subtypes[subtypes[i]])
111 |         tvec, xvec, dt = sim_markov(Q_subtypes[0], pi0_subtypes[0])
112 |         sojourn += dt
113 |         for j in range(len(xvec)):
114 |             counts[int(xvec[j])] += 1
115 |         # each subsequent time step should be distributed around 1 unit of time
116 |         # simulate 100 observation times to generate exact same dataset each run for direct comparison between models, then select number of desired observations after
117 |         # first observation time = 0 to ensure process starts in first state
118 |         time_steps = [0]
119 |         # FIXME: change the range to allow irregular sampling
120 |         for j in range(9): # can set this to whatever
121 |             #        time_steps.append(1 + np.random.normal(scale=.05))
122 |             #FIXME: set minmax timestep and scale externally
123 |             #            time_steps.append(np.random.randint(1,4)*time_mean[0])
124 |             time_steps.append(np.random.randint(1,5))
125 |         time_i = np.cumsum(time_steps)
126 |         times.append(time_i)
127 |         jump_i = np.diff(time_i)
128 |         jump_i = np.insert(jump_i, 0, 0)
129 |         ###FIXME: HACK
130 |         jumps.append(jump_i)
131 |         #        print ('HACKING SIMULATED JUMPS!')
132 |         #        jumps.append(np.array([0 if ii==0 else 1 for ii in range(len(jump_i))]))
133 |         ###
134 |         # sample stages corresponding to these times from the full jump process
135 |         stages.append(step_fun(time_i, tvec, xvec))
136 |     stages = np.array(stages)
137 |     times = np.array(times)
138 |     jumps = np.array(jumps)
139 |     if is_cut:
140 |         # for testing - cut some people at > stage_threshold at baseline
141 |         if model_type=='Zscore':
142 |             stage_threshold = n_components-3
143 |         else:
144 |             stage_threshold = n_components-1
145 |         del_idxs = []
146 |         for i in range(len(stages)):
147 |             if stages[i,0] >= stage_threshold:# and np.random.rand() > .5:
148 |                 del_idxs.append(i)
149 |         stages = np.delete(stages, del_idxs, axis=0)
150 |         times = np.delete(times, del_idxs, axis=0)
151 |         jumps = np.delete(jumps, del_idxs, axis=0)
152 |         print ('##########################################################')
153 |         print ('Cut', n_ppl-stages.shape[0], 'individuals at baseline for testing')
154 |         print ('##########################################################')
155 |         n_ppl = len(stages)
156 |     # generate subtypes
157 |     subtypes = np.random.choice(range(n_subtypes), n_ppl, replace=True, p=fractions).astype(int)
158 |     # reduce to the desired number of observations
159 |     if n_obs:
160 |         stages = stages[:,:n_obs]
161 |         times = times[:,:n_obs]
162 |         jumps = jumps[:,:n_obs]
163 |         lengths = np.array([n_obs for x in range(n_ppl)])
164 |     else:
165 |         lengths = []
166 |         for row in stages:
167 |             print (row.shape)
168 |             lengths.append(row.shape[0])
169 |         lengths = np.array(lengths)
170 |     """
171 |     else:
172 |         lengths = []
173 |         for i in range(n_ppl):
174 |             nobs_i = np.random.randint(1,3)
175 |             lengths.append(nobs_i)
176 |         lengths = np.array(lengths).astype(int)
177 |         times_temp, stages_temp, jumps_temp = [], [], []
178 |         for i in range(n_ppl):
179 |             stages_i = stages[i,:lengths[i]]
180 |             times_i = times[i,:lengths[i]]
181 |             jumps_i = jumps[i,:lengths[i]]
182 |             stages_temp.append(stages_i)
183 |             times_temp.append(times_i)
184 |             jumps_temp.append(jumps_i)
185 |         stages = np.array(stages_temp)
186 |         times = np.array(times_temp)
187 |         jumps = np.array(jumps_temp)
188 |     """
189 |     # generate data
190 |     if model_type=='Zscore':
191 |         if len(seq)==0:
192 |             seq = gen_model_zscore(stage_zscore, stage_biomarker_index, n_subtypes)
193 |         X, X_denoised = gen_data_zscore(subtypes,
194 |                                         stages,
195 |                                         seq,
196 |                                         min_biomarker_zscore,
197 |                                         max_biomarker_zscore,
198 |                                         [sigma_noise]*n_bms,
199 |                                         stage_zscore,
200 |                                         stage_biomarker_index)
201 |     else:
202 |         if len(seq)==0:
203 |             seq = gen_model_mixture(n_bms)
204 |         X, X_denoised = gen_data_mixture(stages, seq, 'mixture_GMM', sigma_noise)
205 |     # true sojourns from generated data
206 |     for s in range(n_subtypes):
207 |         stages_s = stages[subtypes==s]
208 |         if verbose:
209 |             print ('Subtype',s)
210 |             print ('n_ppl',len(stages_s))
211 |         sojourns_true = []
212 |         for i in range(len(Q)-1):
213 |             mask = stages_s[:,0]==i
214 |             den = np.sum(mask)
215 |             num = 0
216 |             for j in range(1,n_obs):
217 |                 num += np.sum(stages_s[mask][:,j]!=i)
218 |             prob_diag_i = 1-num/den
219 |             sojourn_i = 1/(1-prob_diag_i)
220 |             sojourns_true.append(sojourn_i)
221 |             if verbose:
222 |                 print ('Stage',i,'true diagonal probability',round(prob_diag_i,2))
223 |                 print ('Stage',i,'true generated sojourn',round(sojourn_i,2))
224 |         sojourns_true = np.array(sojourns_true)
225 |         if verbose:
226 |             print ('Total sequence true generated sojourn',round(np.nansum(sojourns_true[~np.isinf(sojourns_true)]),2))
227 |     # get data in long format for TEBM
228 |     X0 = []
229 |     stages_0 = []
230 |     for i in range(n_ppl):
231 |         X0.append(X[i][:,0])
232 |         stages_0.append(stages[i][0])
233 |     X0 = np.array(X0)
234 |     stages_0 = np.array(stages_0)
235 |     X_temp, stages_temp, times_temp, jumps_temp = [], [], [], []
236 |     for i in range(n_ppl):
237 |         X_i = X[i]
238 |         for j in range(X_i.shape[1]):
239 |             X_temp.append(X_i[:,j])
240 |         stage_i = stages[i]
241 |         for j in range(stage_i.shape[0]):
242 |             stages_temp.append(stage_i[j])
243 |         time_i = times[i]
244 |         for j in range(time_i.shape[0]):
245 |             times_temp.append(time_i[j])
246 |         jump_i = jumps[i]
247 |         for j in range(jump_i.shape[0]):
248 |             jumps_temp.append(jump_i[j])
249 |     X = np.array(X_temp)
250 |     stages = np.array(stages_temp)
251 |     times = np.array(times_temp)
252 |     jumps = np.array(jumps_temp)
253 |     # choose which subjects will be cases and which will be controls
254 |     MIN_CASE_STAGE = np.round((n_bms + 1) * 0.8)
255 |     index_case = np.where(stages_0 >=  MIN_CASE_STAGE)[0]
256 |     index_control = np.where(stages_0 ==  0)[0]
257 |     labels = 2 * np.ones(n_ppl, dtype=int) # 2 - intermediate value, not used in mixture model fitting
258 |     labels[index_case] = 1 # 1 - cases
259 |     labels[index_control] = 0 # 0 - controls
260 |     return X, lengths, jumps, labels, X0, stages, times, seq, Q_subtypes, pi0_subtypes, subtypes
261 | 
262 | def sim_markov(Q,
263 |                pi0,
264 |                n_jumps=None):
265 |     """
266 |     Gillespie's direct stochastic simulation algorithm for a single Markov chain with absorbing final state
267 |     """
268 |     n_s = len(pi0)
269 |     xvec = np.zeros(n_s)
270 |     tvec = np.zeros(n_s)
271 |     x = np.random.choice(n_s, size=1, p=pi0/np.sum(pi0))[0]
272 |     t = 0
273 |     xvec[0] = x
274 |     tvec[0] = 0.
275 |     if not n_jumps:
276 |         n_jumps = n_s-1
277 |     for i in range(n_jumps):
278 |         # final state is absorbing - don't increment time
279 |         if Q[x,x] != 0:
280 |             t += (1/Q[x,x])*np.log(np.random.random())
281 |         weights = Q[x].copy()
282 |         weights[x] = 0
283 |         # final state is absorbing - don't change state
284 |         if np.sum(weights) != 0:
285 |             x = np.random.choice(n_s, size=1, p=weights/np.sum(weights))[0]
286 |         xvec[i+1] = x
287 |         tvec[i+1] = t
288 |     # time spent in each state
289 |     dt = np.zeros(len(pi0))
290 |     for i in range(len(tvec)):
291 |         if i < (len(tvec)-1):
292 |             dt[int(xvec[i])] += tvec[i+1] - tvec[i]
293 |     return tvec, xvec, dt
294 | 
295 | def step_fun(x, xvec, yvec):
296 |     y = []
297 |     for i in range(len(x)):
298 |         for j in range(len(xvec)-1):
299 |             if x[i] >= xvec[j] and x[i] < xvec[j+1]:
300 |                 y.append(yvec[j])
301 |             elif x[i] >= xvec[-1]:                
302 |                 y.append(yvec[-1])
303 |                 break
304 |     return y
305 | 
306 | def gen_data_zscore(subtypes,
307 |                     stages,
308 |                     gt_ordering,
309 |                     min_biomarker_zscore,
310 |                     max_biomarker_zscore,
311 |                     std_biomarker_zscore,
312 |                     stage_zscore,
313 |                     stage_biomarker_index):
314 | 
315 |     N = stage_biomarker_index.shape[1]
316 |     N_S = gt_ordering.shape[0]    
317 |     possible_biomarkers = np.unique(stage_biomarker_index)
318 |     B = len(possible_biomarkers)
319 |     stage_value = np.zeros((B,N+2,N_S))    
320 |     for s in range(N_S):
321 |         S = gt_ordering[s,:]
322 |         S_inv = np.array([0]*N)
323 |         S_inv[S.astype(int)] = np.arange(N)
324 |         for i in range(B):
325 |             b = possible_biomarkers[i]
326 |             event_location = np.concatenate([[0], S_inv[(stage_biomarker_index == b)[0]], [N]])
327 |             event_value = np.concatenate([[min_biomarker_zscore[i]], stage_zscore[stage_biomarker_index == b], [max_biomarker_zscore[i]]])
328 |             for j in range(len(event_location)-1):
329 |                 if j == 0: # FIXME: nasty hack to get Matlab indexing to match up - necessary here because indices are used for linspace limits
330 |                     index = np.arange(event_location[j],event_location[j+1]+2)
331 |                     stage_value[i,index,s] = np.linspace(event_value[j],event_value[j+1],event_location[j+1]-event_location[j]+2)
332 |                 else:
333 |                     index = np.arange(event_location[j] + 1, event_location[j + 1] + 2)
334 |                     stage_value[i,index,s] = np.linspace(event_value[j],event_value[j+1],event_location[j+1]-event_location[j]+1)
335 |     stage_value = 0.5 * stage_value[:, :stage_value.shape[1] - 1, :] + 0.5 * stage_value[:, 1:, :]
336 |     M = stages.shape[0]
337 |     # initialise variable observation length arrays
338 |     data = []
339 |     for i in range(len(stages)):
340 |         data.append(np.zeros((B, len(stages[i]))))
341 |     data_denoised = []
342 |     for i in range(len(stages)):
343 |         data_denoised.append(np.zeros((B, len(stages[i]))))
344 |     # set data
345 |     for i in range(M):
346 |         stage_i = stages[i]
347 |         # assume noise homoskedastic
348 |         noise = np.random.normal(np.zeros(B), std_biomarker_zscore, B)
349 |         for t in range(len(stage_i)):
350 |             for j in range(B):
351 |                 data_denoised[i][j][t] = stage_value[:,int(stage_i[t]),subtypes[i]][j] # last index would be "subtypes[i]"
352 |                 data[i][j][t] = data_denoised[i][j][t] + noise[j]
353 |     return data, data_denoised
354 | 
355 | def gen_model_zscore(stage_zscore,
356 |                      stage_biomarker_index,
357 |                      N_S):
358 | 
359 |     N = np.array(stage_zscore).shape[1]
360 |     S = np.zeros((N_S,N))
361 |     for s in range(N_S):
362 |         for i in range(N):
363 |             IS_min_stage_zscore = np.array([False]*N)
364 |             possible_biomarkers = np.unique(stage_biomarker_index)
365 |             for j in range(len(possible_biomarkers)):
366 |                 IS_unselected = [False]*N
367 |                 for k in set(range(N))-set(S[s][:i]):
368 |                     IS_unselected[k] = True
369 |                 this_biomarkers = np.array([(np.array(stage_biomarker_index)[0]==possible_biomarkers[j]).astype(int)+(np.array(IS_unselected)==1).astype(int)])==2
370 |                 if not np.any(this_biomarkers):
371 |                     this_min_stage_zscore = 0
372 |                 else:
373 |                     this_min_stage_zscore = min(stage_zscore[this_biomarkers])
374 |                 if(this_min_stage_zscore):
375 |                     temp = ((this_biomarkers.astype(int)+(stage_zscore==this_min_stage_zscore).astype(int))==2).T
376 |                     temp = temp.reshape(len(temp),)
377 |                     IS_min_stage_zscore[temp]=True
378 |             events = np.array(range(N))
379 |             possible_events = np.array(events[IS_min_stage_zscore])
380 |             this_index = np.ceil(np.random.rand()*((len(possible_events))))-1
381 |             S[s][i] = possible_events[int(this_index)]
382 |     return S
383 | 
384 | def gen_model_mixture(N_biomarkers):    
385 |     return np.array([np.random.permutation(N_biomarkers)]).astype(float)
386 | 
387 | def gen_data_mixture(stages,
388 |                      gt_ordering,
389 |                      mixture_style,
390 |                      sigma_noise=1.):
391 |     N_biomarkers                        = gt_ordering.shape[1]
392 |     N_subjects                          = len(stages)
393 |     #controls are always drawn from N(0, 1) distribution
394 |     mean_controls                       = np.array([0]   * N_biomarkers)
395 |     std_controls                        = np.array([sigma_noise] * N_biomarkers)
396 |     #mean and variance for cases
397 |     #if using mixture_GMM, use normal distribution with mean 1 and std. devs sampled from a range
398 |     if mixture_style == 'mixture_GMM':
399 |         #        mean_cases                       = np.array(np.random.uniform(size=N_biomarkers)+1.35) # PW: 1.5 to look more like ADNI SNR
400 |         mean_cases                       = np.array(np.random.uniform(size=N_biomarkers)+1.5) # PW: 1.5 to look more like ADNI SNR
401 |         std_cases                        = np.array([sigma_noise] * N_biomarkers)
402 |     #if using mixture_KDE, use log normal with mean 0.5 and std devs sampled from a range
403 |     elif mixture_style == 'mixture_KDE':
404 |         mean_cases                      = np.array([0.5] * N_biomarkers)
405 |         std_cases                       = np.random.uniform(0.2, 0.5, N_biomarkers)
406 |     # initialise variable observation length arrays
407 |     data = []
408 |     for i in range(len(stages)):
409 |         data.append(np.zeros((N_biomarkers, len(stages[i]))))
410 |     data_denoised = []
411 |     for i in range(len(stages)):
412 |         data_denoised.append(np.zeros((N_biomarkers, len(stages[i]))))
413 |     #loop over all subjects, creating measurment for each biomarker based on what subtype and stage they're in
414 |     for i in range(N_subjects):
415 |         stage_i = stages[i]
416 |         for t in range(len(stage_i)):
417 |             S_i                               = gt_ordering[0, :].astype(int) # first index would be subtype
418 |             stage_i_t                         = stage_i[t].astype(int)
419 |             #fill in with ABNORMAL values up to the subject's stage
420 |             for j in range(stage_i_t):
421 |                 if      mixture_style == 'mixture_KDE':
422 |                     sample_j                = np.random.lognormal(mean_cases[S_i[j]], std_cases[S_i[j]])
423 |                 elif    mixture_style == 'mixture_GMM':
424 |                     sample_j                = np.random.normal(mean_cases[S_i[j]], std_cases[S_i[j]])
425 |                 data[i][S_i[j]][t]             = sample_j
426 |                 data_denoised[i][S_i[j]][t]    = mean_cases[S_i[j]]
427 |             # fill in with NORMAL values from the subject's stage+1 to last stage
428 |             for j in range(stage_i_t, N_biomarkers):
429 |                 data[i][S_i[j]][t]             = np.random.normal(mean_controls[S_i[j]], std_controls[S_i[j]])
430 |                 data_denoised[i][S_i[j]][t]    = mean_controls[S_i[j]]
431 |     return data, data_denoised
432 | 


--------------------------------------------------------------------------------
/examples/run_tebm_sim.py:
--------------------------------------------------------------------------------
  1 | # Run TEBM simulation
  2 | # Author: Peter Wijeratne (p.wijeratne@pm.me)
  3 | 
  4 | import sys
  5 | from tebm import tebm_fix, tebm_var
  6 | from kde_ebm import plotting
  7 | from kde_ebm.mixture_model import fit_all_gmm_models, get_prob_mat
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | import scipy as sp
 12 | import pickle
 13 | import matplotlib.pyplot as plt
 14 | import multiprocessing
 15 | 
 16 | from sim_funcs import gen_data
 17 | import warnings
 18 | warnings.filterwarnings("ignore", message="Casting complex values to real discards the imaginary part")
 19 | 
 20 | ############################################################################################################## USER INPUT START
 21 | is_ebm = False # set True if you want to use the standard EBM
 22 | is_cut = False # cut people starting in final state(s); for testing effect of right-censoring
 23 | sigma_noise = 0.1 # simulated measurement noise
 24 | n_ppl = 100 # number of people
 25 | n_bms = 5 # number of biomarkers
 26 | n_obs = 2 # number of observations per person. Set = None for random number of observations per person
 27 | model_type = 'GMM' # type of likelihood model, can be one of: 'GMM', KDE', 'Zscore'
 28 | if model_type == 'Zscore':
 29 |     n_zscores = 3 # number of z-score events per biomarker
 30 |     z_max = 5 # maximum z-score event per biomarker
 31 |     order = 1#int(n_bms*n_zscores) # transition order. For unconstrained matrix, set order = number of events-1
 32 |     n_components = int(n_bms*n_zscores + 1)
 33 | else:
 34 |     order = 1 # transition order. For unconstrained matrix, set order = n_bms
 35 |     n_components = n_bms + 1
 36 | fwd_only = True # set True to only allow forward transitions
 37 | ############################################################################################################## USER INPUT END
 38 | scale = 1 # mean time scale for each event 
 39 | time_mean = scale # used when simulating data
 40 | init_params = 's' # initialise start / initial probability to uniform prior
 41 | fit_params = 'st' # fit start probability and transition matrix
 42 | tol = 1E-3 # tolerance for both inner and outer EM
 43 | n_cores = 1 # number of cores used for parallelising start point
 44 | n_start = 4 # number of outer EM start points 
 45 | # set random seed
 46 | try:
 47 |     seed = int(sys.argv[1])
 48 | except IndexError:
 49 |     seed = 42
 50 | np.random.seed(seed)
 51 | if model_type=='Zscore':
 52 |     n_stages = int(n_zscores*n_bms + 1)
 53 | else:
 54 |     n_stages = n_bms + 1
 55 | algo = 'viterbi' # staging algorithm
 56 | n_iter_outer = 100 # maximum number of outer EM iterations (for fitting sequence)
 57 | # if fitting standard EBM then we don't need the inner EM loop (for fitting start probability and transition matrix)
 58 | if is_ebm:
 59 |     n_iter_inner = 0
 60 | else:
 61 |     n_iter_inner = 1
 62 | biom_labels = []
 63 | if model_type=='Zscore':
 64 |     for i in range(n_bms):
 65 |         for j in range(n_zscores):
 66 |             biom_labels.append('BM'+str(i)+'Z'+str(j))
 67 | else:
 68 |     for i in range(n_bms):
 69 |         biom_labels.append('BM'+str(i))
 70 | plot_raw_data = False
 71 | # generate data
 72 | if model_type=='Zscore':
 73 | #    X, lengths, jumps, labels, X0, stages_true, times, seq_true, Q, pi0 = gen_data(n_ppl, n_bms, n_obs, n_stages, model_type=model_type, is_cut=is_cut, n_zscores=n_zscores, z_max=z_max, sigma_noise=sigma_noise)
 74 |     X, lengths, jumps, labels, X0, stages_true, times, seq_true, Q, pi0, _ = gen_data(1, n_ppl, n_bms, n_obs, n_components, model_type=model_type, is_cut=is_cut, n_zscores=n_zscores, z_max=z_max, sigma_noise=sigma_noise, seq=[], fractions=[1], fwd_only=fwd_only, order=order, time_mean=[1/scale])
 75 | else:
 76 |     X, lengths, jumps, labels, X0, stages_true, times, seq_true, Q, pi0, _ = gen_data(1, n_ppl, n_bms, n_obs, n_components, model_type=model_type, is_cut=is_cut, n_zscores=None, z_max=None, sigma_noise=sigma_noise, seq=[], fractions=[1], fwd_only=fwd_only, order=order, time_mean=[1/scale])
 77 | 
 78 | if True:
 79 |     n_x = np.round(np.sqrt(n_bms)).astype(int)
 80 |     n_y = np.ceil(np.sqrt(n_bms)).astype(int)
 81 |     fig, ax = plt.subplots(n_y, n_x, figsize=(10, 10))
 82 |     for i in range(n_bms):
 83 |         for j in range(len(lengths)):
 84 |             nobs_i = lengths[j]
 85 |             s_idx, e_idx = int(np.sum(lengths[:j])), int(np.sum(lengths[:j])+nobs_i)
 86 |             ax[i // n_x, i % n_x].plot(stages_true[s_idx:e_idx],X[s_idx:e_idx,i])
 87 |             ax[i // n_x, i % n_x].scatter(stages_true[s_idx:e_idx],X[s_idx:e_idx,i])
 88 |             ax[i // n_x, i % n_x].set_title(biom_labels[i])
 89 | #    plt.show()
 90 | 
 91 | save_variables = {}
 92 | save_variables["X"] = X
 93 | save_variables["lengths"] = lengths
 94 | save_variables["jumps"] = jumps
 95 | save_variables["labels"] = labels
 96 | save_variables["X0"] = X0
 97 | save_variables["seq_true"] = seq_true
 98 | save_variables["times"] = times
 99 | save_variables["Q"] = Q
100 | save_variables["pi0"] = pi0
101 | pickle_file = open('./simdata_Nppl'+str(n_ppl)+'_Nbms'+str(n_bms)+'_Nobs'+str(n_obs)+'_Nintervals'+str(len(np.unique(jumps))-1)+'.pickle', 'wb')
102 | pickle_output = pickle.dump(save_variables, pickle_file)
103 | pickle_file.close()
104 | 
105 | """
106 | pickle_file = open('simdata_Nppl'+str(n_ppl)+'_Nbms'+str(n_bms)+'_Nobs'+str(n_obs)+'_Nintervals'+str(len(np.unique(jumps))-1)+'.pickle', 'rb')
107 | loaded_variables = pickle.load(pickle_file)
108 | X = loaded_variables["X"]
109 | lengths = loaded_variables["lengths"]
110 | jumps = loaded_variables["jumps"]
111 | labels = loaded_variables["labels"]
112 | X0 = loaded_variables["X0"]
113 | seq_true = loaded_variables["seq_true"]
114 | times = loaded_variables["times"]
115 | Q = loaded_variables["Q"]
116 | pi0 = loaded_variables["pi0"]
117 | """ 
118 | # EBM treats repeated measurements as from separate individuals
119 | if is_ebm:
120 |     labels0 = labels.copy()
121 |     labels_temp = []
122 |     for i in range(len(lengths)):
123 |         nobs_i = lengths[i]
124 |         for j in range(nobs_i):
125 |             labels_temp.append(labels[i])
126 |     labels = np.array(labels_temp)        
127 |     lengths = np.ones(X.shape[0]).astype(int)
128 |         
129 |     print (X.shape, lengths.shape)
130 | #
131 | 
132 | labels_long = []
133 | for i in range(len(lengths)):
134 |     nobs_i = lengths[i]
135 |     for j in range(nobs_i):
136 |         labels_long.append(labels[i])
137 | labels_long = np.array(labels_long)
138 | snr = np.nanmean(X[labels_long!=0], axis=0)/np.nanstd(X[labels_long==0], axis=0)
139 | #for i in range(len(biom_labels)):
140 | #    print (biom_labels[i]+' SNR =', snr[i])
141 | print ('SNR', snr)
142 | 
143 | # this is currently redundant, but not for long...
144 | obs_type = 'Var'
145 | #
146 | print ('n_ppl', X0.shape[0], 'n_bms', X0.shape[1], 'n_obs', X.shape[0], 'n_intervals', len(np.unique(jumps))-1, 'n_stages', n_stages, 'order', order, 'fwd_only', fwd_only)
147 | 
148 | print ('Fitting '+model_type+'-'+obs_type+'-TEBM...')
149 | if model_type == 'GMM' or model_type == 'KDE':
150 |     if obs_type == 'Fix':
151 |         model = tebm_fix.MixtureTEBM(X=X,
152 |                                      lengths=lengths, 
153 |                                      n_stages=n_stages,
154 |                                      time_mean=time_mean,
155 |                                      n_iter=n_iter_inner,
156 |                                      fwd_only=fwd_only,
157 |                                      order=order,
158 |                                      algo=algo)
159 |         seq_model, mixtures = model.fit_tebm(labels, n_start=n_start, n_iter=n_iter_outer, n_cores=n_cores, model_type=model_type, cut_controls=False)
160 |     else:
161 |         model = tebm_var.MixtureTEBM(X=X, lengths=lengths, jumps=jumps,
162 |                                      n_components=n_components, time_mean=time_mean, covariance_type="diag",
163 |                                      n_iter=n_iter_inner, tol=tol,
164 |                                      init_params=init_params, params=fit_params,
165 |                                      algorithm=algo, verbose=False, allow_nan=True,
166 |                                      fwd_only=fwd_only, order=order)
167 |         seq_model, mixtures = model._fit_tebm(labels, n_start=n_start, n_iter=n_iter_outer, n_cores=n_cores, model_type=model_type, cut_controls=False)
168 | elif model_type == 'Zscore':
169 |     model = tebm_fix.ZscoreTEBM(X=X,
170 |                                 lengths=lengths, 
171 |                                 n_stages=n_stages,
172 |                                 time_mean=1/n_stages,
173 |                                 n_iter=n_iter_inner,
174 |                                 fwd_only=fwd_only,
175 |                                 order=order,
176 |                                 algo=algo)
177 |     seq_model = model.fit_tebm(n_zscores=n_zscores, z_max=z_max, n_start=n_start, n_iter=n_iter_outer, n_cores=n_cores, cut_controls=False)
178 | else:
179 |     print ('Likelihood model not recognised! quit()')
180 |     quit()
181 | 
182 | if model_type == 'GMM':
183 |     fig, ax = plotting.mixture_model_grid(X0, labels, mixtures, biom_labels)
184 |     for i in range(len(ax)):
185 |         for j in range(len(ax)-1):
186 |             ax[i,j].set_yscale('log')
187 | 
188 | print ('True seq',seq_true[0])
189 | print ('MaxL seq',seq_model[0])
190 | print ('Kendall tau',sp.stats.kendalltau(seq_true[0], seq_model[0]))
191 | 
192 | if not is_ebm:
193 |     n_iter_inner = 100
194 | 
195 | # refit with 100 iterations
196 | if model_type == 'GMM':
197 |     if obs_type == 'Var':
198 |         model = tebm_var.MixtureTEBM(X=X, lengths=lengths, jumps=jumps,
199 |                                      n_components=n_components, time_mean=time_mean, covariance_type="diag",
200 |                                      n_iter=n_iter_inner, tol=tol,
201 |                                      init_params=init_params, params=fit_params,
202 |                                      algorithm=algo, verbose=True, allow_nan=True,
203 |                                      fwd_only=fwd_only, order=order)
204 |     else:
205 |         model = tebm_fix.MixtureTEBM(X=X,
206 |                                      lengths=lengths, 
207 |                                      n_stages=n_stages,
208 |                                      time_mean=time_mean,
209 |                                      n_iter=n_iter_inner,
210 |                                      fwd_only=fwd_only,
211 |                                      order=order,
212 |                                      algo=algo,
213 |                                      verbose=True)
214 | else:
215 |     if obs_type == 'Var':
216 |         model = tebm_var.ZscoreTEBM(X=X,
217 |                                     lengths=lengths,
218 |                                     jumps=jumps,
219 |                                     n_components=n_components,
220 |                                     #                                    time_mean=time_mean,
221 |                                     covariance_type="diag",
222 |                                     n_iter=n_iter_inner,
223 |                                     init_params=init_params,
224 |                                     params=fit_params,
225 |                                     fwd_only=fwd_only,
226 |                                     order=order)#,
227 |                                     #                                    algo=algo)
228 |         print ('!')
229 |     else:
230 |         model = tebm_fix.ZscoreTEBM(X=X,
231 |                                     lengths=lengths, 
232 |                                     n_stages=n_stages,
233 |                                     time_mean=1/n_stages,
234 |                                     n_iter=n_iter_inner,
235 |                                     fwd_only=fwd_only,
236 |                                     order=order,
237 |                                     algo=algo)
238 |     
239 | model.S = seq_model[0]
240 | if model_type=='GMM':
241 |     model.prob_mat = get_prob_mat(X, mixtures)
242 |     model.mixtures = mixtures
243 |     model.fit()
244 | else:
245 |     # intialise z-score stuff
246 |     z_val_arr = np.array([[x+1 for x in range(n_zscores)]]*n_bms)
247 |     z_max_arr = np.array([z_max]*n_bms)
248 |     IX_vals = np.array([[x for x in range(model.n_features)]*n_zscores]).T
249 |     stage_biomarker_index = np.array([y for x in IX_vals.T for y in x])
250 |     stage_zscore = np.array([y for x in z_val_arr.T for y in x])
251 |     model.stage_biomarker_index = stage_biomarker_index.reshape(1,len(stage_biomarker_index))
252 |     model.stage_zscore = stage_zscore.reshape(1,len(stage_zscore))
253 |     model.min_biomarker_zscore = [0]*n_bms
254 |     model.max_biomarker_zscore = z_max_arr
255 |     model.covars_prior = np.tile(np.identity(1), (n_components, n_bms))
256 |     model.covars_ = model.covars_prior
257 |     model.means_ = model._get_means()
258 |     model.fit()
259 | 
260 | if is_ebm:
261 |     fout_name = 'simrun'+str(seed)+'_'+model_type+'-ebm_Nppl'+str(n_ppl)+'_Nstates'+str(n_stages)+'_Nobs'+str(n_obs)+'_Nintervals'+str(len(np.unique(jumps))-1)+'_Nstart'+str(n_start)+'_iscut_'+str(is_cut)+'_noise_'+str(sigma_noise)[0]+'p'+str(sigma_noise)[2]+'_order_'+str(order)+'_fwdonly_'+str(fwd_only)+'_Nits_'+str(n_iter_inner)+'.pickle'
262 | else:
263 |     fout_name = 'simrun'+str(seed)+'_'+model_type+'-tebm_Nppl'+str(n_ppl)+'_Nstates'+str(n_stages)+'_Nobs'+str(n_obs)+'_Nintervals'+str(len(np.unique(jumps))-1)+'_Nstart'+str(n_start)+'_iscut_'+str(is_cut)+'_noise_'+str(sigma_noise)[0]+'p'+str(sigma_noise)[2]+'_order_'+str(order)+'_fwdonly_'+str(fwd_only)+'_Nits_'+str(n_iter_inner)+'.pickle'
264 | save_variables = {}
265 | save_variables["X"] = X
266 | save_variables["lengths"] = lengths
267 | save_variables["jumps"] = jumps
268 | save_variables["labels"] = labels
269 | save_variables["seq_true"] = seq_true
270 | save_variables["stages_true"] = stages_true
271 | save_variables["Q_true"] = Q[0]
272 | save_variables["p_vec_true"] = pi0
273 | save_variables["seq_model"] = seq_model
274 | if obs_type == 'Var':
275 |     save_variables["Q_model"] = model.Q_
276 | save_variables["p_vec_model"] = model.startprob_
277 | 
278 | pickle_file = open('./'+fout_name, 'wb')
279 | pickle_output = pickle.dump(save_variables, pickle_file)
280 | pickle_file.close()
281 | 
282 | if plot_raw_data:
283 |     n_x = np.round(np.sqrt(n_bms)).astype(int)
284 |     n_y = np.ceil(np.sqrt(n_bms)).astype(int)
285 |     fig, ax = plt.subplots(n_y, n_x, figsize=(10, 10))
286 |     for i in range(n_bms):
287 |         for j in range(len(lengths)):
288 |             nobs_i = lengths[j]
289 |             s_idx, e_idx = int(np.sum(lengths[:j])), int(np.sum(lengths[:j])+nobs_i)
290 |             ax[i // n_x, i % n_x].plot(stages_true[s_idx:e_idx],X[s_idx:e_idx,i])
291 |             ax[i // n_x, i % n_x].scatter(stages_true[s_idx:e_idx],X[s_idx:e_idx,i])
292 |             ax[i // n_x, i % n_x].set_title(biom_labels[i])
293 |             
294 | # plots
295 | if plot_raw_data:
296 |     if model_type == 'GMM' or model_type == 'KDE':
297 |         # biomarker distributions and mixture model fits
298 |         fig, ax = plotting.mixture_model_grid(X0, labels, mixtures, biom_labels)
299 | 
300 | # true transition matrix
301 | transmat = np.zeros((n_stages,n_stages))
302 | startprob = np.zeros(n_stages)
303 | for i in range(len(lengths)):
304 |     nobs_i = lengths[i]
305 |     s_idx, e_idx = int(np.sum(lengths[:i])), int(np.sum(lengths[:i])+nobs_i)
306 |     stages_i = stages_true[s_idx:e_idx].astype(int)
307 |     startprob[stages_i[0]] += 1
308 |     for j in range(1,len(stages_i)):
309 |         transmat[stages_i[j-1],stages_i[j]] += 1
310 | # normalise across rows
311 | startprob /= np.sum(startprob)
312 | for i in range(transmat.shape[0]):
313 |     transmat[i] /= np.sum(transmat[i])
314 | # plot true initial probability
315 | fig, ax = plt.subplots()
316 | ax.bar(np.arange(n_stages),startprob)
317 | ax.set_xlabel('Stage', fontsize=18, labelpad=8)
318 | ax.set_ylabel('Probability', fontsize=18, labelpad=2)
319 | ax.tick_params(labelsize=18)
320 | plt.subplots_adjust(top=0.95, right=0.99, bottom=.15)
321 | ax.set_title('True pi0')
322 | # initial probability
323 | if obs_type == 'Fix':
324 |     startprob = model.p_vec
325 | else:
326 |     startprob = model.startprob_    
327 | fig, ax = plt.subplots()
328 | ax.bar(np.arange(n_stages),startprob)
329 | ax.set_xlabel('Stage', fontsize=18, labelpad=8)
330 | ax.set_ylabel('Probability', fontsize=18, labelpad=2)
331 | ax.tick_params(labelsize=18)
332 | plt.subplots_adjust(top=0.95, right=0.99, bottom=.15)
333 | ax.set_title('Fitted pi0')
334 | # sojourn times
335 | sojourns_reco, sojourns_true = [], []
336 | sojourns_reco.append(0)
337 | sojourns_true.append(0)
338 | for i in range(len(transmat)-1):     # skip final stage (absorbing)
339 |     if obs_type == 'Fix':
340 |         sojourns_reco.append(1/(1-transmat[i,i])/scale)
341 |     else:
342 |         sojourns_reco.append(-1/model.Q_[i,i]/scale)
343 |         sojourns_true.append(-1/Q[0][i,i]/scale)
344 | #    print ('Stage',i,'duration',round(sojourns_reco[i+1],2))
345 | sojourns_reco = np.array(sojourns_reco)
346 | sojourns_true = np.array(sojourns_true)
347 | print ('np.sum(sojourns_reco)',round(np.sum(sojourns_reco),2))
348 | print ('np.sum(sojourns_true)',round(np.sum(sojourns_true),2))
349 | print ('np.abs(np.sum(sojourns_reco)-np.sum(sojourns_true))',np.abs(np.sum(sojourns_reco)-np.sum(sojourns_true)))
350 | print ('np.sqrt(np.sum(np.power(sojourns_reco-sojourns_true, 2))/len(sojourns))',np.sqrt(np.sum(np.power(sojourns_reco-sojourns_true, 2))/len(sojourns_reco)))
351 | 
352 | # staging
353 | if obs_type == 'Fix':
354 |     stages_model = model.stages(X, lengths)
355 | else:
356 |     stages_model, _ = model.predict(X, lengths, jumps)
357 | stages_true = stages_true.flatten()
358 | scale = [10.]*len(stages_true)
359 | for i in range(len(stages_true)):
360 |     x0 = stages_true[i]
361 |     x1 = stages_model[i]
362 |     for j in range(len(stages_true)):
363 |         if x0 == stages_true[j] and x1 == stages_model[j]:
364 |             scale[i] += 20.
365 | fig, ax = plt.subplots()
366 | ax.scatter(stages_true.flatten(), stages_model, s=scale)
367 | ax.set_xlabel('Stage (true)')
368 | ax.set_ylabel('Stage (reco)')
369 | ax.grid()
370 | 
371 | if obs_type == 'Var':
372 |     # true transition rate matrix
373 |     transmat = np.real(Q[0])
374 |     fig, ax = plt.subplots()
375 |     ax.imshow(transmat, interpolation='nearest', cmap=plt.cm.Blues)
376 |     for i in range(transmat.shape[0]):
377 |         for j in range(transmat.shape[1]):
378 |             if abs(round(transmat[i, j], 3)) > 1E-3:
379 |                 text = ax.text(j, i, round(transmat[i, j], 3), ha="center", va="center", color="black", size=10)
380 |     event_labels = np.array(biom_labels)[seq_true[0].astype(int)]
381 |     event_labels = np.insert(event_labels, 0, 'None')
382 |     ax.set_xticks(np.arange(len(event_labels)))
383 |     ax.set_yticks(np.arange(len(event_labels)))
384 |     xticklabels = []
385 |     for x in event_labels:
386 |         xticklabels.append(str(x)+' (t1)')
387 |     ax.set_xticklabels(xticklabels, ha='right', rotation=45, rotation_mode='anchor', fontsize=12)
388 |     yticklabels = []
389 |     for x in event_labels:
390 |         yticklabels.append(str(x)+' (t0)')
391 |     ax.set_yticklabels(yticklabels, ha='right', rotation_mode='anchor', fontsize=12)
392 |     plt.subplots_adjust(bottom=.2, top=.95)
393 |     bottom, top = ax.get_ylim()
394 |     ax.set_ylim(bottom + 0.5, top - 0.5)
395 |     ax.set_title('True Q')
396 |     # prior transition rate matrix
397 |     transmat = np.real(model.Q_prior)
398 |     fig, ax = plt.subplots()
399 |     ax.imshow(transmat, interpolation='nearest', cmap=plt.cm.Blues)
400 |     for i in range(transmat.shape[0]):
401 |         for j in range(transmat.shape[1]):
402 |             if abs(round(transmat[i, j], 3)) > 1E-3:
403 |                 text = ax.text(j, i, round(transmat[i, j], 3), ha="center", va="center", color="black", size=10)
404 |     event_labels = np.array(biom_labels)[seq_true[0].astype(int)]
405 |     event_labels = np.insert(event_labels, 0, 'None')
406 |     ax.set_xticks(np.arange(len(event_labels)))
407 |     ax.set_yticks(np.arange(len(event_labels)))
408 |     xticklabels = []
409 |     for x in event_labels:
410 |         xticklabels.append(str(x)+' (t1)')
411 |     ax.set_xticklabels(xticklabels, ha='right', rotation=45, rotation_mode='anchor', fontsize=12)
412 |     yticklabels = []
413 |     for x in event_labels:
414 |         yticklabels.append(str(x)+' (t0)')
415 |     ax.set_yticklabels(yticklabels, ha='right', rotation_mode='anchor', fontsize=12)
416 |     plt.subplots_adjust(bottom=.2, top=.95)
417 |     bottom, top = ax.get_ylim()
418 |     ax.set_ylim(bottom + 0.5, top - 0.5)
419 |     ax.set_title('Prior Q')
420 |     # fitted transition rate matrix
421 |     transmat = np.real(model.Q_)
422 |     fig, ax = plt.subplots()
423 |     ax.imshow(transmat, interpolation='nearest', cmap=plt.cm.Blues)
424 |     for i in range(transmat.shape[0]):
425 |         for j in range(transmat.shape[1]):
426 |             if abs(round(transmat[i, j], 3)) > 1E-3:
427 |                 text = ax.text(j, i, round(transmat[i, j], 3), ha="center", va="center", color="black", size=10)
428 |     event_labels = np.array(biom_labels)[seq_model[0].astype(int)]
429 |     event_labels = np.insert(event_labels, 0, 'None')
430 |     ax.set_xticks(np.arange(len(event_labels)))
431 |     ax.set_yticks(np.arange(len(event_labels)))
432 |     xticklabels = []
433 |     for x in event_labels:
434 |         xticklabels.append(str(x)+' (t1)')
435 |     ax.set_xticklabels(xticklabels, ha='right', rotation=45, rotation_mode='anchor', fontsize=12)
436 |     yticklabels = []
437 |     for x in event_labels:
438 |         yticklabels.append(str(x)+' (t0)')
439 |     ax.set_yticklabels(yticklabels, ha='right', rotation_mode='anchor', fontsize=12)
440 |     plt.subplots_adjust(bottom=.2, top=.95)
441 |     bottom, top = ax.get_ylim()
442 |     ax.set_ylim(bottom + 0.5, top - 0.5)
443 |     ax.set_title('Fitted Q')
444 |     #    print ('sum(diag(Q_true)-diag(Q_reco))', np.abs(np.sum(np.abs(np.diag(Q[0]))-np.abs(np.diag(model.Q_)))))
445 |     #    print ('sum(diag(Q_true)-diag(Q_reco))', np.sum(np.abs(np.diag(Q[0])-np.diag(model.Q_))))
446 |     print ('sum(diag(Q_true)-diag(Q_reco))', np.sqrt(np.sum(np.power(np.diag(Q[0])-np.diag(model.Q_), 2))/model.Q_.shape[0]))
447 | plt.show()
448 | # write
449 | # write data
450 | if is_ebm:
451 |     fout_name = 'simrun'+str(seed)+'_'+model_type+'-ebm_Nppl'+str(n_ppl)+'_Nstates'+str(n_stages)+'_Nobs'+str(n_obs)+'_Nintervals'+str(len(np.unique(jumps))-1)+'_Nstart'+str(n_start)+'_iscut_'+str(is_cut)+'_noise_'+str(sigma_noise)[0]+'p'+str(sigma_noise)[2]+'_order_'+str(order)+'_fwdonly_'+str(fwd_only)+'_Nits_'+str(n_iter_inner)+'.pickle'
452 | else:
453 |     fout_name = 'simrun'+str(seed)+'_'+model_type+'-tebm_Nppl'+str(n_ppl)+'_Nstates'+str(n_stages)+'_Nobs'+str(n_obs)+'_Nintervals'+str(len(np.unique(jumps))-1)+'_Nstart'+str(n_start)+'_iscut_'+str(is_cut)+'_noise_'+str(sigma_noise)[0]+'p'+str(sigma_noise)[2]+'_order_'+str(order)+'_fwdonly_'+str(fwd_only)+'_Nits_'+str(n_iter_inner)+'.pickle'
454 | save_variables = {}
455 | save_variables["X"] = X
456 | save_variables["lengths"] = lengths
457 | save_variables["jumps"] = jumps
458 | save_variables["labels"] = labels
459 | save_variables["seq_true"] = seq_true
460 | save_variables["stages_true"] = stages_true
461 | save_variables["Q_true"] = Q[0]
462 | save_variables["p_vec_true"] = pi0
463 | save_variables["seq_model"] = seq_model
464 | save_variables["stages_model"] = stages_model
465 | save_variables["Q_model"] = model.Q_
466 | save_variables["p_vec_model"] = startprob
467 | 
468 | pickle_file = open('./'+fout_name, 'wb')
469 | pickle_output = pickle.dump(save_variables, pickle_file)
470 | pickle_file.close()
471 | 
472 | plt.show()
473 | 


--------------------------------------------------------------------------------
/lib/tebm/tebm_fix.py:
--------------------------------------------------------------------------------
  1 | # Fixed time interval Temporal Event-Based Model
  2 | # Derived class from base_fix.py
  3 | # Author: Peter Wijeratne (p.wijeratne@sussex.ac.uk)
  4 | # Adapted from code written for 'hmmlearn' (https://github.com/hmmlearn/hmmlearn)
  5 | 
  6 | import numpy as np
  7 | from scipy.special import logsumexp
  8 | from functools import partial
  9 | import pathos
 10 | 
 11 | from .base_fix import BaseTEBM
 12 | from kde_ebm.mixture_model import fit_all_kde_models, fit_all_gmm_models, get_prob_mat
 13 | 
 14 | class MixtureTEBM(BaseTEBM):
 15 | 
 16 |     def __init__(self,
 17 |                  X=None,
 18 |                  lengths=None, 
 19 |                  n_stages=None,
 20 |                  time_mean=None,
 21 |                  n_iter=None,
 22 |                  fwd_only=False,
 23 |                  order=None,
 24 |                  algo='viterbi',
 25 |                  verbose=False):
 26 |         
 27 |         BaseTEBM.__init__(self,
 28 |                           X=X,
 29 |                           lengths=lengths, 
 30 |                           n_stages=n_stages,
 31 |                           time_mean=time_mean,
 32 |                           n_iter=n_iter,
 33 |                           fwd_only=fwd_only,
 34 |                           order=order,
 35 |                           algo=algo,
 36 |                           verbose=verbose)
 37 |     
 38 |     def compute_log_likelihood(self, X, start_i, end_i):
 39 |         n_samples = end_i-start_i
 40 |         S_int = self.S.astype(int)
 41 |         arange_Np1 = np.arange(0, self.n_features+1)
 42 |         p_perm_k = np.zeros((n_samples, self.n_features+1))
 43 |         p_yes = np.array(self.prob_mat[start_i:end_i, :, 1])
 44 |         p_no = np.array(self.prob_mat[start_i:end_i, :, 0])
 45 |         # Leon's clever cumulative probability code
 46 |         cp_yes = np.cumprod(p_yes[:, S_int], 1)
 47 |         cp_no = np.cumprod(p_no[:, S_int[::-1]], 1)
 48 |         for i in arange_Np1:
 49 |             if i == 0:
 50 |                 p_perm_k[:, i] = cp_no[:,self.n_features-1]
 51 |             elif i == self.n_features:
 52 |                 p_perm_k[:, i] = cp_yes[:,self.n_features-1]
 53 |             else:
 54 |                 p_perm_k[:, i] = cp_yes[:,i-1] * cp_no[:,self.n_features-i-1]
 55 |         p_perm_k[p_perm_k==0] = np.finfo(float).eps
 56 |         return np.log(p_perm_k)
 57 | 
 58 |     def stages(self, X, lengths=None):
 59 |         ### FIXME: is there a general way of doing this?
 60 |         self.X = X
 61 |         self.lengths = lengths
 62 |         self.prob_mat = get_prob_mat(X, self.mixtures)
 63 |         ###
 64 |         stage_sequence = self.stage_X(X, lengths)
 65 |         return stage_sequence
 66 | 
 67 |     def posteriors(self, X, lengths=None):
 68 |         ### FIXME: is there a general way of doing this?
 69 |         self.X = X
 70 |         self.lengths = lengths
 71 |         self.prob_mat = get_prob_mat(X, self.mixtures)
 72 |         ###
 73 |         posteriors = self.posteriors_X(X, lengths)
 74 |         return posteriors
 75 |     
 76 |     def gen_sample(self, stage):
 77 |         ### FIXME: this won't work for KDE EBM
 78 |         def _get_params_ebm(means, sdevs, mixes):
 79 |             n_bms = self.n_features
 80 |             seq_means = np.tile(means.T[0], (n_bms+1, 1)).T
 81 |             seq_sdevs = np.tile(sdevs.T[0], (n_bms+1, 1)).T
 82 |             seq_mixes = np.tile(mixes.T[0], (n_bms+1, 1)).T
 83 |             # seq_means[0] = healthy distributions for all biomarkers
 84 |             for i in range(n_bms):
 85 |                 bm_pos = np.where(self.S == i)[0][0]
 86 |                 seq_means[i, bm_pos+1:] = means[i][1]
 87 |                 seq_sdevs[i, bm_pos+1:] = sdevs[i][1]
 88 |                 seq_mixes[i, bm_pos+1:] = mixes[i][1]
 89 |             return seq_means.T, seq_sdevs.T, seq_mixes.T
 90 |         def _return_gmm_fits(mixtures):
 91 |             n_bms = self.n_features
 92 |             fit_means = np.zeros((n_bms, 2))
 93 |             fit_std = np.zeros((n_bms, 2))
 94 |             fit_mixes = np.zeros((n_bms, 2))
 95 |             for i in range(n_bms):
 96 |                 theta_i = mixtures[i].theta
 97 |                 fit_means[i] = theta_i[[0,2]]
 98 |                 fit_std[i] = theta_i[[1,3]]
 99 |                 fit_mixes[i] = [theta_i[4],1-theta_i[4]]
100 |             return fit_means, fit_std, fit_mixes
101 |         fit_means, fit_std, fit_mixes = _return_gmm_fits(self.mixtures)
102 |         theta = _get_params_ebm(fit_means, fit_std, fit_mixes)
103 |         self.means = theta[0]
104 |         self.covars = np.power(theta[1],2)
105 |         ###
106 |         return np.random.multivariate_normal(self.means[stage], self.covars[stage])
107 | 
108 |     def optimise_seq(self, S):
109 |         N = self.n_features
110 |         max_S = S.copy()
111 |         # calculate likelihoods over permutations
112 |         order_bio = np.random.permutation(N)        
113 |         for count,i in enumerate(order_bio):
114 |             current_sequence = max_S
115 |             assert(len(current_sequence)==N)
116 |             current_location = np.array([0] * N)
117 |             current_location[current_sequence.astype(int)] = np.arange(N)
118 |             selected_event = i
119 |             move_event_from = current_location[selected_event]
120 |             possible_positions = np.arange(N)
121 |             possible_sequences = np.zeros((len(possible_positions), N))
122 |             possible_likelihood = np.full((len(possible_positions), 1), -np.inf)
123 |             for index in range(len(possible_positions)):
124 |                 current_sequence = max_S
125 |                 # choose a position in the sequence to move an event to
126 |                 move_event_to = possible_positions[index]
127 |                 # move this event in its new position
128 |                 current_sequence = np.delete(current_sequence, move_event_from, 0)
129 |                 new_sequence = np.concatenate([current_sequence[np.arange(move_event_to)], [selected_event], current_sequence[np.arange(move_event_to, N - 1)]])
130 |                 # fit TEBM
131 |                 self.S = new_sequence
132 |                 self.fit()
133 |                 possible_likelihood[index] = self.compute_model_log_likelihood(self.X, self.lengths)
134 |                 possible_sequences[index, :] = self.S
135 |             max_likelihood = max(possible_likelihood)
136 |             max_S = possible_sequences[np.where(possible_likelihood == max_likelihood)[0][0]]
137 |             if count<(N-1):
138 |                 print (str(round((count+1)/len(order_bio)*100,2))+'% complete')
139 |         return max_S, max_likelihood
140 | 
141 |     def seq_em(self, S, n_iter, seed_num):
142 |         # parse out sequences by seed number
143 |         S = np.array(S[seed_num])
144 |         print ('Startpoint',seed_num)
145 |         cur_seq = S
146 |         cur_like = -np.inf
147 |         flag = False
148 |         for opt_i in range(int(n_iter)):
149 |             print ('EM iteration',opt_i+1)
150 |             seq, like = self.optimise_seq(cur_seq)
151 |             print ('current', like, seq, 'max', cur_like, cur_seq)
152 |             if like-cur_like < 1E-3:
153 |                 print ('EM converged in',opt_i+1,'iterations')
154 |                 flag = True
155 |             elif like > cur_like:
156 |                 cur_seq = seq
157 |                 cur_like = like
158 |             if flag:
159 |                 break
160 |         return cur_seq, cur_like
161 | 
162 |     def fit_tebm(self, labels, n_start, n_iter, n_cores, model_type='GMM', constrained=False, cut_controls=False):
163 |         # only use baseline data to fit mixture models
164 |         X0 = []
165 |         for i in range(len(self.lengths)):
166 |             X0.append(self.X[np.sum(self.lengths[:i])])
167 |         X0 = np.array(X0)
168 |         if model_type == 'KDE':
169 |             mixtures = fit_all_kde_models(X0, labels)
170 |         else:
171 |             mixtures = fit_all_gmm_models(X0, labels)#, constrained)
172 |         # might want to fit sequence without controls
173 |         if cut_controls:
174 |             print ('Cutting controls from sequence fit!')
175 |             X, lengths = [], []
176 |             for i in range(len(self.lengths)):
177 |                 if labels[i] != 0:
178 |                     nobs_i = self.lengths[i]
179 |                     for x in self.X[np.sum(self.lengths[:i]):np.sum(self.lengths[:i])+nobs_i]:
180 |                         X.append(x)
181 |                     lengths.append(self.lengths[i])
182 |             self.X = np.array(X)
183 |             self.lengths = np.array(lengths)
184 |         # calculate likelihood lookup table
185 |         self.prob_mat = get_prob_mat(self.X, mixtures)
186 |         # set mixture models
187 |         self.mixtures = mixtures
188 |         # do EM
189 |         ml_seq_mat = np.zeros((1,self.X.shape[1],n_start))
190 |         ml_like_mat = np.zeros(n_start)
191 |         if n_cores>1:
192 |             pool = pathos.multiprocessing.ProcessingPool()
193 |             pool.ncpus = n_cores
194 |         else:
195 |             # FIXME: serial version doesn't work
196 |             #            pool = pathos.serial.SerialPool()
197 |             pool = pathos.multiprocessing.ProcessingPool()
198 |             pool.ncpus = n_cores
199 |         # instantiate function as class to pass to pool.map
200 |         # first calculate array of sequences - do this first or same random number will be used simultaneously on each processor
201 |         # will return shape (n_start, 1)
202 |         copier = partial(self.init_seq)
203 |         # will return shape (n_start, 1)
204 |         seq_mat = np.array(pool.map(copier, range(n_start)))
205 |         # now optimise
206 |         copier = partial(self.seq_em,
207 |                          seq_mat[:,0],
208 |                          n_iter)
209 |         # will return shape (n_start, 2)
210 |         par_mat = np.array(pool.map(copier, range(n_start)))
211 |         # distribute to local matrices
212 |         for i in range(n_start):
213 |             ml_seq_mat[:, :, i] = par_mat[i, 0]
214 |             ml_like_mat[i] = par_mat[i, 1]
215 |         ix = np.argmax(ml_like_mat)
216 |         ml_seq = ml_seq_mat[:, :, ix]
217 |         ml_like = ml_like_mat[ix]
218 |         # refit model on ML sequence
219 |         self.S = ml_seq[0]
220 |         self.fit()
221 |         return ml_seq, self.mixtures
222 | 
223 |     def init_seq(self, seed_num):
224 |         #FIXME: issue with seeding by seed_num is that every time you call fit_tebm, it will initialise the same sequences
225 |         # ensure randomness across parallel processes
226 |         np.random.seed(seed_num)
227 |         S = np.arange(self.n_features)
228 |         np.random.shuffle(S)
229 |         return [S]
230 | 
231 | class ZscoreTEBM(BaseTEBM):
232 | 
233 |     def __init__(self,
234 |                  X=None,
235 |                  lengths=None, 
236 |                  n_stages=None,
237 |                  time_mean=None,
238 |                  n_iter=None,
239 |                  fwd_only=False,
240 |                  order=None,
241 |                  algo='viterbi',
242 |                  verbose=False):
243 |         
244 |         BaseTEBM.__init__(self,
245 |                           X=X,
246 |                           lengths=lengths, 
247 |                           n_stages=n_stages,
248 |                           time_mean=time_mean,
249 |                           n_iter=n_iter,
250 |                           fwd_only=fwd_only,
251 |                           order=order,
252 |                           algo=algo,
253 |                           verbose=verbose)
254 |         
255 |     def compute_log_likelihood(self, X, start_i, end_i):
256 |         n_samples, n_dim = X.shape
257 |         return -0.5 * (n_dim * np.log(2 * np.pi)
258 |                        + np.log(self.covars).sum(axis=-1)
259 |                        + ((X[:, None, :] - self.means) ** 2 / self.covars).sum(axis=-1))
260 | 
261 |     def stages(self, X, lengths=None):
262 |         ### FIXME: is there a general way of doing this?
263 |         self.X = X
264 |         self.lengths = lengths
265 |         ###
266 |         stage_sequence = self.stage_X(X, lengths)
267 |         return stage_sequence
268 | 
269 |     def posteriors(self, X, lengths=None):
270 |         ### FIXME: is there a general way of doing this?
271 |         self.X = X
272 |         self.lengths = lengths
273 |         ###
274 |         posteriors = self.posteriors_X(X, lengths)
275 |         return posteriors
276 | 
277 |     def gen_sample(self, n_samples=1):
278 |         p_vec_cdf = np.cumsum(self.p_vec)
279 |         a_mat_cdf = np.cumsum(self.a_mat, axis=1)
280 |         X_sample, k_sample = [], []
281 |         k_i = (p_vec_cdf > np.random.rand()).argmax()
282 |         for i in range(n_samples):
283 |             k_i = (a_mat_cdf[k_i] > np.random.rand()).argmax()
284 |             k_sample.append(k_i)
285 |             #            X_sample.append(np.random.multivariate_normal(self.means[k_i], self.covars[k_i]))
286 |             #FIXME: make multivariate
287 |             X_sample.append(np.random.normal(self.means[k_i], self.covars[k_i]))
288 |         return np.array(X_sample), np.array(k_sample)
289 | 
290 |     def init_seq(self, seed_num):
291 |         np.random.seed(seed_num)    
292 |         N = np.array(self.stage_zscore).shape[1]
293 |         S = np.zeros(N)
294 |         for i in range(N):
295 |             IS_min_stage_zscore = np.array([False] * N)
296 |             possible_biomarkers = np.unique(self.stage_biomarker_index)
297 |             for j in range(len(possible_biomarkers)):
298 |                 IS_unselected = [False] * N
299 |                 for k in set(range(N)) - set(S[:i]):
300 |                     IS_unselected[k] = True
301 |                 this_biomarkers = np.array([(np.array(self.stage_biomarker_index)[0] == possible_biomarkers[j]).astype(int) + (np.array(IS_unselected) == 1).astype(int)]) == 2
302 |                 if not np.any(this_biomarkers):
303 |                     this_min_stage_zscore = 0
304 |                 else:
305 |                     this_min_stage_zscore = min(self.stage_zscore[this_biomarkers])
306 |                 if (this_min_stage_zscore):
307 |                     temp = ((this_biomarkers.astype(int) + (self.stage_zscore == this_min_stage_zscore).astype(int)) == 2).T
308 |                     temp = temp.reshape(len(temp), )
309 |                     IS_min_stage_zscore[temp] = True
310 |             events = np.array(range(N))
311 |             possible_events = np.array(events[IS_min_stage_zscore])
312 |             this_index = np.ceil(np.random.rand() * ((len(possible_events)))) - 1
313 |             S[i] = possible_events[int(this_index)]
314 |         S = S.reshape(1, len(S))
315 |         return S
316 |     
317 |     def get_means(self):
318 |         def linspace_local2(a, b, N, arange_N):
319 |             return a + (b - a) / (N - 1.) * arange_N
320 |         N = self.stage_biomarker_index.shape[1]
321 |         S_inv = np.array([ 0 ] * N)
322 |         S_inv[self.S.astype(int)] = np.arange(N)
323 |         possible_biomarkers = np.unique(self.stage_biomarker_index)
324 |         B = len(possible_biomarkers)
325 |         # value of mean function at integral limits
326 |         point_value = np.zeros((B, N + 2))
327 |         # all the arange you'll need below
328 |         arange_N = np.arange(N + 2)
329 |         for i in range(B):
330 |             b = possible_biomarkers[i]
331 |             # position of this biomarker's z-score events in the sequence
332 |             event_location = np.concatenate([[0], S_inv[(self.stage_biomarker_index == b)[0]], [N]])
333 |             # z-score reached at each event
334 |             event_value = np.concatenate([[self.min_biomarker_zscore[i]], self.stage_zscore[self.stage_biomarker_index == b], [self.max_biomarker_zscore[i]]])
335 |             for j in range(len(event_location) - 1):
336 |                 if j == 0:  # FIXME: nasty hack to get Matlab indexing to match up - necessary here because indices are used for linspace limits
337 |                     temp = arange_N[event_location[j]:(event_location[j + 1] + 2)]
338 |                     N_j = event_location[j + 1] - event_location[j] + 2
339 |                     point_value[i, temp] = linspace_local2(event_value[j], event_value[j + 1], N_j, arange_N[0:N_j])
340 |                 else:
341 |                     temp = arange_N[(event_location[j] + 1):(event_location[j + 1] + 2)]
342 |                     N_j = event_location[j + 1] - event_location[j] + 1
343 |                     point_value[i, temp] = linspace_local2(event_value[j], event_value[j + 1], N_j, arange_N[0:N_j])
344 |         # integrate (approximation)
345 |         stage_value = 0.5 * point_value[:, :point_value.shape[1] - 1] + 0.5 * point_value[:, 1:]
346 |         return stage_value.T
347 | 
348 |     def optimise_seq(self, S):
349 |         N = self.stage_zscore.shape[1]
350 |         max_S = S.copy()
351 |         # calculate likelihoods over permutations
352 |         order_bio = np.random.permutation(N)
353 |         for count,i in enumerate(order_bio):
354 |             current_sequence = max_S
355 |             assert(len(current_sequence)==N)
356 |             current_location = np.array([0]*len(current_sequence))
357 |             current_location[current_sequence.astype(int)] = np.arange(len(current_sequence))
358 |             selected_event = i
359 |             move_event_from = current_location[selected_event]
360 |             this_stage_zscore = self.stage_zscore[0, selected_event]
361 |             selected_biomarker = self.stage_biomarker_index[0, selected_event]
362 |             possible_zscores_biomarker = self.stage_zscore[self.stage_biomarker_index == selected_biomarker]
363 |             min_filter = possible_zscores_biomarker < this_stage_zscore
364 |             max_filter = possible_zscores_biomarker > this_stage_zscore
365 |             events = np.array(range(N))
366 |             if np.any(min_filter):
367 |                 min_zscore_bound = max(possible_zscores_biomarker[min_filter])
368 |                 min_zscore_bound_event = events[((self.stage_zscore[0] == min_zscore_bound).astype(int) + (
369 |                     self.stage_biomarker_index[0] == selected_biomarker).astype(int)) == 2]
370 |                 move_event_to_lower_bound = current_location[min_zscore_bound_event] + 1
371 |             else:
372 |                 move_event_to_lower_bound = 0
373 |             if np.any(max_filter):
374 |                 max_zscore_bound = min(possible_zscores_biomarker[max_filter])
375 |                 max_zscore_bound_event = events[((self.stage_zscore[0] == max_zscore_bound).astype(int) + (
376 |                     self.stage_biomarker_index[0] == selected_biomarker).astype(int)) == 2]
377 |                 move_event_to_upper_bound = current_location[max_zscore_bound_event]
378 |             else:
379 |                 move_event_to_upper_bound = N
380 |             if move_event_to_lower_bound == move_event_to_upper_bound:
381 |                 possible_positions = np.array([0])
382 |             else:
383 |                 possible_positions = np.arange(move_event_to_lower_bound, move_event_to_upper_bound)
384 |             possible_sequences = np.zeros((len(possible_positions), N))
385 |             possible_likelihood = np.full((len(possible_positions), 1), -np.inf)
386 |             for index in range(len(possible_positions)):
387 |                 current_sequence = max_S
388 |                 # choose a position in the sequence to move an event to
389 |                 move_event_to = possible_positions[index]
390 |                 # move this event in its new position
391 |                 current_sequence = np.delete(current_sequence, move_event_from, 0)
392 |                 new_sequence = np.concatenate([current_sequence[np.arange(move_event_to)], [selected_event], current_sequence[np.arange(move_event_to, N - 1)]])
393 |                 # fit TEBM
394 |                 self.S = new_sequence
395 |                 self.means = self.get_means()
396 |                 self.covars = self.covars_prior
397 |                 self.fit()
398 |                 possible_likelihood[index] = self.compute_model_log_likelihood(self.X, self.lengths)
399 |                 possible_sequences[index, :] = self.S
400 |             max_likelihood = max(possible_likelihood)
401 |             max_S = possible_sequences[np.where(possible_likelihood == max_likelihood)[0][0]]
402 |             if count<(N-1):
403 |                 print (str(round((count+1)/len(order_bio)*100,2))+'% complete')
404 |         return max_S, max_likelihood
405 | 
406 |     def seq_em(self, S, n_iter, seed_num):
407 |         # parse out sequences by seed number
408 |         S = np.array(S[seed_num])
409 |         print ('Startpoint',seed_num)
410 |         cur_seq = S
411 |         cur_like = -np.inf
412 |         flag = False
413 |         for opt_i in range(int(n_iter)):
414 |             print ('EM iteration',opt_i+1)
415 |             seq, like = self.optimise_seq(cur_seq)
416 |             print ('current', like, seq, 'max', cur_like, cur_seq)
417 |             if like-cur_like < 1E-3:
418 |                 print ('EM converged in',opt_i+1,'iterations')
419 |                 flag = True
420 |             elif like > cur_like:
421 |                 cur_seq = seq
422 |                 cur_like = like
423 |             if flag:
424 |                 break
425 |         return cur_seq, cur_like
426 | 
427 |     def fit_tebm(self, n_zscores, z_max, n_start, n_iter, n_cores, cut_controls=False):
428 |         # intialise z-score stuff
429 |         z_val_arr = np.array([[x+1 for x in range(n_zscores)]]*self.n_features)
430 |         z_max_arr = np.array([z_max]*self.n_features)
431 |         IX_vals = np.array([[x for x in range(self.n_features)]*n_zscores]).T
432 |         stage_biomarker_index = np.array([y for x in IX_vals.T for y in x])
433 |         stage_zscore = np.array([y for x in z_val_arr.T for y in x])
434 |         self.stage_biomarker_index = stage_biomarker_index.reshape(1,len(stage_biomarker_index))
435 |         self.stage_zscore = stage_zscore.reshape(1,len(stage_zscore))
436 |         self.min_biomarker_zscore = [0]*self.n_features
437 |         self.max_biomarker_zscore = z_max_arr
438 |         self.covars_prior = np.tile(np.identity(1), (self.n_stages, self.n_features))
439 |         # might want to fit sequence without controls
440 |         if cut_controls:
441 |             print ('Cutting controls from sequence fit!')
442 |             X, lengths = [], []
443 |             for i in range(len(self.lengths)):
444 |                 if labels[i] != 0:
445 |                     nobs_i = self.lengths[i]
446 |                     for x in self.X[np.sum(self.lengths[:i]):np.sum(self.lengths[:i])+nobs_i]:
447 |                         X.append(x)
448 |                     lengths.append(self.lengths[i])
449 |             self.X = np.array(X)
450 |             self.lengths = np.array(lengths)
451 |         # do EM
452 |         ml_seq_mat = np.zeros((1,self.stage_zscore.shape[1],n_start))
453 |         ml_like_mat = np.zeros(n_start)
454 |         if n_cores>1:
455 |             pool = pathos.multiprocessing.ProcessingPool()
456 |             pool.ncpus = n_cores
457 |         else:
458 |             # FIXME: serial version doesn't work
459 |             #            pool = pathos.serial.SerialPool()
460 |             pool = pathos.multiprocessing.ProcessingPool()
461 |             pool.ncpus = n_cores
462 |         # instantiate function as class to pass to pool.map
463 |         # first calculate array of sequences - do this first or same random number will be used simultaneously on each processor
464 |         # will return shape (n_start, 1)
465 |         copier = partial(self.init_seq)
466 |         # will return shape (n_start, 1)
467 |         seq_mat = np.array(pool.map(copier, range(n_start)))
468 |         # now optimise
469 |         copier = partial(self.seq_em,
470 |                          seq_mat[:,0],
471 |                          n_iter)
472 |         # will return shape (n_start, 2)
473 |         par_mat = np.array(pool.map(copier, range(n_start)))
474 |         # distribute to local matrices
475 |         for i in range(n_start):
476 |             ml_seq_mat[:, :, i] = par_mat[i, 0]
477 |             ml_like_mat[i] = par_mat[i, 1]
478 |         ix = np.argmax(ml_like_mat)
479 |         ml_seq = ml_seq_mat[:, :, ix]
480 |         ml_like = ml_like_mat[ix]
481 |         # refit model on ML sequence
482 |         self.S = ml_seq[0]
483 |         self.covars = self.covars_prior
484 |         self.means = self.get_means()
485 |         self.fit()
486 |         return ml_seq
487 | 


--------------------------------------------------------------------------------
/lib/tebm/cthmm_fix.py:
--------------------------------------------------------------------------------
   1 | # Fixed interval CTHMM
   2 | # Author: Peter Wijeratne (p.wijeratne@pm.me)
   3 | # Adapted from code written for 'hmmlearn' (https://github.com/hmmlearn/hmmlearn)
   4 | 
   5 | import logging
   6 | 
   7 | import numpy as np
   8 | from scipy.special import logsumexp
   9 | from sklearn import cluster
  10 | from sklearn.utils import check_random_state
  11 | 
  12 | from . import _utils
  13 | from .stats import log_multivariate_normal_density
  14 | from .base_fix import BaseTEBM
  15 | from .utils import (
  16 |     fill_covars, iter_from_X_lengths, log_mask_zero, log_normalize, normalize)
  17 | __all__ = ["MixtureCTHMM", "MultinomialCTHMM", "GMMCTHMM"]
  18 | 
  19 | _log = logging.getLogger(__name__)
  20 | COVARIANCE_TYPES = frozenset(("spherical", "diag", "full", "tied"))
  21 | 
  22 | def _check_and_set_gaussian_n_features(model):
  23 |     _, n_features = model.X.shape
  24 |     if hasattr(model, "n_features") and model.n_features != n_features:
  25 |         raise ValueError("Unexpected number of dimensions, got {} but "
  26 |                          "expected {}".format(n_features, model.n_features))
  27 |     model.n_features = n_features
  28 | 
  29 | 
  30 | class GaussianCTHMM(BaseTEBM):
  31 |     r"""Hidden Markov Model with Gaussian emissions.
  32 | 
  33 |     Parameters
  34 |     ----------
  35 |     n_components : int
  36 |         Number of states.
  37 | 
  38 |     covariance_type : string, optional
  39 |         String describing the type of covariance parameters to
  40 |         use.  Must be one of
  41 | 
  42 |         * "spherical" --- each state uses a single variance value that
  43 |           applies to all features.
  44 |         * "diag" --- each state uses a diagonal covariance matrix.
  45 |         * "full" --- each state uses a full (i.e. unrestricted)
  46 |           covariance matrix.
  47 |         * "tied" --- all states use **the same** full covariance matrix.
  48 | 
  49 |         Defaults to "diag".
  50 | 
  51 |     min_covar : float, optional
  52 |         Floor on the diagonal of the covariance matrix to prevent
  53 |         overfitting. Defaults to 1e-3.
  54 | 
  55 |     startprob_prior : array, shape (n_components, ), optional
  56 |         Parameters of the Dirichlet prior distribution for
  57 |         :attr:`startprob_`.
  58 | 
  59 |     transmat_prior : array, shape (n_components, n_components), optional
  60 |         Parameters of the Dirichlet prior distribution for each row
  61 |         of the transition probabilities :attr:`transmat_`.
  62 | 
  63 |     means_prior, means_weight : array, shape (n_components, ), optional
  64 |         Mean and precision of the Normal prior distribtion for
  65 |         :attr:`means_`.
  66 | 
  67 |     covars_prior, covars_weight : array, shape (n_components, ), optional
  68 |         Parameters of the prior distribution for the covariance matrix
  69 |         :attr:`covars_`.
  70 | 
  71 |         If :attr:`covariance_type` is "spherical" or "diag" the prior is
  72 |         the inverse gamma distribution, otherwise --- the inverse Wishart
  73 |         distribution.
  74 | 
  75 |     algorithm : string, optional
  76 |         Decoder algorithm. Must be one of "viterbi" or`"map".
  77 |         Defaults to "viterbi".
  78 | 
  79 |     random_state: RandomState or an int seed, optional
  80 |         A random number generator instance.
  81 | 
  82 |     n_iter : int, optional
  83 |         Maximum number of iterations to perform.
  84 | 
  85 |     tol : float, optional
  86 |         Convergence threshold. EM will stop if the gain in log-likelihood
  87 |         is below this value.
  88 | 
  89 |     verbose : bool, optional
  90 |         When ``True`` per-iteration convergence reports are printed
  91 |         to :data:`sys.stderr`. You can diagnose convergence via the
  92 |         :attr:`monitor_` attribute.
  93 | 
  94 |     params : string, optional
  95 |         Controls which parameters are updated in the training
  96 |         process.  Can contain any combination of 's' for startprob,
  97 |         't' for transmat, 'm' for means and 'c' for covars. Defaults
  98 |         to all parameters.
  99 | 
 100 |     init_params : string, optional
 101 |         Controls which parameters are initialized prior to
 102 |         training.  Can contain any combination of 's' for
 103 |         startprob, 't' for transmat, 'm' for means and 'c' for covars.
 104 |         Defaults to all parameters.
 105 | 
 106 |     Attributes
 107 |     ----------
 108 |     n_features : int
 109 |         Dimensionality of the Gaussian emissions.
 110 | 
 111 |     monitor\_ : ConvergenceMonitor
 112 |         Monitor object used to check the convergence of EM.
 113 | 
 114 |     startprob\_ : array, shape (n_components, )
 115 |         Initial state occupation distribution.
 116 | 
 117 |     transmat\_ : array, shape (n_components, n_components)
 118 |         Matrix of transition probabilities between states.
 119 | 
 120 |     means\_ : array, shape (n_components, n_features)
 121 |         Mean parameters for each state.
 122 | 
 123 |     covars\_ : array
 124 |         Covariance parameters for each state.
 125 | 
 126 |         The shape depends on :attr:`covariance_type`::
 127 | 
 128 |             (n_components, )                        if "spherical",
 129 |             (n_components, n_features)              if "diag",
 130 |             (n_components, n_features, n_features)  if "full"
 131 |             (n_features, n_features)                if "tied",
 132 | 
 133 |     Examples
 134 |     --------
 135 |     >>> from tebm.tebm import MixtureTEBM
 136 |     >>> MixtureTEBM(n_components=2)  #doctest: +ELLIPSIS
 137 |     MixtureTEBM(algorithm='viterbi',...
 138 |     """
 139 |     def __init__(self, X=None, lengths=None, 
 140 |                  n_components=1, startprob_prior=None, transmat_prior=None,
 141 |                  means_prior=0, means_weight=0, covars_prior=1e-2, covars_weight=1, covariance_type='diag', min_covar=1e-3,
 142 |                  algorithm="viterbi", random_state=None, n_iter=10,
 143 |                  tol=1e-2, verbose=False, params="st",
 144 |                  init_params="st", allow_nan=False):
 145 |         BaseTEBM.__init__(self, X=X, lengths=lengths, 
 146 |                            n_components=n_components, startprob_prior=startprob_prior, transmat_prior=transmat_prior,
 147 |                            algorithm=algorithm, random_state=random_state, n_iter=n_iter,
 148 |                            tol=tol, verbose=verbose, params=params,
 149 |                            init_params=init_params, allow_nan=allow_nan)
 150 |         self.covariance_type = covariance_type
 151 |         self.min_covar = min_covar
 152 |         self.means_prior = means_prior
 153 |         self.means_weight = means_weight
 154 |         self.covars_prior = covars_prior
 155 |         self.covars_weight = covars_weight
 156 | 
 157 |     @property
 158 |     def covars_(self):
 159 |         """Return covars as a full matrix."""
 160 |         return fill_covars(self._covars_, self.covariance_type,
 161 |                            self.n_components, self.n_features)
 162 | 
 163 |     @covars_.setter
 164 |     def covars_(self, covars):
 165 |         covars = np.array(covars, copy=True)
 166 |         _utils._validate_covars(covars, self.covariance_type,
 167 |                                 self.n_components)
 168 |         self._covars_ = covars
 169 |     """
 170 |     def _check(self):
 171 |         super()._check()
 172 |         
 173 |         self.means_ = np.asarray(self.means_)
 174 |         self.n_features = self.means_.shape[1]
 175 | 
 176 |         if self.covariance_type not in COVARIANCE_TYPES:
 177 |             raise ValueError('covariance_type must be one of {}'
 178 |                              .format(COVARIANCE_TYPES))
 179 |     """
 180 |     def _get_n_fit_scalars_per_param(self):
 181 |         nc = self.n_components
 182 |         nf = self.n_features
 183 |         return {
 184 |             "s": nc - 1,
 185 |             "t": nc * (nc - 1),
 186 |             "m": nc * nf,
 187 |             "c": {
 188 |                 "spherical": nc,
 189 |                 "diag": nc * nf,
 190 |                 "full": nc * nf * (nf + 1) // 2,
 191 |                 "tied": nf * (nf + 1) // 2,
 192 |             }[self.covariance_type],
 193 |         }
 194 | 
 195 |     def _init(self, X, lengths=None):
 196 |         _check_and_set_gaussian_n_features(self)
 197 |         super()._init(X, lengths=lengths)
 198 | 
 199 |         if 'm' in self.init_params:
 200 |             kmeans = cluster.KMeans(n_clusters=self.n_components,
 201 |                                     random_state=self.random_state)
 202 |             kmeans.fit(X)
 203 |             self.means_ = kmeans.cluster_centers_
 204 |         if 'c' in self.init_params:
 205 |             cv = np.cov(X.T) + self.min_covar * np.eye(X.shape[1])
 206 |             if not cv.shape:
 207 |                 cv.shape = (1, 1)
 208 |             self.covars_ = \
 209 |                 _utils.distribute_covar_matrix_to_match_covariance_type(
 210 |                     cv, self.covariance_type, self.n_components).copy()
 211 | 
 212 |     def _compute_log_likelihood(self, X, i, j):
 213 |         return log_multivariate_normal_density(
 214 |             self.X[i:j], self.means_, self._covars_, self.covariance_type)
 215 | 
 216 |     def predict(self, X, lengths=None):
 217 |         # FIXME: is there a general way of doing this?
 218 |         self.X = X
 219 |         self.lengths = lengths
 220 |         logprob, state_sequence = self.decode(X, lengths)
 221 |         return state_sequence, logprob
 222 | 
 223 |     def predict_proba(self, X, lengths=None):
 224 |         # FIXME: is there a general way of doing this?
 225 |         self.X = X
 226 |         self.lengths = lengths
 227 |         ####
 228 |         _, posteriors = self.score_samples(X, lengths)
 229 |         return posteriors
 230 |     
 231 |     def _generate_sample_from_state(self, state, random_state=None):
 232 |         random_state = check_random_state(random_state)
 233 |         return random_state.multivariate_normal(
 234 |             self.means_[state], self.covars_[state]
 235 |         )
 236 | 
 237 |     def _initialize_sufficient_statistics(self):
 238 |         stats = super()._initialize_sufficient_statistics()
 239 |         stats['post'] = np.zeros(self.n_components)
 240 |         stats['obs'] = np.zeros((self.n_components, self.n_features))
 241 |         stats['obs**2'] = np.zeros((self.n_components, self.n_features))
 242 |         if self.covariance_type in ('tied', 'full'):
 243 |             stats['obs*obs.T'] = np.zeros((self.n_components, self.n_features,
 244 |                                            self.n_features))
 245 |         return stats
 246 | 
 247 |     def _accumulate_sufficient_statistics(self, stats, obs, framelogprob, posteriors,
 248 |                                           fwdlattice, bwdlattice):
 249 |         super()._accumulate_sufficient_statistics(
 250 |             stats, obs, framelogprob, posteriors, fwdlattice, bwdlattice)
 251 | 
 252 |         if 'm' in self.params or 'c' in self.params:
 253 |             stats['post'] += posteriors.sum(axis=0)
 254 |             stats['obs'] += np.dot(posteriors.T, obs)
 255 | 
 256 |         if 'c' in self.params:
 257 |             if self.covariance_type in ('spherical', 'diag'):
 258 |                 stats['obs**2'] += np.dot(posteriors.T, obs ** 2)
 259 |             elif self.covariance_type in ('tied', 'full'):
 260 |                 # posteriors: (nt, nc); obs: (nt, nf); obs: (nt, nf)
 261 |                 # -> (nc, nf, nf)
 262 |                 stats['obs*obs.T'] += np.einsum(
 263 |                     'ij,ik,il->jkl', posteriors, obs, obs)
 264 | 
 265 |     def _do_mstep(self, stats):
 266 |         super()._do_mstep(stats)
 267 | 
 268 |         means_prior = self.means_prior
 269 |         means_weight = self.means_weight
 270 | 
 271 |         # TODO: find a proper reference for estimates for different
 272 |         #       covariance models.
 273 |         # Based on Huang, Acero, Hon, "Spoken Language Processing",
 274 |         # p. 443 - 445
 275 |         denom = stats['post'][:, np.newaxis]
 276 |         if 'm' in self.params:
 277 |             self.means_ = ((means_weight * means_prior + stats['obs'])
 278 |                            / (means_weight + denom))
 279 | 
 280 |         if 'c' in self.params:
 281 |             covars_prior = self.covars_prior
 282 |             covars_weight = self.covars_weight
 283 |             meandiff = self.means_ - means_prior
 284 | 
 285 |             if self.covariance_type in ('spherical', 'diag'):
 286 |                 cv_num = (means_weight * meandiff**2
 287 |                           + stats['obs**2']
 288 |                           - 2 * self.means_ * stats['obs']
 289 |                           + self.means_**2 * denom)
 290 |                 cv_den = max(covars_weight - 1, 0) + denom
 291 |                 self._covars_ = \
 292 |                     (covars_prior + cv_num) / np.maximum(cv_den, 1e-5)
 293 |                 if self.covariance_type == 'spherical':
 294 |                     self._covars_ = np.tile(
 295 |                         self._covars_.mean(1)[:, np.newaxis],
 296 |                         (1, self._covars_.shape[1]))
 297 |             elif self.covariance_type in ('tied', 'full'):
 298 |                 cv_num = np.empty((self.n_components, self.n_features,
 299 |                                   self.n_features))
 300 |                 for c in range(self.n_components):
 301 |                     obsmean = np.outer(stats['obs'][c], self.means_[c])
 302 | 
 303 |                     cv_num[c] = (means_weight * np.outer(meandiff[c],
 304 |                                                          meandiff[c])
 305 |                                  + stats['obs*obs.T'][c]
 306 |                                  - obsmean - obsmean.T
 307 |                                  + np.outer(self.means_[c], self.means_[c])
 308 |                                  * stats['post'][c])
 309 |                 cvweight = max(covars_weight - self.n_features, 0)
 310 |                 if self.covariance_type == 'tied':
 311 |                     self._covars_ = ((covars_prior + cv_num.sum(axis=0)) /
 312 |                                      (cvweight + stats['post'].sum()))
 313 |                 elif self.covariance_type == 'full':
 314 |                     self._covars_ = ((covars_prior + cv_num) /
 315 |                                      (cvweight + stats['post'][:, None, None]))
 316 | 
 317 | class MultinomialCTHMM(BaseTEBM):
 318 |     r"""Hidden Markov Model with multinomial (discrete) emissions
 319 | 
 320 |     Parameters
 321 |     ----------
 322 | 
 323 |     n_components : int
 324 |         Number of states.
 325 | 
 326 |     startprob_prior : array, shape (n_components, ), optional
 327 |         Parameters of the Dirichlet prior distribution for
 328 |         :attr:`startprob_`.
 329 | 
 330 |     transmat_prior : array, shape (n_components, n_components), optional
 331 |         Parameters of the Dirichlet prior distribution for each row
 332 |         of the transition probabilities :attr:`transmat_`.
 333 | 
 334 |     algorithm : string, optional
 335 |         Decoder algorithm. Must be one of "viterbi" or "map".
 336 |         Defaults to "viterbi".
 337 | 
 338 |     random_state: RandomState or an int seed, optional
 339 |         A random number generator instance.
 340 | 
 341 |     n_iter : int, optional
 342 |         Maximum number of iterations to perform.
 343 | 
 344 |     tol : float, optional
 345 |         Convergence threshold. EM will stop if the gain in log-likelihood
 346 |         is below this value.
 347 | 
 348 |     verbose : bool, optional
 349 |         When ``True`` per-iteration convergence reports are printed
 350 |         to :data:`sys.stderr`. You can diagnose convergence via the
 351 |         :attr:`monitor_` attribute.
 352 | 
 353 |     params : string, optional
 354 |         Controls which parameters are updated in the training
 355 |         process.  Can contain any combination of 's' for startprob,
 356 |         't' for transmat, 'e' for emissionprob.
 357 |         Defaults to all parameters.
 358 | 
 359 |     init_params : string, optional
 360 |         Controls which parameters are initialized prior to
 361 |         training.  Can contain any combination of 's' for
 362 |         startprob, 't' for transmat, 'e' for emissionprob.
 363 |         Defaults to all parameters.
 364 | 
 365 |     Attributes
 366 |     ----------
 367 |     n_features : int
 368 |         Number of possible symbols emitted by the model (in the samples).
 369 | 
 370 |     monitor\_ : ConvergenceMonitor
 371 |         Monitor object used to check the convergence of EM.
 372 | 
 373 |     startprob\_ : array, shape (n_components, )
 374 |         Initial state occupation distribution.
 375 | 
 376 |     transmat\_ : array, shape (n_components, n_components)
 377 |         Matrix of transition probabilities between states.
 378 | 
 379 |     emissionprob\_ : array, shape (n_components, n_features)
 380 |         Probability of emitting a given symbol when in each state.
 381 | 
 382 |     Examples
 383 |     --------
 384 |     >>> from tebm.tebm import MultinomialTEBM
 385 |     >>> MultinomialTEBM(n_components=2)  #doctest: +ELLIPSIS
 386 |     MultinomialTEBM(algorithm='viterbi',...
 387 |     """
 388 |     # TODO: accept the prior on emissionprob_ for consistency.
 389 |     def __init__(self, n_components=1,
 390 |                  startprob_prior=1.0, transmat_prior=1.0,
 391 |                  algorithm="viterbi", random_state=None,
 392 |                  n_iter=10, tol=1e-2, verbose=False,
 393 |                  params="ste", init_params="ste"):
 394 |         BaseTEBM.__init__(self, n_components,
 395 |                           startprob_prior=startprob_prior,
 396 |                           transmat_prior=transmat_prior,
 397 |                           algorithm=algorithm,
 398 |                           random_state=random_state,
 399 |                           n_iter=n_iter, tol=tol, verbose=verbose,
 400 |                           params=params, init_params=init_params)
 401 | 
 402 |     def _get_n_fit_scalars_per_param(self):
 403 |         nc = self.n_components
 404 |         nf = self.n_features
 405 |         return {
 406 |             "s": nc - 1,
 407 |             "t": nc * (nc - 1),
 408 |             "e": nc * (nf - 1),
 409 |         }
 410 | 
 411 |     def _init(self, X, lengths=None):
 412 |         self._check_and_set_n_features(X)
 413 |         super()._init(X, lengths=lengths)
 414 |         self.random_state = check_random_state(self.random_state)
 415 | 
 416 |         if 'e' in self.init_params:
 417 |             self.emissionprob_ = self.random_state \
 418 |                 .rand(self.n_components, self.n_features)
 419 |             normalize(self.emissionprob_, axis=1)
 420 | 
 421 |     def _check(self):
 422 |         super()._check()
 423 | 
 424 |         self.emissionprob_ = np.atleast_2d(self.emissionprob_)
 425 |         n_features = getattr(self, "n_features", self.emissionprob_.shape[1])
 426 |         if self.emissionprob_.shape != (self.n_components, n_features):
 427 |             raise ValueError(
 428 |                 "emissionprob_ must have shape (n_components, n_features)")
 429 |         else:
 430 |             self.n_features = n_features
 431 | 
 432 |     def _compute_log_likelihood(self, X):
 433 |         return log_mask_zero(self.emissionprob_)[:, np.concatenate(X)].T
 434 | 
 435 |     def _generate_sample_from_state(self, state, random_state=None):
 436 |         cdf = np.cumsum(self.emissionprob_[state, :])
 437 |         random_state = check_random_state(random_state)
 438 |         return [(cdf > random_state.rand()).argmax()]
 439 | 
 440 |     def _initialize_sufficient_statistics(self):
 441 |         stats = super()._initialize_sufficient_statistics()
 442 |         stats['obs'] = np.zeros((self.n_components, self.n_features))
 443 |         return stats
 444 | 
 445 |     def _accumulate_sufficient_statistics(self, stats, X, framelogprob,
 446 |                                           posteriors, fwdlattice, bwdlattice):
 447 |         super()._accumulate_sufficient_statistics(
 448 |             stats, X, framelogprob, posteriors, fwdlattice, bwdlattice)
 449 |         if 'e' in self.params:
 450 |             for t, symbol in enumerate(np.concatenate(X)):
 451 |                 stats['obs'][:, symbol] += posteriors[t]
 452 | 
 453 |     def _do_mstep(self, stats):
 454 |         super()._do_mstep(stats)
 455 |         if 'e' in self.params:
 456 |             self.emissionprob_ = (stats['obs']
 457 |                                   / stats['obs'].sum(axis=1)[:, np.newaxis])
 458 | 
 459 |     def _check_and_set_n_features(self, X):
 460 |         """
 461 |         Check if ``X`` is a sample from a Multinomial distribution, i.e. an
 462 |         array of non-negative integers.
 463 |         """
 464 |         if not np.issubdtype(X.dtype, np.integer):
 465 |             raise ValueError("Symbols should be integers")
 466 |         if X.min() < 0:
 467 |             raise ValueError("Symbols should be nonnegative")
 468 |         if hasattr(self, "n_features"):
 469 |             if self.n_features - 1 < X.max():
 470 |                 raise ValueError(
 471 |                     "Largest symbol is {} but the model only emits "
 472 |                     "symbols up to {}"
 473 |                     .format(X.max(), self.n_features - 1))
 474 |         self.n_features = X.max() + 1
 475 | 
 476 | 
 477 | class GMMCTHMM(BaseTEBM):
 478 |     r"""Hidden Markov Model with Gaussian mixture emissions.
 479 | 
 480 |     Parameters
 481 |     ----------
 482 |     n_components : int
 483 |         Number of states in the model.
 484 | 
 485 |     n_mix : int
 486 |         Number of states in the GMM.
 487 | 
 488 |     covariance_type : string, optional
 489 |         String describing the type of covariance parameters to
 490 |         use.  Must be one of
 491 | 
 492 |         * "spherical" --- each state uses a single variance value that
 493 |           applies to all features.
 494 |         * "diag" --- each state uses a diagonal covariance matrix.
 495 |         * "full" --- each state uses a full (i.e. unrestricted)
 496 |           covariance matrix.
 497 |         * "tied" --- all mixture components of each state use **the same** full
 498 |           covariance matrix (note that this is not the same as for
 499 |           `MixtureTEBM`).
 500 | 
 501 |         Defaults to "diag".
 502 | 
 503 |     min_covar : float, optional
 504 |         Floor on the diagonal of the covariance matrix to prevent
 505 |         overfitting. Defaults to 1e-3.
 506 | 
 507 |     startprob_prior : array, shape (n_components, ), optional
 508 |         Parameters of the Dirichlet prior distribution for
 509 |         :attr:`startprob_`.
 510 | 
 511 |     transmat_prior : array, shape (n_components, n_components), optional
 512 |         Parameters of the Dirichlet prior distribution for each row
 513 |         of the transition probabilities :attr:`transmat_`.
 514 | 
 515 |     weights_prior : array, shape (n_mix, ), optional
 516 |         Parameters of the Dirichlet prior distribution for
 517 |         :attr:`weights_`.
 518 | 
 519 |     means_prior, means_weight : array, shape (n_mix, ), optional
 520 |         Mean and precision of the Normal prior distribtion for
 521 |         :attr:`means_`.
 522 | 
 523 |     covars_prior, covars_weight : array, shape (n_mix, ), optional
 524 |         Parameters of the prior distribution for the covariance matrix
 525 |         :attr:`covars_`.
 526 | 
 527 |         If :attr:`covariance_type` is "spherical" or "diag" the prior is
 528 |         the inverse gamma distribution, otherwise --- the inverse Wishart
 529 |         distribution.
 530 | 
 531 |     algorithm : string, optional
 532 |         Decoder algorithm. Must be one of "viterbi" or "map".
 533 |         Defaults to "viterbi".
 534 | 
 535 |     random_state: RandomState or an int seed, optional
 536 |         A random number generator instance.
 537 | 
 538 |     n_iter : int, optional
 539 |         Maximum number of iterations to perform.
 540 | 
 541 |     tol : float, optional
 542 |         Convergence threshold. EM will stop if the gain in log-likelihood
 543 |         is below this value.
 544 | 
 545 |     verbose : bool, optional
 546 |         When ``True`` per-iteration convergence reports are printed
 547 |         to :data:`sys.stderr`. You can diagnose convergence via the
 548 |         :attr:`monitor_` attribute.
 549 | 
 550 |     init_params : string, optional
 551 |         Controls which parameters are initialized prior to training. Can
 552 |         contain any combination of 's' for startprob, 't' for transmat, 'm'
 553 |         for means, 'c' for covars, and 'w' for GMM mixing weights.
 554 |         Defaults to all parameters.
 555 | 
 556 |     params : string, optional
 557 |         Controls which parameters are updated in the training process.  Can
 558 |         contain any combination of 's' for startprob, 't' for transmat, 'm' for
 559 |         means, and 'c' for covars, and 'w' for GMM mixing weights.
 560 |         Defaults to all parameters.
 561 | 
 562 |     Attributes
 563 |     ----------
 564 |     monitor\_ : ConvergenceMonitor
 565 |         Monitor object used to check the convergence of EM.
 566 | 
 567 |     startprob\_ : array, shape (n_components, )
 568 |         Initial state occupation distribution.
 569 | 
 570 |     transmat\_ : array, shape (n_components, n_components)
 571 |         Matrix of transition probabilities between states.
 572 | 
 573 |     weights\_ : array, shape (n_components, n_mix)
 574 |         Mixture weights for each state.
 575 | 
 576 |     means\_ : array, shape (n_components, n_mix)
 577 |         Mean parameters for each mixture component in each state.
 578 | 
 579 |     covars\_ : array
 580 |         Covariance parameters for each mixture components in each state.
 581 | 
 582 |         The shape depends on :attr:`covariance_type`::
 583 | 
 584 |             (n_components, n_mix)                          if "spherical",
 585 |             (n_components, n_mix, n_features)              if "diag",
 586 |             (n_components, n_mix, n_features, n_features)  if "full"
 587 |             (n_components, n_features, n_features)         if "tied",
 588 |     """
 589 | 
 590 |     def __init__(self, n_components=1, n_mix=1,
 591 |                  min_covar=1e-3, startprob_prior=1.0, transmat_prior=1.0,
 592 |                  weights_prior=1.0, means_prior=0.0, means_weight=0.0,
 593 |                  covars_prior=None, covars_weight=None,
 594 |                  algorithm="viterbi", covariance_type="diag",
 595 |                  random_state=None, n_iter=10, tol=1e-2,
 596 |                  verbose=False, params="stmcw",
 597 |                  init_params="stmcw"):
 598 |         BaseTEBM.__init__(self, n_components,
 599 |                           startprob_prior=startprob_prior,
 600 |                           transmat_prior=transmat_prior,
 601 |                           algorithm=algorithm, random_state=random_state,
 602 |                           n_iter=n_iter, tol=tol, verbose=verbose,
 603 |                           params=params, init_params=init_params)
 604 |         self.covariance_type = covariance_type
 605 |         self.min_covar = min_covar
 606 |         self.n_mix = n_mix
 607 |         self.weights_prior = weights_prior
 608 |         self.means_prior = means_prior
 609 |         self.means_weight = means_weight
 610 |         self.covars_prior = covars_prior
 611 |         self.covars_weight = covars_weight
 612 | 
 613 |     def _get_n_fit_scalars_per_param(self):
 614 |         nc = self.n_components
 615 |         nf = self.n_features
 616 |         nm = self.n_mix
 617 |         return {
 618 |             "s": nc - 1,
 619 |             "t": nc * (nc - 1),
 620 |             "m": nc * nm * nf,
 621 |             "c": {
 622 |                 "spherical": nc * nm,
 623 |                 "diag": nc * nm * nf,
 624 |                 "full": nc * nm * nf * (nf + 1) // 2,
 625 |                 "tied": nc * nf * (nf + 1) // 2,
 626 |             }[self.covariance_type],
 627 |             "w": nm - 1,
 628 |         }
 629 | 
 630 |     def _init(self, X, lengths=None):
 631 |         _check_and_set_gaussian_n_features(self)
 632 |         super()._init(X, lengths=lengths)
 633 |         nc = self.n_components
 634 |         nf = self.n_features
 635 |         nm = self.n_mix
 636 | 
 637 |         # Default values for covariance prior parameters
 638 |         self._init_covar_priors()
 639 |         self._fix_priors_shape()
 640 | 
 641 |         main_kmeans = cluster.KMeans(n_clusters=nc,
 642 |                                      random_state=self.random_state)
 643 |         labels = main_kmeans.fit_predict(X)
 644 |         kmeanses = []
 645 |         for label in range(nc):
 646 |             kmeans = cluster.KMeans(n_clusters=nm,
 647 |                                     random_state=self.random_state)
 648 |             kmeans.fit(X[np.where(labels == label)])
 649 |             kmeanses.append(kmeans)
 650 | 
 651 |         if 'w' in self.init_params or not hasattr(self, "weights_"):
 652 |             self.weights_ = np.ones((nc, nm)) / (np.ones((nc, 1)) * nm)
 653 | 
 654 |         if 'm' in self.init_params or not hasattr(self, "means_"):
 655 |             self.means_ = np.stack(
 656 |                 [kmeans.cluster_centers_ for kmeans in kmeanses])
 657 | 
 658 |         if 'c' in self.init_params or not hasattr(self, "covars_"):
 659 |             cv = np.cov(X.T) + self.min_covar * np.eye(nf)
 660 |             if not cv.shape:
 661 |                 cv.shape = (1, 1)
 662 |             if self.covariance_type == 'tied':
 663 |                 self.covars_ = np.zeros((nc, nf, nf))
 664 |                 self.covars_[:] = cv
 665 |             elif self.covariance_type == 'full':
 666 |                 self.covars_ = np.zeros((nc, nm, nf, nf))
 667 |                 self.covars_[:] = cv
 668 |             elif self.covariance_type == 'diag':
 669 |                 self.covars_ = np.zeros((nc, nm, nf))
 670 |                 self.covars_[:] = np.diag(cv)
 671 |             elif self.covariance_type == 'spherical':
 672 |                 self.covars_ = np.zeros((nc, nm))
 673 |                 self.covars_[:] = cv.mean()
 674 | 
 675 |     def _init_covar_priors(self):
 676 |         if self.covariance_type == "full":
 677 |             if self.covars_prior is None:
 678 |                 self.covars_prior = 0.0
 679 |             if self.covars_weight is None:
 680 |                 self.covars_weight = -(1.0 + self.n_features + 1.0)
 681 |         elif self.covariance_type == "tied":
 682 |             if self.covars_prior is None:
 683 |                 self.covars_prior = 0.0
 684 |             if self.covars_weight is None:
 685 |                 self.covars_weight = -(self.n_mix + self.n_features + 1.0)
 686 |         elif self.covariance_type == "diag":
 687 |             if self.covars_prior is None:
 688 |                 self.covars_prior = -1.5
 689 |             if self.covars_weight is None:
 690 |                 self.covars_weight = 0.0
 691 |         elif self.covariance_type == "spherical":
 692 |             if self.covars_prior is None:
 693 |                 self.covars_prior = -(self.n_mix + 2.0) / 2.0
 694 |             if self.covars_weight is None:
 695 |                 self.covars_weight = 0.0
 696 | 
 697 |     def _fix_priors_shape(self):
 698 |         nc = self.n_components
 699 |         nf = self.n_features
 700 |         nm = self.n_mix
 701 | 
 702 |         # If priors are numbers, this function will make them into a
 703 |         # matrix of proper shape
 704 |         self.weights_prior = np.broadcast_to(
 705 |             self.weights_prior, (nc, nm)).copy()
 706 |         self.means_prior = np.broadcast_to(
 707 |             self.means_prior, (nc, nm, nf)).copy()
 708 |         self.means_weight = np.broadcast_to(
 709 |             self.means_weight, (nc, nm)).copy()
 710 | 
 711 |         if self.covariance_type == "full":
 712 |             self.covars_prior = np.broadcast_to(
 713 |                 self.covars_prior, (nc, nm, nf, nf)).copy()
 714 |             self.covars_weight = np.broadcast_to(
 715 |                 self.covars_weight, (nc, nm)).copy()
 716 |         elif self.covariance_type == "tied":
 717 |             self.covars_prior = np.broadcast_to(
 718 |                 self.covars_prior, (nc, nf, nf)).copy()
 719 |             self.covars_weight = np.broadcast_to(
 720 |                 self.covars_weight, nc).copy()
 721 |         elif self.covariance_type == "diag":
 722 |             self.covars_prior = np.broadcast_to(
 723 |                 self.covars_prior, (nc, nm, nf)).copy()
 724 |             self.covars_weight = np.broadcast_to(
 725 |                 self.covars_weight, (nc, nm, nf)).copy()
 726 |         elif self.covariance_type == "spherical":
 727 |             self.covars_prior = np.broadcast_to(
 728 |                 self.covars_prior, (nc, nm)).copy()
 729 |             self.covars_weight = np.broadcast_to(
 730 |                 self.covars_weight, (nc, nm)).copy()
 731 | 
 732 |     def _check(self):
 733 |         super()._check()
 734 |         if not hasattr(self, "n_features"):
 735 |             self.n_features = self.means_.shape[2]
 736 |         nc = self.n_components
 737 |         nf = self.n_features
 738 |         nm = self.n_mix
 739 | 
 740 |         self._init_covar_priors()
 741 |         self._fix_priors_shape()
 742 | 
 743 |         # Checking covariance type
 744 |         if self.covariance_type not in COVARIANCE_TYPES:
 745 |             raise ValueError("covariance_type must be one of {}"
 746 |                              .format(COVARIANCE_TYPES))
 747 | 
 748 |         self.weights_ = np.array(self.weights_)
 749 |         # Checking mixture weights' shape
 750 |         if self.weights_.shape != (nc, nm):
 751 |             raise ValueError("mixture weights must have shape "
 752 |                              "(n_components, n_mix), actual shape: {}"
 753 |                              .format(self.weights_.shape))
 754 | 
 755 |         # Checking mixture weights' mathematical correctness
 756 |         if not np.allclose(np.sum(self.weights_, axis=1), np.ones(nc)):
 757 |             raise ValueError("mixture weights must sum up to 1")
 758 | 
 759 |         # Checking means' shape
 760 |         self.means_ = np.array(self.means_)
 761 |         if self.means_.shape != (nc, nm, nf):
 762 |             raise ValueError("mixture means must have shape "
 763 |                              "(n_components, n_mix, n_features), "
 764 |                              "actual shape: {}".format(self.means_.shape))
 765 | 
 766 |         # Checking covariances' shape
 767 |         self.covars_ = np.array(self.covars_)
 768 |         covars_shape = self.covars_.shape
 769 |         needed_shapes = {
 770 |             "spherical": (nc, nm),
 771 |             "tied": (nc, nf, nf),
 772 |             "diag": (nc, nm, nf),
 773 |             "full": (nc, nm, nf, nf),
 774 |         }
 775 |         needed_shape = needed_shapes[self.covariance_type]
 776 |         if covars_shape != needed_shape:
 777 |             raise ValueError("{!r} mixture covars must have shape {}, "
 778 |                              "actual shape: {}"
 779 |                              .format(self.covariance_type,
 780 |                                      needed_shape, covars_shape))
 781 | 
 782 |         # Checking covariances' mathematical correctness
 783 |         from scipy import linalg
 784 | 
 785 |         if (self.covariance_type == "spherical" or
 786 |                 self.covariance_type == "diag"):
 787 |             if np.any(self.covars_ < 0):
 788 |                 raise ValueError("{!r} mixture covars must be non-negative"
 789 |                                  .format(self.covariance_type))
 790 |             if np.any(self.covars_ == 0):
 791 |                 _log.warning("Degenerate mixture covariance")
 792 |         elif self.covariance_type == "tied":
 793 |             for i, covar in enumerate(self.covars_):
 794 |                 if not np.allclose(covar, covar.T):
 795 |                     raise ValueError("Covariance of state #{} is not symmetric"
 796 |                                      .format(i))
 797 |                 min_eigvalsh = np.linalg.eigvalsh(covar).min()
 798 |                 if min_eigvalsh < 0:
 799 |                     raise ValueError("Covariance of state #{} is not positive "
 800 |                                      "definite".format(i))
 801 |                 if min_eigvalsh == 0:
 802 |                     _log.warning("Covariance of state #%d has a null "
 803 |                                  "eigenvalue.", i)
 804 |         elif self.covariance_type == "full":
 805 |             for i, mix_covars in enumerate(self.covars_):
 806 |                 for j, covar in enumerate(mix_covars):
 807 |                     if not np.allclose(covar, covar.T):
 808 |                         raise ValueError(
 809 |                             "Covariance of state #{}, mixture #{} is not "
 810 |                             "symmetric".format(i, j))
 811 |                     min_eigvalsh = np.linalg.eigvalsh(covar).min()
 812 |                     if min_eigvalsh < 0:
 813 |                         raise ValueError(
 814 |                             "Covariance of state #{}, mixture #{} is not "
 815 |                             "positive definite".format(i, j))
 816 |                     if min_eigvalsh == 0:
 817 |                         _log.warning("Covariance of state #%d, mixture #%d "
 818 |                                      "has a null eigenvalue.", i, j)
 819 | 
 820 |     def _generate_sample_from_state(self, state, random_state=None):
 821 |         if random_state is None:
 822 |             random_state = self.random_state
 823 |         random_state = check_random_state(random_state)
 824 | 
 825 |         cur_weights = self.weights_[state]
 826 |         i_gauss = random_state.choice(self.n_mix, p=cur_weights)
 827 |         if self.covariance_type == 'tied':
 828 |             # self.covars_.shape == (n_components, n_features, n_features)
 829 |             # shouldn't that be (n_mix, ...)?
 830 |             covs = self.covars_
 831 |         else:
 832 |             covs = self.covars_[:, i_gauss]
 833 |             covs = fill_covars(covs, self.covariance_type,
 834 |                                self.n_components, self.n_features)
 835 |         return random_state.multivariate_normal(
 836 |             self.means_[state, i_gauss], covs[state]
 837 |         )
 838 | 
 839 |     def _compute_log_weighted_gaussian_densities(self, X, i_comp):
 840 |         cur_means = self.means_[i_comp]
 841 |         cur_covs = self.covars_[i_comp]
 842 |         if self.covariance_type == 'spherical':
 843 |             cur_covs = cur_covs[:, np.newaxis]
 844 |         log_cur_weights = np.log(self.weights_[i_comp])
 845 | 
 846 |         return log_multivariate_normal_density(
 847 |             X, cur_means, cur_covs, self.covariance_type
 848 |         ) + log_cur_weights
 849 | 
 850 |     def _compute_log_likelihood(self, X):
 851 |         n_samples, _ = X.shape
 852 |         res = np.zeros((n_samples, self.n_components))
 853 | 
 854 |         for i in range(self.n_components):
 855 |             log_denses = self._compute_log_weighted_gaussian_densities(X, i)
 856 |             with np.errstate(under="ignore"):
 857 |                 res[:, i] = logsumexp(log_denses, axis=1)
 858 | 
 859 |         return res
 860 | 
 861 |     def _initialize_sufficient_statistics(self):
 862 |         stats = super()._initialize_sufficient_statistics()
 863 |         stats['n_samples'] = 0
 864 |         stats['post_comp_mix'] = None
 865 |         stats['post_mix_sum'] = np.zeros((self.n_components, self.n_mix))
 866 |         stats['post_sum'] = np.zeros(self.n_components)
 867 |         stats['samples'] = None
 868 |         stats['centered'] = None
 869 |         return stats
 870 | 
 871 |     def _accumulate_sufficient_statistics(self, stats, X, framelogprob,
 872 |                                           post_comp, fwdlattice, bwdlattice):
 873 | 
 874 |         # TODO: support multiple frames
 875 | 
 876 |         super()._accumulate_sufficient_statistics(
 877 |             stats, X, framelogprob, post_comp, fwdlattice, bwdlattice
 878 |         )
 879 | 
 880 |         n_samples, _ = X.shape
 881 | 
 882 |         stats['n_samples'] = n_samples
 883 |         stats['samples'] = X
 884 | 
 885 |         post_mix = np.zeros((n_samples, self.n_components, self.n_mix))
 886 |         for p in range(self.n_components):
 887 |             log_denses = self._compute_log_weighted_gaussian_densities(X, p)
 888 |             log_normalize(log_denses, axis=-1)
 889 |             with np.errstate(under="ignore"):
 890 |                 post_mix[:, p, :] = np.exp(log_denses)
 891 | 
 892 |         with np.errstate(under="ignore"):
 893 |             post_comp_mix = post_comp[:, :, np.newaxis] * post_mix
 894 |         stats['post_comp_mix'] = post_comp_mix
 895 | 
 896 |         stats['post_mix_sum'] = np.sum(post_comp_mix, axis=0)
 897 |         stats['post_sum'] = np.sum(post_comp, axis=0)
 898 | 
 899 |         stats['centered'] = X[:, np.newaxis, np.newaxis, :] - self.means_
 900 | 
 901 |     def _do_mstep(self, stats):
 902 |         super()._do_mstep(stats)
 903 |         nc = self.n_components
 904 |         nf = self.n_features
 905 |         nm = self.n_mix
 906 | 
 907 |         n_samples = stats['n_samples']
 908 | 
 909 |         # Maximizing weights
 910 |         alphas_minus_one = self.weights_prior - 1
 911 |         new_weights_numer = stats['post_mix_sum'] + alphas_minus_one
 912 |         new_weights_denom = (
 913 |             stats['post_sum'] + np.sum(alphas_minus_one, axis=1)
 914 |         )[:, np.newaxis]
 915 |         new_weights = new_weights_numer / new_weights_denom
 916 | 
 917 |         # Maximizing means
 918 |         lambdas, mus = self.means_weight, self.means_prior
 919 |         new_means_numer = (
 920 |             np.einsum('ijk,il->jkl', stats['post_comp_mix'], stats['samples'])
 921 |             + lambdas[:, :, np.newaxis] * mus
 922 |         )
 923 |         new_means_denom = (stats['post_mix_sum'] + lambdas)[:, :, np.newaxis]
 924 |         new_means = new_means_numer / new_means_denom
 925 | 
 926 |         # Maximizing covariances
 927 |         centered_means = self.means_ - mus
 928 | 
 929 |         if self.covariance_type == 'full':
 930 |             centered = stats['centered'].reshape((n_samples, nc, nm, nf, 1))
 931 |             centered_t = stats['centered'].reshape((n_samples, nc, nm, 1, nf))
 932 |             centered_dots = centered * centered_t
 933 | 
 934 |             psis_t = np.transpose(self.covars_prior, axes=(0, 1, 3, 2))
 935 |             nus = self.covars_weight
 936 | 
 937 |             centr_means_resh = centered_means.reshape((nc, nm, nf, 1))
 938 |             centr_means_resh_t = centered_means.reshape((nc, nm, 1, nf))
 939 |             centered_means_dots = centr_means_resh * centr_means_resh_t
 940 | 
 941 |             new_cov_numer = (
 942 |                 np.einsum(
 943 |                     'ijk,ijklm->jklm', stats['post_comp_mix'], centered_dots)
 944 |                 + psis_t
 945 |                 + lambdas[:, :, np.newaxis, np.newaxis] * centered_means_dots
 946 |             )
 947 |             new_cov_denom = (
 948 |                 stats['post_mix_sum'] + 1 + nus + nf + 1
 949 |             )[:, :, np.newaxis, np.newaxis]
 950 |             new_cov = new_cov_numer / new_cov_denom
 951 | 
 952 |         elif self.covariance_type == 'diag':
 953 |             centered2 = stats['centered'] ** 2
 954 |             centered_means2 = centered_means ** 2
 955 | 
 956 |             alphas = self.covars_prior
 957 |             betas = self.covars_weight
 958 | 
 959 |             new_cov_numer = (
 960 |                 np.einsum('ijk,ijkl->jkl', stats['post_comp_mix'], centered2)
 961 |                 + lambdas[:, :, np.newaxis] * centered_means2
 962 |                 + 2 * betas
 963 |             )
 964 |             new_cov_denom = (
 965 |                 stats['post_mix_sum'][:, :, np.newaxis] + 1 + 2 * (alphas + 1)
 966 |             )
 967 |             new_cov = new_cov_numer / new_cov_denom
 968 | 
 969 |         elif self.covariance_type == 'spherical':
 970 |             centered_norm2 = np.sum(stats['centered'] ** 2, axis=-1)
 971 | 
 972 |             alphas = self.covars_prior
 973 |             betas = self.covars_weight
 974 | 
 975 |             centered_means_norm2 = np.sum(centered_means ** 2, axis=-1)
 976 | 
 977 |             new_cov_numer = (
 978 |                 np.einsum(
 979 |                     'ijk,ijk->jk', stats['post_comp_mix'], centered_norm2)
 980 |                 + lambdas * centered_means_norm2
 981 |                 + 2 * betas
 982 |             )
 983 |             new_cov_denom = nf * (stats['post_mix_sum'] + 1) + 2 * (alphas + 1)
 984 |             new_cov = new_cov_numer / new_cov_denom
 985 | 
 986 |         elif self.covariance_type == 'tied':
 987 |             centered = stats['centered'].reshape((n_samples, nc, nm, nf, 1))
 988 |             centered_t = stats['centered'].reshape((n_samples, nc, nm, 1, nf))
 989 |             centered_dots = centered * centered_t
 990 | 
 991 |             psis_t = np.transpose(self.covars_prior, axes=(0, 2, 1))
 992 |             nus = self.covars_weight
 993 | 
 994 |             centr_means_resh = centered_means.reshape((nc, nm, nf, 1))
 995 |             centr_means_resh_t = centered_means.reshape((nc, nm, 1, nf))
 996 |             centered_means_dots = centr_means_resh * centr_means_resh_t
 997 | 
 998 |             lambdas_cmdots_prod_sum = (
 999 |                 np.einsum('ij,ijkl->ikl', lambdas, centered_means_dots))
1000 | 
1001 |             new_cov_numer = (
1002 |                 np.einsum(
1003 |                     'ijk,ijklm->jlm', stats['post_comp_mix'], centered_dots)
1004 |                 + lambdas_cmdots_prod_sum + psis_t)
1005 |             new_cov_denom = (
1006 |                 stats['post_sum'] + nm + nus + nf + 1
1007 |             )[:, np.newaxis, np.newaxis]
1008 |             new_cov = new_cov_numer / new_cov_denom
1009 | 
1010 |         # Assigning new values to class members
1011 |         self.weights_ = new_weights
1012 |         self.means_ = new_means
1013 |         self.covars_ = new_cov
1014 | 


--------------------------------------------------------------------------------
/lib/tebm/cthmm_var.py:
--------------------------------------------------------------------------------
   1 | # Variable interval CTHMM
   2 | # Author: Peter Wijeratne (p.wijeratne@pm.me)
   3 | 
   4 | import logging
   5 | 
   6 | import numpy as np
   7 | from scipy.special import logsumexp
   8 | from sklearn import cluster
   9 | from sklearn.utils import check_random_state
  10 | 
  11 | from . import _utils
  12 | from .stats import log_multivariate_normal_density
  13 | from .base_var import _BaseTEBM
  14 | from .utils import (
  15 |     fill_covars, iter_from_X_lengths, log_mask_zero, log_normalize, normalize)
  16 | __all__ = ["MixtureCTHMM", "MultinomialCTHMM", "GMMCTHMM"]
  17 | 
  18 | _log = logging.getLogger(__name__)
  19 | COVARIANCE_TYPES = frozenset(("spherical", "diag", "full", "tied"))
  20 | 
  21 | def _check_and_set_gaussian_n_features(model):
  22 |     _, n_features = model.X.shape
  23 |     if hasattr(model, "n_features") and model.n_features != n_features:
  24 |         raise ValueError("Unexpected number of dimensions, got {} but "
  25 |                          "expected {}".format(n_features, model.n_features))
  26 |     model.n_features = n_features
  27 | 
  28 | 
  29 | class GaussianCTHMM(_BaseTEBM):
  30 |     r"""Hidden Markov Model with Gaussian emissions.
  31 | 
  32 |     Parameters
  33 |     ----------
  34 |     n_components : int
  35 |         Number of states.
  36 | 
  37 |     covariance_type : string, optional
  38 |         String describing the type of covariance parameters to
  39 |         use.  Must be one of
  40 | 
  41 |         * "spherical" --- each state uses a single variance value that
  42 |           applies to all features.
  43 |         * "diag" --- each state uses a diagonal covariance matrix.
  44 |         * "full" --- each state uses a full (i.e. unrestricted)
  45 |           covariance matrix.
  46 |         * "tied" --- all states use **the same** full covariance matrix.
  47 | 
  48 |         Defaults to "diag".
  49 | 
  50 |     min_covar : float, optional
  51 |         Floor on the diagonal of the covariance matrix to prevent
  52 |         overfitting. Defaults to 1e-3.
  53 | 
  54 |     startprob_prior : array, shape (n_components, ), optional
  55 |         Parameters of the Dirichlet prior distribution for
  56 |         :attr:`startprob_`.
  57 | 
  58 |     transmat_prior : array, shape (n_components, n_components), optional
  59 |         Parameters of the Dirichlet prior distribution for each row
  60 |         of the transition probabilities :attr:`transmat_`.
  61 | 
  62 |     means_prior, means_weight : array, shape (n_components, ), optional
  63 |         Mean and precision of the Normal prior distribtion for
  64 |         :attr:`means_`.
  65 | 
  66 |     covars_prior, covars_weight : array, shape (n_components, ), optional
  67 |         Parameters of the prior distribution for the covariance matrix
  68 |         :attr:`covars_`.
  69 | 
  70 |         If :attr:`covariance_type` is "spherical" or "diag" the prior is
  71 |         the inverse gamma distribution, otherwise --- the inverse Wishart
  72 |         distribution.
  73 | 
  74 |     algorithm : string, optional
  75 |         Decoder algorithm. Must be one of "viterbi" or`"map".
  76 |         Defaults to "viterbi".
  77 | 
  78 |     random_state: RandomState or an int seed, optional
  79 |         A random number generator instance.
  80 | 
  81 |     n_iter : int, optional
  82 |         Maximum number of iterations to perform.
  83 | 
  84 |     tol : float, optional
  85 |         Convergence threshold. EM will stop if the gain in log-likelihood
  86 |         is below this value.
  87 | 
  88 |     verbose : bool, optional
  89 |         When ``True`` per-iteration convergence reports are printed
  90 |         to :data:`sys.stderr`. You can diagnose convergence via the
  91 |         :attr:`monitor_` attribute.
  92 | 
  93 |     params : string, optional
  94 |         Controls which parameters are updated in the training
  95 |         process.  Can contain any combination of 's' for startprob,
  96 |         't' for transmat, 'm' for means and 'c' for covars. Defaults
  97 |         to all parameters.
  98 | 
  99 |     init_params : string, optional
 100 |         Controls which parameters are initialized prior to
 101 |         training.  Can contain any combination of 's' for
 102 |         startprob, 't' for transmat, 'm' for means and 'c' for covars.
 103 |         Defaults to all parameters.
 104 | 
 105 |     Attributes
 106 |     ----------
 107 |     n_features : int
 108 |         Dimensionality of the Gaussian emissions.
 109 | 
 110 |     monitor\_ : ConvergenceMonitor
 111 |         Monitor object used to check the convergence of EM.
 112 | 
 113 |     startprob\_ : array, shape (n_components, )
 114 |         Initial state occupation distribution.
 115 | 
 116 |     transmat\_ : array, shape (n_components, n_components)
 117 |         Matrix of transition probabilities between states.
 118 | 
 119 |     means\_ : array, shape (n_components, n_features)
 120 |         Mean parameters for each state.
 121 | 
 122 |     covars\_ : array
 123 |         Covariance parameters for each state.
 124 | 
 125 |         The shape depends on :attr:`covariance_type`::
 126 | 
 127 |             (n_components, )                        if "spherical",
 128 |             (n_components, n_features)              if "diag",
 129 |             (n_components, n_features, n_features)  if "full"
 130 |             (n_features, n_features)                if "tied",
 131 | 
 132 |     Examples
 133 |     --------
 134 |     >>> from tebm.tebm import MixtureTEBM
 135 |     >>> MixtureTEBM(n_components=2)  #doctest: +ELLIPSIS
 136 |     MixtureTEBM(algorithm='viterbi',...
 137 |     """
 138 |     def __init__(self, X=None, lengths=None, jumps=None, 
 139 |                  n_components=1, startprob_prior=None, transmat_prior=None,
 140 |                  means_prior=0, means_weight=0, covars_prior=1e-2, covars_weight=1, covariance_type='diag', min_covar=1e-3,
 141 |                  algorithm="viterbi", random_state=None, n_iter=10,
 142 |                  tol=1e-2, verbose=False, params="st",
 143 |                  init_params="st", allow_nan=False):
 144 |         _BaseTEBM.__init__(self, X=X, lengths=lengths, jumps=jumps,
 145 |                            n_components=n_components, startprob_prior=startprob_prior, transmat_prior=transmat_prior,
 146 |                            algorithm=algorithm, random_state=random_state, n_iter=n_iter,
 147 |                            tol=tol, verbose=verbose, params=params,
 148 |                            init_params=init_params, allow_nan=allow_nan)
 149 |         self.covariance_type = covariance_type
 150 |         self.min_covar = min_covar
 151 |         self.means_prior = means_prior
 152 |         self.means_weight = means_weight
 153 |         self.covars_prior = covars_prior
 154 |         self.covars_weight = covars_weight
 155 |         #FIXME
 156 |         #        self.mixtures = []
 157 | 
 158 |     @property
 159 |     def covars_(self):
 160 |         """Return covars as a full matrix."""
 161 |         return fill_covars(self._covars_, self.covariance_type,
 162 |                            self.n_components, self.n_features)
 163 | 
 164 |     @covars_.setter
 165 |     def covars_(self, covars):
 166 |         covars = np.array(covars, copy=True)
 167 |         _utils._validate_covars(covars, self.covariance_type,
 168 |                                 self.n_components)
 169 |         self._covars_ = covars
 170 |     """
 171 |     def _check(self):
 172 |         super()._check()
 173 |         
 174 |         self.means_ = np.asarray(self.means_)
 175 |         self.n_features = self.means_.shape[1]
 176 | 
 177 |         if self.covariance_type not in COVARIANCE_TYPES:
 178 |             raise ValueError('covariance_type must be one of {}'
 179 |                              .format(COVARIANCE_TYPES))
 180 |     """
 181 |     def _get_n_fit_scalars_per_param(self):
 182 |         nc = self.n_components
 183 |         nf = self.n_features
 184 |         return {
 185 |             "s": nc - 1,
 186 |             "t": nc * (nc - 1),
 187 |             "m": nc * nf,
 188 |             "c": {
 189 |                 "spherical": nc,
 190 |                 "diag": nc * nf,
 191 |                 "full": nc * nf * (nf + 1) // 2,
 192 |                 "tied": nf * (nf + 1) // 2,
 193 |             }[self.covariance_type],
 194 |         }
 195 | 
 196 |     def _init(self, X, lengths=None):
 197 |         _check_and_set_gaussian_n_features(self)
 198 |         super()._init(X, lengths=lengths)
 199 | 
 200 |         if 'm' in self.init_params:
 201 |             kmeans = cluster.KMeans(n_clusters=self.n_components,
 202 |                                     random_state=self.random_state)
 203 |             kmeans.fit(X)
 204 |             self.means_ = kmeans.cluster_centers_
 205 |         if 'c' in self.init_params:
 206 |             cv = np.cov(X.T) + self.min_covar * np.eye(X.shape[1])
 207 |             if not cv.shape:
 208 |                 cv.shape = (1, 1)
 209 |             self.covars_ = \
 210 |                 _utils.distribute_covar_matrix_to_match_covariance_type(
 211 |                     cv, self.covariance_type, self.n_components).copy()
 212 | 
 213 |     def _compute_log_likelihood_S(self, i, j):
 214 |         return log_multivariate_normal_density(
 215 |             self.X[i:j], self.means_, self._covars_, self.covariance_type)
 216 | 
 217 |     def predict(self, X, lengths=None, jumps=None):
 218 |         # FIXME: is there a general way of doing this?
 219 |         self.X = X
 220 |         self.lengths = lengths
 221 |         self.jumps = jumps
 222 |         logprob, state_sequence = self.decode()
 223 |         return state_sequence, logprob
 224 | 
 225 |     def predict_proba(self, X, lengths=None, jumps=None):
 226 |         # FIXME: is there a general way of doing this?
 227 |         self.X = X
 228 |         self.lengths = lengths
 229 |         self.jumps = jumps
 230 |         n_samples = self.X.shape[0]
 231 |         logprob = 0
 232 |         posteriors = np.zeros((n_samples, self.n_components))
 233 |         for i, j in iter_from_X_lengths(self.X, self.lengths):
 234 |             _, posteriors[i:j] = self.score_samples(i, j)
 235 |         return posteriors
 236 |     
 237 |     def _generate_sample_from_state(self, state, random_state=None):
 238 |         random_state = check_random_state(random_state)
 239 |         return random_state.multivariate_normal(
 240 |             self.means_[state], self.covars_[state]
 241 |         )
 242 | 
 243 |     def _initialize_sufficient_statistics(self):
 244 |         stats = super()._initialize_sufficient_statistics()
 245 |         stats['post'] = np.zeros(self.n_components)
 246 |         stats['obs'] = np.zeros((self.n_components, self.n_features))
 247 |         stats['obs**2'] = np.zeros((self.n_components, self.n_features))
 248 |         if self.covariance_type in ('tied', 'full'):
 249 |             stats['obs*obs.T'] = np.zeros((self.n_components, self.n_features,
 250 |                                            self.n_features))
 251 |         return stats
 252 | 
 253 |     def _accumulate_sufficient_statistics(self, stats, obs, framelogprob, posteriors,
 254 |                                           fwdlattice, bwdlattice, framejumps):
 255 |         super()._accumulate_sufficient_statistics(
 256 |             stats, obs, framelogprob, posteriors, fwdlattice, bwdlattice, framejumps)
 257 |         
 258 |         if 'm' in self.params or 'c' in self.params:
 259 |             stats['post'] += posteriors.sum(axis=0)
 260 |             stats['obs'] += np.dot(posteriors.T, obs)
 261 | 
 262 |         if 'c' in self.params:
 263 |             if self.covariance_type in ('spherical', 'diag'):
 264 |                 stats['obs**2'] += np.dot(posteriors.T, obs ** 2)
 265 |             elif self.covariance_type in ('tied', 'full'):
 266 |                 # posteriors: (nt, nc); obs: (nt, nf); obs: (nt, nf)
 267 |                 # -> (nc, nf, nf)
 268 |                 stats['obs*obs.T'] += np.einsum(
 269 |                     'ij,ik,il->jkl', posteriors, obs, obs)
 270 | 
 271 |     def _do_mstep(self, stats):
 272 |         super()._do_mstep(stats)
 273 | 
 274 |         means_prior = self.means_prior
 275 |         means_weight = self.means_weight
 276 | 
 277 |         # TODO: find a proper reference for estimates for different
 278 |         #       covariance models.
 279 |         # Based on Huang, Acero, Hon, "Spoken Language Processing",
 280 |         # p. 443 - 445
 281 |         denom = stats['post'][:, np.newaxis]
 282 |         if 'm' in self.params:
 283 |             self.means_ = ((means_weight * means_prior + stats['obs'])
 284 |                            / (means_weight + denom))
 285 | 
 286 |         if 'c' in self.params:
 287 |             covars_prior = self.covars_prior
 288 |             covars_weight = self.covars_weight
 289 |             meandiff = self.means_ - means_prior
 290 | 
 291 |             if self.covariance_type in ('spherical', 'diag'):
 292 |                 cv_num = (means_weight * meandiff**2
 293 |                           + stats['obs**2']
 294 |                           - 2 * self.means_ * stats['obs']
 295 |                           + self.means_**2 * denom)
 296 |                 cv_den = max(covars_weight - 1, 0) + denom
 297 |                 self._covars_ = \
 298 |                     (covars_prior + cv_num) / np.maximum(cv_den, 1e-5)
 299 |                 if self.covariance_type == 'spherical':
 300 |                     self._covars_ = np.tile(
 301 |                         self._covars_.mean(1)[:, np.newaxis],
 302 |                         (1, self._covars_.shape[1]))
 303 |             elif self.covariance_type in ('tied', 'full'):
 304 |                 cv_num = np.empty((self.n_components, self.n_features,
 305 |                                   self.n_features))
 306 |                 for c in range(self.n_components):
 307 |                     obsmean = np.outer(stats['obs'][c], self.means_[c])
 308 | 
 309 |                     cv_num[c] = (means_weight * np.outer(meandiff[c],
 310 |                                                          meandiff[c])
 311 |                                  + stats['obs*obs.T'][c]
 312 |                                  - obsmean - obsmean.T
 313 |                                  + np.outer(self.means_[c], self.means_[c])
 314 |                                  * stats['post'][c])
 315 |                 cvweight = max(covars_weight - self.n_features, 0)
 316 |                 if self.covariance_type == 'tied':
 317 |                     self._covars_ = ((covars_prior + cv_num.sum(axis=0)) /
 318 |                                      (cvweight + stats['post'].sum()))
 319 |                 elif self.covariance_type == 'full':
 320 |                     self._covars_ = ((covars_prior + cv_num) /
 321 |                                      (cvweight + stats['post'][:, None, None]))
 322 | 
 323 | class MultinomialCTHMM(_BaseTEBM):
 324 |     r"""Hidden Markov Model with multinomial (discrete) emissions
 325 | 
 326 |     Parameters
 327 |     ----------
 328 | 
 329 |     n_components : int
 330 |         Number of states.
 331 | 
 332 |     startprob_prior : array, shape (n_components, ), optional
 333 |         Parameters of the Dirichlet prior distribution for
 334 |         :attr:`startprob_`.
 335 | 
 336 |     transmat_prior : array, shape (n_components, n_components), optional
 337 |         Parameters of the Dirichlet prior distribution for each row
 338 |         of the transition probabilities :attr:`transmat_`.
 339 | 
 340 |     algorithm : string, optional
 341 |         Decoder algorithm. Must be one of "viterbi" or "map".
 342 |         Defaults to "viterbi".
 343 | 
 344 |     random_state: RandomState or an int seed, optional
 345 |         A random number generator instance.
 346 | 
 347 |     n_iter : int, optional
 348 |         Maximum number of iterations to perform.
 349 | 
 350 |     tol : float, optional
 351 |         Convergence threshold. EM will stop if the gain in log-likelihood
 352 |         is below this value.
 353 | 
 354 |     verbose : bool, optional
 355 |         When ``True`` per-iteration convergence reports are printed
 356 |         to :data:`sys.stderr`. You can diagnose convergence via the
 357 |         :attr:`monitor_` attribute.
 358 | 
 359 |     params : string, optional
 360 |         Controls which parameters are updated in the training
 361 |         process.  Can contain any combination of 's' for startprob,
 362 |         't' for transmat, 'e' for emissionprob.
 363 |         Defaults to all parameters.
 364 | 
 365 |     init_params : string, optional
 366 |         Controls which parameters are initialized prior to
 367 |         training.  Can contain any combination of 's' for
 368 |         startprob, 't' for transmat, 'e' for emissionprob.
 369 |         Defaults to all parameters.
 370 | 
 371 |     Attributes
 372 |     ----------
 373 |     n_features : int
 374 |         Number of possible symbols emitted by the model (in the samples).
 375 | 
 376 |     monitor\_ : ConvergenceMonitor
 377 |         Monitor object used to check the convergence of EM.
 378 | 
 379 |     startprob\_ : array, shape (n_components, )
 380 |         Initial state occupation distribution.
 381 | 
 382 |     transmat\_ : array, shape (n_components, n_components)
 383 |         Matrix of transition probabilities between states.
 384 | 
 385 |     emissionprob\_ : array, shape (n_components, n_features)
 386 |         Probability of emitting a given symbol when in each state.
 387 | 
 388 |     Examples
 389 |     --------
 390 |     >>> from tebm.tebm import MultinomialTEBM
 391 |     >>> MultinomialTEBM(n_components=2)  #doctest: +ELLIPSIS
 392 |     MultinomialTEBM(algorithm='viterbi',...
 393 |     """
 394 |     # TODO: accept the prior on emissionprob_ for consistency.
 395 |     def __init__(self, n_components=1,
 396 |                  startprob_prior=1.0, transmat_prior=1.0,
 397 |                  algorithm="viterbi", random_state=None,
 398 |                  n_iter=10, tol=1e-2, verbose=False,
 399 |                  params="ste", init_params="ste"):
 400 |         _BaseTEBM.__init__(self, n_components,
 401 |                           startprob_prior=startprob_prior,
 402 |                           transmat_prior=transmat_prior,
 403 |                           algorithm=algorithm,
 404 |                           random_state=random_state,
 405 |                           n_iter=n_iter, tol=tol, verbose=verbose,
 406 |                           params=params, init_params=init_params)
 407 | 
 408 |     def _get_n_fit_scalars_per_param(self):
 409 |         nc = self.n_components
 410 |         nf = self.n_features
 411 |         return {
 412 |             "s": nc - 1,
 413 |             "t": nc * (nc - 1),
 414 |             "e": nc * (nf - 1),
 415 |         }
 416 | 
 417 |     def _init(self, X, lengths=None):
 418 |         self._check_and_set_n_features(X)
 419 |         super()._init(X, lengths=lengths)
 420 |         self.random_state = check_random_state(self.random_state)
 421 | 
 422 |         if 'e' in self.init_params:
 423 |             self.emissionprob_ = self.random_state \
 424 |                 .rand(self.n_components, self.n_features)
 425 |             normalize(self.emissionprob_, axis=1)
 426 | 
 427 |     def _check(self):
 428 |         super()._check()
 429 | 
 430 |         self.emissionprob_ = np.atleast_2d(self.emissionprob_)
 431 |         n_features = getattr(self, "n_features", self.emissionprob_.shape[1])
 432 |         if self.emissionprob_.shape != (self.n_components, n_features):
 433 |             raise ValueError(
 434 |                 "emissionprob_ must have shape (n_components, n_features)")
 435 |         else:
 436 |             self.n_features = n_features
 437 | 
 438 |     def _compute_log_likelihood(self, X):
 439 |         return log_mask_zero(self.emissionprob_)[:, np.concatenate(X)].T
 440 | 
 441 |     def _generate_sample_from_state(self, state, random_state=None):
 442 |         cdf = np.cumsum(self.emissionprob_[state, :])
 443 |         random_state = check_random_state(random_state)
 444 |         return [(cdf > random_state.rand()).argmax()]
 445 | 
 446 |     def _initialize_sufficient_statistics(self):
 447 |         stats = super()._initialize_sufficient_statistics()
 448 |         stats['obs'] = np.zeros((self.n_components, self.n_features))
 449 |         return stats
 450 | 
 451 |     def _accumulate_sufficient_statistics(self, stats, X, framelogprob,
 452 |                                           posteriors, fwdlattice, bwdlattice):
 453 |         super()._accumulate_sufficient_statistics(
 454 |             stats, X, framelogprob, posteriors, fwdlattice, bwdlattice)
 455 |         if 'e' in self.params:
 456 |             for t, symbol in enumerate(np.concatenate(X)):
 457 |                 stats['obs'][:, symbol] += posteriors[t]
 458 | 
 459 |     def _do_mstep(self, stats):
 460 |         super()._do_mstep(stats)
 461 |         if 'e' in self.params:
 462 |             self.emissionprob_ = (stats['obs']
 463 |                                   / stats['obs'].sum(axis=1)[:, np.newaxis])
 464 | 
 465 |     def _check_and_set_n_features(self, X):
 466 |         """
 467 |         Check if ``X`` is a sample from a Multinomial distribution, i.e. an
 468 |         array of non-negative integers.
 469 |         """
 470 |         if not np.issubdtype(X.dtype, np.integer):
 471 |             raise ValueError("Symbols should be integers")
 472 |         if X.min() < 0:
 473 |             raise ValueError("Symbols should be nonnegative")
 474 |         if hasattr(self, "n_features"):
 475 |             if self.n_features - 1 < X.max():
 476 |                 raise ValueError(
 477 |                     "Largest symbol is {} but the model only emits "
 478 |                     "symbols up to {}"
 479 |                     .format(X.max(), self.n_features - 1))
 480 |         self.n_features = X.max() + 1
 481 | 
 482 | 
 483 | class GMMCTHMM(_BaseTEBM):
 484 |     r"""Hidden Markov Model with Gaussian mixture emissions.
 485 | 
 486 |     Parameters
 487 |     ----------
 488 |     n_components : int
 489 |         Number of states in the model.
 490 | 
 491 |     n_mix : int
 492 |         Number of states in the GMM.
 493 | 
 494 |     covariance_type : string, optional
 495 |         String describing the type of covariance parameters to
 496 |         use.  Must be one of
 497 | 
 498 |         * "spherical" --- each state uses a single variance value that
 499 |           applies to all features.
 500 |         * "diag" --- each state uses a diagonal covariance matrix.
 501 |         * "full" --- each state uses a full (i.e. unrestricted)
 502 |           covariance matrix.
 503 |         * "tied" --- all mixture components of each state use **the same** full
 504 |           covariance matrix (note that this is not the same as for
 505 |           `MixtureTEBM`).
 506 | 
 507 |         Defaults to "diag".
 508 | 
 509 |     min_covar : float, optional
 510 |         Floor on the diagonal of the covariance matrix to prevent
 511 |         overfitting. Defaults to 1e-3.
 512 | 
 513 |     startprob_prior : array, shape (n_components, ), optional
 514 |         Parameters of the Dirichlet prior distribution for
 515 |         :attr:`startprob_`.
 516 | 
 517 |     transmat_prior : array, shape (n_components, n_components), optional
 518 |         Parameters of the Dirichlet prior distribution for each row
 519 |         of the transition probabilities :attr:`transmat_`.
 520 | 
 521 |     weights_prior : array, shape (n_mix, ), optional
 522 |         Parameters of the Dirichlet prior distribution for
 523 |         :attr:`weights_`.
 524 | 
 525 |     means_prior, means_weight : array, shape (n_mix, ), optional
 526 |         Mean and precision of the Normal prior distribtion for
 527 |         :attr:`means_`.
 528 | 
 529 |     covars_prior, covars_weight : array, shape (n_mix, ), optional
 530 |         Parameters of the prior distribution for the covariance matrix
 531 |         :attr:`covars_`.
 532 | 
 533 |         If :attr:`covariance_type` is "spherical" or "diag" the prior is
 534 |         the inverse gamma distribution, otherwise --- the inverse Wishart
 535 |         distribution.
 536 | 
 537 |     algorithm : string, optional
 538 |         Decoder algorithm. Must be one of "viterbi" or "map".
 539 |         Defaults to "viterbi".
 540 | 
 541 |     random_state: RandomState or an int seed, optional
 542 |         A random number generator instance.
 543 | 
 544 |     n_iter : int, optional
 545 |         Maximum number of iterations to perform.
 546 | 
 547 |     tol : float, optional
 548 |         Convergence threshold. EM will stop if the gain in log-likelihood
 549 |         is below this value.
 550 | 
 551 |     verbose : bool, optional
 552 |         When ``True`` per-iteration convergence reports are printed
 553 |         to :data:`sys.stderr`. You can diagnose convergence via the
 554 |         :attr:`monitor_` attribute.
 555 | 
 556 |     init_params : string, optional
 557 |         Controls which parameters are initialized prior to training. Can
 558 |         contain any combination of 's' for startprob, 't' for transmat, 'm'
 559 |         for means, 'c' for covars, and 'w' for GMM mixing weights.
 560 |         Defaults to all parameters.
 561 | 
 562 |     params : string, optional
 563 |         Controls which parameters are updated in the training process.  Can
 564 |         contain any combination of 's' for startprob, 't' for transmat, 'm' for
 565 |         means, and 'c' for covars, and 'w' for GMM mixing weights.
 566 |         Defaults to all parameters.
 567 | 
 568 |     Attributes
 569 |     ----------
 570 |     monitor\_ : ConvergenceMonitor
 571 |         Monitor object used to check the convergence of EM.
 572 | 
 573 |     startprob\_ : array, shape (n_components, )
 574 |         Initial state occupation distribution.
 575 | 
 576 |     transmat\_ : array, shape (n_components, n_components)
 577 |         Matrix of transition probabilities between states.
 578 | 
 579 |     weights\_ : array, shape (n_components, n_mix)
 580 |         Mixture weights for each state.
 581 | 
 582 |     means\_ : array, shape (n_components, n_mix)
 583 |         Mean parameters for each mixture component in each state.
 584 | 
 585 |     covars\_ : array
 586 |         Covariance parameters for each mixture components in each state.
 587 | 
 588 |         The shape depends on :attr:`covariance_type`::
 589 | 
 590 |             (n_components, n_mix)                          if "spherical",
 591 |             (n_components, n_mix, n_features)              if "diag",
 592 |             (n_components, n_mix, n_features, n_features)  if "full"
 593 |             (n_components, n_features, n_features)         if "tied",
 594 |     """
 595 | 
 596 |     def __init__(self, n_components=1, n_mix=1,
 597 |                  min_covar=1e-3, startprob_prior=1.0, transmat_prior=1.0,
 598 |                  weights_prior=1.0, means_prior=0.0, means_weight=0.0,
 599 |                  covars_prior=None, covars_weight=None,
 600 |                  algorithm="viterbi", covariance_type="diag",
 601 |                  random_state=None, n_iter=10, tol=1e-2,
 602 |                  verbose=False, params="stmcw",
 603 |                  init_params="stmcw"):
 604 |         _BaseTEBM.__init__(self, n_components,
 605 |                           startprob_prior=startprob_prior,
 606 |                           transmat_prior=transmat_prior,
 607 |                           algorithm=algorithm, random_state=random_state,
 608 |                           n_iter=n_iter, tol=tol, verbose=verbose,
 609 |                           params=params, init_params=init_params)
 610 |         self.covariance_type = covariance_type
 611 |         self.min_covar = min_covar
 612 |         self.n_mix = n_mix
 613 |         self.weights_prior = weights_prior
 614 |         self.means_prior = means_prior
 615 |         self.means_weight = means_weight
 616 |         self.covars_prior = covars_prior
 617 |         self.covars_weight = covars_weight
 618 | 
 619 |     def _get_n_fit_scalars_per_param(self):
 620 |         nc = self.n_components
 621 |         nf = self.n_features
 622 |         nm = self.n_mix
 623 |         return {
 624 |             "s": nc - 1,
 625 |             "t": nc * (nc - 1),
 626 |             "m": nc * nm * nf,
 627 |             "c": {
 628 |                 "spherical": nc * nm,
 629 |                 "diag": nc * nm * nf,
 630 |                 "full": nc * nm * nf * (nf + 1) // 2,
 631 |                 "tied": nc * nf * (nf + 1) // 2,
 632 |             }[self.covariance_type],
 633 |             "w": nm - 1,
 634 |         }
 635 | 
 636 |     def _init(self, X, lengths=None):
 637 |         _check_and_set_gaussian_n_features(self)
 638 |         super()._init(X, lengths=lengths)
 639 |         nc = self.n_components
 640 |         nf = self.n_features
 641 |         nm = self.n_mix
 642 | 
 643 |         # Default values for covariance prior parameters
 644 |         self._init_covar_priors()
 645 |         self._fix_priors_shape()
 646 | 
 647 |         main_kmeans = cluster.KMeans(n_clusters=nc,
 648 |                                      random_state=self.random_state)
 649 |         labels = main_kmeans.fit_predict(X)
 650 |         kmeanses = []
 651 |         for label in range(nc):
 652 |             kmeans = cluster.KMeans(n_clusters=nm,
 653 |                                     random_state=self.random_state)
 654 |             kmeans.fit(X[np.where(labels == label)])
 655 |             kmeanses.append(kmeans)
 656 | 
 657 |         if 'w' in self.init_params or not hasattr(self, "weights_"):
 658 |             self.weights_ = np.ones((nc, nm)) / (np.ones((nc, 1)) * nm)
 659 | 
 660 |         if 'm' in self.init_params or not hasattr(self, "means_"):
 661 |             self.means_ = np.stack(
 662 |                 [kmeans.cluster_centers_ for kmeans in kmeanses])
 663 | 
 664 |         if 'c' in self.init_params or not hasattr(self, "covars_"):
 665 |             cv = np.cov(X.T) + self.min_covar * np.eye(nf)
 666 |             if not cv.shape:
 667 |                 cv.shape = (1, 1)
 668 |             if self.covariance_type == 'tied':
 669 |                 self.covars_ = np.zeros((nc, nf, nf))
 670 |                 self.covars_[:] = cv
 671 |             elif self.covariance_type == 'full':
 672 |                 self.covars_ = np.zeros((nc, nm, nf, nf))
 673 |                 self.covars_[:] = cv
 674 |             elif self.covariance_type == 'diag':
 675 |                 self.covars_ = np.zeros((nc, nm, nf))
 676 |                 self.covars_[:] = np.diag(cv)
 677 |             elif self.covariance_type == 'spherical':
 678 |                 self.covars_ = np.zeros((nc, nm))
 679 |                 self.covars_[:] = cv.mean()
 680 | 
 681 |     def _init_covar_priors(self):
 682 |         if self.covariance_type == "full":
 683 |             if self.covars_prior is None:
 684 |                 self.covars_prior = 0.0
 685 |             if self.covars_weight is None:
 686 |                 self.covars_weight = -(1.0 + self.n_features + 1.0)
 687 |         elif self.covariance_type == "tied":
 688 |             if self.covars_prior is None:
 689 |                 self.covars_prior = 0.0
 690 |             if self.covars_weight is None:
 691 |                 self.covars_weight = -(self.n_mix + self.n_features + 1.0)
 692 |         elif self.covariance_type == "diag":
 693 |             if self.covars_prior is None:
 694 |                 self.covars_prior = -1.5
 695 |             if self.covars_weight is None:
 696 |                 self.covars_weight = 0.0
 697 |         elif self.covariance_type == "spherical":
 698 |             if self.covars_prior is None:
 699 |                 self.covars_prior = -(self.n_mix + 2.0) / 2.0
 700 |             if self.covars_weight is None:
 701 |                 self.covars_weight = 0.0
 702 | 
 703 |     def _fix_priors_shape(self):
 704 |         nc = self.n_components
 705 |         nf = self.n_features
 706 |         nm = self.n_mix
 707 | 
 708 |         # If priors are numbers, this function will make them into a
 709 |         # matrix of proper shape
 710 |         self.weights_prior = np.broadcast_to(
 711 |             self.weights_prior, (nc, nm)).copy()
 712 |         self.means_prior = np.broadcast_to(
 713 |             self.means_prior, (nc, nm, nf)).copy()
 714 |         self.means_weight = np.broadcast_to(
 715 |             self.means_weight, (nc, nm)).copy()
 716 | 
 717 |         if self.covariance_type == "full":
 718 |             self.covars_prior = np.broadcast_to(
 719 |                 self.covars_prior, (nc, nm, nf, nf)).copy()
 720 |             self.covars_weight = np.broadcast_to(
 721 |                 self.covars_weight, (nc, nm)).copy()
 722 |         elif self.covariance_type == "tied":
 723 |             self.covars_prior = np.broadcast_to(
 724 |                 self.covars_prior, (nc, nf, nf)).copy()
 725 |             self.covars_weight = np.broadcast_to(
 726 |                 self.covars_weight, nc).copy()
 727 |         elif self.covariance_type == "diag":
 728 |             self.covars_prior = np.broadcast_to(
 729 |                 self.covars_prior, (nc, nm, nf)).copy()
 730 |             self.covars_weight = np.broadcast_to(
 731 |                 self.covars_weight, (nc, nm, nf)).copy()
 732 |         elif self.covariance_type == "spherical":
 733 |             self.covars_prior = np.broadcast_to(
 734 |                 self.covars_prior, (nc, nm)).copy()
 735 |             self.covars_weight = np.broadcast_to(
 736 |                 self.covars_weight, (nc, nm)).copy()
 737 | 
 738 |     def _check(self):
 739 |         super()._check()
 740 |         if not hasattr(self, "n_features"):
 741 |             self.n_features = self.means_.shape[2]
 742 |         nc = self.n_components
 743 |         nf = self.n_features
 744 |         nm = self.n_mix
 745 | 
 746 |         self._init_covar_priors()
 747 |         self._fix_priors_shape()
 748 | 
 749 |         # Checking covariance type
 750 |         if self.covariance_type not in COVARIANCE_TYPES:
 751 |             raise ValueError("covariance_type must be one of {}"
 752 |                              .format(COVARIANCE_TYPES))
 753 | 
 754 |         self.weights_ = np.array(self.weights_)
 755 |         # Checking mixture weights' shape
 756 |         if self.weights_.shape != (nc, nm):
 757 |             raise ValueError("mixture weights must have shape "
 758 |                              "(n_components, n_mix), actual shape: {}"
 759 |                              .format(self.weights_.shape))
 760 | 
 761 |         # Checking mixture weights' mathematical correctness
 762 |         if not np.allclose(np.sum(self.weights_, axis=1), np.ones(nc)):
 763 |             raise ValueError("mixture weights must sum up to 1")
 764 | 
 765 |         # Checking means' shape
 766 |         self.means_ = np.array(self.means_)
 767 |         if self.means_.shape != (nc, nm, nf):
 768 |             raise ValueError("mixture means must have shape "
 769 |                              "(n_components, n_mix, n_features), "
 770 |                              "actual shape: {}".format(self.means_.shape))
 771 | 
 772 |         # Checking covariances' shape
 773 |         self.covars_ = np.array(self.covars_)
 774 |         covars_shape = self.covars_.shape
 775 |         needed_shapes = {
 776 |             "spherical": (nc, nm),
 777 |             "tied": (nc, nf, nf),
 778 |             "diag": (nc, nm, nf),
 779 |             "full": (nc, nm, nf, nf),
 780 |         }
 781 |         needed_shape = needed_shapes[self.covariance_type]
 782 |         if covars_shape != needed_shape:
 783 |             raise ValueError("{!r} mixture covars must have shape {}, "
 784 |                              "actual shape: {}"
 785 |                              .format(self.covariance_type,
 786 |                                      needed_shape, covars_shape))
 787 | 
 788 |         # Checking covariances' mathematical correctness
 789 |         from scipy import linalg
 790 | 
 791 |         if (self.covariance_type == "spherical" or
 792 |                 self.covariance_type == "diag"):
 793 |             if np.any(self.covars_ < 0):
 794 |                 raise ValueError("{!r} mixture covars must be non-negative"
 795 |                                  .format(self.covariance_type))
 796 |             if np.any(self.covars_ == 0):
 797 |                 _log.warning("Degenerate mixture covariance")
 798 |         elif self.covariance_type == "tied":
 799 |             for i, covar in enumerate(self.covars_):
 800 |                 if not np.allclose(covar, covar.T):
 801 |                     raise ValueError("Covariance of state #{} is not symmetric"
 802 |                                      .format(i))
 803 |                 min_eigvalsh = np.linalg.eigvalsh(covar).min()
 804 |                 if min_eigvalsh < 0:
 805 |                     raise ValueError("Covariance of state #{} is not positive "
 806 |                                      "definite".format(i))
 807 |                 if min_eigvalsh == 0:
 808 |                     _log.warning("Covariance of state #%d has a null "
 809 |                                  "eigenvalue.", i)
 810 |         elif self.covariance_type == "full":
 811 |             for i, mix_covars in enumerate(self.covars_):
 812 |                 for j, covar in enumerate(mix_covars):
 813 |                     if not np.allclose(covar, covar.T):
 814 |                         raise ValueError(
 815 |                             "Covariance of state #{}, mixture #{} is not "
 816 |                             "symmetric".format(i, j))
 817 |                     min_eigvalsh = np.linalg.eigvalsh(covar).min()
 818 |                     if min_eigvalsh < 0:
 819 |                         raise ValueError(
 820 |                             "Covariance of state #{}, mixture #{} is not "
 821 |                             "positive definite".format(i, j))
 822 |                     if min_eigvalsh == 0:
 823 |                         _log.warning("Covariance of state #%d, mixture #%d "
 824 |                                      "has a null eigenvalue.", i, j)
 825 | 
 826 |     def _generate_sample_from_state(self, state, random_state=None):
 827 |         if random_state is None:
 828 |             random_state = self.random_state
 829 |         random_state = check_random_state(random_state)
 830 | 
 831 |         cur_weights = self.weights_[state]
 832 |         i_gauss = random_state.choice(self.n_mix, p=cur_weights)
 833 |         if self.covariance_type == 'tied':
 834 |             # self.covars_.shape == (n_components, n_features, n_features)
 835 |             # shouldn't that be (n_mix, ...)?
 836 |             covs = self.covars_
 837 |         else:
 838 |             covs = self.covars_[:, i_gauss]
 839 |             covs = fill_covars(covs, self.covariance_type,
 840 |                                self.n_components, self.n_features)
 841 |         return random_state.multivariate_normal(
 842 |             self.means_[state, i_gauss], covs[state]
 843 |         )
 844 | 
 845 |     def _compute_log_weighted_gaussian_densities(self, X, i_comp):
 846 |         cur_means = self.means_[i_comp]
 847 |         cur_covs = self.covars_[i_comp]
 848 |         if self.covariance_type == 'spherical':
 849 |             cur_covs = cur_covs[:, np.newaxis]
 850 |         log_cur_weights = np.log(self.weights_[i_comp])
 851 | 
 852 |         return log_multivariate_normal_density(
 853 |             X, cur_means, cur_covs, self.covariance_type
 854 |         ) + log_cur_weights
 855 | 
 856 |     def _compute_log_likelihood(self, X):
 857 |         n_samples, _ = X.shape
 858 |         res = np.zeros((n_samples, self.n_components))
 859 | 
 860 |         for i in range(self.n_components):
 861 |             log_denses = self._compute_log_weighted_gaussian_densities(X, i)
 862 |             with np.errstate(under="ignore"):
 863 |                 res[:, i] = logsumexp(log_denses, axis=1)
 864 | 
 865 |         return res
 866 | 
 867 |     def _initialize_sufficient_statistics(self):
 868 |         stats = super()._initialize_sufficient_statistics()
 869 |         stats['n_samples'] = 0
 870 |         stats['post_comp_mix'] = None
 871 |         stats['post_mix_sum'] = np.zeros((self.n_components, self.n_mix))
 872 |         stats['post_sum'] = np.zeros(self.n_components)
 873 |         stats['samples'] = None
 874 |         stats['centered'] = None
 875 |         return stats
 876 | 
 877 |     def _accumulate_sufficient_statistics(self, stats, X, framelogprob,
 878 |                                           post_comp, fwdlattice, bwdlattice):
 879 | 
 880 |         # TODO: support multiple frames
 881 | 
 882 |         super()._accumulate_sufficient_statistics(
 883 |             stats, X, framelogprob, post_comp, fwdlattice, bwdlattice
 884 |         )
 885 | 
 886 |         n_samples, _ = X.shape
 887 | 
 888 |         stats['n_samples'] = n_samples
 889 |         stats['samples'] = X
 890 | 
 891 |         post_mix = np.zeros((n_samples, self.n_components, self.n_mix))
 892 |         for p in range(self.n_components):
 893 |             log_denses = self._compute_log_weighted_gaussian_densities(X, p)
 894 |             log_normalize(log_denses, axis=-1)
 895 |             with np.errstate(under="ignore"):
 896 |                 post_mix[:, p, :] = np.exp(log_denses)
 897 | 
 898 |         with np.errstate(under="ignore"):
 899 |             post_comp_mix = post_comp[:, :, np.newaxis] * post_mix
 900 |         stats['post_comp_mix'] = post_comp_mix
 901 | 
 902 |         stats['post_mix_sum'] = np.sum(post_comp_mix, axis=0)
 903 |         stats['post_sum'] = np.sum(post_comp, axis=0)
 904 | 
 905 |         stats['centered'] = X[:, np.newaxis, np.newaxis, :] - self.means_
 906 | 
 907 |     def _do_mstep(self, stats):
 908 |         super()._do_mstep(stats)
 909 |         nc = self.n_components
 910 |         nf = self.n_features
 911 |         nm = self.n_mix
 912 | 
 913 |         n_samples = stats['n_samples']
 914 | 
 915 |         # Maximizing weights
 916 |         alphas_minus_one = self.weights_prior - 1
 917 |         new_weights_numer = stats['post_mix_sum'] + alphas_minus_one
 918 |         new_weights_denom = (
 919 |             stats['post_sum'] + np.sum(alphas_minus_one, axis=1)
 920 |         )[:, np.newaxis]
 921 |         new_weights = new_weights_numer / new_weights_denom
 922 | 
 923 |         # Maximizing means
 924 |         lambdas, mus = self.means_weight, self.means_prior
 925 |         new_means_numer = (
 926 |             np.einsum('ijk,il->jkl', stats['post_comp_mix'], stats['samples'])
 927 |             + lambdas[:, :, np.newaxis] * mus
 928 |         )
 929 |         new_means_denom = (stats['post_mix_sum'] + lambdas)[:, :, np.newaxis]
 930 |         new_means = new_means_numer / new_means_denom
 931 | 
 932 |         # Maximizing covariances
 933 |         centered_means = self.means_ - mus
 934 | 
 935 |         if self.covariance_type == 'full':
 936 |             centered = stats['centered'].reshape((n_samples, nc, nm, nf, 1))
 937 |             centered_t = stats['centered'].reshape((n_samples, nc, nm, 1, nf))
 938 |             centered_dots = centered * centered_t
 939 | 
 940 |             psis_t = np.transpose(self.covars_prior, axes=(0, 1, 3, 2))
 941 |             nus = self.covars_weight
 942 | 
 943 |             centr_means_resh = centered_means.reshape((nc, nm, nf, 1))
 944 |             centr_means_resh_t = centered_means.reshape((nc, nm, 1, nf))
 945 |             centered_means_dots = centr_means_resh * centr_means_resh_t
 946 | 
 947 |             new_cov_numer = (
 948 |                 np.einsum(
 949 |                     'ijk,ijklm->jklm', stats['post_comp_mix'], centered_dots)
 950 |                 + psis_t
 951 |                 + lambdas[:, :, np.newaxis, np.newaxis] * centered_means_dots
 952 |             )
 953 |             new_cov_denom = (
 954 |                 stats['post_mix_sum'] + 1 + nus + nf + 1
 955 |             )[:, :, np.newaxis, np.newaxis]
 956 |             new_cov = new_cov_numer / new_cov_denom
 957 | 
 958 |         elif self.covariance_type == 'diag':
 959 |             centered2 = stats['centered'] ** 2
 960 |             centered_means2 = centered_means ** 2
 961 | 
 962 |             alphas = self.covars_prior
 963 |             betas = self.covars_weight
 964 | 
 965 |             new_cov_numer = (
 966 |                 np.einsum('ijk,ijkl->jkl', stats['post_comp_mix'], centered2)
 967 |                 + lambdas[:, :, np.newaxis] * centered_means2
 968 |                 + 2 * betas
 969 |             )
 970 |             new_cov_denom = (
 971 |                 stats['post_mix_sum'][:, :, np.newaxis] + 1 + 2 * (alphas + 1)
 972 |             )
 973 |             new_cov = new_cov_numer / new_cov_denom
 974 | 
 975 |         elif self.covariance_type == 'spherical':
 976 |             centered_norm2 = np.sum(stats['centered'] ** 2, axis=-1)
 977 | 
 978 |             alphas = self.covars_prior
 979 |             betas = self.covars_weight
 980 | 
 981 |             centered_means_norm2 = np.sum(centered_means ** 2, axis=-1)
 982 | 
 983 |             new_cov_numer = (
 984 |                 np.einsum(
 985 |                     'ijk,ijk->jk', stats['post_comp_mix'], centered_norm2)
 986 |                 + lambdas * centered_means_norm2
 987 |                 + 2 * betas
 988 |             )
 989 |             new_cov_denom = nf * (stats['post_mix_sum'] + 1) + 2 * (alphas + 1)
 990 |             new_cov = new_cov_numer / new_cov_denom
 991 | 
 992 |         elif self.covariance_type == 'tied':
 993 |             centered = stats['centered'].reshape((n_samples, nc, nm, nf, 1))
 994 |             centered_t = stats['centered'].reshape((n_samples, nc, nm, 1, nf))
 995 |             centered_dots = centered * centered_t
 996 | 
 997 |             psis_t = np.transpose(self.covars_prior, axes=(0, 2, 1))
 998 |             nus = self.covars_weight
 999 | 
1000 |             centr_means_resh = centered_means.reshape((nc, nm, nf, 1))
1001 |             centr_means_resh_t = centered_means.reshape((nc, nm, 1, nf))
1002 |             centered_means_dots = centr_means_resh * centr_means_resh_t
1003 | 
1004 |             lambdas_cmdots_prod_sum = (
1005 |                 np.einsum('ij,ijkl->ikl', lambdas, centered_means_dots))
1006 | 
1007 |             new_cov_numer = (
1008 |                 np.einsum(
1009 |                     'ijk,ijklm->jlm', stats['post_comp_mix'], centered_dots)
1010 |                 + lambdas_cmdots_prod_sum + psis_t)
1011 |             new_cov_denom = (
1012 |                 stats['post_sum'] + nm + nus + nf + 1
1013 |             )[:, np.newaxis, np.newaxis]
1014 |             new_cov = new_cov_numer / new_cov_denom
1015 | 
1016 |         # Assigning new values to class members
1017 |         self.weights_ = new_weights
1018 |         self.means_ = new_means
1019 |         self.covars_ = new_cov
1020 | 


--------------------------------------------------------------------------------