├── .git_archival.txt
├── .gitattributes
├── time-lagged-autoencoder
    ├── tae
    │   ├── test
    │   │   ├── __init__.py
    │   │   ├── test_toymodels.py
    │   │   ├── test_models.py
    │   │   ├── test_api.py
    │   │   └── test_utils.py
    │   ├── __init__.py
    │   ├── toymodels.py
    │   ├── api.py
    │   ├── utils.py
    │   ├── benchmarks.py
    │   └── models.py
    ├── setup.cfg
    ├── setup.py
    └── README.md
├── README.md
├── docs
    └── wishlist.md
├── vampnet
    ├── vampnet
    │   ├── __init__.py
    │   └── data_generator.py
    ├── setup.py
    ├── README.md
    └── examples
    │   ├── 1D_double_well.ipynb
    │   ├── Folding.ipynb
    │   ├── Alanine_dipeptide.ipynb
    │   └── Alanine_dipeptide_multiple_files.ipynb
├── .gitignore
└── LICENSE


/.git_archival.txt:
--------------------------------------------------------------------------------
1 | ref-names: HEAD -> master


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | .git_archival.txt  export-subst


--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/time-lagged-autoencoder/setup.cfg:
--------------------------------------------------------------------------------
1 | [alias]
2 | test=pytest


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # deeptime
2 | Deep learning meets molecular dynamics.
3 | 
4 | ## Contents
5 | 
6 | - **time-lagged-autoencoder**: a toolbox for dimension reduction of time series data with a [time-lagged autoencoder](https://aip.scitation.org/doi/full/10.1063/1.5011399)-type deep neural network.
7 | - **vampnet**: Variational Approach for Markov Processes networks, see https://www.nature.com/articles/s41467-017-02388-1
8 | 


--------------------------------------------------------------------------------
/docs/wishlist.md:
--------------------------------------------------------------------------------
 1 | # General
 2 | - want to be able to fit by batch by either providing a numpy style array (so everything fits into memory) or provide a generator function
 3 | - which framework is used should be decided based on what is in the environment / based on a user configuration if both frameworks are available
 4 | # Top level
 5 | - invisible to the user which NN framework is used
 6 | - have Models `TAE` and `VAMPNet` which can be "trained" (layer sizes, dropout, batch size, learning rate, activation functions etc)
 7 | - Trained models can be `fit`-ted on data 
 8 | # Mid level 
 9 | - abstraction layer between the actual NN-framework implementation and the top layer
10 | # Low level
11 | - specialization toward pytorch / TF as implementation of the abstraction layer
12 |   - smaller dispatch interfaces separated through namespaces, e.g.,
13 |   ```python
14 |   deeptime.scores.tf.vamp
15 |   deeptime.scores.torch.vamp
16 |   ```


--------------------------------------------------------------------------------
/vampnet/vampnet/__init__.py:
--------------------------------------------------------------------------------
 1 | #   This file is part of the markovmodel/deeptime repository.
 2 | #   Copyright (C) 2017, 2018 Computational Molecular Biology Group,
 3 | #   Freie Universitaet Berlin (GER)
 4 | #
 5 | #   This program is free software: you can redistribute it and/or modify
 6 | #   it under the terms of the GNU Lesser General Public License as published by
 7 | #   the Free Software Foundation, either version 3 of the License, or
 8 | #   (at your option) any later version.
 9 | #
10 | #   This program is distributed in the hope that it will be useful,
11 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | #   GNU General Public License for more details.
14 | #
15 | #   You should have received a copy of the GNU Lesser General Public License
16 | #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | 
18 | from pkg_resources import get_distribution, DistributionNotFound
19 | try:
20 |     __version__ = get_distribution(__name__).version
21 | except DistributionNotFound:
22 |     __version__ = 'x.y.z'
23 | del get_distribution, DistributionNotFound
24 | 
25 | __author__ = 'Andreas Mardt, Luca Pasquali'
26 | __email__ = 'andreas.mardt@fu-berlin.de, luca.pasquali@fu-berlin.de'
27 | 
28 | from .vampnet import VampnetTools
29 | 


--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/__init__.py:
--------------------------------------------------------------------------------
 1 | #   This file is part of the markovmodel/deeptime repository.
 2 | #   Copyright (C) 2017, 2018 Computational Molecular Biology Group,
 3 | #   Freie Universitaet Berlin (GER)
 4 | #
 5 | #   This program is free software: you can redistribute it and/or modify
 6 | #   it under the terms of the GNU Lesser General Public License as published by
 7 | #   the Free Software Foundation, either version 3 of the License, or
 8 | #   (at your option) any later version.
 9 | #
10 | #   This program is distributed in the hope that it will be useful,
11 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | #   GNU General Public License for more details.
14 | #
15 | #   You should have received a copy of the GNU Lesser General Public License
16 | #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | 
18 | '''
19 | A toolbox for dimension reduction of time series data with a
20 | time-lagged autoencoder.
21 | '''
22 | 
23 | from pkg_resources import get_distribution, DistributionNotFound
24 | try:
25 |     __version__ = get_distribution(__name__).version
26 | except DistributionNotFound:
27 |     __version__ = 'x.y.z'
28 | del get_distribution, DistributionNotFound
29 | 
30 | __author__ = 'Christoph Wehmeyer'
31 | __email__ = 'christoph.wehmeyer@fu-berlin.de'
32 | 
33 | from .api import pca, tica, ae, vae, vampnet
34 | from .models import PCA, TICA, AE, VAE, VAMPNet
35 | from . import utils
36 | from . import toymodels
37 | 


--------------------------------------------------------------------------------
/vampnet/setup.py:
--------------------------------------------------------------------------------
 1 | #   This file is part of the markovmodel/deeptime repository.
 2 | #   Copyright (C) 2017, 2018 Computational Molecular Biology Group,
 3 | #   Freie Universitaet Berlin (GER)
 4 | #
 5 | #   This program is free software: you can redistribute it and/or modify
 6 | #   it under the terms of the GNU Lesser General Public License as published by
 7 | #   the Free Software Foundation, either version 3 of the License, or
 8 | #   (at your option) any later version.
 9 | #
10 | #   This program is distributed in the hope that it will be useful,
11 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | #   GNU General Public License for more details.
14 | #
15 | #   You should have received a copy of the GNU Lesser General Public License
16 | #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | 
18 | from setuptools import setup, find_packages
19 | 
20 | description = '''
21 | Collection of functions to implement neural networks based
22 | on the variational approach for Markov processes,
23 | as described in https://arxiv.org/abs/1710.06012
24 | '''
25 | 
26 | setup(
27 |     use_scm_version=dict(root='..', relative_to=__file__),
28 |     name='vampnet',
29 |     author='Andreas Mardt, Luca Pasquali',
30 |     author_email='andreas.mardt@fu-berlin.de, luca.pasquali@fu-berlin.de',
31 |     url='https://github.com/markovmodel/deeptime/tree/master/vampnet',
32 |     description=description,
33 |     packages=find_packages(),
34 |     setup_requires=['setuptools_scm', 'setuptools_scm_git_archive'],
35 |     install_requires=[
36 |         'numpy',
37 |         'scipy',
38 |         'matplotlib'],
39 |     zip_safe=False)
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | Untitled*.ipynb
 73 | 
 74 | # pyenv
 75 | .python-version
 76 | 
 77 | # celery beat schedule file
 78 | celerybeat-schedule
 79 | 
 80 | # SageMath parsed files
 81 | *.sage.py
 82 | 
 83 | # dotenv
 84 | .env
 85 | 
 86 | # virtualenv
 87 | .venv
 88 | venv/
 89 | ENV/
 90 | 
 91 | # Spyder project settings
 92 | .spyderproject
 93 | .spyproject
 94 | 
 95 | # Rope project settings
 96 | .ropeproject
 97 | 
 98 | # mkdocs documentation
 99 | /site
100 | 
101 | # mypy
102 | .mypy_cache/
103 | .idea/
104 | 


--------------------------------------------------------------------------------
/vampnet/README.md:
--------------------------------------------------------------------------------
 1 | # VAMPnet
 2 | Variational Approach for Markov Processes networks.
 3 | 
 4 | 
 5 | ## What is it?
 6 | VAMPnet is an open source Python package for the implementation of the VAMPnet method for dynamical systems analysis (described in https://www.nature.com/articles/s41467-017-02388-1). It includes losses functions, metrics, basic estimators for Koopman operators and the most important validation tools for Koopman models.
 7 | 
 8 | VAMPnet can be used from Jupyter (former IPython, recommended), or by
 9 | writing Python scripts.
10 | 
11 | 
12 | ## Citation
13 | If you use VAMPnet in scientific work, please cite:
14 | 
15 |     Mardt, A., Pasquali, L., Wu, H., & Noé, F. (2018). 
16 |     VAMPnets for deep learning of molecular kinetics. 
17 |     Nature communications, 9(1), 5.
18 | 
19 | ## Installation
20 | 
21 | IMPORTANT: On Tensorflow 1.7 and 1.8 there's an unresolved issue which causes the 
22 | eigenvalue decomposition to fail. This issue doesn't present itself on TF 1.4-1.6
23 | and 1.9+, so please use one of these other releases instead.
24 | 
25 | This package requires [Tensorflow](https://www.tensorflow.org) to be used.
26 | Please install either tensorflow or tensorflow-gpu. Installation instructions:
27 | 
28 | https://www.tensorflow.org/install/
29 | 
30 | To install this package, first clone the repository:
31 | 
32 | git clone https://github.com/markovmodel/deeptime.git
33 | 
34 | Then with pip:
35 | 
36 | ```bash
37 | python setup.py install
38 | ```
39 | 
40 | The examples are jupyter notebooks, so the jupyter package is needed to run them:
41 | 
42 | http://jupyter.readthedocs.io/en/latest/install.html
43 | 
44 | This is not needed if you'd like to use the package only.
45 | 
46 | 
47 | If you want to run the alanine dipeptide example, you'll also need to install the mdshare package (necessary for the download of the trajectory files):
48 | 
49 | git clone https://github.com/markovmodel/mdshare.git
50 | pip install ./mdshare
51 | 
52 | or
53 | 
54 | conda install mdshare -c conda-forge
55 | 
56 | 


--------------------------------------------------------------------------------
/time-lagged-autoencoder/setup.py:
--------------------------------------------------------------------------------
 1 | #   This file is part of the markovmodel/deeptime repository.
 2 | #   Copyright (C) 2017, 2018 Computational Molecular Biology Group,
 3 | #   Freie Universitaet Berlin (GER)
 4 | #
 5 | #   This program is free software: you can redistribute it and/or modify
 6 | #   it under the terms of the GNU Lesser General Public License as published by
 7 | #   the Free Software Foundation, either version 3 of the License, or
 8 | #   (at your option) any later version.
 9 | #
10 | #   This program is distributed in the hope that it will be useful,
11 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | #   GNU General Public License for more details.
14 | #
15 | #   You should have received a copy of the GNU Lesser General Public License
16 | #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | 
18 | from setuptools import setup, find_packages
19 | from setuptools.command.test import test as TestCommand
20 | import sys
21 | 
22 | try:
23 |     import torch
24 | except ImportError:
25 |     # setup.py forces pytorch installation via pip and ignores an existing
26 |     # conda installation. That's why we catch this here...
27 |     print(
28 |         'Please install pytorch >=0.2.0_4 according to the instructions on '
29 |         'http://pytorch.org before you continue!')
30 |     sys.exit(1)
31 | 
32 | class PyTest(TestCommand):
33 |     user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")]
34 |     def initialize_options(self):
35 |         TestCommand.initialize_options(self)
36 |         self.pytest_args = ['tae']
37 |     def run_tests(self):
38 |         import pytest
39 |         errno = pytest.main(self.pytest_args)
40 |         sys.exit(errno)
41 | 
42 | setup(
43 |     cmdclass={'test': PyTest},
44 |     use_scm_version=dict(root='..', relative_to=__file__),
45 |     name='tae',
46 |     author='Christoph Wehmeyer',
47 |     author_email='christoph.wehmeyer@fu-berlin.de',
48 |     url='https://github.com/markovmodel/deeptime/tree/master/time-lagged-autoencoder',
49 |     description='A toolbox for dimension reduction of time series data with a time-lagged autoencoder.',
50 |     packages=find_packages(),
51 |     setup_requires=['setuptools_scm', 'setuptools_scm_git_archive'],
52 |     install_requires=['numpy'],
53 |     tests_require=['pytest'],
54 |     zip_safe=False)
55 | 


--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/test/test_toymodels.py:
--------------------------------------------------------------------------------
 1 | #   This file is part of the markovmodel/deeptime repository.
 2 | #   Copyright (C) 2017, 2018 Computational Molecular Biology Group,
 3 | #   Freie Universitaet Berlin (GER)
 4 | #
 5 | #   This program is free software: you can redistribute it and/or modify
 6 | #   it under the terms of the GNU Lesser General Public License as published by
 7 | #   the Free Software Foundation, either version 3 of the License, or
 8 | #   (at your option) any later version.
 9 | #
10 | #   This program is distributed in the hope that it will be useful,
11 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | #   GNU General Public License for more details.
14 | #
15 | #   You should have received a copy of the GNU Lesser General Public License
16 | #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | 
18 | import numpy as np
19 | from ..toymodels import sample_hmm
20 | from ..toymodels import sample_sqrt_model
21 | from ..toymodels import sample_swissroll_model
22 | 
23 | def run_sample_hmm(ndim, nstates):
24 |     length = 10000
25 |     states = [np.random.randn(ndim) for i in range(nstates)]
26 |     cov = np.random.rand(ndim, ndim)
27 |     cov = np.matmul(cov.T, cov)
28 |     transition_matrix = np.random.rand(nstates, nstates)
29 |     transition_matrix = transition_matrix + transition_matrix.T
30 |     transition_matrix /= transition_matrix.sum()
31 |     pi = transition_matrix.sum(axis=1)
32 |     transition_matrix /= pi[:, None]
33 |     traj, dtraj = sample_hmm(length, cov, states, transition_matrix)
34 |     sets = [np.where(dtraj == state)[0] for state in range(nstates)]
35 |     np.testing.assert_allclose(
36 |         [float(len(s)) / float(length) for s in sets],
37 |         pi, atol=0.1)
38 |     for i, s in enumerate(sets):
39 |         mean = np.mean(traj[s, :], axis=0)
40 |         np.testing.assert_allclose(
41 |             mean, states[i], atol=0.2)
42 |         traj[s, :] -= mean
43 |     np.testing.assert_allclose(np.cov(traj.T), cov, atol=0.2)
44 | 
45 | def test_sample_hmm_random():
46 |     for _ in range(3):
47 |         ndim = np.random.randint(low=2, high=5)
48 |         nstates = np.random.randint(low=2, high=5)
49 |         run_sample_hmm(ndim, nstates)
50 | 
51 | def test_sample_sqrt_model():
52 |     traj, dtraj = sample_sqrt_model(20000)
53 |     np.testing.assert_allclose(
54 |         np.mean(traj, axis=0), [0.0, 1.9], atol=0.2)
55 |     np.testing.assert_allclose(
56 |         np.std(traj, axis=0, ddof=1), [5.5, 1.3], atol=0.2)
57 | 
58 | def test_sample_swissroll_model():
59 |     traj, dtraj = sample_swissroll_model(20000)
60 |     np.testing.assert_allclose(
61 |         np.mean(traj, axis=0), [-3.1, 11.2, 4.9], atol=1.0)
62 |     np.testing.assert_allclose(
63 |         np.std(traj, axis=0, ddof=1), [7.9, 3.8, 6.7], atol=0.4)
64 | 


--------------------------------------------------------------------------------
/time-lagged-autoencoder/README.md:
--------------------------------------------------------------------------------
 1 | # time-lagged autoencoder
 2 | 
 3 | A toolbox for dimension reduction of time series data with a [time-lagged autoencoder](https://arxiv.org/abs/1710.11239)-type deep neural network.
 4 | 
 5 | ## Installation
 6 | Make sure to install pytorch via conda, instructions on http://pytorch.org, before you install the present module with
 7 | 
 8 | ```bash
 9 | python setup.py test
10 | python setup.py install
11 | ```
12 | 
13 | To run the included benchmarks, you also need to install the packages [pyemma](https://github.com/markovmodel/pyemma) and [mdshare](https://github.com/markovmodel/mdshare).
14 | 
15 | ## Methods
16 | This package implements
17 | - principal component analysis (PCA),
18 | - time-lagged independent component analysis (TICA),
19 | - time-lagged canonical correlation analysis (via TICA),
20 | - kinetic maps (via TICA), and
21 | - an autoencoder-type neural network (AE) trained in a time-lagged manner.
22 | 
23 | ## Example
24 | Assume that `data` is a single `numpy.ndarray(shape=[n_frames, n_features])` object or list thereof, `n_frames` refers to the number of timesteps in the/each trajectory, and `n_features` refers to the number of features extracted from the original molecular dyamics (MD) data. Now choose a target dimensionality `dim` and a transformation lag time `lag`, and run:
25 | 
26 | ```python
27 | import tae
28 | 
29 | # run PCA
30 | pca_transformed_data, pca_train_loss, pca_val_loss = tae.pca(data, dim=dim)
31 | 
32 | # run TICA
33 | tica_transformed_data, tica_train_loss, tica_val_loss = tae.tica(data, dim=dim, lag=lag)
34 | 
35 | # run AE
36 | ae_transformed_data, ae_train_loss, ae_val_loss = tae.ae(data, dim=dim, lag=lag)
37 | 
38 | # run VAE
39 | vae_transformed_data, vae_train_loss, vae_val_loss = tae.vae(data, dim=dim, lag=lag)
40 | 
41 | # run AE on a GPU
42 | ae_transformed_data, ae_train_loss, ae_val_loss = tae.ae(data, dim=dim, lag=lag, cuda=True)
43 | ```
44 | 
45 | In this example, we get `*_val_loss=None` because we are training on the full data set. To exclude a randomly chosen fraction `fval` of the data from the training, add the parameter `validation_split=fval` to the function calls, e.g.:
46 | 
47 | ```python
48 | ae_transformed_data, ae_train_loss, ae_val_loss = tae.ae(
49 |     data, dim=dim, lag=lag, validation_split=fval, cuda=True)
50 | ```
51 | 
52 | ## Citation
53 | ```
54 | @article{time-lagged-autoencoder,
55 | 	Author = {Christoph Wehmeyer and Frank No{\'{e}}},
56 | 	Doi = {10.1063/1.5011399},
57 | 	Journal = {J. Chem. Phys.},
58 | 	Month = {jun},
59 | 	Number = {24},
60 | 	Pages = {241703},
61 | 	Publisher = {{AIP} Publishing},
62 | 	Title = {Time-lagged autoencoders: Deep learning of slow collective variables for molecular kinetics},
63 | 	Volume = {148},
64 | 	Year = {2018}}
65 | ```
66 | 
67 | ## Development system
68 | This project was developed using the following python environment:
69 | 
70 | | package | version | | channel |
71 | |:---|:---|:---|:---|
72 | | python | 3.6.1 | 2 | |
73 | | conda | 4.3.29 | py36_0 | conda-forge |
74 | | numpy | 1.13.3 | py36_blas_openblas_200 [blas_openblas] | conda-forge |
75 | | pytorch | 0.2.0 | py36_4cu75 | soumith |
76 | | pyemma | 2.4 | np113py36_1 | conda-forge |
77 | 


--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/test/test_models.py:
--------------------------------------------------------------------------------
 1 | #   This file is part of the markovmodel/deeptime repository.
 2 | #   Copyright (C) 2017, 2018 Computational Molecular Biology Group,
 3 | #   Freie Universitaet Berlin (GER)
 4 | #
 5 | #   This program is free software: you can redistribute it and/or modify
 6 | #   it under the terms of the GNU Lesser General Public License as published by
 7 | #   the Free Software Foundation, either version 3 of the License, or
 8 | #   (at your option) any later version.
 9 | #
10 | #   This program is distributed in the hope that it will be useful,
11 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | #   GNU General Public License for more details.
14 | #
15 | #   You should have received a copy of the GNU Lesser General Public License
16 | #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | 
18 | import numpy as np
19 | from torch.utils.data import DataLoader
20 | from ..utils import create_dataset
21 | from ..utils import whiten_data
22 | from ..models import PCA
23 | from ..models import TICA
24 | from ..models import AE
25 | 
26 | def generate_data_2state_hmm(length=10000, lag=0, batch_size=100):
27 |     transition_matrix = np.asarray([[0.9, 0.1], [0.1, 0.9]])
28 |     dtraj = np.zeros(shape=(length,), dtype=np.intc)
29 |     for i in range(1, length):
30 |         dtraj[i] = np.random.choice(
31 |             2, size=1, p=transition_matrix[dtraj[i - 1], :])
32 |     traj = np.random.randn(len(dtraj))
33 |     traj[np.where(dtraj == 1)[0]] += 2.0
34 |     traj_stacked = np.vstack((traj, np.zeros(len(traj))))
35 |     phi = np.random.rand() * 2.0 * np.pi
36 |     rot = np.asarray([
37 |         [np.cos(phi), -np.sin(phi)],
38 |         [np.sin(phi), np.cos(phi)]])
39 |     traj_rot = np.dot(rot, traj_stacked).T
40 |     return traj, \
41 |         DataLoader(
42 |             create_dataset(traj_rot, lag=lag),
43 |             batch_size=batch_size,
44 |             shuffle=True), \
45 |         DataLoader(
46 |             create_dataset(traj_rot, lag=0),
47 |             batch_size=batch_size)
48 | 
49 | ################################################################################
50 | #
51 | #   PCA
52 | #
53 | ################################################################################
54 | 
55 | def test_pca_2state_hmm():
56 |     traj, train_loader, transform_loader = generate_data_2state_hmm()
57 |     pca = PCA()
58 |     pca.fit(train_loader, dim=1)
59 |     out = whiten_data(pca.transform(transform_loader)).numpy().reshape((-1,))
60 |     traj -= np.mean(traj)
61 |     traj /= np.std(traj, ddof=1)
62 |     np.testing.assert_allclose(np.abs(np.mean(traj * out)), 1.0, atol=0.001)
63 | 
64 | ################################################################################
65 | #
66 | #   TICA
67 | #
68 | ################################################################################
69 | 
70 | def test_tica_2state_hmm():
71 |     traj, train_loader, transform_loader = generate_data_2state_hmm(lag=1)
72 |     tica = TICA()
73 |     tica.fit(train_loader, dim=1)
74 |     out = whiten_data(tica.transform(transform_loader)).numpy().reshape((-1,))
75 |     traj -= np.mean(traj)
76 |     traj /= np.std(traj, ddof=1)
77 |     np.testing.assert_allclose(np.abs(np.mean(traj * out)), 1.0, atol=0.001)
78 | 
79 | ################################################################################
80 | #
81 | #   AUTOENCODER
82 | #
83 | ################################################################################
84 | 
85 | def test_ae_2state_hmm():
86 |     traj, train_loader, transform_loader = generate_data_2state_hmm(lag=1)
87 |     ae = AE(2, 1, bias=False, alpha=None)
88 |     ae.fit(train_loader, 20)
89 |     out = whiten_data(ae.transform(transform_loader)).numpy().reshape((-1,))
90 |     traj -= np.mean(traj)
91 |     traj /= np.std(traj, ddof=1)
92 |     np.testing.assert_allclose(np.abs(np.mean(traj * out)), 1.0, atol=0.001)
93 | 


--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/test/test_api.py:
--------------------------------------------------------------------------------
 1 | #   This file is part of the markovmodel/deeptime repository.
 2 | #   Copyright (C) 2017, 2018 Computational Molecular Biology Group,
 3 | #   Freie Universitaet Berlin (GER)
 4 | #
 5 | #   This program is free software: you can redistribute it and/or modify
 6 | #   it under the terms of the GNU Lesser General Public License as published by
 7 | #   the Free Software Foundation, either version 3 of the License, or
 8 | #   (at your option) any later version.
 9 | #
10 | #   This program is distributed in the hope that it will be useful,
11 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | #   GNU General Public License for more details.
14 | #
15 | #   You should have received a copy of the GNU Lesser General Public License
16 | #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | 
18 | import numpy as np
19 | from torch.utils.data import DataLoader
20 | from ..utils import create_dataset
21 | from ..utils import whiten_data
22 | from ..api import pca
23 | from ..api import tica
24 | from ..api import ae
25 | 
26 | def generate_data_2state_hmm(length=10000):
27 |     transition_matrix = np.asarray([[0.9, 0.1], [0.1, 0.9]])
28 |     phi = np.random.rand() * 2.0 * np.pi
29 |     rot = np.asarray([
30 |         [np.cos(phi), -np.sin(phi)],
31 |         [np.sin(phi), np.cos(phi)]])
32 |     trajs, rtrajs = [], []
33 |     for _ in range(np.random.randint(1, 5)):
34 |         dtraj = np.zeros(
35 |             shape=(length + np.random.randint(100),), dtype=np.intc)
36 |         for i in range(1, len(dtraj)):
37 |             dtraj[i] = np.random.choice(
38 |                 2, size=1, p=transition_matrix[dtraj[i - 1], :])
39 |         traj = np.random.randn(len(dtraj))
40 |         traj[np.where(dtraj == 1)[0]] += 2.0
41 |         traj_stacked = np.vstack((traj, np.zeros(len(traj))))
42 |         traj_rot = np.dot(rot, traj_stacked).T
43 |         trajs.append(traj[:])
44 |         rtrajs.append(traj_rot[:])
45 |     if len(trajs) == 1:
46 |         trajs = trajs[0]
47 |         rtrajs = rtrajs[0]
48 |     else:
49 |         trajs = np.concatenate(trajs)
50 |     trajs -= np.mean(trajs)
51 |     trajs /= np.std(trajs, ddof=1)
52 |     return trajs, rtrajs
53 | 
54 | def checkpout_output(ref, data, out):
55 |     if isinstance(data, (list, tuple)):
56 |         np.testing.assert_array_equal(
57 |             [o.shape[0] for o in out],
58 |             [d.shape[0] for d in data])
59 |         out = np.concatenate(out)
60 |     else:
61 |         assert data.shape[0] == out.shape[0]
62 |     out = out.reshape(-1)
63 |     np.testing.assert_allclose(np.abs(np.mean(ref * out)), 1.0, atol=0.001)
64 | 
65 | ################################################################################
66 | #
67 | #   PCA
68 | #
69 | ################################################################################
70 | 
71 | def test_pca_2state_hmm():
72 |     ref, data = generate_data_2state_hmm()
73 |     out, train_loss, test_loss = pca(data, dim=1, whiten=True)
74 |     checkpout_output(ref, data, out)
75 | 
76 | ################################################################################
77 | #
78 | #   TICA
79 | #
80 | ################################################################################
81 | 
82 | def test_tica_2state_hmm():
83 |     ref, data = generate_data_2state_hmm()
84 |     out, train_loss, test_loss = tica(data, dim=1, lag=1, whiten=True)
85 |     checkpout_output(ref, data, out)
86 | 
87 | ################################################################################
88 | #
89 | #   AUTOENCODER
90 | #
91 | ################################################################################
92 | 
93 | def test_ae_2state_hmm():
94 |     ref, data = generate_data_2state_hmm()
95 |     out, train_loss, test_loss = ae(
96 |         data, dim=1, lag=1, n_epochs=20, whiten=True,
97 |         bias=False, hid_size=[], alpha=None)
98 |     checkpout_output(ref, data, out)
99 | 


--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/toymodels.py:
--------------------------------------------------------------------------------
  1 | #   This file is part of the markovmodel/deeptime repository.
  2 | #   Copyright (C) 2017, 2018 Computational Molecular Biology Group,
  3 | #   Freie Universitaet Berlin (GER)
  4 | #
  5 | #   This program is free software: you can redistribute it and/or modify
  6 | #   it under the terms of the GNU Lesser General Public License as published by
  7 | #   the Free Software Foundation, either version 3 of the License, or
  8 | #   (at your option) any later version.
  9 | #
 10 | #   This program is distributed in the hope that it will be useful,
 11 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | #   GNU General Public License for more details.
 14 | #
 15 | #   You should have received a copy of the GNU Lesser General Public License
 16 | #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 | 
 18 | '''
 19 | A collection of "difficult" toymodels.
 20 | '''
 21 | 
 22 | import numpy as _np
 23 | 
 24 | __all__ = ['sample_sqrt_model', 'sample_swissroll_model']
 25 | 
 26 | def sample_hmm(length, cov, states, transition_matrix):
 27 |     '''Sample a hidden state trajectory and n-dimensional emissions.
 28 | 
 29 |     We sample a hidden state trajectory using the given transition matrix. For
 30 |     each hidden state, we compute Gaussian noise around the center of the state
 31 |     using the given covariance matrix.
 32 | 
 33 |     Arguments:
 34 |         length (int): length of the resulting trajectories
 35 |         cov (array-like of float): covariance matrix for the noise
 36 |         states (array-like of float): centers for each state's emissions
 37 |         transition_matrix (array-like of float): a transition matrix
 38 |     '''
 39 |     cov = _np.asarray(cov, dtype=_np.float32)
 40 |     states = _np.asarray(states, dtype=_np.float32)
 41 |     transition_matrix = _np.asarray(transition_matrix, dtype=_np.float32)
 42 |     dtraj = _np.zeros(shape=(length,), dtype=_np.intc)
 43 |     dtraj[0] = _np.random.randint(low=0, high=len(states))
 44 |     for i in range(1, length):
 45 |         dtraj[i] = _np.random.choice(
 46 |             len(states), size=1, p=transition_matrix[dtraj[i - 1], :])
 47 |     traj = states[dtraj, :] + _np.random.multivariate_normal(
 48 |         _np.zeros(len(cov)), cov, size=length, check_valid='ignore')
 49 |     return traj, dtraj
 50 | 
 51 | def sqrt_transform(traj):
 52 |     '''Mask an emission trajectory using an sqrt transform.
 53 | 
 54 |     We add the square root of the first dimension (which ideally a large
 55 |     variance) to the second (which is ideally the slowest degree of freedom)
 56 |     to mask the slow process.
 57 | 
 58 |     Arguments:
 59 |         traj (array-like of float): a trajectory of emissions
 60 |     '''
 61 |     transformed_traj = _np.asarray(traj).copy()
 62 |     transformed_traj[:, 1] += _np.sqrt(_np.abs(traj[:, 0]))
 63 |     return transformed_traj
 64 | 
 65 | def sample_sqrt_model(length):
 66 |     '''Sample a hidden state and an sqrt-transformed emission trajectory.
 67 | 
 68 |     We sample a hidden state trajectory and sqrt-masked emissions in two
 69 |     dimensions such that the two metastable states are not linearly separable.
 70 | 
 71 |     Arguments:
 72 |         length (int): length of the resulting trajectories
 73 |     '''
 74 |     cov = [[30.0, 0.0], [0.0, 0.015]]
 75 |     states = [[0.0, 1.0], [0.0, -1.0]]
 76 |     transition_matrix = [[0.95, 0.05], [0.05, 0.95]]
 77 |     traj, dtraj = sample_hmm(length, cov, states, transition_matrix)
 78 |     return sqrt_transform(traj), dtraj
 79 | 
 80 | def swissroll_transform(traj):
 81 |     '''Mask an emission trajectory using a swissroll transform.
 82 | 
 83 |     We roll two dimensional emissions into a swissroll style manifold in three
 84 |     dimensions.
 85 | 
 86 |     Arguments:
 87 |         traj (array-like of float): a trajectory of emissions
 88 |     '''
 89 |     x = traj[:, 0]
 90 |     return _np.vstack([x * _np.cos(x), traj[:, 1], x * _np.sin(x)]).T
 91 | 
 92 | def sample_swissroll_model(length):
 93 |     '''Sample a hidden state and a swissroll-transformed emission trajectory.
 94 | 
 95 |     We sample a hidden state trajectory and swissroll-masked emissions in two
 96 |     dimensions such that the four metastable states are not linearly separable.
 97 | 
 98 |     Arguments:
 99 |         length (int): length of the resulting trajectories
100 |     '''
101 |     cov = [[1.0, 0.0], [0.0, 1.0]]
102 |     states = [[7.5, 7.5], [7.5, 15.0], [15.0, 15.0], [15.0, 7.5]]
103 |     transition_matrix = [
104 |         [0.95, 0.05, 0.00, 0.00],
105 |         [0.05, 0.90, 0.05, 0.00],
106 |         [0.00, 0.05, 0.90, 0.05],
107 |         [0.00, 0.00, 0.05, 0.95]]
108 |     traj, dtraj = sample_hmm(length, cov, states, transition_matrix)
109 |     return swissroll_transform(traj), dtraj
110 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.


--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/api.py:
--------------------------------------------------------------------------------
  1 | #   This file is part of the markovmodel/deeptime repository.
  2 | #   Copyright (C) 2017, 2018 Computational Molecular Biology Group,
  3 | #   Freie Universitaet Berlin (GER)
  4 | #
  5 | #   This program is free software: you can redistribute it and/or modify
  6 | #   it under the terms of the GNU Lesser General Public License as published by
  7 | #   the Free Software Foundation, either version 3 of the License, or
  8 | #   (at your option) any later version.
  9 | #
 10 | #   This program is distributed in the hope that it will be useful,
 11 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | #   GNU General Public License for more details.
 14 | #
 15 | #   You should have received a copy of the GNU Lesser General Public License
 16 | #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 | 
 18 | '''
 19 | A simple API to apply PCA, TICA, and AE to time series data.
 20 | '''
 21 | 
 22 | from .models import PCA as _PCA
 23 | from .models import TICA as _TICA
 24 | from .models import AE as _AE
 25 | from .models import VAE as _VAE
 26 | from .models import VAMPNet as _VAMPNet
 27 | from .utils import create_dataset as _create_dataset
 28 | from .utils import random_split as _random_split
 29 | from .utils import random_block_split as _random_block_split
 30 | from .utils import whiten_data as _whiten_data
 31 | from torch import nn as _nn
 32 | from torch.utils.data import DataLoader as _DataLoader
 33 | 
 34 | def _transform(model, data, data_0, batch_size, whiten, pin_memory=False):
 35 |     loader = _DataLoader(data_0, batch_size=batch_size, pin_memory=pin_memory)
 36 |     if whiten:
 37 |         transformed_data = _whiten_data(model.transform(loader)).numpy()
 38 |     else:
 39 |         transformed_data = model.transform(loader).numpy()
 40 |     if isinstance(data, (list, tuple)):
 41 |         collect = []
 42 |         p = 0
 43 |         lengths = [d.shape[0] for d in data]
 44 |         for length in lengths:
 45 |             collect.append(transformed_data[p:p+length, :])
 46 |             p += length
 47 |         return collect
 48 |     return transformed_data
 49 | 
 50 | def pca(data, dim=None, validation_split=None, batch_size=100, whiten=False):
 51 |     '''Perform a principal component analysis for dimensionality reduction.
 52 | 
 53 |     We compute the first <dim> eigenvectors of the instantaneous covariance
 54 |     matrix and use them to rotate/project the data into a lower dimensional
 55 |     subspace.
 56 | 
 57 |     Arguments:
 58 |         data (numpy-ndarray of list thereof): the data to be transformed
 59 |         dim (int): the target dimensionality
 60 |         validation_split (float): fraction of the data reserved for validation
 61 |         batch_size (int): specify a batch size for the minibatch process
 62 |         whiten (boolean): set to True to whiten the transformed data
 63 | 
 64 |     Returns:
 65 |         (numpy.ndarray of list thereof): the transformed data
 66 |         (float): training loss
 67 |         (float): validation loss
 68 |     '''
 69 |     data_0 = _create_dataset(data, lag=0)
 70 |     if validation_split is None:
 71 |         train_loader = _DataLoader(data_0, batch_size=batch_size)
 72 |         test_loader = None
 73 |     else:
 74 |         data_test, data_train = _random_split(
 75 |             data_0, f_active=validation_split)
 76 |         train_loader = _DataLoader(data_train, batch_size=batch_size)
 77 |         test_loader = _DataLoader(data_test, batch_size=batch_size)
 78 |     model = _PCA()
 79 |     train_loss, test_loss = model.fit(
 80 |         train_loader, dim=dim, test_loader=test_loader)
 81 |     transformed_data = _transform(model, data, data_0, batch_size, whiten)
 82 |     return transformed_data, train_loss, test_loss
 83 | 
 84 | def tica(
 85 |     data, dim=None, lag=1, kinetic_map=True, symmetrize=False,
 86 |     validation_split=None, batch_size=100, whiten=False):
 87 |     '''Perform a time-lagged independent component analysis for
 88 |     dimensionality reduction.
 89 | 
 90 |     We compute a rank-d approximation to the Koopman operator and use it to
 91 |     rotate/project the data into a lower dimensional subspace.
 92 | 
 93 |     Arguments:
 94 |         data (numpy-ndarray of list thereof): the data to be transformed
 95 |         dim (int): the target dimensionality
 96 |         lag (int): specifies the lag in time steps
 97 |         kinetic_map (boolean): use the kinetic map variant of TICA
 98 |         symmetrize (boolean): enforce symmetry and reversibility
 99 |         validation_split (float): fraction of the data reserved for validation
100 |         batch_size (int): specify a batch size for the minibatch process
101 |         whiten (boolean): set to True to whiten the transformed data
102 | 
103 |     Returns:
104 |         (numpy.ndarray of list thereof): the transformed data
105 |         (float): training loss
106 |         (float): validation loss
107 |     '''
108 |     data_0 = _create_dataset(data, lag=0)
109 |     data_lag = _create_dataset(data, lag=lag)
110 |     if validation_split is None:
111 |         train_loader = _DataLoader(data_lag, batch_size=batch_size)
112 |         test_loader = None
113 |     else:
114 |         data_test, data_train = _random_block_split(
115 |             data_lag, lag, f_active=validation_split)
116 |         train_loader = _DataLoader(data_train, batch_size=batch_size)
117 |         test_loader = _DataLoader(data_test, batch_size=batch_size)
118 |     model = _TICA(kinetic_map=kinetic_map, symmetrize=symmetrize)
119 |     train_loss, test_loss = model.fit(
120 |         train_loader, dim=dim, test_loader=test_loader)
121 |     transformed_data = _transform(model, data, data_0, batch_size, whiten)
122 |     return transformed_data, train_loss, test_loss
123 | 
124 | def ae(
125 |     data, dim=None, lag=1, n_epochs=50, validation_split=None,
126 |     batch_size=100, whiten=False, pin_memory=False, **kwargs):
127 |     '''Use a time-lagged autoencoder model for dimensionality reduction.
128 | 
129 |     We train a deep (or shallow) time-lagged autoencoder type neural network
130 |     and use the first half (encoder stage) to transform the supplied data.
131 | 
132 |     Arguments:
133 |         data (numpy-ndarray of list thereof): the data to be transformed
134 |         dim (int): the target dimensionality
135 |         lag (int): specifies the lag in time steps
136 |         n_epochs (int): number of training epochs
137 |         validation_split (float): fraction of the data reserved for validation
138 |         batch_size (int): specify a batch size for the minibatch process
139 |         whiten (boolean): set to True to whiten the transformed data
140 |         pin_memory (boolean): make DataLoaders return pinned memory
141 | 
142 |     Returns:
143 |         (numpy.ndarray of list thereof): the transformed data
144 |         (list of float): training loss
145 |         (list of float): validation loss
146 |     '''
147 |     ae_args = dict(
148 |         hid_size=[100],
149 |         dropout=0.5,
150 |         alpha=0.01,
151 |         prelu=False,
152 |         bias=True,
153 |         lr=0.001,
154 |         cuda=False,
155 |         non_blocking=False)
156 |     ae_args.update(kwargs)
157 |     try:
158 |         size = data.shape[1]
159 |     except AttributeError:
160 |         size = data[0].shape[1]
161 |     data_0 = _create_dataset(data, lag=0)
162 |     data_lag = _create_dataset(data, lag=lag)
163 |     if validation_split is None:
164 |         train_loader = _DataLoader(
165 |             data_lag, batch_size=batch_size, pin_memory=pin_memory)
166 |         test_loader = None
167 |     else:
168 |         data_test, data_train = _random_block_split(
169 |             data_lag, lag, f_active=validation_split)
170 |         train_loader = _DataLoader(
171 |             data_train, batch_size=batch_size, pin_memory=pin_memory)
172 |         test_loader = _DataLoader(
173 |             data_test, batch_size=batch_size, pin_memory=pin_memory)
174 |     model = _AE(size, dim, **ae_args)
175 |     train_loss, test_loss = model.fit(
176 |         train_loader, n_epochs, test_loader=test_loader)
177 |     transformed_data = _transform(model, data, data_0, batch_size, whiten)
178 |     return transformed_data, train_loss, test_loss
179 | 
180 | def vae(
181 |     data, dim=None, lag=1, n_epochs=50, validation_split=None,
182 |     batch_size=100, whiten=False, pin_memory=False, **kwargs):
183 |     '''Use a time-lagged variational autoencoder model for dimensionality
184 |     reduction.
185 | 
186 |     We train a deep (or shallow) time-lagged variational autoencoder type
187 |     neural network and use the first half (encoder stage) to transform the
188 |     supplied data.
189 | 
190 |     Arguments:
191 |         data (numpy-ndarray of list thereof): the data to be transformed
192 |         dim (int): the target dimensionality
193 |         lag (int): specifies the lag in time steps
194 |         n_epochs (int): number of training epochs
195 |         validation_split (float): fraction of the data reserved for validation
196 |         batch_size (int): specify a batch size for the minibatch process
197 |         whiten (boolean): set to True to whiten the transformed data
198 |         pin_memory (boolean): make DataLoaders return pinned memory
199 | 
200 |     Returns:
201 |         (numpy.ndarray of list thereof): the transformed data
202 |         (list of float): training loss
203 |         (list of float): validation loss
204 |     '''
205 |     vae_args = dict(
206 |         hid_size=[100],
207 |         beta=1.0,
208 |         dropout=0.5,
209 |         alpha=0.01,
210 |         prelu=False,
211 |         bias=True,
212 |         lr=0.001,
213 |         cuda=False,
214 |         non_blocking=False)
215 |     vae_args.update(kwargs)
216 |     try:
217 |         size = data.shape[1]
218 |     except AttributeError:
219 |         size = data[0].shape[1]
220 |     data_0 = _create_dataset(data, lag=0)
221 |     data_lag = _create_dataset(data, lag=lag)
222 |     if validation_split is None:
223 |         train_loader = _DataLoader(
224 |             data_lag, batch_size=batch_size, pin_memory=pin_memory)
225 |         test_loader = None
226 |     else:
227 |         data_test, data_train = _random_block_split(
228 |             data_lag, lag, f_active=validation_split)
229 |         train_loader = _DataLoader(
230 |             data_train, batch_size=batch_size, pin_memory=pin_memory)
231 |         test_loader = _DataLoader(
232 |             data_test, batch_size=batch_size, pin_memory=pin_memory)
233 |     model = _VAE(size, dim, **vae_args)
234 |     train_loss, test_loss = model.fit(
235 |         train_loader, n_epochs, test_loader=test_loader)
236 |     transformed_data = _transform(model, data, data_0, batch_size, whiten)
237 |     return transformed_data, train_loss, test_loss
238 | 
239 | ################################################################################
240 | #
241 | #   VAMPNET WORK IN PROGRESS
242 | #
243 | ################################################################################
244 | 
245 | def vampnet(
246 |     data, dim=None, lag=1, n_epochs=50, validation_split=None,
247 |     batch_size=100, whiten=False, pin_memory=False, **kwargs):
248 |     '''Use a vampnet model for dimensionality reduction and/or clustering.
249 | 
250 |     ....
251 | 
252 |     Arguments:
253 |         data (numpy-ndarray of list thereof): the data to be transformed
254 |         dim (int): the target dimensionality
255 |         lag (int): specifies the lag in time steps
256 |         n_epochs (int): number of training epochs
257 |         validation_split (float): fraction of the data reserved for validation
258 |         batch_size (int): specify a batch size for the minibatch process
259 |         whiten (boolean): set to True to whiten the transformed data
260 |         pin_memory (boolean): make DataLoaders return pinned memory
261 | 
262 |     Returns:
263 |         (numpy.ndarray of list thereof): the transformed data
264 |         (list of float): training score
265 |         (list of float): validation score
266 |     '''
267 |     vn_args = dict(
268 |         hid_size=[100],
269 |         dropout=0.5,
270 |         alpha=0.01,
271 |         prelu=False,
272 |         bias=True,
273 |         lr=0.001,
274 |         cuda=False,
275 |         non_blocking=False)
276 |     vn_args.update(kwargs)
277 |     try:
278 |         size = data.shape[1]
279 |     except AttributeError:
280 |         size = data[0].shape[1]
281 |     data_0 = _create_dataset(data, lag=0)
282 |     data_lag = _create_dataset(data, lag=lag)
283 |     if validation_split is None:
284 |         train_loader = _DataLoader(
285 |             data_lag, batch_size=batch_size, pin_memory=pin_memory)
286 |         test_loader = None
287 |     else:
288 |         data_test, data_train = _random_block_split(
289 |             data_lag, lag, f_active=validation_split)
290 |         train_loader = _DataLoader(
291 |             data_train, batch_size=batch_size, pin_memory=pin_memory)
292 |         test_loader = _DataLoader(
293 |             data_test, batch_size=batch_size, pin_memory=pin_memory)
294 |     model = _VAMPNet(size, dim, **vn_args)
295 |     train_loss, test_loss = model.fit(
296 |         train_loader, n_epochs, test_loader=test_loader)
297 |     transformed_data = _transform(model, data, data_0, batch_size, whiten)
298 |     train_loss = [-loss for loss in train_loss]
299 |     test_loss = [-loss for loss in test_loss]
300 |     return transformed_data, train_loss, test_loss
301 | 


--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/test/test_utils.py:
--------------------------------------------------------------------------------
  1 | #   This file is part of the markovmodel/deeptime repository.
  2 | #   Copyright (C) 2017, 2018 Computational Molecular Biology Group,
  3 | #   Freie Universitaet Berlin (GER)
  4 | #
  5 | #   This program is free software: you can redistribute it and/or modify
  6 | #   it under the terms of the GNU Lesser General Public License as published by
  7 | #   the Free Software Foundation, either version 3 of the License, or
  8 | #   (at your option) any later version.
  9 | #
 10 | #   This program is distributed in the hope that it will be useful,
 11 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | #   GNU General Public License for more details.
 14 | #
 15 | #   You should have received a copy of the GNU Lesser General Public License
 16 | #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 | 
 18 | import numpy as np
 19 | import torch
 20 | from torch.utils.data import DataLoader
 21 | from ..utils import LaggedDataset
 22 | from ..utils import MaskedDataset
 23 | from ..utils import ensure_traj_format
 24 | from ..utils import create_dataset
 25 | from ..utils import stride_split
 26 | from ..utils import random_split
 27 | from ..utils import random_block_split
 28 | from ..utils import get_mean
 29 | from ..utils import get_covariance
 30 | from ..utils import get_sqrt_inverse
 31 | from ..utils import whiten_data
 32 | from ..utils import cca
 33 | from ..utils import BaseTransform
 34 | from ..utils import Transform
 35 | 
 36 | ################################################################################
 37 | #
 38 | #   DATASETS
 39 | #
 40 | ################################################################################
 41 | 
 42 | def test_lagged_dataset_at_default_lag():
 43 |     data = np.arange(
 44 |         800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32)
 45 |     dataset = LaggedDataset(torch.Tensor(data), lag=0)
 46 |     for x, y in dataset:
 47 |         assert x[0] == y[0]
 48 | 
 49 | def test_lagged_dataset_at_lag0():
 50 |     data = np.arange(
 51 |         800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32)
 52 |     dataset = LaggedDataset(torch.Tensor(data), lag=0)
 53 |     for x, y in dataset:
 54 |         assert x[0] == y[0]
 55 | 
 56 | def test_lagged_dataset_at_random_lag():
 57 |     data = np.arange(
 58 |         800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32)
 59 |     lag = 1 + np.random.randint(50)
 60 |     dataset = LaggedDataset(torch.Tensor(data), lag)
 61 |     for x, y in dataset:
 62 |         assert x[0] + lag == y[0]
 63 | 
 64 | def test_masked_dataset():
 65 |     data = np.arange(
 66 |         800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32)
 67 |     active = np.random.choice(data[:, 0], size=100, replace=False)
 68 |     dataset = MaskedDataset(LaggedDataset(torch.Tensor(data), lag=0), active)
 69 |     assert len(dataset) == len(active)
 70 |     for (x, y), z in zip(dataset, active):
 71 |         assert x[0] == y[0] == z
 72 | 
 73 | def test_ensure_traj_format_1d():
 74 |     raw_data = np.arange(800 + np.random.randint(200))
 75 |     data = ensure_traj_format(raw_data)
 76 |     assert isinstance(data, np.ndarray)
 77 |     assert data.dtype == np.float32
 78 |     assert data.ndim == 2
 79 |     np.testing.assert_array_equal(data.shape, [len(raw_data), 1])
 80 |     np.testing.assert_allclose(raw_data.astype(np.float32), data[:, 0])
 81 | 
 82 | def test_ensure_traj_format_2d():
 83 |     raw_data = np.arange(800 + np.random.randint(200)).reshape(-1, 1)
 84 |     data = ensure_traj_format(raw_data)
 85 |     assert isinstance(data, np.ndarray)
 86 |     assert data.dtype == np.float32
 87 |     assert data.ndim == 2
 88 |     np.testing.assert_array_equal(data.shape, raw_data.shape)
 89 |     np.testing.assert_allclose(raw_data.astype(np.float32), data)
 90 | 
 91 | def test_create_dataset_single_file_1d():
 92 |     data = np.arange(
 93 |         800 + np.random.randint(200))
 94 |     lag = np.random.randint(50)
 95 |     dataset = create_dataset(data, lag, dtype=np.float32)
 96 |     for x, y in dataset:
 97 |         assert x[0] + lag == y[0]
 98 | 
 99 | def test_create_dataset_single_file_2d():
100 |     data = np.arange(
101 |         800 + np.random.randint(200)).reshape(-1, 1)
102 |     lag = np.random.randint(50)
103 |     dataset = create_dataset(data, lag, dtype=np.float32)
104 |     for x, y in dataset:
105 |         assert x[0] + lag == y[0]
106 | 
107 | def test_create_dataset_multiple_files_1d():
108 |     data = [np.arange(800 + np.random.randint(200)) for _ in range(3)]
109 |     lag = np.random.randint(50)
110 |     dataset = create_dataset(data, lag, dtype=np.float32)
111 |     for x, y in dataset:
112 |         assert x[0] + lag == y[0]
113 | 
114 | def test_create_dataset_multiple_files_2d():
115 |     data = [np.arange(
116 |             800 + np.random.randint(200)).reshape(-1, 1) for _ in range(3)]
117 |     lag = np.random.randint(50)
118 |     dataset = create_dataset(data, lag, dtype=np.float32)
119 |     for x, y in dataset:
120 |         assert x[0] + lag == y[0]
121 | 
122 | def test_stride_split():
123 |     data = np.arange(
124 |         800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32)
125 |     lag = 1 + np.random.randint(50)
126 |     dataset = LaggedDataset(torch.Tensor(data), lag)
127 |     stride = 1 + np.random.randint(10)
128 |     offset = np.random.randint(stride)
129 |     dataset_a, dataset_b = stride_split(dataset, stride=stride, offset=offset)
130 |     assert len(dataset) == len(dataset_a) + len(dataset_b)
131 |     for x, y in dataset_a:
132 |         assert x[0] + lag == y[0]
133 |     for x, y in dataset_b:
134 |         assert x[0] + lag == y[0]
135 | 
136 | def test_random_split():
137 |     data = np.arange(
138 |         800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32)
139 |     lag = 1 + np.random.randint(50)
140 |     dataset = LaggedDataset(torch.Tensor(data), lag)
141 |     dataset_a, dataset_b = random_split(dataset, f_active=0.5)
142 |     assert len(dataset) == len(dataset_a) + len(dataset_b)
143 |     for x, y in dataset_a:
144 |         assert x[0] + lag == y[0]
145 |     for x, y in dataset_b:
146 |         assert x[0] + lag == y[0]
147 | 
148 | def test_random_block_split():
149 |     data = np.arange(
150 |         800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32)
151 |     lag = 1 + np.random.randint(50)
152 |     dataset = LaggedDataset(torch.Tensor(data), lag)
153 |     dataset_a, dataset_b = random_block_split(dataset, lag, f_active=0.5)
154 |     assert len(dataset) == len(dataset_a) + len(dataset_b)
155 |     for x, y in dataset_a:
156 |         assert x[0] + lag == y[0]
157 |     for x, y in dataset_b:
158 |         assert x[0] + lag == y[0]
159 | 
160 | ################################################################################
161 | #
162 | #   STATISTICS
163 | #
164 | ################################################################################
165 | 
166 | def test_get_mean_via_normal_distribution_parameters():
167 |     data = torch.randn(10000, 1)
168 |     dataset = LaggedDataset(data, lag=0)
169 |     x, y = get_mean(
170 |         DataLoader(
171 |             dataset, batch_size=np.random.randint(low=10, high=100)))
172 |     np.testing.assert_allclose(x.numpy(), 0.0, atol=0.05)
173 |     np.testing.assert_allclose(y.numpy(), 0.0, atol=0.05)
174 | 
175 | def test_get_mean_via_distribution_symmetry():
176 |     data = torch.rand(5000, 1)
177 |     data = torch.cat([data, -data])
178 |     dataset = LaggedDataset(data, lag=0)
179 |     x, y = get_mean(
180 |         DataLoader(
181 |             dataset, batch_size=np.random.randint(low=10, high=100)))
182 |     np.testing.assert_allclose(x.numpy(), 0.0, atol=0.0001)
183 |     np.testing.assert_allclose(y.numpy(), 0.0, atol=0.0001)
184 | 
185 | def test_get_mean_vs_numpy():
186 |     data = torch.randn(10000, 1)
187 |     dataset = LaggedDataset(data, lag=0)
188 |     x, y = get_mean(
189 |         DataLoader(
190 |             dataset, batch_size=np.random.randint(low=10, high=100)))
191 |     numpy_result = np.mean(data.numpy())
192 |     np.testing.assert_allclose(x.numpy(), numpy_result, atol=0.0001)
193 |     np.testing.assert_allclose(y.numpy(), numpy_result, atol=0.0001)
194 | 
195 | def test_get_covariance_via_normal_distribution_parameters():
196 |     data = torch.randn(10000, 1)
197 |     dataset = LaggedDataset(data, lag=0)
198 |     xx, xy, yy = get_covariance(
199 |         DataLoader(
200 |             dataset, batch_size=np.random.randint(low=10, high=100)),
201 |         torch.Tensor([0]), torch.Tensor([0]))
202 |     np.testing.assert_allclose(xx.numpy(), 1.0, atol=0.1)
203 |     np.testing.assert_allclose(xy.numpy(), 1.0, atol=0.1)
204 |     np.testing.assert_allclose(yy.numpy(), 1.0, atol=0.1)
205 | 
206 | def test_get_covariance_vs_numpy():
207 |     data = torch.randn(10000, 1)
208 |     dataset = LaggedDataset(data, lag=0)
209 |     xx, xy, yy = get_covariance(
210 |         DataLoader(
211 |             dataset, batch_size=np.random.randint(low=10, high=100)),
212 |         torch.Tensor([0]), torch.Tensor([0]))
213 |     numpy_result = np.var(data.numpy(), ddof=1)
214 |     np.testing.assert_allclose(xx.numpy(), numpy_result, atol=0.0005)
215 |     np.testing.assert_allclose(xy.numpy(), numpy_result, atol=0.0005)
216 |     np.testing.assert_allclose(yy.numpy(), numpy_result, atol=0.0005)
217 | 
218 | ################################################################################
219 | #
220 | #   WHITENING
221 | #
222 | ################################################################################
223 | 
224 | def test_get_sqrt_inverse():
225 |     dim = 2 + np.random.randint(5)
226 |     x = torch.rand(500, dim)
227 |     x = torch.mm(x.t(), x)
228 |     y = get_sqrt_inverse(x)
229 |     y = torch.mm(y, y)
230 |     np.testing.assert_allclose(
231 |         x.mm(y).numpy(),
232 |         np.diag([1.0] * dim).astype(np.float32),
233 |         atol=0.0001)
234 | 
235 | def test_whiten_data():
236 |     dim = 1 + np.random.randint(5)
237 |     x = whiten_data(torch.rand(500, dim))
238 |     np.testing.assert_allclose(
239 |         x.numpy().mean(axis=0),
240 |         0.0,
241 |         atol=0.01)
242 |     np.testing.assert_allclose(
243 |         torch.mm(x.t(), x).div_(float(x.size()[0])).numpy(),
244 |         np.diag([1.0] * dim).astype(np.float32),
245 |         atol=0.01)
246 | 
247 | ################################################################################
248 | #
249 | #   CCA
250 | #
251 | ################################################################################
252 | 
253 | def test_cca():
254 |     s = np.arange(1000)
255 |     x = torch.from_numpy(
256 |         np.vstack((s, np.random.randn(s.shape[0]))).T.astype(np.float32))
257 |     y = torch.from_numpy(
258 |         np.vstack((np.random.randn(s.shape[0]), s)).T.astype(np.float32))
259 |     u, s, v = cca(x, y, batch_size=100)
260 |     np.testing.assert_allclose(s.numpy(), [1.0, 0.0], atol=0.2)
261 |     p = u.mm(torch.diag(s).mm(v))
262 |     np.testing.assert_allclose(
263 |         np.abs(p.numpy()), [[0.0, 1.0], [0.0, 0.0]], atol=0.2)
264 | 
265 | ################################################################################
266 | #
267 | #   TRANSFORMER
268 | #
269 | ################################################################################
270 | 
271 | def test_base_transform():
272 |     dim = 2 + np.random.randint(5)
273 |     mean = 10.0 * (torch.rand(dim) - 0.5)
274 |     sigma = torch.rand(dim, dim)
275 |     sigma.add_(sigma.t())
276 |     data = torch.randn(500, dim).mm(sigma) + mean[None, :]
277 |     loader = DataLoader(LaggedDataset(data, lag=0), batch_size=64)
278 |     x_mean, y_mean = get_mean(loader)
279 |     cxx, cxy, cyy = get_covariance(loader, x_mean, y_mean)
280 |     transformer = BaseTransform(mean=x_mean, covariance=cxx)
281 |     transformed_data = []
282 |     for x, _ in loader:
283 |         transformed_data.append(transformer(x))
284 |     y = torch.cat(transformed_data)
285 |     np.testing.assert_allclose(
286 |         y.numpy().mean(axis=0),
287 |         0.0,
288 |         atol=0.01)
289 |     np.testing.assert_allclose(
290 |         torch.mm(y.t(), y).div_(float(y.size()[0])).numpy(),
291 |         np.diag([1.0] * dim).astype(np.float32),
292 |         atol=0.2)
293 | 
294 | def test_transform():
295 |     dim = 2 + np.random.randint(5)
296 |     mean = 10.0 * (torch.rand(dim) - 0.5)
297 |     sigma = torch.rand(dim, dim)
298 |     sigma.add_(sigma.t())
299 |     data = torch.randn(500, dim).mm(sigma) + mean[None, :]
300 |     loader = DataLoader(LaggedDataset(data, lag=0), batch_size=64)
301 |     x_mean, y_mean = get_mean(loader)
302 |     cxx, cxy, cyy = get_covariance(loader, x_mean, y_mean)
303 |     transformer = Transform(
304 |         x_mean=x_mean, x_covariance=cxx,
305 |         y_mean=x_mean, y_covariance=cyy)
306 |     x_, y_ = [], []
307 |     for x, y in loader:
308 |         x, y = transformer(x, y)
309 |         x_.append(x)
310 |         y_.append(y)
311 |     x = torch.cat(x_)
312 |     y = torch.cat(y_)
313 |     np.testing.assert_allclose(
314 |         x.numpy().mean(axis=0),
315 |         0.0,
316 |         atol=0.1)
317 |     np.testing.assert_allclose(
318 |         torch.mm(x.t(), x).div_(float(x.size()[0])).numpy(),
319 |         np.diag([1.0] * dim).astype(np.float32),
320 |         atol=0.1)
321 |     np.testing.assert_allclose(
322 |         y.numpy().mean(axis=0),
323 |         0.0,
324 |         atol=0.1)
325 |     np.testing.assert_allclose(
326 |         torch.mm(y.t(), y).div_(float(y.size()[0])).numpy(),
327 |         np.diag([1.0] * dim).astype(np.float32),
328 |         atol=0.1)
329 | 


--------------------------------------------------------------------------------
/vampnet/examples/1D_double_well.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Import all the packages used"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import numpy as np\n",
 17 |     "import matplotlib.pyplot as plt\n",
 18 |     "%matplotlib inline\n",
 19 |     "import vampnet\n",
 20 |     "from vampnet import data_generator\n",
 21 |     "from keras.models import Model\n",
 22 |     "from keras.layers import Dense, Activation, Flatten, Input, BatchNormalization, concatenate\n",
 23 |     "from keras import optimizers\n",
 24 |     "import tensorflow as tf\n",
 25 |     "from keras.backend import clear_session"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "# generate 50000 frames and energy values\n",
 35 |     "datapoints = int(5e4)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "traj_whole = data_generator.get_asymmetric_double_well_data(datapoints)\n",
 45 |     "# To fit the dataformat\n",
 46 |     "traj_whole = np.expand_dims(traj_whole, 1)\n",
 47 |     "traj_data_points, input_size = traj_whole.shape"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "x = np.linspace(-1,5,500)\n",
 57 |     "plt.figure(figsize=(6,2))\n",
 58 |     "plt.ylim(-15,10)\n",
 59 |     "plt.xlim(-1,5)\n",
 60 |     "plt.plot(x,data_generator.asymmetric_double_well_energy(x), lw = 2)\n",
 61 |     "plt.xlabel('Position x / a.u.', fontsize = 16)\n",
 62 |     "plt.ylabel('Pot. energy / a.u.', fontsize = 16)\n",
 63 |     "plt.xticks(fontsize = 14)\n",
 64 |     "\n",
 65 |     "plt.yticks(fontsize = 14);"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "# All Hyperparameters\n",
 75 |     "\n",
 76 |     "# Tau, how much is the timeshift of the two datasets\n",
 77 |     "tau = 1\n",
 78 |     "\n",
 79 |     "# Batch size for Stochastic Gradient descent\n",
 80 |     "batch_size = 2048\n",
 81 |     "\n",
 82 |     "# Which trajectory points percentage is used as training\n",
 83 |     "train_ratio = 0.9\n",
 84 |     "\n",
 85 |     "# How many hidden layers the network has\n",
 86 |     "network_depth = 4\n",
 87 |     "\n",
 88 |     "# Width of every layer\n",
 89 |     "layer_width = 20\n",
 90 |     "nodes = [layer_width]*network_depth\n",
 91 |     "# Learning rate used for the ADAM optimizer\n",
 92 |     "learning_rate = 0.0001\n",
 93 |     "\n",
 94 |     "# How many output states the network has\n",
 95 |     "output_size = 5\n",
 96 |     "\n",
 97 |     "# Iteration over the training set in the fitting process\n",
 98 |     "nb_epoch = 300"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "epsilon = 1e-5\n",
108 |     "vamp = vampnet.VampnetTools(epsilon = epsilon)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "# Shuffle trajectory and lagged trajectory together\n",
118 |     "length_data = traj_data_points - tau\n",
119 |     "\n",
120 |     "traj_ord= traj_whole[:length_data]\n",
121 |     "traj_ord_lag = traj_whole[tau:length_data+tau]\n",
122 |     "\n",
123 |     "indexes = np.arange(length_data)\n",
124 |     "np.random.shuffle(indexes)\n",
125 |     "\n",
126 |     "\n",
127 |     "\n",
128 |     "traj = traj_ord[indexes]\n",
129 |     "traj_lag = traj_ord_lag[indexes]\n"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "# Prepare data for tensorflow usage\n",
139 |     "length_train = int(np.floor(length_data * train_ratio))\n",
140 |     "length_vali = length_data - length_train\n",
141 |     "\n",
142 |     "traj_data_train = traj[:length_train]\n",
143 |     "traj_data_train_lag = traj_lag[:length_train]\n",
144 |     "\n",
145 |     "traj_data_valid = traj[length_train:]\n",
146 |     "traj_data_valid_lag = traj_lag[length_train:]\n",
147 |     "\n",
148 |     "#Data used for states ordering\n",
149 |     "X1 = traj_ord[:length_data].astype('float32')\n",
150 |     "X2 = traj_ord_lag[:length_data].astype('float32')\n",
151 |     "\n",
152 |     "# Input of the first network\n",
153 |     "X1_train = traj_data_train.astype('float32')\n",
154 |     "X2_train  = traj_data_train_lag.astype('float32')\n",
155 |     "\n",
156 |     "# Input for validation\n",
157 |     "X1_vali = traj_data_valid.astype('float32')\n",
158 |     "X2_vali = traj_data_valid_lag.astype('float32')\n",
159 |     "\n",
160 |     "# Needs a Y-train set which we dont have.\n",
161 |     "Y_train = np.zeros((length_train,2*output_size)).astype('float32')\n",
162 |     "Y_vali = np.zeros((length_vali,2*output_size)).astype('float32')"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "if 'model' in globals():\n",
172 |     "    del model\n",
173 |     "    clear_session()\n",
174 |     "# Build the model\n",
175 |     "Data_X = Input(shape = (input_size,))\n",
176 |     "Data_Y = Input(shape = (input_size,))\n",
177 |     "\n",
178 |     "# A batch normalization layer improves convergence speed\n",
179 |     "#     bn_layer = BatchNormalization()\n",
180 |     "bn_layer = Activation('linear')\n",
181 |     "\n",
182 |     "# Instance layers and assign them to the two lobes of the network\n",
183 |     "dense_layers = [Dense(node, activation = 'relu',)\n",
184 |     "                for node in nodes]\n",
185 |     "\n",
186 |     "lx_branch = bn_layer(Data_X)\n",
187 |     "rx_branch = bn_layer(Data_Y)\n",
188 |     "\n",
189 |     "for i, layer in enumerate(dense_layers):\n",
190 |     "\n",
191 |     "    lx_branch = dense_layers[i](lx_branch)\n",
192 |     "    rx_branch = dense_layers[i](rx_branch)\n",
193 |     "\n",
194 |     "\n",
195 |     "# Add a softmax output layer.\n",
196 |     "# Should be replaced with a linear activation layer if\n",
197 |     "# the outputs of the network cannot be interpreted as states\n",
198 |     "softmax = Dense(output_size, activation='softmax')\n",
199 |     "\n",
200 |     "lx_branch = softmax(lx_branch)\n",
201 |     "rx_branch = softmax(rx_branch)\n",
202 |     "\n",
203 |     "# Merge both networks to train both at the same time\n",
204 |     "merged = concatenate([lx_branch, rx_branch])\n",
205 |     "\n",
206 |     "# Initialize the model and the optimizer, and compile it with\n",
207 |     "# the loss and metric functions from the VAMPnets package\n",
208 |     "model = Model(inputs = [Data_X, Data_Y], outputs = merged)\n",
209 |     "#     model.summary()\n",
210 |     "# Compile it with our own loss-function\n",
211 |     "adam = optimizers.adam(lr = learning_rate)\n",
212 |     "\n",
213 |     "\n",
214 |     "# Pretraining with VAMP with 'symmetrized' matrices yields a bad approximation of the \n",
215 |     "# eigenvectors per se, but improves the 'readability' of the states identified by VAMP-2\n",
216 |     "# which would otherwise be difficult to interprete.\n",
217 |     "\n",
218 |     "\n",
219 |     "# IMPORTANT: the function vamp.loss_VAMP2_autograd can only be used with tensorflow 1.6 or more recent.\n",
220 |     "# For older versions of TF, use the function vamp.loss_VAMP2\n",
221 |     "\n",
222 |     "losses = [\n",
223 |     "    vamp._loss_VAMP_sym,\n",
224 |     "    vamp.loss_VAMP2,\n",
225 |     "]\n",
226 |     "\n",
227 |     "valid_metric = np.zeros((len(losses), nb_epoch))\n",
228 |     "train_metric = np.zeros((len(losses), nb_epoch))\n",
229 |     "\n",
230 |     "for l_index, loss in enumerate(losses):\n",
231 |     "    \n",
232 |     "    model.compile(optimizer = 'adam', loss = loss, metrics = [vamp.metric_VAMP])\n",
233 |     "    \n",
234 |     "    hist = model.fit([X1_train, X2_train], Y_train ,batch_size=batch_size, epochs=nb_epoch, verbose=0,\n",
235 |     "                     validation_data=([X1_vali, X2_vali], Y_vali))\n",
236 |     "    \n",
237 |     "    temp = model.predict([traj_ord, traj_ord_lag], batch_size=np.shape(X1_vali)[0])\n",
238 |     "    \n",
239 |     "    x_a = temp[:,:output_size]\n",
240 |     "\n",
241 |     "\n",
242 |     "    X_Validation = np.squeeze(traj_ord)\n",
243 |     "    for i in range(output_size):\n",
244 |     "        plt.scatter(X_Validation, x_a[:,i], label= 'state '+str(i))\n",
245 |     "    plt.title('State probabilities')\n",
246 |     "    plt.legend()\n",
247 |     "    plt.show()\n",
248 |     "\n",
249 |     "\n",
250 |     "\n",
251 |     "\n",
252 |     "    states_prob_meanfree = x_a  - np.mean(x_a, axis=0)\n",
253 |     "    tau_msm = 5\n",
254 |     "    K_smt = vamp.estimate_koopman_op(states_prob_meanfree, tau_msm)\n",
255 |     "\n",
256 |     "    K_eigvals, K_eigvec = np.linalg.eig(np.real(K_smt))\n",
257 |     "\n",
258 |     "    index = np.argmax(np.real(K_eigvals))\n",
259 |     "    real_eigfunc = states_prob_meanfree @ np.real(K_eigvec[:,index])\n",
260 |     "\n",
261 |     "    plt.scatter(X_Validation, real_eigfunc)\n",
262 |     "    plt.title('Eigenvector')\n",
263 |     "    plt.show()\n",
264 |     "\n",
265 |     "    valid_metric[l_index] = np.array(hist.history['val_metric_VAMP'])\n",
266 |     "    train_metric[l_index] = np.array(hist.history['metric_VAMP'])\n",
267 |     "\n",
268 |     "valid_metric = np.reshape(valid_metric, (-1))\n",
269 |     "train_metric = np.reshape(train_metric, (-1))"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "# Training result visualization"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "plt.plot(train_metric, label = 'Training')\n",
286 |     "plt.legend()\n",
287 |     "plt.plot(valid_metric, label = 'Validation')\n",
288 |     "plt.legend()\n",
289 |     "\n",
290 |     "plt.show()"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "markdown",
295 |    "metadata": {},
296 |    "source": [
297 |     "# Transform the input trajectory using the network"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "metadata": {},
304 |    "outputs": [],
305 |    "source": [
306 |     "states_prob = model.predict([traj_ord, traj_ord_lag])[:, :output_size]\n",
307 |     "\n",
308 |     "# Order the output states based on their population\n",
309 |     "coor_pred = np.argmax(states_prob, axis = 1)\n",
310 |     "indexes = [np.where(coor_pred == np.multiply(np.ones_like(coor_pred), n)) for n in range(output_size)]\n",
311 |     "states_num = [len(i[0]) for i in indexes]\n",
312 |     "states_order = np.argsort(states_num).astype('int')[::-1]\n",
313 |     "\n",
314 |     "pred_ord = states_prob[:,states_order]"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {},
320 |    "source": [
321 |     "# Visualize the population of the states"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": [
330 |     "def print_states_pie_chart():\n",
331 |     "    coors = []\n",
332 |     "    maxi = np.max(pred_ord, axis= 1)\n",
333 |     "\n",
334 |     "    for i in range(output_size):\n",
335 |     "        coors.append(len(np.where(pred_ord[:,i] == maxi)[0]))\n",
336 |     "        \n",
337 |     "    fig1, ax1 = plt.subplots()\n",
338 |     "    ax1.pie(np.array(coors), autopct='%1.2f%%', startangle=90)\n",
339 |     "    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.\n",
340 |     "    print('States population: '+str(np.array(coors)/len(maxi)*100)+'%')\n",
341 |     "    plt.show()\n",
342 |     "\n",
343 |     "print_states_pie_chart()"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "markdown",
348 |    "metadata": {},
349 |    "source": [
350 |     "# Estimate the implied timescales"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": null,
356 |    "metadata": {},
357 |    "outputs": [],
358 |    "source": [
359 |     "max_tau = 15\n",
360 |     "lag = np.arange(1, max_tau, 1)\n",
361 |     "its = vamp.get_its(pred_ord, lag)\n",
362 |     "vamp.plot_its(its, lag)"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "markdown",
367 |    "metadata": {},
368 |    "source": [
369 |     "# Chapman-Kolmogorov test for the estimated koopman operator"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "steps = 16\n",
379 |     "tau_msm = 1\n",
380 |     "predicted, estimated = vamp.get_ck_test(pred_ord, steps, tau_msm)\n",
381 |     "vamp.plot_ck_test(predicted, estimated, output_size, steps, tau_msm)"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": []
390 |   }
391 |  ],
392 |  "metadata": {
393 |   "anaconda-cloud": {},
394 |   "kernelspec": {
395 |    "display_name": "Python 3",
396 |    "language": "python",
397 |    "name": "python3"
398 |   },
399 |   "language_info": {
400 |    "codemirror_mode": {
401 |     "name": "ipython",
402 |     "version": 3
403 |    },
404 |    "file_extension": ".py",
405 |    "mimetype": "text/x-python",
406 |    "name": "python",
407 |    "nbconvert_exporter": "python",
408 |    "pygments_lexer": "ipython3",
409 |    "version": "3.6.4"
410 |   }
411 |  },
412 |  "nbformat": 4,
413 |  "nbformat_minor": 1
414 | }
415 | 


--------------------------------------------------------------------------------
/vampnet/vampnet/data_generator.py:
--------------------------------------------------------------------------------
  1 | #   This file is part of the markovmodel/deeptime repository.
  2 | #   Copyright (C) 2017, 2018 Computational Molecular Biology Group,
  3 | #   Freie Universitaet Berlin (GER)
  4 | #
  5 | #   This program is free software: you can redistribute it and/or modify
  6 | #   it under the terms of the GNU Lesser General Public License as published by
  7 | #   the Free Software Foundation, either version 3 of the License, or
  8 | #   (at your option) any later version.
  9 | #
 10 | #   This program is distributed in the hope that it will be useful,
 11 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | #   GNU General Public License for more details.
 14 | #
 15 | #   You should have received a copy of the GNU Lesser General Public License
 16 | #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 | 
 18 | """sample generator for the MCMM project's clustering stage"""
 19 | 
 20 | import numpy as np
 21 | 
 22 | ################################################################################
 23 | #                                                                              #
 24 | #   defining test potentials                                                   #
 25 | #                                                                              #
 26 | ################################################################################
 27 | 
 28 | class BrownianDynamics(object):
 29 |     r"""base class for Brownian dynamics integration"""
 30 |     def __init__(self, dim, dt, kT, mass, damping):
 31 |         self.dim = dim
 32 |         self.dt = dt
 33 |         self.kT = kT
 34 |         self.mass = mass
 35 |         self.daming = damping
 36 |         self.coeff_A = dt / (mass * damping)
 37 |         self.coeff_B = np.sqrt(2.0 * dt * kT / (mass * damping))
 38 |     def gradient(self, x):
 39 |         r"""gradient of the yet unkown potential"""
 40 |         raise NotImplementedError("implement in child class")
 41 |     def step(self, x):
 42 |         r"""perform a single Brownian dynamics step"""
 43 |         return x - self.coeff_A * self.gradient(x) \
 44 |             + self.coeff_B * np.random.normal(size=self.dim)
 45 | 
 46 | 
 47 | ################################################################################
 48 | #                                                                              #
 49 | #   defining test potentials                                                   #
 50 | #                                                                              #
 51 | ################################################################################
 52 | 
 53 | def asymmetric_double_well_energy(x):
 54 |     r"""computes the potential energy at point x"""
 55 |     _x = x - 2.0
 56 |     return 2.0 * _x - 6.0 * _x**2 + _x**4
 57 | 
 58 | def asymmetric_double_well_gradient(x):
 59 |     r"""computes the potential's gradient at point x"""
 60 |     return 4.0 * x**3 - 24.0 * x**2 + 36.0 * x - 6.0
 61 | 
 62 | def prinz_energy(x):
 63 |     return 4*(x**8 + 0.8 * np.exp(-80*x**2) + 0.2*np.exp(-80*(x-0.5)**2) + 0.5*np.exp(-40.*(x+0.5)**2))
 64 | 
 65 | def prinz_gradient(x):
 66 |     return 4*(8*x**7 - 128. * np.exp(-80*x**2)*x - 32.*np.exp(-80*(x-0.5)**2) *(x-0.5) - 40*np.exp(-40.*(x+0.5)**2) *(x+0.5))
 67 | 
 68 | def folding_model_energy(rvec, rcut):
 69 |     r"""computes the potential energy at point rvec"""
 70 |     r = np.linalg.norm(rvec) - rcut
 71 |     rr = r**2
 72 |     if r < 0.0:
 73 |         return -2.5 * rr
 74 |     return 0.5 * (r - 2.0) * rr
 75 | 
 76 | def folding_model_gradient(rvec, rcut):
 77 |     r"""computes the potential's gradient at point rvec"""
 78 |     rnorm = np.linalg.norm(rvec)
 79 |     if rnorm == 0.0:
 80 |         return np.zeros(rvec.shape)
 81 |     r = rnorm - rcut
 82 |     if r < 0.0:
 83 |         return -5.0 * r * rvec / rnorm
 84 |     return (1.5 * r - 2.0) * rvec / rnorm
 85 | 
 86 | 
 87 | ################################################################################
 88 | #                                                                              #
 89 | #   defining wrapper classes                                                   #
 90 | #                                                                              #
 91 | ################################################################################
 92 | 
 93 | class AsymmetricDoubleWell(BrownianDynamics):
 94 |     r"""encapsulates the asymmetric double well potential"""
 95 |     def __init__(self, dt, kT, mass=1.0, damping=1.0):
 96 |         super(AsymmetricDoubleWell, self).__init__(1, dt, kT, mass, damping)
 97 |     def gradient(self, x):
 98 |         return asymmetric_double_well_gradient(x)
 99 |     def sample(self, x0, nsteps, nskip=1):
100 |         r"""generate nsteps sample points"""
101 |         x = np.zeros(shape=(nsteps+1,))
102 |         x[0] = x0
103 |         for t in range(nsteps):
104 |             q = x[t]
105 |             for s in range(nskip):
106 |                 q = self.step(q)
107 |             x[t+1] = q
108 |         return x
109 | 
110 | class FoldingModel(BrownianDynamics):
111 |     r"""encapsulates the folding model potential"""
112 |     def __init__(self, dt, kT, mass=1.0, damping=1.0, rcut=3.0):
113 |         super(FoldingModel, self).__init__(5, dt, kT, mass, damping)
114 |         self.rcut = rcut
115 |     def gradient(self, x):
116 |         return folding_model_gradient(x, self.rcut)
117 |     def sample(self, rvec0, nsteps, nskip=1):
118 |         r"""generate nsteps sample points"""
119 |         rvec = np.zeros(shape=(nsteps+1, self.dim))
120 |         rvec[0, :] = rvec0[:]
121 |         for t in range(nsteps):
122 |             q = rvec[t, :]
123 |             for s in range(nskip):
124 |                 q = self.step(q)
125 |             rvec[t+1, :] = q[:]
126 |         return rvec
127 | 
128 | class PrinzModel(BrownianDynamics):
129 |     r"""encapsulates the Prinz potential"""
130 |     def __init__(self, dt, kT, mass=1.0, damping=1.0):
131 |         super(PrinzModel, self).__init__(1, dt, kT, mass, damping)
132 |     def gradient(self, x):
133 |         return prinz_gradient(x)
134 |     def sample(self, x0, nsteps, nskip=1):
135 |         r"""generate nsteps sample points"""
136 |         x = np.zeros(shape=(nsteps+1,))
137 |         x[0] = x0
138 |         for t in range(nsteps):
139 |             q = x[t]
140 |             for s in range(nskip):
141 |                 q = self.step(q)
142 |             x[t+1] = q
143 |         return x
144 | 
145 | 
146 | ################################################################################
147 | #                                                                              #
148 | #   main area                                                                  #
149 | #                                                                              #
150 | ################################################################################
151 | 
152 | def get_asymmetric_double_well_data(nstep, x0 = 0., nskip=1, dt=0.01, kT=10.0, mass=1.0, damping=1.0):
153 |     r"""wrapper for the asymmetric double well generator"""
154 |     adw = AsymmetricDoubleWell(dt, kT, mass=mass, damping=damping)
155 |     return adw.sample(x0, nstep, nskip=nskip)
156 | 
157 | def get_folding_model_data(
158 |         nstep, rvec0 = np.zeros((5)), nskip=1, dt=0.01, kT=10.0, mass=1.0, damping=1.0, rcut=3.0):
159 |     r"""wrapper for the folding model generator"""
160 |     fm = FoldingModel(dt, kT, mass=mass, damping=damping, rcut=rcut)
161 |     return fm.sample(rvec0, nstep, nskip=nskip)
162 | 
163 | def get_prinz_pot(nstep, x0 = 0., nskip=1, dt=0.01, kT=10.0, mass=1.0, damping=1.0):
164 |     r"""wrapper for the Prinz model generator"""
165 |     pw = PrinzModel(dt, kT, mass=mass, damping=damping)
166 |     return pw.sample(x0, nstep, nskip=nskip)
167 | 
168 | def get_alanine_data(input_type = 'coordinates', return_dihedrals = True):
169 |     
170 |     import mdshare
171 |     
172 |     retval = []
173 | 
174 |     if input_type == 'distances':
175 | 
176 |         local_filename = mdshare.fetch('alanine-dipeptide-3x250ns-heavy-atom-distances.npz')
177 | 
178 |         traj_whole = np.load(local_filename)['arr_0']
179 | 
180 |     elif input_type == 'coordinates':
181 | 
182 |         local_filename = mdshare.fetch('alanine-dipeptide-3x250ns-heavy-atom-positions.npz')
183 | 
184 |         traj_whole = np.load(local_filename)['arr_0']
185 |     
186 |     retval.append(traj_whole)
187 | 
188 |     if return_dihedrals:
189 |         dihedral = np.load(mdshare.fetch('alanine-dipeptide-3x250ns-backbone-dihedrals.npz'))['arr_0']
190 |         retval.append(dihedral)
191 |     
192 |     
193 |     return retval
194 | 
195 | 
196 | def build_generator_on_source(data_source, batch_size, lag, output_size):
197 |     '''Function used to create a generator that will fetch data from a data source through an iterator.
198 |     This can be passed as parameter to a keras fit_generator method.
199 | 
200 |     Parameters
201 |     ----------
202 |     data_source: pyemma source object.
203 |         Data files source. This has to be initialized with chunksize = batch_size
204 | 
205 |     batch_size: int
206 |         Batch size to be used for the training
207 | 
208 |     lag: int
209 |         time frames lag to be used in the training of the VAMPnets
210 | 
211 |     output_size: int
212 |         How many output nodes the network has
213 |     '''
214 | 
215 |     counter_batches = 0
216 | 
217 | 
218 |     # How many batches before the iterator has to be reinitialized
219 |     steps_epoch = np.sum(np.ceil((data_source.trajectory_lengths()-lag)/batch_size))
220 | 
221 |     data_iterator = data_source.iterator(chunk = batch_size,
222 |                                          lag = lag,
223 |                                          return_trajindex=False)
224 | 
225 |     while True:       
226 | 
227 |         input_data = list(data_iterator.next())        
228 | 
229 |         # Create empty labels to accomodate keras' interface requirements
230 |         labels = np.empty((input_data[0].shape[0],2*output_size)).astype('float32')
231 |         data = input_data, labels
232 |         counter_batches += 1
233 | 
234 |         if counter_batches == steps_epoch:
235 |             data_iterator = data_source.iterator(chunk = batch_size,
236 |                                                  lag = lag,
237 |                                                  return_trajindex=False)
238 |             counter_batches = 0
239 | 
240 |         yield data
241 | 
242 |         
243 |     
244 | def build_generator_on_source_shuffle(data_source, batch_size, lag, output_size, preloaded_batches = 1):
245 |     '''Function used to create a generator that will randomly access data and fetch them from a data
246 |     source through an iterator. This can be passed as parameter to a keras fit_generator method.
247 | 
248 |     Parameters
249 |     ----------
250 |     data_source: pyemma source object.
251 |         Data files source. This has to be initialized with chunksize = batch_size
252 | 
253 |     batch_size: int
254 |         Batch size to be used for the training
255 | 
256 |     lag: int
257 |         time frames lag to be used in the training of the VAMPnets
258 | 
259 |     output_size: int
260 |         How many output nodes the network has
261 |         
262 |     preloaded_batches: int
263 |         How many batches of data should be loaded at once; higher values will improve
264 |         execution speed but also memory consumption
265 |     '''
266 | 
267 |     counter_batches = 0
268 | 
269 | 
270 |     # How many batches before the iterator has to be reinitialized
271 |     steps_epoch = np.ceil(np.sum((data_source.trajectory_lengths()-lag)/ (batch_size* preloaded_batches)))
272 |     input_size = data_source.dimension()
273 |     
274 |     
275 |     traj_lengths = data_source.trajectory_lengths()
276 |     remaining_frames = np.concatenate([[index_traj*np.ones((traj_len - lag)), np.arange(traj_len - lag)] for index_traj, traj_len in enumerate(traj_lengths)], axis = 1).T.astype('int')
277 |     indexes = np.arange(remaining_frames.shape[0])
278 |     np.random.shuffle(indexes)
279 |     
280 |     while True:       
281 |         
282 |         start = counter_batches * batch_size * preloaded_batches
283 |         end = min(start + batch_size * preloaded_batches, remaining_frames.shape[0])
284 |         
285 |         frames = remaining_frames[indexes[start:end]]
286 |         
287 |         fake_ind = frames[:,0]*(traj_lengths.sum()) + frames[:,1]
288 |         arg_sort = np.argsort(fake_ind)
289 |         sort_arg_sort = np.argsort(arg_sort)
290 |         
291 |         frames_tau = frames + np.array([np.zeros((frames.shape[0])), np.ones((frames.shape[0]))*lag], dtype = 'int').T
292 |         
293 |         
294 |         data_iterator_t = data_source.iterator(stride=frames[arg_sort],
295 |                                                return_trajindex=False)
296 |         data_iterator_tau = data_source.iterator(stride=frames_tau[arg_sort],
297 |                                                return_trajindex=False)
298 |         
299 |         data = np.empty((2, batch_size * preloaded_batches, input_size))
300 |         start_iter = 0
301 |         for iter_data, iter_data_tau in zip(data_iterator_t, data_iterator_tau):
302 |             temp_frames = iter_data.shape[0]
303 |             end_iter = start_iter + temp_frames
304 |             data[0, start_iter:end_iter] = iter_data
305 |             data[1, start_iter:end_iter] = iter_data_tau
306 |             start_iter = end_iter
307 | 
308 |             
309 |         data = data[:, sort_arg_sort]
310 |         
311 |         index_preloaded = 0
312 |         labels = np.empty((batch_size,2*output_size)).astype('float32')
313 |         
314 |         while index_preloaded < preloaded_batches:
315 |             
316 |             start_batch = index_preloaded * batch_size
317 |             end_batch = start_batch + batch_size
318 |             index_preloaded += 1
319 |             
320 |             if end_batch > data.shape[1]:
321 |                 end_batch = data.shape[1]
322 |                 index_preloaded = preloaded_batches
323 |                 labels = np.empty((end_batch - start_batch,2*output_size)).astype('float32')
324 |                 
325 |             output_data = [data[0, start_batch:end_batch], data[1, start_batch:end_batch]], labels
326 |             
327 |             yield output_data
328 | 
329 |         
330 |         counter_batches += 1
331 | 
332 |         if counter_batches == steps_epoch:
333 |             
334 |             counter_batches = 0
335 |             indexes = np.arange(remaining_frames.shape[0])
336 |             np.random.shuffle(indexes)


--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/utils.py:
--------------------------------------------------------------------------------
  1 | #   This file is part of the markovmodel/deeptime repository.
  2 | #   Copyright (C) 2017, 2018 Computational Molecular Biology Group,
  3 | #   Freie Universitaet Berlin (GER)
  4 | #
  5 | #   This program is free software: you can redistribute it and/or modify
  6 | #   it under the terms of the GNU Lesser General Public License as published by
  7 | #   the Free Software Foundation, either version 3 of the License, or
  8 | #   (at your option) any later version.
  9 | #
 10 | #   This program is distributed in the hope that it will be useful,
 11 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | #   GNU General Public License for more details.
 14 | #
 15 | #   You should have received a copy of the GNU Lesser General Public License
 16 | #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 | 
 18 | '''
 19 | Tools to handle datasets, transformations, and statistics.
 20 | '''
 21 | 
 22 | import numpy as _np
 23 | import torch as _torch
 24 | from torch import nn as _nn
 25 | from torch.utils.data import Dataset as _Dataset
 26 | from torch.utils.data import TensorDataset as _TensorDataset
 27 | from torch.utils.data import ConcatDataset as _ConcatDataset
 28 | from torch.utils.data import DataLoader as _DataLoader
 29 | 
 30 | __all__ = [
 31 |     'LaggedDataset',
 32 |     'MaskedDataset',
 33 |     'create_dataset',
 34 |     'stride_split',
 35 |     'random_split',
 36 |     'random_block_split',
 37 |     'get_mean',
 38 |     'get_covariance',
 39 |     'get_sqrt_inverse',
 40 |     'whiten_data',
 41 |     'cca',
 42 |     'Transform']
 43 | 
 44 | ################################################################################
 45 | #
 46 | #   DATASETS
 47 | #
 48 | ################################################################################
 49 | 
 50 | class LaggedDataset(_Dataset):
 51 |     '''Dataset for wrapping time-lagged data from a single stored time series.
 52 | 
 53 |     Each sample will contain the data_tensor at index t and the (not explicitly
 54 |     stored) target_tensor via data_tensor at index t+lag. We need this for
 55 |     training the time-lagged autoencoder and TICA.
 56 | 
 57 |     Arguments:
 58 |         data_tensor (Tensor): contains time series data
 59 |         lag (int): specifies the lag in time steps
 60 |     '''
 61 |     def __init__(self, data_tensor, lag=1):
 62 |         assert data_tensor.size(0) > lag, 'you need more samples than lag'
 63 |         assert lag >= 0, 'you need a non-negative lagtime'
 64 |         self.data_tensor = data_tensor
 65 |         self.lag = lag
 66 |     def __getitem__(self, index):
 67 |         return self.data_tensor[index], self.data_tensor[index + self.lag]
 68 |     def __len__(self):
 69 |         return self.data_tensor.size(0) - self.lag
 70 | 
 71 | class MaskedDataset(_Dataset):
 72 |     '''Dataset for wrapping a specified subset of another dataset.
 73 | 
 74 |     This helps to separate a dataset into two or more subsets, e.g., for
 75 |     training and testing.
 76 | 
 77 |     Arguments:
 78 |         data_tensor (Tensor): contains time series data
 79 |         active (sequence of int): indices of the active elements
 80 |     '''
 81 |     def __init__(self, dataset, active):
 82 |         assert len(dataset) >= len(active), \
 83 |             'you cannot have less total samples than active'
 84 |         assert _np.all(0 <= active) and _np.all(active < len(dataset)), \
 85 |             'you must use only valid indices'
 86 |         assert len(active) == len(_np.unique(active)), \
 87 |             'you must use every active index only once'
 88 |         self.dataset = dataset
 89 |         self.active = active
 90 |     def __getitem__(self, index):
 91 |         return self.dataset[self.active[index]]
 92 |     def __len__(self):
 93 |         return len(self.active)
 94 | 
 95 | def ensure_traj_format(data, dtype=_np.float32):
 96 |     data = _np.asarray(data, dtype=dtype)
 97 |     if data.ndim == 2:
 98 |         return data
 99 |     elif data.ndim == 1:
100 |         return data.reshape(-1, 1)
101 |     else:
102 |         raise ValueError('data has incomplatible ndim: ' + str(data.ndim))
103 | 
104 | def create_dataset(data, lag=0, dtype=_np.float32):
105 |     '''Create a (time-lagged) dataset from one or more numpy.ndarrays.
106 | 
107 |     Arguments:
108 |         data (numpy.ndarray of list thereof): data to create the dataset from
109 |         lag (int): specifies the lag in time steps
110 |         dtype (numpy.dtype): dtype of the resulting dataset
111 |     '''
112 |     if isinstance(data, _np.ndarray):
113 |         return LaggedDataset(
114 |             _torch.from_numpy(ensure_traj_format(data, dtype=dtype)),
115 |             lag=lag)
116 |     elif isinstance(data, (list, tuple)):
117 |         return _ConcatDataset([LaggedDataset(
118 |             _torch.from_numpy(ensure_traj_format(d, dtype=dtype)),
119 |             lag=lag) for d in data])
120 |     else:
121 |         raise ValueError(
122 |             'use a single or a list of numpy.ndarrays of dim 1 or 2')
123 | 
124 | def stride_split(dataset, stride=2, offset=0):
125 |     '''Split one dataset into two parts based on a stride.
126 | 
127 |     This helps to separate a dataset into two or more subsets, e.g., for
128 |     training and testing. Every <stride>th element starting from <offset>
129 |     goes into the first MaskedDataset, everything else into the second.
130 | 
131 |     Arguments:
132 |         dataset (Dataset): contains the data you want to split
133 |         stride (int): specify the size of the stride
134 |         offset (int): specify where to start counting
135 |     '''
136 |     assert 0 < stride < len(dataset), \
137 |         'use a positive stride smaller than the length of the dataset'
138 |     assert 0 <= offset < stride, \
139 |         'use a non-negative offset smaller than the stride'
140 |     active = _np.arange(offset, len(dataset), stride)
141 |     complement = _np.setdiff1d(
142 |         _np.arange(len(dataset)), active, assume_unique=True)
143 |     return MaskedDataset(dataset, active), MaskedDataset(dataset, complement)
144 | 
145 | def random_split(dataset, active=None, n_active=None, f_active=None):
146 |     '''Split one dataset into two parts based on a random selection.
147 | 
148 |     This helps to separate a dataset into two or more subsets, e.g., for
149 |     training and testing. Specify the active set either by giving the frame
150 |     indices, the number of active frames or the fraction of active frames.
151 | 
152 |     Arguments:
153 |         dataset (Dataset): contains the data you want to split
154 |         active (iterable of int): specify the active frames
155 |         n_active (int): number of active frames
156 |         f_active (float): fraction of active frames
157 |     '''
158 |     if active is None:
159 |         if n_active is None:
160 |             if f_active is None:
161 |                 raise ValueError(
162 |                     'specify either active, n_active or f_active')
163 |             else:
164 |                 assert 0 < f_active < 1, \
165 |                     'f_active must be 0 < f_active < 1'
166 |             n_active = int(_np.floor(0.5 + f_active * len(dataset)))
167 |         else:
168 |             assert 0 < n_active < len(dataset), \
169 |                 'n_active must be 0 < n_active < len(dataset)'
170 |             if f_active is not None:
171 |                 raise ValueError(
172 |                     'do not specify f_active if n_active is given')
173 |         active = _np.random.choice(len(dataset), size=n_active, replace=False)
174 |     else:
175 |         active = _np.asarray(active)
176 |         assert len(active) == len(_np.unique(active)), \
177 |             'you must use every active index only once'
178 |         assert _np.all(0 <= active) and _np.all(active < len(dataset)), \
179 |             'you must use only valid indices'
180 |         if f_active is not None:
181 |             raise ValueError(
182 |                 'do not specify f_active if active is given')
183 |         if n_active is not None:
184 |             raise ValueError(
185 |                 'do not specify n_active if active is given')
186 |     complement = _np.setdiff1d(
187 |         _np.arange(len(dataset)), active, assume_unique=True)
188 |     return MaskedDataset(dataset, active), MaskedDataset(dataset, complement)
189 | 
190 | def random_block_split(dataset, lag, f_active=0.5):
191 |     '''Split one dataset into two parts based on a random selection of blocks.
192 | 
193 |     This helps to separate a dataset into two or more subsets, e.g., for
194 |     training and testing. Specify the active set either by giving the fraction
195 |     of active blocks (the total number of transitions is conserved).
196 | 
197 |     Arguments:
198 |         dataset (Dataset): contains the data you want to split
199 |         lag (int): specifies the lag in time steps
200 |         f_active (float): fraction of active blocks
201 |     '''
202 |     active = []
203 |     n = 0
204 |     nmax = len(dataset)
205 |     n_blocks = int(_np.ceil(float(nmax) / float(lag)))
206 |     n_active_blocks = int(_np.floor(0.5 + f_active * n_blocks))
207 |     active_blocks = _np.random.choice(
208 |         n_blocks, size=n_active_blocks, replace=False)
209 |     for n in active_blocks:
210 |         active += _np.arange(n * lag, min((n + 1) * lag, nmax)).tolist()
211 |     return random_split(dataset, active=active)
212 | 
213 | ################################################################################
214 | #
215 | #   STATISTICS
216 | #
217 | ################################################################################
218 | 
219 | def get_mean(loader):
220 |     '''Compute the mean value via minibatch summation using a loader.
221 | 
222 |     Arguments:
223 |         loader (DataLoader): contains the data you want to analyze
224 |     '''
225 |     x_mean, y_mean = None, None
226 |     for x, y in loader:
227 |         try:
228 |             x_mean.add_(x.sum(dim=0))
229 |         except AttributeError:
230 |             x_mean = x.sum(dim=0)
231 |         try:
232 |             y_mean.add_(y.sum(dim=0))
233 |         except AttributeError:
234 |             y_mean = y.sum(dim=0)
235 |     x_mean.div_(float(len(loader.dataset)))
236 |     y_mean.div_(float(len(loader.dataset)))
237 |     return x_mean, y_mean
238 | 
239 | def get_covariance(loader, x_mean, y_mean):
240 |     '''Compute the instantaneous and time-lagged covariance matrices via
241 |     minibatch summation using a loader.
242 | 
243 |     Arguments:
244 |         loader (DataLoader): contains the data you want to analyze
245 |         x_mean (Tensor): mean value for the data_tensor
246 |         y_mean (Tensor): mean value for the target_tensor
247 |     '''
248 |     cxx = _torch.zeros(len(x_mean), len(x_mean))
249 |     cxy = _torch.zeros(len(x_mean), len(y_mean))
250 |     cyy = _torch.zeros(len(y_mean), len(y_mean))
251 |     for x, y in loader:
252 |         x.sub_(x_mean[None, :])
253 |         y.sub_(y_mean[None, :])
254 |         cxx.add_(_torch.mm(x.t(), x))
255 |         cxy.add_(_torch.mm(x.t(), y))
256 |         cyy.add_(_torch.mm(y.t(), y))
257 |     cxx.div_(float(len(loader.dataset)))
258 |     cxy.div_(float(len(loader.dataset)))
259 |     cyy.div_(float(len(loader.dataset)))
260 |     return cxx, cxy, cyy
261 | 
262 | ################################################################################
263 | #
264 | #   WHITENING
265 | #
266 | ################################################################################
267 | 
268 | def get_sqrt_inverse(matrix, bias=1.0e-5):
269 |     '''Compute the sqrt-inverse of the supplied symmetric/real matrix.
270 | 
271 |     We need this step for whitening and TICA.
272 | 
273 |     Arguments:
274 |         matrix (Tensor): contains the matrix you want to transform
275 |         bias (float): assures numerical stability
276 |     '''
277 |     e, v = _torch.symeig(matrix, eigenvectors=True)
278 |     d = _torch.diag(1.0 / _torch.sqrt(_torch.abs(e) + bias))
279 |     return _torch.mm(_torch.mm(v, d), v.t())
280 | 
281 | def whiten_data(data_tensor, batch_size=100):
282 |     '''Whiten a Tensor in the PCA basis.
283 | 
284 |     Arguments:
285 |         data_tensor (Tensor): contains the data you want to whiten
286 |         batch_size (int): specify a batch size for the whitening process
287 |     '''
288 |     loader = _DataLoader(
289 |         LaggedDataset(data_tensor, lag=0), batch_size=batch_size)
290 |     x_mean, y_mean = get_mean(loader)
291 |     cxx, cxy, cyy = get_covariance(loader, x_mean, y_mean)
292 |     ixx = get_sqrt_inverse(cxx)
293 |     whitened_data = []
294 |     for x, _ in loader:
295 |         x.sub_(x_mean[None, :])
296 |         whitened_data.append(x.mm(ixx))
297 |     return _torch.cat(whitened_data)
298 | 
299 | ################################################################################
300 | #
301 | #   CCA
302 | #
303 | ################################################################################
304 | 
305 | def cca(data_tensor_x, data_tensor_y, batch_size=100):
306 |     '''Perform canonical correlation analysis for two data tensors.
307 | 
308 |     Arguments:
309 |         data_tensor_x (Tensor): contains the first data tensor
310 |         data_tensor_y (Tensor): contains the second data tensor
311 |         batch_size (int): specify a batch size for the CCA calculation
312 |     '''
313 |     loader = _DataLoader(
314 |         _TensorDataset(data_tensor_x, data_tensor_y),
315 |         batch_size=batch_size)
316 |     x_mean, y_mean = get_mean(loader)
317 |     cxx, cxy, cyy = get_covariance(loader, x_mean, y_mean)
318 |     ixx = get_sqrt_inverse(cxx)
319 |     iyy = get_sqrt_inverse(cyy)
320 |     return _torch.svd(_torch.mm(_torch.mm(ixx, cxy), iyy))
321 | 
322 | ################################################################################
323 | #
324 | #   TRANSFORMER
325 | #
326 | ################################################################################
327 | 
328 | class BaseTransform(object):
329 |     def __init__(self, mean=None, covariance=None):
330 |         if mean is not None:
331 |             self.sub = mean
332 |         if covariance is not None:
333 |             self.mul = get_sqrt_inverse(covariance)
334 |     def __call__(self, x):
335 |         try:
336 |             x.sub_(self.sub[None, :])
337 |         except AttributeError:
338 |             pass
339 |         try:
340 |             x = x.mm(self.mul)
341 |         except AttributeError:
342 |             pass
343 |         return x
344 | 
345 | class Transform(object):
346 |     '''Apply whitening/centering transformations within a minibatch.
347 | 
348 |     As we do not want to preprocess and, thus, duplicate large datasets,
349 |     we do the necessary whitening and centering operations on the fly while
350 |     iterating over the datasets.
351 | 
352 |     Arguments:
353 |         x_mean (Tensor): contains the mean of the data tensor
354 |         x_covariance (Tensor): contains the covariance of the data tensor
355 |         y_mean (Tensor): contains the mean of the target tensor
356 |         y_covariance (Tensor): contains the covariance of the target tensor
357 |     '''
358 |     def __init__(
359 |         self, x_mean=None, x_covariance=None, y_mean=None, y_covariance=None):
360 |         self.x = BaseTransform(mean=x_mean, covariance=x_covariance)
361 |         self.y = BaseTransform(mean=y_mean, covariance=y_covariance)
362 |     def __call__(self, x, y):
363 |         return self.x(x), self.y(y)
364 | 


--------------------------------------------------------------------------------
/vampnet/examples/Folding.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Import all the packages used"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import numpy as np\n",
 17 |     "import matplotlib.pyplot as plt\n",
 18 |     "%matplotlib inline\n",
 19 |     "import vampnet\n",
 20 |     "from vampnet import data_generator\n",
 21 |     "from keras.models import Model\n",
 22 |     "from keras.layers import Dense, Activation, Flatten, Input, BatchNormalization, concatenate\n",
 23 |     "from keras import optimizers\n",
 24 |     "import tensorflow as tf\n",
 25 |     "from keras.backend import clear_session"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "# generate 10^7 frames and energy values\n",
 35 |     "datapoints = int(1e6)\n",
 36 |     "stride = 10"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "x = data_generator.get_folding_model_data(datapoints, rvec0=2.0 * (np.random.rand(5) - 0.5), kT=1., dt = 0.1)\n",
 46 |     "r = np.linalg.norm(x, axis=-1)[::stride]"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "pot = np.zeros_like(r)\n",
 56 |     "for i in range(r.shape[0]):\n",
 57 |     "    pot[i] = data_generator.folding_model_energy(r[i], 3)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "plt.plot(r[::stride], pot[::stride], '.')\n",
 67 |     "plt.show()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "traj_whole = x\n",
 77 |     "traj_data_points, input_size = traj_whole.shape"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "# All Hyperparameters\n",
 87 |     "\n",
 88 |     "# Tau, how much is the timeshift of the two datasets\n",
 89 |     "tau = 10\n",
 90 |     "\n",
 91 |     "# Batch size for Stochastic Gradient descent\n",
 92 |     "batch_size = 2048\n",
 93 |     "\n",
 94 |     "# Which trajectory points percentage is used as training\n",
 95 |     "train_ratio = 0.9\n",
 96 |     "\n",
 97 |     "# How many hidden layers the network has\n",
 98 |     "network_depth = 4\n",
 99 |     "\n",
100 |     "# Width of every layer\n",
101 |     "layer_width = 20\n",
102 |     "nodes = [layer_width]*network_depth\n",
103 |     "# Learning rate used for the ADAM optimizer\n",
104 |     "learning_rate = 0.0001\n",
105 |     "\n",
106 |     "# How many output states the network has\n",
107 |     "output_size = 2\n",
108 |     "\n",
109 |     "# Iteration over the training set in the fitting process\n",
110 |     "nb_epoch = 20\n",
111 |     "\n",
112 |     "plot_stride = 200"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "epsilon = 1e-5\n",
122 |     "vamp = vampnet.VampnetTools(epsilon = epsilon)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "# Shuffle trajectory and lagged trajectory together\n",
132 |     "length_data = traj_data_points - tau\n",
133 |     "\n",
134 |     "traj_ord= traj_whole[:length_data]\n",
135 |     "traj_ord_lag = traj_whole[tau:length_data+tau]\n",
136 |     "\n",
137 |     "indexes = np.arange(length_data)\n",
138 |     "np.random.shuffle(indexes)\n",
139 |     "\n",
140 |     "\n",
141 |     "\n",
142 |     "traj = traj_ord[indexes]\n",
143 |     "traj_lag = traj_ord_lag[indexes]\n"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "# Prepare data for tensorflow usage\n",
153 |     "length_train = int(np.floor(length_data * train_ratio))\n",
154 |     "length_vali = length_data - length_train\n",
155 |     "\n",
156 |     "traj_data_train = traj[:length_train]\n",
157 |     "traj_data_train_lag = traj_lag[:length_train]\n",
158 |     "\n",
159 |     "traj_data_valid = traj[length_train:]\n",
160 |     "traj_data_valid_lag = traj_lag[length_train:]\n",
161 |     "\n",
162 |     "#Data used for states ordering\n",
163 |     "X1 = traj_ord[:length_data].astype('float32')\n",
164 |     "X2 = traj_ord_lag[:length_data].astype('float32')\n",
165 |     "\n",
166 |     "# Input of the first network\n",
167 |     "X1_train = traj_data_train.astype('float32')\n",
168 |     "X2_train  = traj_data_train_lag.astype('float32')\n",
169 |     "\n",
170 |     "# Input for validation\n",
171 |     "X1_vali = traj_data_valid.astype('float32')\n",
172 |     "X2_vali = traj_data_valid_lag.astype('float32')\n",
173 |     "\n",
174 |     "# Needs a Y-train set which we dont have.\n",
175 |     "Y_train = np.zeros((length_train,2*output_size)).astype('float32')\n",
176 |     "Y_vali = np.zeros((length_vali,2*output_size)).astype('float32')"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "if 'model' in globals():\n",
186 |     "    del model\n",
187 |     "    clear_session()\n",
188 |     "\n",
189 |     "    \n",
190 |     "# Build the model\n",
191 |     "Data_X = Input(shape = (input_size,))\n",
192 |     "Data_Y = Input(shape = (input_size,))\n",
193 |     "\n",
194 |     "# A batch normalization layer improves convergence speed\n",
195 |     "#     bn_layer = BatchNormalization()\n",
196 |     "bn_layer = Activation('linear')\n",
197 |     "\n",
198 |     "# Instance layers and assign them to the two lobes of the network\n",
199 |     "dense_layers = [Dense(node, activation = 'relu',)\n",
200 |     "                for node in nodes]\n",
201 |     "\n",
202 |     "lx_branch = bn_layer(Data_X)\n",
203 |     "rx_branch = bn_layer(Data_Y)\n",
204 |     "\n",
205 |     "for i, layer in enumerate(dense_layers):\n",
206 |     "\n",
207 |     "    lx_branch = dense_layers[i](lx_branch)\n",
208 |     "    rx_branch = dense_layers[i](rx_branch)\n",
209 |     "\n",
210 |     "\n",
211 |     "# Add a softmax output layer.\n",
212 |     "# Should be replaced with a linear activation layer if\n",
213 |     "# the outputs of the network cannot be interpreted as states\n",
214 |     "softmax = Dense(output_size, activation='softmax')\n",
215 |     "\n",
216 |     "lx_branch = softmax(lx_branch)\n",
217 |     "rx_branch = softmax(rx_branch)\n",
218 |     "\n",
219 |     "# Merge both networks to train both at the same time\n",
220 |     "merged = concatenate([lx_branch, rx_branch])\n",
221 |     "\n",
222 |     "# Initialize the model and the optimizer, and compile it with\n",
223 |     "# the loss and metric functions from the VAMPnets package\n",
224 |     "model = Model(inputs = [Data_X, Data_Y], outputs = merged)\n",
225 |     "#     model.summary()\n",
226 |     "# Compile it with our own loss-function\n",
227 |     "adam = optimizers.adam(lr = learning_rate)\n",
228 |     "\n",
229 |     "\n",
230 |     "# Pretraining with VAMP with 'symmetrized' matrices yields a bad approximation of the \n",
231 |     "# eigenvectors per se, but improves the 'readability' of the states identified by VAMP-2\n",
232 |     "# which would otherwise be difficult to interprete.\n",
233 |     "\n",
234 |     "\n",
235 |     "# IMPORTANT: the function vamp.loss_VAMP2_autograd can only be used with tensorflow 1.6 or more recent.\n",
236 |     "# For older versions of TF, use the function vamp.loss_VAMP2\n",
237 |     "\n",
238 |     "losses = [\n",
239 |     "    vamp._loss_VAMP_sym,\n",
240 |     "    vamp.loss_VAMP2,\n",
241 |     "]\n",
242 |     "\n",
243 |     "valid_metric = np.zeros((len(losses), nb_epoch))\n",
244 |     "train_metric = np.zeros((len(losses), nb_epoch))\n",
245 |     "\n",
246 |     "for l_index, loss in enumerate(losses):\n",
247 |     "    \n",
248 |     "    model.compile(optimizer = 'adam', loss = loss, metrics = [vamp.metric_VAMP])\n",
249 |     "    \n",
250 |     "    hist = model.fit([X1_train, X2_train], Y_train ,batch_size=batch_size, epochs=nb_epoch, verbose=0,\n",
251 |     "                     validation_data=([X1_vali, X2_vali], Y_vali))\n",
252 |     "    \n",
253 |     "    states_prob = model.predict([traj_ord, traj_ord_lag])[:, :output_size]\n",
254 |     "\n",
255 |     "    # Order the output states based on their population\n",
256 |     "    coor_pred = np.argmax(states_prob, axis = 1)\n",
257 |     "    indexes = [np.where(coor_pred == np.multiply(np.ones_like(coor_pred), n)) for n in range(output_size)]\n",
258 |     "    states_num = [len(i[0]) for i in indexes]\n",
259 |     "    states_order = np.argsort(states_num).astype('int')[::-1]\n",
260 |     "\n",
261 |     "    pred_ord = states_prob[:,states_order]\n",
262 |     "    \n",
263 |     "    X_Validation = np.linalg.norm(traj_ord, axis=1)\n",
264 |     "    for i in range(output_size):\n",
265 |     "        plt.plot(X_Validation[::plot_stride], pred_ord[::plot_stride,i], '.', label = 'state '+str(i))\n",
266 |     "    plt.legend()\n",
267 |     "    plt.title('States probabilites')\n",
268 |     "    plt.show()\n",
269 |     "    tau_msm = 20\n",
270 |     "    pred_ord_meanfree = pred_ord - pred_ord.mean(0)\n",
271 |     "    K_smt = vamp.estimate_koopman_op(pred_ord_meanfree, tau_msm)\n",
272 |     "\n",
273 |     "    K_eigvals, K_eigvec = np.linalg.eig(np.real(K_smt))\n",
274 |     "\n",
275 |     "    index = np.argmax(np.real(K_eigvals))\n",
276 |     "    real_eigfunc = pred_ord_meanfree @ np.real(K_eigvec[:,index])\n",
277 |     "\n",
278 |     "    plt.plot(X_Validation[::plot_stride], real_eigfunc[::plot_stride], '.')\n",
279 |     "    plt.title('Eigenvector')\n",
280 |     "    plt.show()\n",
281 |     "\n",
282 |     "    valid_metric[l_index] = np.array(hist.history['val_metric_VAMP'])\n",
283 |     "    train_metric[l_index] = np.array(hist.history['metric_VAMP'])\n",
284 |     "\n",
285 |     "valid_metric = np.reshape(valid_metric, (-1))\n",
286 |     "train_metric = np.reshape(train_metric, (-1))"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": null,
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": [
295 |     "# Training result visualization\n",
296 |     "\n",
297 |     "plt.plot(train_metric, label = 'Training')\n",
298 |     "plt.legend()\n",
299 |     "plt.plot(valid_metric, label = 'Validation')\n",
300 |     "plt.legend()\n",
301 |     "\n",
302 |     "plt.show()"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": null,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "# Transform the input trajectory using the network\n",
312 |     "states_prob = model.predict([traj_ord, traj_ord_lag])[:, :output_size]\n",
313 |     "\n",
314 |     "# Order the output states based on their population\n",
315 |     "coor_pred = np.argmax(states_prob, axis = 1)\n",
316 |     "indexes = [np.where(coor_pred == np.multiply(np.ones_like(coor_pred), n)) for n in range(output_size)]\n",
317 |     "states_num = [len(i[0]) for i in indexes]\n",
318 |     "states_order = np.argsort(states_num).astype('int')[::-1]\n",
319 |     "\n",
320 |     "pred_ord = states_prob[:,states_order]"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "X_Validation = np.linalg.norm(traj_ord, axis=1)\n",
330 |     "for i in range(output_size):\n",
331 |     "    plt.plot(X_Validation[::plot_stride], pred_ord[::plot_stride,i], '.', label = 'state '+str(i))\n",
332 |     "    \n",
333 |     "scaled_pot = (pot-pot.min())/(pot.max()-pot.min())\n",
334 |     "    \n",
335 |     "plt.plot(r[::plot_stride], scaled_pot[::plot_stride], '.', label = 'Potential')\n",
336 |     "plt.show()"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": null,
342 |    "metadata": {},
343 |    "outputs": [],
344 |    "source": [
345 |     "tau_msm = 20\n",
346 |     "pred_ord_meanfree = pred_ord - pred_ord.mean(0)\n",
347 |     "K_smt = vamp.estimate_koopman_op(pred_ord_meanfree, tau_msm)\n",
348 |     "\n",
349 |     "K_eigvals, K_eigvec = np.linalg.eig(np.real(K_smt))\n",
350 |     "\n",
351 |     "index = np.argmax(np.real(K_eigvals))\n",
352 |     "real_eigfunc = pred_ord_meanfree @ np.real(K_eigvec[:,index])\n",
353 |     "\n",
354 |     "plt.plot(X_Validation[::plot_stride], real_eigfunc[::plot_stride], '.')\n",
355 |     "plt.title('Eigenvector')\n",
356 |     "plt.show()"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "markdown",
361 |    "metadata": {},
362 |    "source": [
363 |     "# Visualize the population of the states"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": null,
369 |    "metadata": {},
370 |    "outputs": [],
371 |    "source": [
372 |     "def print_states_pie_chart():\n",
373 |     "    coors = []\n",
374 |     "    maxi = np.max(pred_ord, axis= 1)\n",
375 |     "\n",
376 |     "    for i in range(output_size):\n",
377 |     "        coors.append(len(np.where(pred_ord[:,i] == maxi)[0]))\n",
378 |     "        \n",
379 |     "    fig1, ax1 = plt.subplots()\n",
380 |     "    ax1.pie(np.array(coors), autopct='%1.2f%%', startangle=90)\n",
381 |     "    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.\n",
382 |     "    print('States population: '+str(np.array(coors)/len(maxi)*100)+'%')\n",
383 |     "    plt.show()\n",
384 |     "\n",
385 |     "print_states_pie_chart()"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "markdown",
390 |    "metadata": {},
391 |    "source": [
392 |     "# Estimate the implied timescales"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": null,
398 |    "metadata": {},
399 |    "outputs": [],
400 |    "source": [
401 |     "max_tau = 200\n",
402 |     "lag = np.arange(1, max_tau, 1)\n",
403 |     "its = vamp.get_its(pred_ord, lag)\n",
404 |     "vamp.plot_its(its, lag)"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "markdown",
409 |    "metadata": {},
410 |    "source": [
411 |     "# Chapman-Kolmogorov test for the estimated koopman operator"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": null,
417 |    "metadata": {},
418 |    "outputs": [],
419 |    "source": [
420 |     "steps = 24\n",
421 |     "tau_msm = 50\n",
422 |     "predicted, estimated = vamp.get_ck_test(pred_ord, steps, tau_msm)\n",
423 |     "vamp.plot_ck_test(predicted, estimated, output_size, steps, tau_msm)"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": null,
429 |    "metadata": {},
430 |    "outputs": [],
431 |    "source": []
432 |   }
433 |  ],
434 |  "metadata": {
435 |   "anaconda-cloud": {},
436 |   "kernelspec": {
437 |    "display_name": "Python 3",
438 |    "language": "python",
439 |    "name": "python3"
440 |   },
441 |   "language_info": {
442 |    "codemirror_mode": {
443 |     "name": "ipython",
444 |     "version": 3
445 |    },
446 |    "file_extension": ".py",
447 |    "mimetype": "text/x-python",
448 |    "name": "python",
449 |    "nbconvert_exporter": "python",
450 |    "pygments_lexer": "ipython3",
451 |    "version": "3.6.4"
452 |   }
453 |  },
454 |  "nbformat": 4,
455 |  "nbformat_minor": 1
456 | }
457 | 


--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/benchmarks.py:
--------------------------------------------------------------------------------
  1 | #   This file is part of the markovmodel/deeptime repository.
  2 | #   Copyright (C) 2017, 2018 Computational Molecular Biology Group,
  3 | #   Freie Universitaet Berlin (GER)
  4 | #
  5 | #   This program is free software: you can redistribute it and/or modify
  6 | #   it under the terms of the GNU Lesser General Public License as published by
  7 | #   the Free Software Foundation, either version 3 of the License, or
  8 | #   (at your option) any later version.
  9 | #
 10 | #   This program is distributed in the hope that it will be useful,
 11 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | #   GNU General Public License for more details.
 14 | #
 15 | #   You should have received a copy of the GNU Lesser General Public License
 16 | #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 | 
 18 | '''
 19 | Automatized benchmarks.
 20 | '''
 21 | 
 22 | import multiprocessing as mp
 23 | import numpy as np
 24 | import torch
 25 | import tae
 26 | import os
 27 | 
 28 | import tae
 29 | import torch
 30 | import pyemma
 31 | from time import time
 32 | 
 33 | try:
 34 |     import pyemma
 35 | except ImportError:
 36 |     print('running benchmarks requires the pyemma package')
 37 | 
 38 | try:
 39 |     from mdshare import load as _load
 40 | except ImportError:
 41 |     print('running benchmarks requires the mdshare package')
 42 | 
 43 | ################################################################################
 44 | #
 45 | #   BENCHMARKING THE SQRT TOY MODEL
 46 | #
 47 | ################################################################################
 48 | 
 49 | def evaluate_sqrt_model(
 50 |     length=10000,
 51 |     trns_lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 52 |     msm_lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 53 |     use_cuda=True):
 54 |     '''A wrapper to run the sqrt model benchmarks
 55 | 
 56 |     Arguments:
 57 |         length (int): length of the sampled trajectory
 58 |         trns_lags (list of int): lag times for the transformers
 59 |         msm_lags (list of int): lag times for the MSM validation
 60 |         use_cuda (boolean): use a GPU to run the benchmarks
 61 |     '''
 62 |     def analyse(lat_data, ref_data, msm_lags):
 63 |         cca = tae.utils.cca(torch.from_numpy(lat_data), ref_data)[1].numpy()
 64 |         centers = np.linspace(np.min(lat_data), np.max(lat_data), 101)
 65 |         centers = 0.5 * (centers[:-1] + centers[1:]).reshape(-1, 1)
 66 |         dtraj = pyemma.coordinates.assign_to_centers(lat_data, centers)
 67 |         its = pyemma.msm.its(dtraj, lags=msm_lags, nits=1).timescales
 68 |         return cca, its
 69 |     data, dtraj = tae.toymodels.sample_sqrt_model(length)
 70 |     ref_data = tae.utils.whiten_data(
 71 |         torch.from_numpy(dtraj.reshape(-1, 1).astype(np.float32)))
 72 |     ref_its = pyemma.msm.its(dtraj, lags=msm_lags, nits=1).timescales
 73 |     lat, trn, val = tae.pca(
 74 |         data, dim=1, validation_split=0.5, batch_size=100, whiten=True)
 75 |     cca, its = analyse(lat, ref_data, msm_lags)
 76 |     result = dict(
 77 |         trns_lags=np.asarray(trns_lags),
 78 |         msm_lags=np.asarray(msm_lags),
 79 |         ref_its=np.asarray(ref_its),
 80 |         pca_its=np.asarray(its),
 81 |         pca_cca=np.asarray(cca),
 82 |         pca_trn=np.asarray(trn),
 83 |         pca_val=np.asarray(val))
 84 |     for lag in trns_lags:
 85 |         lat, trn, val = tae.tica(
 86 |             data, dim=1, lag=lag, kinetic_map=True, symmetrize=True,
 87 |             validation_split=0.5, batch_size=100, whiten=True)
 88 |         cca, its = analyse(lat, ref_data, msm_lags)
 89 |         result.update({
 90 |             'tica_%d_its' % lag: np.asarray(its),
 91 |             'tica_%d_cca' % lag: np.asarray(cca),
 92 |             'tica_%d_trn' % lag: np.asarray(trn),
 93 |             'tica_%d_val' % lag: np.asarray(val)})
 94 |         lat, trn, val = tae.ae(
 95 |             data, dim=1, lag=lag, n_epochs=200, validation_split=0.5,
 96 |             batch_size=100, whiten=True, pin_memory=use_cuda, hid_size=[200, 100],
 97 |             cuda=use_cuda, non_blocking=use_cuda)
 98 |         cca, its = analyse(lat, ref_data, msm_lags)
 99 |         result.update({
100 |             'ae_%d_its' % lag: np.asarray(its),
101 |             'ae_%d_cca' % lag: np.asarray(cca),
102 |             'ae_%d_trn' % lag: np.asarray(trn),
103 |             'ae_%d_val' % lag: np.asarray(val)})
104 |     return result
105 | 
106 | ################################################################################
107 | #
108 | #   BENCHMARKING THE SWISSROLL TOY MODEL
109 | #
110 | ################################################################################
111 | 
112 | def evaluate_swissroll_model(
113 |     dim=None,
114 |     length=30000,
115 |     trns_lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
116 |     msm_lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
117 |     use_cuda=True):
118 |     '''A wrapper to run the swissroll model benchmarks
119 | 
120 |     Arguments:
121 |         dim (int): specify the latent dimension (1 or 2)
122 |         length (int): length of the sampled trajectory
123 |         trns_lags (list of int): lag times for the transformers
124 |         msm_lags (list of int): lag times for the MSM validation
125 |         use_cuda (boolean): use a GPU to run the benchmarks
126 |     '''
127 |     def analyse(lat_data, ref_data, msm_lags):
128 |         cca = tae.utils.cca(torch.from_numpy(lat_data), ref_data)[1].numpy()
129 |         if lat_data.shape[1] == 1:
130 |             centers = np.linspace(np.min(lat_data), np.max(lat_data), 101)
131 |             centers = 0.5 * (centers[:-1] + centers[1:]).reshape(-1, 1)
132 |             dtraj = pyemma.coordinates.assign_to_centers(lat_data, centers)
133 |         else:
134 |             dtraj = pyemma.coordinates.cluster_regspace(
135 |                 lat_data, dmin=0.2, max_centers=400).dtrajs
136 |         its = pyemma.msm.its(dtraj, lags=msm_lags, nits=3).timescales
137 |         return cca, its
138 |     data, dtraj = tae.toymodels.sample_swissroll_model(length)
139 |     ref_data = tae.utils.whiten_data(
140 |         torch.from_numpy(dtraj.reshape(-1, 1).astype(np.float32)))
141 |     ref_its = pyemma.msm.its(dtraj, lags=msm_lags, nits=3).timescales
142 |     lat, trn, val = tae.pca(
143 |         data, dim=dim, validation_split=0.5, batch_size=100, whiten=True)
144 |     cca, its = analyse(lat, ref_data, msm_lags)
145 |     result = dict(
146 |         trns_lags=np.asarray(trns_lags),
147 |         msm_lags=np.asarray(msm_lags),
148 |         ref_its=np.asarray(ref_its),
149 |         pca_its=np.asarray(its),
150 |         pca_cca=np.asarray(cca),
151 |         pca_trn=np.asarray(trn),
152 |         pca_val=np.asarray(val))
153 |     for lag in trns_lags:
154 |         lat, trn, val = tae.tica(
155 |             data, dim=dim, lag=lag, kinetic_map=True, symmetrize=True,
156 |             validation_split=0.5, batch_size=100, whiten=True)
157 |         cca, its = analyse(lat, ref_data, msm_lags)
158 |         result.update({
159 |             'tica_%d_its' % lag: np.asarray(its),
160 |             'tica_%d_cca' % lag: np.asarray(cca),
161 |             'tica_%d_trn' % lag: np.asarray(trn),
162 |             'tica_%d_val' % lag: np.asarray(val)})
163 |         lat, trn, val = tae.ae(
164 |             data, dim=dim, lag=lag, n_epochs=200, validation_split=0.5,
165 |             batch_size=100, whiten=True, pin_memory=use_cuda, hid_size=[200, 100],
166 |             cuda=use_cuda, non_blocking=use_cuda)
167 |         cca, its = analyse(lat, ref_data, msm_lags)
168 |         result.update({
169 |             'ae_%d_its' % lag: np.asarray(its),
170 |             'ae_%d_cca' % lag: np.asarray(cca),
171 |             'ae_%d_trn' % lag: np.asarray(trn),
172 |             'ae_%d_val' % lag: np.asarray(val)})
173 |     return result
174 | 
175 | ################################################################################
176 | #
177 | #   BENCHMARKING THE ALANINE DIPEPTIDE MD SIMULATIONS
178 | #
179 | ################################################################################
180 | 
181 | def evaluate_ala2_md(
182 |     n_trajs=5,
183 |     length=50000,
184 |     trns_lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
185 |     msm_lags=[1, 2, 3, 5, 7, 10, 15, 20, 30, 40, 50],
186 |     use_cuda=True):
187 |     '''A wrapper to run the alanine dipeptide benchmarks
188 | 
189 |     Arguments:
190 |         n_trajs (int): number of bootstrapped trajectories
191 |         length (int): length of each bootstrapped trajectory
192 |         trns_lags (list of int): lag times for the transformers
193 |         msm_lags (list of int): lag times for the MSM validation
194 |         use_cuda (boolean): use a GPU to run the benchmarks
195 |     '''
196 |     def analyse(lat_data, ref_data, msm_lags):
197 |         cca = tae.utils.cca(
198 |             torch.cat([torch.from_numpy(array) for array in lat_data]),
199 |             ref_data)[1].numpy()
200 |         dtrajs = pyemma.coordinates.cluster_kmeans(
201 |             lat_data, k=300, max_iter=50, stride=10).dtrajs
202 |         its = pyemma.msm.its(dtrajs, lags=msm_lags, nits=3).timescales
203 |         return cca, its
204 |     with np.load(_load('alanine-dipeptide-3x250ns-backbone-dihedrals.npz')) as fh:
205 |         n_frames = [fh[key].shape[0] for key in sorted(fh.keys())]
206 |         selection = []
207 |         for i in np.random.choice(
208 |             len(n_frames), size=n_trajs, replace=True):
209 |             selection.append(
210 |                 [i, np.random.randint(n_frames[i] - length)])
211 |         ref_data = [fh['arr_%d' % i][l:l+length] for i, l in selection]
212 |     with np.load(_load('alanine-dipeptide-3x250ns-heavy-atom-positions.npz')) as fh:
213 |         data = [fh['arr_%d' % i][l:l+length] for i, l in selection]
214 |     dtrajs = pyemma.coordinates.cluster_kmeans(
215 |         ref_data, k=300, max_iter=50, stride=10).dtrajs
216 |     ref_its = pyemma.msm.its(dtrajs, lags=msm_lags, nits=3).timescales
217 |     ref_data = tae.utils.whiten_data(
218 |         torch.cat([torch.from_numpy(array) for array in ref_data]))
219 |     lat, trn, val = tae.pca(
220 |         data, dim=2, validation_split=0.5, batch_size=100, whiten=True)
221 |     cca, its = analyse(lat, ref_data, msm_lags)
222 |     result = dict(
223 |         trns_lags=np.asarray(trns_lags),
224 |         msm_lags=np.asarray(msm_lags),
225 |         ref_its=np.asarray(ref_its),
226 |         pca_its=np.asarray(its),
227 |         pca_cca=np.asarray(cca),
228 |         pca_trn=np.asarray(trn),
229 |         pca_val=np.asarray(val))
230 |     for lag in trns_lags:
231 |         lat, trn, val = tae.tica(
232 |             data, dim=2, lag=lag, kinetic_map=True, symmetrize=True,
233 |             validation_split=0.5, batch_size=100, whiten=True)
234 |         cca, its = analyse(lat, ref_data, msm_lags)
235 |         result.update({
236 |             'tica_%d_its' % lag: np.asarray(its),
237 |             'tica_%d_cca' % lag: np.asarray(cca),
238 |             'tica_%d_trn' % lag: np.asarray(trn),
239 |             'tica_%d_val' % lag: np.asarray(val)})
240 |         lat, trn, val = tae.ae(
241 |             data, dim=2, lag=lag, n_epochs=200, validation_split=0.5,
242 |             batch_size=100, whiten=True, pin_memory=use_cuda, hid_size=[200, 100],
243 |             cuda=use_cuda, non_blocking=use_cuda)
244 |         cca, its = analyse(lat, ref_data, msm_lags)
245 |         result.update({
246 |             'ae_%d_its' % lag: np.asarray(its),
247 |             'ae_%d_cca' % lag: np.asarray(cca),
248 |             'ae_%d_trn' % lag: np.asarray(trn),
249 |             'ae_%d_val' % lag: np.asarray(val)})
250 |     return result
251 | 
252 | ################################################################################
253 | #
254 | #   BENCHMARKING THE VILLIN MD SIMULATIONS
255 | #
256 | ################################################################################
257 | 
258 | def evaluate_villin_md(
259 |     data=None,
260 |     n_blocks=10,
261 |     trns_lags=[10, 20, 50, 100, 200, 500],
262 |     msm_lags=[1, 5, 10, 20, 30, 40, 50, 60, 80, 100, 125, 150, 175, 200, 250, 300, 400, 500, 700, 1000],
263 |     use_cuda=True):
264 |     '''An inner wrapper to run the villin benchmarks for a single featurization
265 | 
266 |     Arguments:
267 |         data (numpy.ndarray): featurized md data
268 |         n_blocks (int): number of blocks to divide the original trajectory in
269 |         trns_lags (list of int): lag times for the transformers
270 |         msm_lags (list of int): lag times for the MSM validation
271 |         use_cuda (boolean): use a GPU to run the benchmarks
272 |     '''
273 |     def analyse(lat_data, msm_lags):
274 |         dtrajs = pyemma.coordinates.cluster_kmeans(
275 |             lat_data, k=300, max_iter=50, stride=10).dtrajs
276 |         return pyemma.msm.its(dtrajs, lags=msm_lags, nits=2).timescales
277 |     nmax = len(data)
278 |     length = int(np.floor(0.5 + float(nmax) / float(n_blocks)))
279 |     active_blocks = np.random.choice(n_blocks, size=n_blocks, replace=True)
280 |     _data = [data[n * length:min((n + 1) * length, nmax), :] for n in active_blocks]
281 |     result = dict(
282 |         trns_lags=np.asarray(trns_lags),
283 |         msm_lags=np.asarray(msm_lags))
284 |     for lag in trns_lags:
285 |         for dim in [2, 5]:
286 |             lat, trn, val = tae.tica(
287 |                 _data, dim=2, lag=lag, kinetic_map=True, symmetrize=True,
288 |                 validation_split=0.5, batch_size=100, whiten=True)
289 |             result.update({
290 |                 'tica_%d_%d_its' % (lag, dim): np.asarray(analyse(lat, msm_lags)),
291 |                 'tica_%d_%d_trn' % (lag, dim): np.asarray(trn),
292 |                 'tica_%d_%d_val' % (lag, dim): np.asarray(val)})
293 |         lat, trn, val = tae.ae(
294 |             _data, dim=2, lag=lag, n_epochs=200, validation_split=0.5,
295 |             batch_size=100, whiten=True, pin_memory=use_cuda, hid_size=[200, 100],
296 |             cuda=use_cuda, non_blocking=use_cuda)
297 |         result.update({
298 |             'ae_%d_its' % lag: np.asarray(analyse(lat, msm_lags)),
299 |             'ae_%d_trn' % lag: np.asarray(trn),
300 |             'ae_%d_val' % lag: np.asarray(val)})
301 |     return result
302 | 
303 | def evaluate_villin_md_wrapper(
304 |     path_to_data=None,
305 |     trns_lags=[10, 20, 50, 100, 200, 500],
306 |     msm_lags=[1, 5, 10, 20, 30, 40, 50, 60, 80, 100, 125, 150, 175, 200, 250, 300, 400, 500, 700, 1000],
307 |     use_cuda=True):
308 |     '''An outer wrapper to run the villin benchmarks for all featurizations
309 | 
310 |     Arguments:
311 |         path_to_data (str): path to the villin data which we are not allowed to share
312 |         n_blocks (int): number of blocks to divide the original trajectory in
313 |         trns_lags (list of int): lag times for the transformers
314 |         msm_lags (list of int): lag times for the MSM validation
315 |         use_cuda (boolean): use a GPU to run the benchmarks
316 |     '''
317 |     featurisations = dict({
318 |         'bbt': 'villin-ff-1ns-backbone-torsions.npy',
319 |         'cap': 'villin-ff-1ns-ca-positions.npy',
320 |         'hap': 'villin-ff-1ns-heavy-atom-positions.npy',
321 |         'icad': 'villin-ff-1ns-inverse-ca-distances.npy'})
322 |     result = dict()
323 |     for model in featurisations.keys():
324 |         data = np.load(os.path.join(path_to_data, featurisations[model]))
325 |         model_result = evaluate_villin_md(
326 |             data=data, trns_lags=trns_lags,
327 |             msm_lags=msm_lags, use_cuda=use_cuda)
328 |         for key in model_result.keys():
329 |             if key not in ['trns_lags', 'msm_lags']:
330 |                 result.update({'%s_%s' % (model, key): model_result[key]})
331 |     result.update(trns_lags=trns_lags, msm_lags=msm_lags)
332 |     return result
333 | 
334 | ################################################################################
335 | #
336 | #   MANUSCRIPT BENCHMARKS
337 | #
338 | ################################################################################
339 | 
340 | def worker(queue, gpu, seed, evaluate_func, evaluate_kwargs):
341 |     with torch.cuda.device(gpu):
342 |         np.random.seed(seed)
343 |         torch.manual_seed(seed)
344 |         torch.cuda.manual_seed(seed)
345 |         try:
346 |             result = evaluate_func(**evaluate_kwargs)
347 |         except Exception as e:
348 |             print(e)
349 |             result = dict()
350 |         queue.put(result)
351 |         queue.task_done()
352 | 
353 | def spawn(
354 |     seed_generator, task_index, n_gpus, evaluate_func, evaluate_kwargs=dict()):
355 |     processes = []
356 |     queue = mp.JoinableQueue()
357 |     for gpu in range(n_gpus):
358 |         seed = seed_generator(task_index, gpu, n_gpus=n_gpus)
359 |         p = mp.Process(
360 |             target=worker,
361 |             args=[queue, gpu, seed, evaluate_func, evaluate_kwargs])
362 |         processes.append(p)
363 |         print('Spawning task:%d on gpu:%d with seed:%d' % (task_index, gpu, seed))
364 |     for p in processes:
365 |         p.start()
366 |     queue.join()
367 |     out = dict()
368 |     for _ in processes:
369 |         result = queue.get()
370 |         for key in result.keys():
371 |             if key in ['trns_lags', 'msm_lags']:
372 |                 if key not in out:
373 |                     out.update({key: result[key]})
374 |             else:
375 |                 try:
376 |                     out[key].append(result[key])
377 |                 except KeyError:
378 |                     out.update({key: [result[key]]})
379 |     return out
380 | 


--------------------------------------------------------------------------------
/vampnet/examples/Alanine_dipeptide.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Import all the packages used"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "scrolled": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import matplotlib.pyplot as plt\n",
 20 |     "%matplotlib inline\n",
 21 |     "import vampnet\n",
 22 |     "from vampnet import data_generator as vamp_data_generator\n",
 23 |     "from tensorflow.contrib.keras.api.keras.models import Model\n",
 24 |     "from tensorflow.contrib.keras.api.keras.layers import Dense, Activation, Flatten, Input, BatchNormalization, concatenate\n",
 25 |     "from tensorflow.contrib.keras.api.keras.optimizers import Adam\n",
 26 |     "import tensorflow as tf\n",
 27 |     "import matplotlib.gridspec as gridspec\n",
 28 |     "from tensorflow.contrib.keras.api.keras.backend import clear_session"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "# Necessary for downloading the trajectory data\n",
 38 |     "import mdshare"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "# Load Data"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "import pyemma.coordinates as pycoor"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "# Define Hyperparameters"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "# Tau, how much is the timeshift of the two datasets\n",
 71 |     "tau = 1\n",
 72 |     "\n",
 73 |     "# Batch size for Stochastic Gradient descent\n",
 74 |     "batch_size = 1000\n",
 75 |     "\n",
 76 |     "# Which trajectory points percentage is used as training\n",
 77 |     "train_ratio = 0.9\n",
 78 |     "\n",
 79 |     "# How many hidden layers the network has\n",
 80 |     "network_depth = 6\n",
 81 |     "\n",
 82 |     "# Width of every layer\n",
 83 |     "layer_width = 100\n",
 84 |     "\n",
 85 |     "# Learning rate used for the ADAM optimizer\n",
 86 |     "learning_rate = 1e-4\n",
 87 |     "\n",
 88 |     "# How many output states the network has\n",
 89 |     "output_size = 6\n",
 90 |     "\n",
 91 |     "# Iteration over the training set in the fitting process\n",
 92 |     "nb_epoch = 60\n",
 93 |     "\n",
 94 |     "epsilon = 1e-5"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "traj_whole, dihedral = vamp_data_generator.get_alanine_data()\n",
104 |     "\n",
105 |     "traj_data_points, input_size = traj_whole.shape"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "# Initialized the VAMPnets wrapper class"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "vamp = vampnet.VampnetTools(epsilon = epsilon)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "# Shuffle trajectory and lagged trajectory together"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "length_data = traj_data_points - tau\n",
138 |     "\n",
139 |     "traj_ord = traj_whole[:length_data]\n",
140 |     "traj_ord_lag = traj_whole[tau:length_data+tau]\n",
141 |     "\n",
142 |     "\n",
143 |     "dihedral_init = dihedral[:length_data]\n",
144 |     "\n",
145 |     "indexes = np.arange(length_data)\n",
146 |     "np.random.shuffle(indexes)\n",
147 |     "\n",
148 |     "traj = traj_ord[indexes]\n",
149 |     "traj_lag = traj_ord_lag[indexes]\n",
150 |     "dihedral_shuffle = dihedral_init[indexes]"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "# Prepare data for tensorflow usage"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "length_train = int(np.floor(length_data * train_ratio))\n",
167 |     "length_vali = length_data - length_train\n",
168 |     "\n",
169 |     "traj_data_train = traj[:length_train]\n",
170 |     "traj_data_train_lag = traj_lag[:length_train]\n",
171 |     "\n",
172 |     "traj_data_valid = traj[length_train:]\n",
173 |     "traj_data_valid_lag = traj_lag[length_train:]\n",
174 |     "\n",
175 |     "# Input of the first network\n",
176 |     "X1_train = traj_data_train.astype('float32')\n",
177 |     "X2_train  = traj_data_train_lag.astype('float32')\n",
178 |     "\n",
179 |     "# Input for validation\n",
180 |     "X1_vali = traj_data_valid.astype('float32')\n",
181 |     "X2_vali = traj_data_valid_lag.astype('float32')\n",
182 |     "\n",
183 |     "# Needs a Y-train set which we dont have.\n",
184 |     "Y_train = np.zeros((length_train,2*output_size)).astype('float32')\n",
185 |     "Y_vali = np.zeros((length_vali,2*output_size)).astype('float32')"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "# Run several model iterations saving the best one, to help finding sparcely populated states"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {
199 |     "scrolled": true
200 |    },
201 |    "outputs": [],
202 |    "source": [
203 |     "max_vm = 0\n",
204 |     "attempts = 10\n",
205 |     "\n",
206 |     "# IMPORTANT: the function vamp.loss_VAMP2_autograd can only be used with tensorflow 1.6 or more recent.\n",
207 |     "# For older versions of TF, use the function vamp.loss_VAMP2\n",
208 |     "\n",
209 |     "losses = [\n",
210 |     "          vamp.loss_VAMP2_autograd,\n",
211 |     "]\n",
212 |     "\n",
213 |     "\n",
214 |     "for i in range(attempts):    \n",
215 |     "\n",
216 |     "    # Clear the previous tensorflow session to prevent memory leaks\n",
217 |     "    clear_session()\n",
218 |     "\n",
219 |     "    # Build the model\n",
220 |     "\n",
221 |     "\n",
222 |     "    nodes = [layer_width]*network_depth\n",
223 |     "\n",
224 |     "    Data_X = Input(shape = (input_size,))\n",
225 |     "    Data_Y = Input(shape = (input_size,))\n",
226 |     "\n",
227 |     "    # A batch normalization layer improves convergence speed\n",
228 |     "    bn_layer = BatchNormalization()\n",
229 |     "\n",
230 |     "    # Instance layers and assign them to the two lobes of the network\n",
231 |     "    dense_layers = [Dense(node, activation = 'elu')# if index_layer < 3 else 'linear nodes')\n",
232 |     "                    for index_layer,node in enumerate(nodes)]\n",
233 |     "\n",
234 |     "    lx_branch = bn_layer(Data_X)\n",
235 |     "    rx_branch = bn_layer(Data_Y)\n",
236 |     "\n",
237 |     "    for i, layer in enumerate(dense_layers):\n",
238 |     "\n",
239 |     "        lx_branch = dense_layers[i](lx_branch)\n",
240 |     "        rx_branch = dense_layers[i](rx_branch)\n",
241 |     "\n",
242 |     "\n",
243 |     "    # Add a softmax output layer.\n",
244 |     "    # Should be replaced with a linear activation layer if\n",
245 |     "    # the outputs of the network cannot be interpreted as states\n",
246 |     "    softmax = Dense(output_size, activation='softmax')\n",
247 |     "\n",
248 |     "    lx_branch = softmax(lx_branch)\n",
249 |     "    rx_branch = softmax(rx_branch)\n",
250 |     "\n",
251 |     "    # Merge both networks to train both at the same time\n",
252 |     "    merged = concatenate([lx_branch, rx_branch])\n",
253 |     "\n",
254 |     "    # Initialize the model and the optimizer, and compile it with\n",
255 |     "    # the loss and metric functions from the VAMPnets package\n",
256 |     "    model = Model(inputs = [Data_X, Data_Y], outputs = merged)\n",
257 |     "    adam = Adam(lr = learning_rate/10)\n",
258 |     "\n",
259 |     "    vm1 = np.zeros((len(losses), nb_epoch))\n",
260 |     "    tm1 = np.zeros_like(vm1)\n",
261 |     "    vm2 = np.zeros_like(vm1)\n",
262 |     "    tm2 = np.zeros_like(vm1)\n",
263 |     "    vm3 = np.zeros_like(vm1)\n",
264 |     "    tm3 = np.zeros_like(vm1)\n",
265 |     "    \n",
266 |     "    for l_index, loss_function in enumerate(losses):\n",
267 |     "\n",
268 |     "        \n",
269 |     "        model.compile(optimizer = adam,\n",
270 |     "                      loss = loss_function,\n",
271 |     "                      metrics = [\n",
272 |     "                          vamp.metric_VAMP,\n",
273 |     "                          vamp.metric_VAMP2,\n",
274 |     "                                 ])\n",
275 |     "\n",
276 |     "\n",
277 |     "        # Train the model\n",
278 |     "                \n",
279 |     "        hist = model.fit([X1_train, X2_train], Y_train ,\n",
280 |     "                         batch_size=batch_size,\n",
281 |     "                         epochs=nb_epoch,\n",
282 |     "                         validation_data=([X1_vali, X2_vali], Y_vali ),\n",
283 |     "                         verbose=0)\n",
284 |     "\n",
285 |     "\n",
286 |     "        vm1[l_index] = np.array(hist.history['val_metric_VAMP'])\n",
287 |     "        tm1[l_index] = np.array(hist.history['metric_VAMP'])\n",
288 |     "        \n",
289 |     "        \n",
290 |     "        vm2[l_index] = np.array(hist.history['val_metric_VAMP2'])\n",
291 |     "        tm2[l_index] = np.array(hist.history['metric_VAMP2'])\n",
292 |     "        \n",
293 |     "        vm3[l_index] = np.array(hist.history['val_loss'])\n",
294 |     "        tm3[l_index] = np.array(hist.history['loss'])\n",
295 |     "        \n",
296 |     "    \n",
297 |     "    vm1 = np.reshape(vm1, (-1))\n",
298 |     "    tm1 = np.reshape(tm1, (-1))\n",
299 |     "    vm2 = np.reshape(vm2, (-1))\n",
300 |     "    tm2 = np.reshape(tm2, (-1))\n",
301 |     "    vm3 = np.reshape(vm3, (-1))\n",
302 |     "    tm3 = np.reshape(tm3, (-1))\n",
303 |     "\n",
304 |     "    # Average the score obtained in the last part of the training process\n",
305 |     "    # in order to estabilish which model is better and thus worth saving\n",
306 |     "\n",
307 |     "\n",
308 |     "    score = vm1[-5:].mean()\n",
309 |     "    extra_msg = ''\n",
310 |     "    if score > max_vm:\n",
311 |     "        extra_msg = ' - Highest'\n",
312 |     "        best_weights = model.get_weights()\n",
313 |     "        max_vm = score\n",
314 |     "        vm1_max = vm1\n",
315 |     "        tm1_max = tm1\n",
316 |     "        vm2_max = vm2\n",
317 |     "        tm2_max = tm2\n",
318 |     "        vm3_max = vm3\n",
319 |     "        tm3_max = tm3\n",
320 |     "        \n",
321 |     "    print('Score: {0:.2f}'.format(score) + extra_msg)"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "markdown",
326 |    "metadata": {},
327 |    "source": [
328 |     "# Recover the saved model and its training history"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": null,
334 |    "metadata": {},
335 |    "outputs": [],
336 |    "source": [
337 |     "model.set_weights(best_weights)\n",
338 |     "\n",
339 |     "tm1 = np.array(tm1_max)\n",
340 |     "tm2 = np.array(tm2_max)\n",
341 |     "tm3 = np.array(tm3_max)\n",
342 |     "vm1 = np.array(vm1_max)\n",
343 |     "vm2 = np.array(vm2_max)\n",
344 |     "vm3 = np.array(vm3_max)\n"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "metadata": {},
350 |    "source": [
351 |     "# Training result visualization"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": null,
357 |    "metadata": {
358 |     "scrolled": false
359 |    },
360 |    "outputs": [],
361 |    "source": [
362 |     "plt.plot(vm1, label = 'VAMP')\n",
363 |     "plt.plot(vm2, label = 'VAMP2')\n",
364 |     "plt.plot(-vm3, label = 'loss')\n",
365 |     "plt.plot(tm1, label = 'training VAMP')\n",
366 |     "plt.plot(tm2, label = 'training VAMP2')\n",
367 |     "plt.plot(-tm3, label = 'training loss')\n",
368 |     "plt.legend()\n",
369 |     "plt.show()"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "# Transform the input trajectory using the network\n",
379 |     "states_prob = model.predict([traj_ord, traj_ord_lag])[:, :output_size]\n",
380 |     "\n",
381 |     "# Order the output states based on their population\n",
382 |     "coor_pred = np.argmax(states_prob, axis = 1)\n",
383 |     "indexes = [np.where(coor_pred == np.multiply(np.ones_like(coor_pred), n)) for n in range(output_size)]\n",
384 |     "states_num = [len(i[0]) for i in indexes]\n",
385 |     "states_order = np.argsort(states_num).astype('int')[::-1]\n",
386 |     "\n",
387 |     "pred_ord = states_prob[:,states_order]"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "markdown",
392 |    "metadata": {},
393 |    "source": [
394 |     "# Visualize the population of the states"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": null,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "def print_states_pie_chart():\n",
404 |     "    coors = []\n",
405 |     "    maxi = np.max(pred_ord, axis= 1)\n",
406 |     "\n",
407 |     "    for i in range(output_size):\n",
408 |     "        coors.append(len(np.where(pred_ord[:,i] == maxi)[0]))\n",
409 |     "        \n",
410 |     "    fig1, ax1 = plt.subplots()\n",
411 |     "    ax1.pie(np.array(coors), autopct='%1.2f%%', startangle=90)\n",
412 |     "    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.\n",
413 |     "    print('States population: '+str(np.array(coors)/len(maxi)*100)+'%')\n",
414 |     "    plt.show()\n",
415 |     "\n",
416 |     "print_states_pie_chart()"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "markdown",
421 |    "metadata": {},
422 |    "source": [
423 |     "# Visualize how the 4 states are placed on the Ramachandran plot"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": null,
429 |    "metadata": {
430 |     "scrolled": false
431 |    },
432 |    "outputs": [],
433 |    "source": [
434 |     "maxi_train = np.max(pred_ord, axis= 1)\n",
435 |     "coor_train = np.zeros_like(pred_ord)\n",
436 |     "for i in range(output_size):\n",
437 |     "    coor_train = np.where(pred_ord[:,i]== maxi_train)[0]\n",
438 |     "    plt.scatter(dihedral_init[coor_train,0], dihedral_init[coor_train,1], s=5)\n",
439 |     "plt.axes = [[-np.pi, np.pi],[-np.pi, np.pi]]"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "markdown",
444 |    "metadata": {},
445 |    "source": [
446 |     "# For each state, visualize the probabilities the different trajectory points have to belong to it"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": null,
452 |    "metadata": {
453 |     "scrolled": false
454 |    },
455 |    "outputs": [],
456 |    "source": [
457 |     "fig = plt.figure(figsize=(16, 16))\n",
458 |     "\n",
459 |     "gs1 = gridspec.GridSpec(2, int(np.ceil(output_size/2)))\n",
460 |     "gs1.update(wspace=0.05, hspace = 0.05)\n",
461 |     "\n",
462 |     "for n in range(output_size):\n",
463 |     "    ax = plt.subplot(gs1[n])\n",
464 |     "    im = ax.scatter(dihedral_init[:,0], dihedral_init[:,1], s=30,\n",
465 |     "                    c = pred_ord[:,n],\n",
466 |     "                    alpha=0.5, edgecolor='',\n",
467 |     "                    vmin = 0, vmax = 1\n",
468 |     "                    )\n",
469 |     "    plt.axis('on')\n",
470 |     "    title = 'State '+str(n + 1)\n",
471 |     "\n",
472 |     "    ax.text(.85, .15, title,\n",
473 |     "        horizontalalignment='center',\n",
474 |     "        transform=ax.transAxes,  fontdict = {'size':36})\n",
475 |     "\n",
476 |     "\n",
477 |     "    if (n < 3):\n",
478 |     "        ax.set_xticks([-3, 0, 3])\n",
479 |     "        ax.set_xticklabels([r'-$\\pi$', r'$0$', r'$\\pi$'])\n",
480 |     "        ax.xaxis.set_tick_params(top='on', bottom='off', labeltop='on', labelbottom='off')\n",
481 |     "        ax.xaxis.set_tick_params(labelsize=40)\n",
482 |     "    else:\n",
483 |     "        ax.set_xticks([])\n",
484 |     "    if (n%3==0):\n",
485 |     "        ax.set_yticks([-3, 0, 3])\n",
486 |     "        ax.set_yticklabels([r'-$\\pi$', r'$0$', r'$\\pi$'])\n",
487 |     "        ax.yaxis.set_tick_params(labelsize=40)\n",
488 |     "    else:\n",
489 |     "        ax.set_yticks([])\n",
490 |     "#    ax.set_aspect('equal')\n",
491 |     "    ax.set_xlim([-np.pi, np.pi]);\n",
492 |     "    ax.set_ylim([-np.pi, np.pi]);\n",
493 |     "    \n",
494 |     "    if (n%3 == 0):\n",
495 |     "        ax.set_ylabel(r'$\\Psi$ [rad]', fontdict = {'size':40})\n",
496 |     "    if (n < 3):\n",
497 |     "        ax.set_xlabel(r'$\\Phi$ [rad]', fontdict = {'size':40}, position = 'top')\n",
498 |     "        ax.xaxis.set_label_coords(0.5,1.2)\n",
499 |     "\n",
500 |     "gs1.tight_layout(fig, rect=[0, 0.03, 0.95, 0.94])\n",
501 |     "fig.show()\n",
502 |     "\n",
503 |     "cax = fig.add_axes([0.95, 0.05, 0.02, 0.8])\n",
504 |     "cbar = fig.colorbar(im, cax=cax, ticks=[0, 1])\n",
505 |     "cbar.ax.yaxis.set_tick_params(labelsize=40)"
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "markdown",
510 |    "metadata": {},
511 |    "source": [
512 |     "# Markov Model Estimation"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "markdown",
517 |    "metadata": {},
518 |    "source": [
519 |     "# Estimate the implied timescales"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": null,
525 |    "metadata": {
526 |     "scrolled": false
527 |    },
528 |    "outputs": [],
529 |    "source": [
530 |     "max_tau = 200\n",
531 |     "lag = np.arange(1, max_tau, 1)\n",
532 |     "its = vamp.get_its(pred_ord, lag)\n",
533 |     "vamp.plot_its(its, lag)"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "markdown",
538 |    "metadata": {},
539 |    "source": [
540 |     "# Chapman-Kolmogorov test for the estimated koopman operator"
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "code",
545 |    "execution_count": null,
546 |    "metadata": {},
547 |    "outputs": [],
548 |    "source": [
549 |     "steps = 8\n",
550 |     "tau_msm = 35\n",
551 |     "predicted, estimated = vamp.get_ck_test(pred_ord, steps, tau_msm)\n",
552 |     "vamp.plot_ck_test(predicted, estimated, output_size, steps, tau_msm)"
553 |    ]
554 |   }
555 |  ],
556 |  "metadata": {
557 |   "anaconda-cloud": {},
558 |   "kernelspec": {
559 |    "display_name": "Python 3",
560 |    "language": "python",
561 |    "name": "python3"
562 |   },
563 |   "language_info": {
564 |    "codemirror_mode": {
565 |     "name": "ipython",
566 |     "version": 3
567 |    },
568 |    "file_extension": ".py",
569 |    "mimetype": "text/x-python",
570 |    "name": "python",
571 |    "nbconvert_exporter": "python",
572 |    "pygments_lexer": "ipython3",
573 |    "version": "3.6.8"
574 |   }
575 |  },
576 |  "nbformat": 4,
577 |  "nbformat_minor": 2
578 | }
579 | 


--------------------------------------------------------------------------------
/vampnet/examples/Alanine_dipeptide_multiple_files.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Import all the packages used"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "scrolled": false
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import matplotlib.pyplot as plt\n",
 20 |     "%matplotlib inline\n",
 21 |     "import vampnet\n",
 22 |     "from vampnet import data_generator as vamp_data_loader\n",
 23 |     "from tensorflow.contrib.keras.api.keras.models import Model\n",
 24 |     "from tensorflow.contrib.keras.api.keras.layers import Dense, Activation, Flatten, Input, BatchNormalization, concatenate\n",
 25 |     "from tensorflow.contrib.keras.api.keras.optimizers import Adam\n",
 26 |     "import tensorflow as tf\n",
 27 |     "import matplotlib.gridspec as gridspec\n",
 28 |     "from tensorflow.contrib.keras.api.keras.backend import clear_session"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "# Necessary for downloading the trajectory data\n",
 38 |     "import mdshare\n",
 39 |     "import pyemma.coordinates as pycoor"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "# Define Hyperparameters"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# Tau, how much is the timeshift of the two datasets\n",
 56 |     "tau = 1\n",
 57 |     "\n",
 58 |     "# Batch size for Stochastic Gradient descent\n",
 59 |     "batch_size = 1000\n",
 60 |     "\n",
 61 |     "# Which trajectory points percentage is used as training\n",
 62 |     "train_ratio = 0.9\n",
 63 |     "\n",
 64 |     "# How many hidden layers the network has\n",
 65 |     "network_depth = 6\n",
 66 |     "\n",
 67 |     "# Width of every layer\n",
 68 |     "layer_width = 100\n",
 69 |     "\n",
 70 |     "# Learning rate used for the ADAM optimizer\n",
 71 |     "learning_rate = 1e-4\n",
 72 |     "\n",
 73 |     "# How many output states the network has\n",
 74 |     "output_size = 6\n",
 75 |     "\n",
 76 |     "# Iteration over the training set in the fitting process\n",
 77 |     "nb_epoch = 40\n",
 78 |     "\n",
 79 |     "epsilon = 1e-5"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "# Initialized the VAMPnets wrapper class"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "vamp = vampnet.VampnetTools(epsilon = epsilon)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "# Load Data"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "# #Download alanine coordinates and dihedral angles data\n",
112 |     "mdshare.fetch('alanine-dipeptide-3x250ns-heavy-atom-positions.npz')\n",
113 |     "mdshare.fetch('alanine-dipeptide-3x250ns-backbone-dihedrals.npz')\n",
114 |     "\n",
115 |     "alanine_files = np.load('alanine-dipeptide-3x250ns-heavy-atom-positions.npz')\n",
116 |     "\n",
117 |     "# # Save the files separately\n",
118 |     "np.save('traj0.npy', alanine_files['arr_0'])\n",
119 |     "np.save('traj1.npy', alanine_files['arr_1'])\n",
120 |     "np.save('traj2.npy', alanine_files['arr_2'])\n",
121 |     "\n",
122 |     "# Separate data files between training data and validation data\n",
123 |     "\n",
124 |     "train_data_files_list = [\n",
125 |     "    'traj0.npy',\n",
126 |     "    'traj1.npy',\n",
127 |     "]\n",
128 |     "\n",
129 |     "valid_data_files_list = [\n",
130 |     "    'traj2.npy',\n",
131 |     "]\n",
132 |     "\n",
133 |     "total_data_files_list = train_data_files_list + valid_data_files_list"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "# Define the pyemma data sources and get basic info from the files, number of datapoints and system size\n",
143 |     "\n",
144 |     "train_data_source = pycoor.source(train_data_files_list,chunksize = batch_size)\n",
145 |     "valid_data_source = pycoor.source(valid_data_files_list,chunksize = batch_size)\n",
146 |     "total_data_source = pycoor.source(total_data_files_list,chunksize = batch_size)\n",
147 |     "\n",
148 |     "train_datapoints = train_data_source.n_frames_total()\n",
149 |     "valid_datapoints = valid_data_source.n_frames_total()\n",
150 |     "total_datapoints = total_data_source.n_frames_total()\n",
151 |     "                           \n",
152 |     "traj_lengths = total_data_source.trajectory_lengths()\n",
153 |     "\n",
154 |     "input_size = total_data_source.dimension()"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "# Run several model iterations saving the best one, to help finding sparcely populated states"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {
168 |     "scrolled": true
169 |    },
170 |    "outputs": [],
171 |    "source": [
172 |     "max_vm = 0\n",
173 |     "attempts_number = 10\n",
174 |     "\n",
175 |     "\n",
176 |     "# IMPORTANT: the function vamp.loss_VAMP2_autograd can only be used with tensorflow 1.6 or more recent.\n",
177 |     "# For older versions of TF, use the function vamp.loss_VAMP2\n",
178 |     "\n",
179 |     "losses = [\n",
180 |     "          vamp.loss_VAMP2_autograd,\n",
181 |     "]\n",
182 |     "\n",
183 |     "\n",
184 |     "for attempt in range(attempts_number):\n",
185 |     "    \n",
186 |     "\n",
187 |     "    # Clear the previous tensorflow session to prevent memory leaks\n",
188 |     "    clear_session()\n",
189 |     "\n",
190 |     "    # Build the model\n",
191 |     "\n",
192 |     "\n",
193 |     "    nodes = [layer_width]*network_depth\n",
194 |     "\n",
195 |     "    Data_X = Input(shape = (input_size,))\n",
196 |     "    Data_Y = Input(shape = (input_size,))\n",
197 |     "\n",
198 |     "    # A batch normalization layer improves convergence speed\n",
199 |     "    bn_layer = BatchNormalization()\n",
200 |     "\n",
201 |     "    # Instance layers and assign them to the two lobes of the network\n",
202 |     "    dense_layers = [Dense(node, activation = 'elu',)\n",
203 |     "                    for node in nodes]\n",
204 |     "\n",
205 |     "    lx_branch = bn_layer(Data_X)\n",
206 |     "    rx_branch = bn_layer(Data_Y)\n",
207 |     "\n",
208 |     "    for i, layer in enumerate(dense_layers):\n",
209 |     "\n",
210 |     "        lx_branch = dense_layers[i](lx_branch)\n",
211 |     "        rx_branch = dense_layers[i](rx_branch)\n",
212 |     "\n",
213 |     "\n",
214 |     "    # Add a softmax output layer.\n",
215 |     "    # Should be replaced with a linear activation layer if\n",
216 |     "    # the outputs of the network cannot be interpreted as states\n",
217 |     "    softmax = Dense(output_size, activation='softmax')\n",
218 |     "\n",
219 |     "    lx_branch = softmax(lx_branch)\n",
220 |     "    rx_branch = softmax(rx_branch)\n",
221 |     "\n",
222 |     "    # Merge both networks to train both at the same time\n",
223 |     "    merged = concatenate([lx_branch, rx_branch])\n",
224 |     "\n",
225 |     "    # Initialize the model and the optimizer, and compile it with\n",
226 |     "    # the loss and metric functions from the VAMPnets package\n",
227 |     "    model = Model(inputs = [Data_X, Data_Y], outputs = merged)\n",
228 |     "    adam = Adam(lr = learning_rate)\n",
229 |     "\n",
230 |     "    vm1 = np.zeros((len(losses), nb_epoch))\n",
231 |     "    tm1 = np.zeros_like(vm1)\n",
232 |     "    vm2 = np.zeros_like(vm1)\n",
233 |     "    tm2 = np.zeros_like(vm1)\n",
234 |     "    \n",
235 |     "    for l_index, loss_function in enumerate(losses):\n",
236 |     "\n",
237 |     "        \n",
238 |     "        model.compile(optimizer = adam,\n",
239 |     "                      loss = loss_function,\n",
240 |     "                      metrics = [\n",
241 |     "                          vamp.metric_VAMP,\n",
242 |     "                          vamp.metric_VAMP2,\n",
243 |     "                                 ])\n",
244 |     "\n",
245 |     "\n",
246 |     "        # Train the model\n",
247 |     "        \n",
248 |     "        steps_per_train_epoch = int(np.sum(np.ceil((train_data_source.trajectory_lengths()-tau)/batch_size)))\n",
249 |     "        steps_per_valid_epoch = int(np.sum(np.ceil((valid_data_source.trajectory_lengths()-tau)/batch_size)))\n",
250 |     "        \n",
251 |     "        hist = model.fit_generator(generator = vamp_data_loader.build_generator_on_source_shuffle(train_data_source,\n",
252 |     "                                                                                                  batch_size,\n",
253 |     "                                                                                                  tau,\n",
254 |     "                                                                                                  output_size,\n",
255 |     "                                                                                                ),\n",
256 |     "                                   steps_per_epoch = steps_per_train_epoch,\n",
257 |     "                                   epochs = nb_epoch,\n",
258 |     "                                   verbose = 0,\n",
259 |     "                                   validation_data = vamp_data_loader.build_generator_on_source_shuffle(valid_data_source,\n",
260 |     "                                                                                                  batch_size,\n",
261 |     "                                                                                                  tau,\n",
262 |     "                                                                                                  output_size,\n",
263 |     "                                                                                                ),\n",
264 |     "                                   validation_steps = steps_per_valid_epoch,\n",
265 |     "                                   shuffle = True\n",
266 |     "                                  )\n",
267 |     "\n",
268 |     "        vm1[l_index] = np.array(hist.history['val_metric_VAMP'])\n",
269 |     "        tm1[l_index] = np.array(hist.history['metric_VAMP'])\n",
270 |     "        \n",
271 |     "        vm2[l_index] = np.array(hist.history['val_metric_VAMP2'])\n",
272 |     "        tm2[l_index] = np.array(hist.history['metric_VAMP2'])\n",
273 |     "        \n",
274 |     "    \n",
275 |     "    vm1 = np.reshape(vm1, (-1))\n",
276 |     "    tm1 = np.reshape(tm1, (-1))\n",
277 |     "    vm2 = np.reshape(vm2, (-1))\n",
278 |     "    tm2 = np.reshape(tm2, (-1))\n",
279 |     "\n",
280 |     "    # Average the score obtained in the last part of the training process\n",
281 |     "    # in order to estabilish which model is better and thus worth saving\n",
282 |     "\n",
283 |     "\n",
284 |     "    score = vm1[-5:].mean()\n",
285 |     "    t_score = tm1[-5:].mean()\n",
286 |     "    extra_msg = ''\n",
287 |     "    if score > max_vm:\n",
288 |     "        extra_msg = ' - Highest'\n",
289 |     "        best_weights = model.get_weights()\n",
290 |     "        max_vm = score\n",
291 |     "        vm1_max = vm1\n",
292 |     "        tm1_max = tm1\n",
293 |     "        vm2_max = vm2\n",
294 |     "        tm2_max = tm2\n",
295 |     "        \n",
296 |     "    print('Attempt {0}, training score: {1:.2f}, validation score: {2:.2f}'.format(attempt+1, t_score, score) + extra_msg)"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {},
302 |    "source": [
303 |     "# Recover the saved model and its training history"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "model.set_weights(best_weights)\n",
313 |     "\n",
314 |     "tm1 = np.array(vm1_max)\n",
315 |     "tm2 = np.array(tm1_max)\n",
316 |     "vm1 = np.array(vm2_max)\n",
317 |     "vm2 = np.array(tm2_max)"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "metadata": {},
323 |    "source": [
324 |     "# Training result visualization"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": null,
330 |    "metadata": {
331 |     "scrolled": false
332 |    },
333 |    "outputs": [],
334 |    "source": [
335 |     "plt.plot(vm1, label = 'VAMP')\n",
336 |     "plt.plot(vm2, label = 'VAMP2')\n",
337 |     "plt.plot(tm1, label = 'training VAMP')\n",
338 |     "plt.plot(tm2, label = 'training VAMP2')\n",
339 |     "plt.legend()\n",
340 |     "plt.show()"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": null,
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "# Transform the input trajectory using the network\n",
350 |     "states_prob_all = model.predict_generator(generator = vamp_data_loader.build_generator_on_source(total_data_source,\n",
351 |     "                                                              batch_size,\n",
352 |     "                                                              tau,\n",
353 |     "                                                              output_size),\n",
354 |     "                                     steps = np.sum(np.ceil((total_data_source.trajectory_lengths()-tau)/batch_size)),\n",
355 |     "                                     verbose = 0)\n",
356 |     "\n",
357 |     "states_prob_t = states_prob_all[:,:output_size]\n",
358 |     "states_prob_lag = states_prob_all[:,output_size:]\n",
359 |     "\n",
360 |     "# reorganize the output of the network in order to have every data point transformed by the network in one array\n",
361 |     "start = 0\n",
362 |     "states_prob = np.zeros((states_prob_t.shape[0]+len(traj_lengths)*tau, output_size))\n",
363 |     "for l, length_i in enumerate(traj_lengths-tau):\n",
364 |     "    states_prob[start+l*tau:start+l*tau+length_i] = states_prob_t[start:start+length_i]\n",
365 |     "    states_prob[start+l*tau+length_i:start+l*tau+length_i+tau] = states_prob_lag[start+length_i-tau:start+length_i]\n",
366 |     "    start += length_i\n",
367 |     "\n",
368 |     "# Order the output states based on their population\n",
369 |     "coor_pred = np.argmax(states_prob, axis = 1)\n",
370 |     "indexes = [np.where(coor_pred == np.multiply(np.ones_like(coor_pred), n)) for n in range(output_size)]\n",
371 |     "states_num = [len(i[0]) for i in indexes]\n",
372 |     "states_order = np.argsort(states_num).astype('int')[::-1]\n",
373 |     "\n",
374 |     "pred_ord = states_prob[:,states_order]"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "markdown",
379 |    "metadata": {},
380 |    "source": [
381 |     "# Visualize the population of the states"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": [
390 |     "def print_states_pie_chart():\n",
391 |     "    coors = []\n",
392 |     "    maxi = np.max(pred_ord, axis= 1)\n",
393 |     "\n",
394 |     "    for i in range(output_size):\n",
395 |     "        coors.append(len(np.where(pred_ord[:,i] == maxi)[0]))\n",
396 |     "        \n",
397 |     "    fig1, ax1 = plt.subplots()\n",
398 |     "    ax1.pie(np.array(coors), autopct='%1.2f%%', startangle=90)\n",
399 |     "    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.\n",
400 |     "    print('States population: '+str(np.array(coors)/len(maxi)*100)+'%')\n",
401 |     "    plt.show()\n",
402 |     "\n",
403 |     "print_states_pie_chart()"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "markdown",
408 |    "metadata": {},
409 |    "source": [
410 |     "# Visualize how the 4 states are placed on the Ramachandran plot"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": null,
416 |    "metadata": {},
417 |    "outputs": [],
418 |    "source": [
419 |     "dihedral_file = np.load('alanine-dipeptide-3x250ns-backbone-dihedrals.npz')\n",
420 |     "dihedral_init = np.concatenate([dihedral_file['arr_0'],\n",
421 |     "                                dihedral_file['arr_1'],\n",
422 |     "                                dihedral_file['arr_2'],\n",
423 |     "                               ], axis = 0)\n"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": null,
429 |    "metadata": {
430 |     "scrolled": false
431 |    },
432 |    "outputs": [],
433 |    "source": [
434 |     "maxi_train = np.max(pred_ord, axis= 1)\n",
435 |     "coor_train = np.zeros_like(pred_ord)\n",
436 |     "for i in range(output_size):\n",
437 |     "    coor_train = np.where(pred_ord[:,i]== maxi_train)[0]\n",
438 |     "    plt.scatter(dihedral_init[coor_train,0], dihedral_init[coor_train,1], s=1)\n",
439 |     "plt.axes = [[-np.pi, np.pi],[-np.pi, np.pi]]"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "markdown",
444 |    "metadata": {},
445 |    "source": [
446 |     "# For each state, visualize the probabilities the different trajectory points have to belong to it"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": null,
452 |    "metadata": {
453 |     "scrolled": false
454 |    },
455 |    "outputs": [],
456 |    "source": [
457 |     "fig = plt.figure(figsize=(16, 16))\n",
458 |     "\n",
459 |     "gs1 = gridspec.GridSpec(int(np.ceil(output_size/2)), 2)\n",
460 |     "gs1.update(wspace=0.05, hspace = 0.05)\n",
461 |     "\n",
462 |     "for n in range(output_size):\n",
463 |     "    ax = plt.subplot(gs1[n])\n",
464 |     "    im = ax.scatter(dihedral_init[:,0], dihedral_init[:,1], s=5,\n",
465 |     "                    c = pred_ord[:,n],\n",
466 |     "                    alpha=0.5, edgecolor='', vmin = 0, vmax = 1)\n",
467 |     "    plt.axis('on')\n",
468 |     "    title = 'State '+str(n + 1)\n",
469 |     "\n",
470 |     "    ax.text(.85, .15, title,\n",
471 |     "        horizontalalignment='center',\n",
472 |     "        transform=ax.transAxes,  fontdict = {'size':36})\n",
473 |     "\n",
474 |     "\n",
475 |     "    if (n < 3):\n",
476 |     "        ax.set_xticks([-3, 0, 3])\n",
477 |     "        ax.set_xticklabels([r'-$\\pi$', r'$0$', r'$\\pi$'])\n",
478 |     "        ax.xaxis.set_tick_params(top=True, bottom=False, labeltop=True, labelbottom=False)\n",
479 |     "        ax.xaxis.set_tick_params(labelsize=40)\n",
480 |     "    else:\n",
481 |     "        ax.set_xticks([])\n",
482 |     "    if (n%3==0):\n",
483 |     "        ax.set_yticks([-3, 0, 3])\n",
484 |     "        ax.set_yticklabels([r'-$\\pi$', r'$0$', r'$\\pi$'])\n",
485 |     "        ax.yaxis.set_tick_params(labelsize=40)\n",
486 |     "    else:\n",
487 |     "        ax.set_yticks([])\n",
488 |     "#    ax.set_aspect('equal')\n",
489 |     "    ax.set_xlim([-np.pi, np.pi]);\n",
490 |     "    ax.set_ylim([-np.pi, np.pi]);\n",
491 |     "    \n",
492 |     "    if (n%3 == 0):\n",
493 |     "        ax.set_ylabel(r'$\\Psi$ [rad]', fontdict = {'size':40})\n",
494 |     "    if (n < 3):\n",
495 |     "        ax.set_xlabel(r'$\\Phi$ [rad]', fontdict = {'size':40}, position = 'top')\n",
496 |     "        ax.xaxis.set_label_coords(0.5,1.2)\n",
497 |     "\n",
498 |     "gs1.tight_layout(fig, rect=[0, 0.03, 0.95, 0.94])\n",
499 |     "fig.show()\n",
500 |     "\n",
501 |     "cax = fig.add_axes([0.95, 0.05, 0.02, 0.8])\n",
502 |     "cbar = fig.colorbar(im, cax=cax, ticks=[0, 1])\n",
503 |     "cbar.ax.yaxis.set_tick_params(labelsize=40)"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "markdown",
508 |    "metadata": {},
509 |    "source": [
510 |     "# Markov Model Estimation"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "markdown",
515 |    "metadata": {},
516 |    "source": [
517 |     "## Prepare multiple trajectories  "
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": null,
523 |    "metadata": {},
524 |    "outputs": [],
525 |    "source": [
526 |     "# separate the trajectories again as a list based on the length of them\n",
527 |     "traj_list = []\n",
528 |     "start = 0\n",
529 |     "for length_i in traj_lengths:\n",
530 |     "    traj_list.append(pred_ord[start:start+length_i])\n",
531 |     "    start += length_i"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "markdown",
536 |    "metadata": {},
537 |    "source": [
538 |     "# Estimate the implied timescales"
539 |    ]
540 |   },
541 |   {
542 |    "cell_type": "code",
543 |    "execution_count": null,
544 |    "metadata": {
545 |     "scrolled": false
546 |    },
547 |    "outputs": [],
548 |    "source": [
549 |     "max_tau = 200\n",
550 |     "lag = np.arange(1, max_tau, 1)\n",
551 |     "its = vamp.get_its(traj_list, lag)\n",
552 |     "vamp.plot_its(its, lag)"
553 |    ]
554 |   },
555 |   {
556 |    "cell_type": "markdown",
557 |    "metadata": {},
558 |    "source": [
559 |     "# Chapman-Kolmogorov test for the estimated koopman operator"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": null,
565 |    "metadata": {},
566 |    "outputs": [],
567 |    "source": [
568 |     "steps = 8\n",
569 |     "tau_msm = 35\n",
570 |     "predicted, estimated = vamp.get_ck_test(traj_list, steps, tau_msm)\n",
571 |     "vamp.plot_ck_test(predicted, estimated, output_size, steps, tau_msm)"
572 |    ]
573 |   },
574 |   {
575 |    "cell_type": "code",
576 |    "execution_count": null,
577 |    "metadata": {},
578 |    "outputs": [],
579 |    "source": []
580 |   }
581 |  ],
582 |  "metadata": {
583 |   "anaconda-cloud": {},
584 |   "kernelspec": {
585 |    "display_name": "Python 3",
586 |    "language": "python",
587 |    "name": "python3"
588 |   },
589 |   "language_info": {
590 |    "codemirror_mode": {
591 |     "name": "ipython",
592 |     "version": 3
593 |    },
594 |    "file_extension": ".py",
595 |    "mimetype": "text/x-python",
596 |    "name": "python",
597 |    "nbconvert_exporter": "python",
598 |    "pygments_lexer": "ipython3",
599 |    "version": "3.6.8"
600 |   },
601 |   "varInspector": {
602 |    "cols": {
603 |     "lenName": 16,
604 |     "lenType": 16,
605 |     "lenVar": 40
606 |    },
607 |    "kernels_config": {
608 |     "python": {
609 |      "delete_cmd_postfix": "",
610 |      "delete_cmd_prefix": "del ",
611 |      "library": "var_list.py",
612 |      "varRefreshCmd": "print(var_dic_list())"
613 |     },
614 |     "r": {
615 |      "delete_cmd_postfix": ") ",
616 |      "delete_cmd_prefix": "rm(",
617 |      "library": "var_list.r",
618 |      "varRefreshCmd": "cat(var_dic_list()) "
619 |     }
620 |    },
621 |    "types_to_exclude": [
622 |     "module",
623 |     "function",
624 |     "builtin_function_or_method",
625 |     "instance",
626 |     "_Feature"
627 |    ],
628 |    "window_display": false
629 |   }
630 |  },
631 |  "nbformat": 4,
632 |  "nbformat_minor": 2
633 | }
634 | 


--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/models.py:
--------------------------------------------------------------------------------
  1 | #   This file is part of the markovmodel/deeptime repository.
  2 | #   Copyright (C) 2017, 2018 Computational Molecular Biology Group,
  3 | #   Freie Universitaet Berlin (GER)
  4 | #
  5 | #   This program is free software: you can redistribute it and/or modify
  6 | #   it under the terms of the GNU Lesser General Public License as published by
  7 | #   the Free Software Foundation, either version 3 of the License, or
  8 | #   (at your option) any later version.
  9 | #
 10 | #   This program is distributed in the hope that it will be useful,
 11 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | #   GNU General Public License for more details.
 14 | #
 15 | #   You should have received a copy of the GNU Lesser General Public License
 16 | #   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 | 
 18 | '''
 19 | Implementations of PCA, TICA, AE, and VAE.
 20 | '''
 21 | 
 22 | from torch import svd as _svd
 23 | from torch import nn as _nn
 24 | from torch import optim as _optim
 25 | from torch import diag as _diag
 26 | from torch import cat as _cat
 27 | from torch import randn as _randn
 28 | from torch import sum as _sum
 29 | from torch import mm as _mm
 30 | from torch import symeig as _symeig
 31 | from torch import abs as _abs
 32 | from torch import arange as _arange
 33 | from torch import sqrt as _sqrt
 34 | from torch import zeros as _zeros
 35 | from torch import no_grad as _no_grad
 36 | from torch.autograd import Function as _Function
 37 | from .utils import get_mean as _get_mean
 38 | from .utils import get_covariance as _get_covariance
 39 | from .utils import Transform as _Transform
 40 | 
 41 | __all__ = ['PCA', 'TICA', 'AE', 'VAE', 'VAMPNet']
 42 | 
 43 | ################################################################################
 44 | #
 45 | #   PCA
 46 | #
 47 | ################################################################################
 48 | 
 49 | class PCA(object):
 50 |     '''Perform a principal component analysis for dimensionality reduction.
 51 | 
 52 |     We compute the first <dim> eigenvectors of the instantaneous covariance
 53 |     matrix and use them to rotate/project the data into a lower dimensional
 54 |     subspace.
 55 |     '''
 56 |     def __init__(self):
 57 |         self.loss_function = _nn.MSELoss(size_average=False)
 58 |     def get_loss(self, loader):
 59 |         '''Train the model on the provided data loader.
 60 | 
 61 |         Arguments:
 62 |             loader (DataLoader): the data for loss calculation
 63 |         '''
 64 |         if loader is None:
 65 |             return None
 66 |         loss = 0.0
 67 |         for x, y in loader:
 68 |             x, y = self.transformer(x, y)
 69 |             loss += self.loss_function(x.mm(self.score_matrix), y).item()
 70 |         return loss / float(len(loader.dataset))
 71 |     def fit(self, train_loader, dim=None, test_loader=None):
 72 |         '''Train the model on the provided data loader.
 73 | 
 74 |         Arguments:
 75 |             train_loader (DataLoader): the training data
 76 |             dim (int): the target dimensionality
 77 |             test_loader (DataLoader): the data for validation
 78 |         '''
 79 |         self.x_mean, y_mean = _get_mean(train_loader)
 80 |         self.cxx, cxy, cyy = _get_covariance(
 81 |             train_loader, self.x_mean, y_mean)
 82 |         self.transformer = _Transform(
 83 |             x_mean=self.x_mean, y_mean=self.x_mean)
 84 |         u, s, v = _svd(self.cxx)
 85 |         if dim is None:
 86 |             dim = s.size()[0]
 87 |         self.decoder_matrix = u[:, :dim]
 88 |         self.encoder_matrix = v.t()[:dim, :]
 89 |         self.score_matrix = self.decoder_matrix.mm(self.encoder_matrix)
 90 |         return self.get_loss(train_loader), self.get_loss(test_loader)
 91 |     def transform(self, loader):
 92 |         '''Apply the model on the provided data loader.
 93 | 
 94 |         Arguments:
 95 |             loader (DataLoader): the data you wish to transform
 96 |         '''
 97 |         latent = []
 98 |         for x, _ in loader:
 99 |             x = self.transformer.x(x)
100 |             latent.append(x.mm(self.encoder_matrix.t()))
101 |         return _cat(latent)
102 | 
103 | ################################################################################
104 | #
105 | #   TICA
106 | #
107 | ################################################################################
108 | 
109 | class TICA(object):
110 |     '''Perform a time-lagged independent component analysis for
111 |     dimensionality reduction.
112 | 
113 |     We compute a rank-d approximation to the Koopman operator and use it to
114 |     rotate/project the data into a lower dimensional subspace.
115 | 
116 |     Arguments:
117 |         kinetic_map (boolean): use the kinetic map variant of TICA
118 |         symmetrize (boolean): enforce symmetry and reversibility
119 |     '''
120 |     def __init__(self, kinetic_map=True, symmetrize=False):
121 |         self.loss_function = _nn.MSELoss(size_average=False)
122 |         self.kinetic_map = kinetic_map
123 |         self.symmetrize = symmetrize
124 |     def get_loss(self, loader):
125 |         '''Train the model on the provided data loader.
126 | 
127 |         Arguments:
128 |             loader (DataLoader): the data for loss calculation
129 |         '''
130 |         if loader is None:
131 |             return None
132 |         loss = 0.0
133 |         for x, y in loader:
134 |             x, y = self.transformer(x, y)
135 |             loss += self.loss_function(x.mm(self.koopman_matrix), y).item()
136 |         return loss / float(len(loader.dataset))
137 |     def fit(self, train_loader, dim=None, test_loader=None):
138 |         '''Train the model on the provided data loader.
139 | 
140 |         Arguments:
141 |             train_loader (DataLoader): the training data
142 |             dim (int): the target dimensionality
143 |             test_loader (DataLoader): the data for validation
144 |         '''
145 |         self.x_mean, self.y_mean = _get_mean(train_loader)
146 |         self.cxx, self.cxy, self.cyy = _get_covariance(
147 |             train_loader, self.x_mean, self.y_mean)
148 |         if self.symmetrize:
149 |             self.cxx = 0.5 * (self.cxx + self.cyy)
150 |             self.cyy.copy_(self.cxx)
151 |             self.cxy = 0.5 * (self.cxy + self.cxy.t())
152 |         self.transformer = _Transform(
153 |             x_mean=self.x_mean, x_covariance=self.cxx,
154 |             y_mean=self.y_mean, y_covariance=self.cyy)
155 |         self.ixx = self.transformer.x.mul
156 |         self.iyy = self.transformer.y.mul
157 |         u, s, v = _svd(self.ixx.mm(self.cxy.mm(self.iyy)))
158 |         if dim is None:
159 |             dim = s.size()[0]
160 |         self.decoder_matrix = v[:, :dim]
161 |         self.encoder_matrix = u.t()[:dim, :]
162 |         if self.kinetic_map:
163 |             self.encoder_matrix = _diag(s[:dim]).mm(self.encoder_matrix)
164 |         else:
165 |             self.decoder_matrix = self.decoder_matrix.mm(_diag(s[:dim]))
166 |         self.koopman_matrix = self.decoder_matrix.mm(self.encoder_matrix)
167 |         return self.get_loss(train_loader), self.get_loss(test_loader)
168 |     def transform(self, loader):
169 |         '''Apply the model on the provided data loader.
170 | 
171 |         Arguments:
172 |             loader (DataLoader): the data you wish to transform
173 |         '''
174 |         latent = []
175 |         for x, _ in loader:
176 |             x = self.transformer.x(x)
177 |             latent.append(x.mm(self.encoder_matrix.t()))
178 |         return _cat(latent)
179 | 
180 | ################################################################################
181 | #
182 | #   AUTOENCODER BASE CLASS
183 | #
184 | ################################################################################
185 | 
186 | class BaseNet(_nn.Module):
187 |     '''Basic shape of a pytorch neural network model for dimension reduction.
188 | 
189 |     The BaseNet is the basis of more specialised dimension reduction networks
190 |     and provides the full infrastructure for the setup and training process.
191 | 
192 |     Arguments:
193 |         inp_size (int): dimensionality of the full space
194 |         lat_size (int): dimensionality of the desired latent space
195 |         hid_size (sequence of int): sizes of the hidden layers
196 |         normalize_batch (boolean): normalize over batches instead samples
197 |         dropout (Dropout): dropout layer for each hidden layer
198 |         alpha (float) activation parameter for the rectified linear units
199 |         prelu (bool) use a learnable ReLU
200 |         bias (boolean): specify usage of bias neurons
201 |         lr (float): learning rate parameter for Adam
202 |         cuda (boolean): use the GPU
203 |         non_blocking (boolean): use asyncronous mode (GPU only)
204 |     '''
205 |     def __init__(
206 |         self, inp_size, lat_size, hid_size, normalize_batch,
207 |         dropout, alpha, prelu, bias, lr, cuda, non_blocking):
208 |         super(BaseNet, self).__init__()
209 |         sizes = [inp_size] + list(hid_size) + [lat_size]
210 |         self._last = len(sizes) - 2
211 |         if isinstance(dropout, float):
212 |             dropout = _nn.Dropout(p=dropout)
213 |         self._setup(sizes, bias, alpha, prelu, dropout)
214 |         self.optimizer = _optim.Adam(self.parameters(), lr=lr)
215 |         self.normalize_batch = normalize_batch
216 |         self.non_blocking = non_blocking
217 |         if cuda:
218 |             self.use_cuda = True
219 |             self.cuda() # the non_blocking=... parameter is not accepted, here
220 |         else:
221 |             self.use_cuda = False
222 |     def _setup(self, sizes, bias, alpha, prelu, dropout):
223 |         '''Implement this in your derived class to create the necessary
224 |         layers.
225 |         '''
226 |     def _create_activation(self, key, idx, alpha, prelu, suffix=''):
227 |         '''Helper function to create activations and initialize parameters.'''
228 |         if alpha is None:
229 |             activation = None
230 |         elif alpha < 0.0:
231 |             raise ValueError('alpha must be a non-negative number')
232 |         elif alpha == 0.0:
233 |             activation = _nn.ReLU()
234 |         elif prelu:
235 |             activation = _nn.PReLU(num_parameters=1, init=alpha)
236 |         else:
237 |             activation = _nn.LeakyReLU(negative_slope=alpha)
238 |         if activation is not None:
239 |             setattr(self, key + '_act_%d%s' % (idx, suffix), activation)
240 |         layer = getattr(self, key + '_prm_%d%s' % (idx, suffix))
241 |         _nn.init.kaiming_normal_(layer.weight.data, a=alpha, mode='fan_in')
242 |         try:
243 |             layer.bias.data.uniform_(0.0, 0.1)
244 |         except AttributeError:
245 |             pass
246 |     def _try_to_apply_module(self, key, value):
247 |         '''Helper function to safely apply a module within the network.'''
248 |         try:
249 |             return getattr(self, key)(value)
250 |         except AttributeError:
251 |             return value
252 |     def _apply_layer(self, key, idx, value):
253 |         '''Helper function to safely apply a layer (module sequence) within
254 |         the network.
255 |         '''
256 |         return self._try_to_apply_module(
257 |             key + '_drp_%d' % idx, self._try_to_apply_module(
258 |                 key + '_act_%d' % idx, self._try_to_apply_module(
259 |                     key + '_prm_%d' % idx, value)))
260 |     def forward_and_apply_loss_function(self, x, y):
261 |         '''Implement this in your derived class'''
262 |         raise NotImplementedError('Implement in child class')
263 |     def train_step(self, loader):
264 |         '''A single training epoch.'''
265 |         self.train()
266 |         train_loss = 0
267 |         for x, y in loader:
268 |             x, y = self.transformer(x, y)
269 |             if self.use_cuda:
270 |                 x = x.cuda(non_blocking=self.non_blocking)
271 |                 y = y.cuda(non_blocking=self.non_blocking)
272 |             self.optimizer.zero_grad()
273 |             loss = self.forward_and_apply_loss_function(x, y)
274 |             loss.backward()
275 |             train_loss += loss.item()
276 |             self.optimizer.step()
277 |         if self.normalize_batch:
278 |             return train_loss / float(len(loader))
279 |         return train_loss / float(len(loader.dataset))
280 |     def test_step(self, loader):
281 |         '''A single validation epoch'''
282 |         self.eval()
283 |         test_loss = 0
284 |         if loader is None:
285 |             return None
286 |         for x, y in loader:
287 |             x, y = self.transformer(x, y)
288 |             if self.use_cuda:
289 |                 x = x.cuda(non_blocking=self.non_blocking)
290 |                 y = y.cuda(non_blocking=self.non_blocking)
291 |             test_loss += self.forward_and_apply_loss_function(x, y).item()
292 |         if self.normalize_batch:
293 |             return test_loss / float(len(loader))
294 |         return test_loss / float(len(loader.dataset))
295 |     def fit(self, train_loader, n_epochs, test_loader=None):
296 |         '''Train the model on the provided data loader.
297 | 
298 |         Arguments:
299 |             train_loader (DataLoader): the training data
300 |             n_epochs (int): number of training epochs
301 |             test_loader (DataLoader): the data for validation
302 |         '''
303 |         x_mean, y_mean = _get_mean(train_loader)
304 |         cxx, cxy, cyy = _get_covariance(train_loader, x_mean, y_mean)
305 |         self.transformer = _Transform(
306 |             x_mean=x_mean, x_covariance=cxx, y_mean=y_mean, y_covariance=cyy)
307 |         train_loss, test_loss = [], []
308 |         for epoch in range(n_epochs):
309 |             train_loss.append(
310 |                 self.train_step(
311 |                     train_loader))
312 |             with _no_grad():
313 |                 test_loss.append(
314 |                     self.test_step(test_loader))
315 |         return train_loss, test_loss
316 |     def transform(self, loader):
317 |         '''Apply the model on the provided data loader.
318 | 
319 |         Arguments:
320 |             loader (DataLoader): the data you wish to transform
321 |         '''
322 |         self.eval()
323 |         latent = []
324 |         for x, _ in loader:
325 |             x = self.transformer.x(x)
326 |             if self.use_cuda:
327 |                 x = x.cuda(non_blocking=self.non_blocking)
328 |             y = self.encode(x)
329 |             if self.cuda:
330 |                 y = y.cpu()
331 |             latent.append(y)
332 |         return _cat(latent).data
333 | 
334 | ################################################################################
335 | #
336 | #   AUTOENCODER
337 | #
338 | ################################################################################
339 | 
340 | class AE(BaseNet):
341 |     '''Use a time-lagged autoencoder model for dimensionality reduction.
342 | 
343 |     We train a time-lagged autoencoder type neural network.
344 | 
345 |     Arguments:
346 |         inp_size (int): dimensionality of the full space
347 |         lat_size (int): dimensionality of the desired latent space
348 |         hid_size (sequence of int): sizes of the hidden layers
349 |         dropout (Dropout): dropout layer for each hidden layer
350 |         alpha (float) activation parameter for the rectified linear units
351 |         prelu (bool) use a learnable ReLU
352 |         bias (boolean): specify usage of bias neurons
353 |         lr (float): learning rate parameter for Adam
354 |         cuda (boolean): use the GPU
355 |     '''
356 |     def __init__(
357 |         self, inp_size, lat_size, hid_size=[],
358 |         dropout=0.5, alpha=0.01, prelu=False,
359 |         bias=True, lr=0.001, cuda=False, non_blocking=False):
360 |         super(AE, self).__init__(
361 |             inp_size, lat_size, hid_size, False,
362 |             dropout, alpha, prelu, bias, lr, cuda, non_blocking)
363 |         self._mse_loss_function = _nn.MSELoss(size_average=False)
364 |     def _setup(self, sizes, bias, alpha, prelu, dropout):
365 |         '''Helper function to create al necessary layers.'''
366 |         for c, idx in enumerate(range(1, len(sizes))):
367 |             setattr(
368 |                 self,
369 |                 'enc_prm_%d' % c,
370 |                 _nn.Linear(sizes[idx - 1], sizes[idx], bias=bias))
371 |             self._create_activation('enc', c, alpha, prelu)
372 |             if c < self._last:
373 |                 if dropout is not None:
374 |                     setattr(self, 'enc_drp_%d' % c, dropout)
375 |         for c, idx in enumerate(reversed(range(1, len(sizes)))):
376 |             setattr(
377 |                 self,
378 |                 'dec_prm_%d' % c,
379 |                 _nn.Linear(sizes[idx], sizes[idx - 1], bias=bias))
380 |             if c < self._last:
381 |                 self._create_activation('dec', c, alpha, prelu)
382 |                 if dropout is not None:
383 |                     setattr(self, 'dec_drp_%d' % c, dropout)
384 |             else:
385 |                 self._create_activation('dec', c, None, None)
386 |     def forward_and_apply_loss_function(self, x, y):
387 |         '''Helper function to feed data through the network and compute the
388 |         desired loss.
389 |         '''
390 |         return self._mse_loss_function(self(x), y)
391 |     def encode(self, x):
392 |         '''Encode the given input.'''
393 |         y = x
394 |         for idx in range(self._last):
395 |             y = self._apply_layer('enc', idx, y)
396 |         return getattr(self, 'enc_prm_%d' % self._last)(y)
397 |     def decode(self, z):
398 |         '''Decode the given input.'''
399 |         y = self._try_to_apply_module('enc_act_%d' % self._last, z)
400 |         for idx in range(self._last):
401 |             y = self._apply_layer('dec', idx, y)
402 |         return getattr(self, 'dec_prm_%d' % self._last)(y)
403 |     def forward(self, x):
404 |         '''Forward the given input through the network.'''
405 |         return self.decode(self.encode(x))
406 | 
407 | ################################################################################
408 | #
409 | #   VARIATIONAL AUTOENCODER
410 | #
411 | ################################################################################
412 | 
413 | class VAE(BaseNet):
414 |     '''Use a time-lagged variational autoencoder model for dimensionality
415 |     reduction.
416 | 
417 |     We train a time-lagged variational autoencoder type neural network.
418 | 
419 |     Arguments:
420 |         inp_size (int): dimensionality of the full space
421 |         lat_size (int): dimensionality of the desired latent space
422 |         hid_size (sequence of int): sizes of the hidden layers
423 |         beta (float) : KLD weight for optimization
424 |         dropout (Dropout): dropout layer for each hidden layer
425 |         alpha (float) activation parameter for the rectified linear units
426 |         prelu (bool) use a learnable ReLU
427 |         bias (boolean): specify usage of bias neurons
428 |         lr (float): learning rate parameter for Adam
429 |         cuda (boolean): use the GPU
430 |     '''
431 |     def __init__(
432 |         self, inp_size, lat_size, hid_size=[], beta=1.0,
433 |         dropout=0.5, alpha=0.01, prelu=False,
434 |         bias=True, lr=0.001, cuda=False, non_blocking=False):
435 |         super(VAE, self).__init__(
436 |             inp_size, lat_size, hid_size, False,
437 |             dropout, alpha, prelu, bias, lr, cuda, non_blocking)
438 |         self.beta = beta
439 |         self._mse_loss_function = _nn.MSELoss(size_average=False)
440 |     def _setup(self, sizes, bias, alpha, prelu, dropout):
441 |         '''Helper function to create al necessary layers.'''
442 |         for c, idx in enumerate(range(1, len(sizes) - 1)):
443 |             setattr(
444 |                 self,
445 |                 'enc_prm_%d' % c,
446 |                 _nn.Linear(sizes[idx - 1], sizes[idx], bias=bias))
447 |             self._create_activation('enc', c, alpha, prelu)
448 |             if dropout is not None:
449 |                 setattr(self, 'enc_drp_%d' % c, dropout)
450 |         setattr(
451 |             self,
452 |             'enc_prm_%d_mu' % self._last,
453 |             _nn.Linear(sizes[-2], sizes[-1], bias=bias))
454 |         self._create_activation('enc', self._last, None, None, suffix='_mu')
455 |         setattr(
456 |             self,
457 |             'enc_prm_%d_lv' % self._last,
458 |             _nn.Linear(sizes[-2], sizes[-1], bias=bias))
459 |         self._create_activation('enc', self._last, None, None, suffix='_lv')
460 |         for c, idx in enumerate(reversed(range(1, len(sizes)))):
461 |             setattr(
462 |                 self,
463 |                 'dec_prm_%d' % c,
464 |                 _nn.Linear(sizes[idx], sizes[idx - 1], bias=bias))
465 |             if c < self._last:
466 |                 self._create_activation('dec', c, alpha, prelu)
467 |                 if dropout is not None:
468 |                     setattr(self, 'dec_drp_%d' % c, dropout)
469 |             else:
470 |                 self._create_activation('dec', c, None, None)
471 |     def forward_and_apply_loss_function(self, x, y):
472 |         '''Helper function to feed data through the network and compute the
473 |         desired loss.
474 |         '''
475 |         y_recon, mu, lv = self(x)
476 |         mse = self._mse_loss_function(y_recon, y)
477 |         kld = -0.5 * _sum(1.0 + lv - mu.pow(2) - lv.exp())
478 |         return mse + self.beta * kld / float(y.size(1))
479 |     def _encode(self, x):
480 |         '''Encode the given input.'''
481 |         y = x
482 |         for idx in range(self._last):
483 |             y = self._apply_layer('enc', idx, y)
484 |         mu = getattr(self, 'enc_prm_%d_mu' % self._last)(y)
485 |         lv = getattr(self, 'enc_prm_%d_lv' % self._last)(y)
486 |         return mu, lv
487 |     def _reparameterize(self, mu, lv):
488 |         '''Reparametrize the given input.'''
489 |         if self.training:
490 |             std = lv.mul(0.5).exp_()
491 |             eps = _randn(*std.size())
492 |             if self.use_cuda:
493 |                 eps = eps.cuda()
494 |             return eps.mul(std).add_(mu)
495 |         else:
496 |             return mu
497 |     def encode(self, x):
498 |         '''Encode/reparametrize the given input.'''
499 |         return self._reparameterize(*self._encode(x))
500 |     def decode(self, z):
501 |         '''Decode the given input.'''
502 |         y = z
503 |         for idx in range(self._last):
504 |             y = self._apply_layer('dec', idx, y)
505 |         return getattr(self, 'dec_prm_%d' % self._last)(y)
506 |     def forward(self, x):
507 |         '''Forward the given input through the network.'''
508 |         mu, lv = self._encode(x)
509 |         return self.decode(self._reparameterize(mu, lv)), mu, lv
510 | 
511 | ################################################################################
512 | #
513 | #   VAMPNET WORK IN PROGRESS
514 | #
515 | ################################################################################
516 | 
517 | class DecomposeRSPDMatrix(_Function):
518 |     @staticmethod
519 |     def forward(ctx, matrix):
520 |         eigval, eigvec = _symeig(matrix, eigenvectors=True)
521 |         eigval = _abs(eigval) + 1e-10
522 |         ctx.eigval = eigval
523 |         ctx.eigvec = eigvec
524 |         return eigval, eigvec
525 |     @staticmethod
526 |     def backward(ctx, dval, dvec):
527 |         eigval = ctx.eigval
528 |         eigvec = ctx.eigvec
529 |         n = len(eigval)
530 |         eigval_dist = eigval[:, None] - eigval[None, :]
531 |         idx = _arange(n).long().tolist()
532 |         eigval_dist[idx, idx] = 1.0
533 |         dval_out = eigvec[:, None, :] * eigvec[None, :, :]
534 |         dvec_out = _zeros(n, n, n, n).type(eigval.type())
535 |         omega = _zeros(n, n).type(eigval.type())
536 |         for i in range(n):
537 |             for j in range(n):
538 |                 omega[:, :] = eigvec[i, :, None] * eigvec[j, None, :]
539 |                 omega[idx, idx] = 0.0
540 |                 omega.div_(eigval_dist)
541 |                 dvec_out[i, j, :, :] = -_mm(eigvec, omega)
542 |         dval = _sum(dval[None, None, :] * dval_out, -1)
543 |         dvec = _sum(_sum(dvec[None, None, :, :] * dvec_out, -1), -1)
544 |         return dval + dvec
545 | 
546 | def covar(x, y):
547 |     return _mm(x.t(), y).div_(len(x))
548 | 
549 | def sqrtinv(matrix):
550 |     eigval, eigvec = DecomposeRSPDMatrix.apply(matrix)
551 |     diag = _diag(1.0 / _sqrt(eigval))
552 |     return _mm(eigvec, _mm(diag, eigvec.t()))
553 | 
554 | def get_koopman_matrix(x, y):
555 |     ixx = sqrtinv(covar(x, x))
556 |     iyy = sqrtinv(covar(y, y))
557 |     cxy = covar(x, y)
558 |     kmm = _mm(ixx, _mm(cxy, iyy))
559 |     return kmm.t()
560 | 
561 | class VAMPNet(BaseNet):
562 |     def __init__(
563 |         self, inp_size, lat_size, hid_size=[],
564 |         dropout=0.5, alpha=0.01, prelu=False,
565 |         bias=True, lr=0.001, cuda=False, non_blocking=False):
566 |         super(VAMPNet, self).__init__(
567 |             inp_size, lat_size, hid_size, True,
568 |             dropout, alpha, prelu, bias, lr, cuda, non_blocking)
569 |     def _setup(self, sizes, bias, alpha, prelu, dropout):
570 |         for c, idx in enumerate(range(1, len(sizes))):
571 |             setattr(
572 |                 self,
573 |                 'enc_prm_%d' % c,
574 |                 _nn.Linear(sizes[idx - 1], sizes[idx], bias=bias))
575 |             if c < self._last:
576 |                 self._create_activation('enc', c, alpha, prelu)
577 |                 if dropout is not None:
578 |                     setattr(self, 'enc_drp_%d' % c, dropout)
579 |         self._create_activation('enc', self._last, None, None)
580 |         self.max = _nn.Softmax(dim=1)
581 |     def forward_and_apply_loss_function(self, x, y):
582 |         koopman = self(x, y)
583 |         return -_sum(koopman**2)
584 |     def encode(self, x):
585 |         y = x
586 |         for idx in range(self._last):
587 |             y = self._apply_layer('enc', idx, y)
588 |         y = getattr(self, 'enc_prm_%d' % self._last)(y)
589 |         return self.max(y)
590 |     def forward(self, x, y):
591 |         x_enc = self.encode(x)
592 |         y_enc = self.encode(y)
593 |         return get_koopman_matrix(x_enc, y_enc)
594 | 


--------------------------------------------------------------------------------