├── .git_archival.txt
├── .gitattributes
├── time-lagged-autoencoder
├── tae
│ ├── test
│ │ ├── __init__.py
│ │ ├── test_toymodels.py
│ │ ├── test_models.py
│ │ ├── test_api.py
│ │ └── test_utils.py
│ ├── __init__.py
│ ├── toymodels.py
│ ├── api.py
│ ├── utils.py
│ ├── benchmarks.py
│ └── models.py
├── setup.cfg
├── setup.py
└── README.md
├── README.md
├── docs
└── wishlist.md
├── vampnet
├── vampnet
│ ├── __init__.py
│ └── data_generator.py
├── setup.py
├── README.md
└── examples
│ ├── 1D_double_well.ipynb
│ ├── Folding.ipynb
│ ├── Alanine_dipeptide.ipynb
│ └── Alanine_dipeptide_multiple_files.ipynb
├── .gitignore
└── LICENSE
/.git_archival.txt:
--------------------------------------------------------------------------------
1 | ref-names: HEAD -> master
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | .git_archival.txt export-subst
--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/time-lagged-autoencoder/setup.cfg:
--------------------------------------------------------------------------------
1 | [alias]
2 | test=pytest
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # deeptime
2 | Deep learning meets molecular dynamics.
3 |
4 | ## Contents
5 |
6 | - **time-lagged-autoencoder**: a toolbox for dimension reduction of time series data with a [time-lagged autoencoder](https://aip.scitation.org/doi/full/10.1063/1.5011399)-type deep neural network.
7 | - **vampnet**: Variational Approach for Markov Processes networks, see https://www.nature.com/articles/s41467-017-02388-1
8 |
--------------------------------------------------------------------------------
/docs/wishlist.md:
--------------------------------------------------------------------------------
1 | # General
2 | - want to be able to fit by batch by either providing a numpy style array (so everything fits into memory) or provide a generator function
3 | - which framework is used should be decided based on what is in the environment / based on a user configuration if both frameworks are available
4 | # Top level
5 | - invisible to the user which NN framework is used
6 | - have Models `TAE` and `VAMPNet` which can be "trained" (layer sizes, dropout, batch size, learning rate, activation functions etc)
7 | - Trained models can be `fit`-ted on data
8 | # Mid level
9 | - abstraction layer between the actual NN-framework implementation and the top layer
10 | # Low level
11 | - specialization toward pytorch / TF as implementation of the abstraction layer
12 | - smaller dispatch interfaces separated through namespaces, e.g.,
13 | ```python
14 | deeptime.scores.tf.vamp
15 | deeptime.scores.torch.vamp
16 | ```
--------------------------------------------------------------------------------
/vampnet/vampnet/__init__.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/deeptime repository.
2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | from pkg_resources import get_distribution, DistributionNotFound
19 | try:
20 | __version__ = get_distribution(__name__).version
21 | except DistributionNotFound:
22 | __version__ = 'x.y.z'
23 | del get_distribution, DistributionNotFound
24 |
25 | __author__ = 'Andreas Mardt, Luca Pasquali'
26 | __email__ = 'andreas.mardt@fu-berlin.de, luca.pasquali@fu-berlin.de'
27 |
28 | from .vampnet import VampnetTools
29 |
--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/__init__.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/deeptime repository.
2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | '''
19 | A toolbox for dimension reduction of time series data with a
20 | time-lagged autoencoder.
21 | '''
22 |
23 | from pkg_resources import get_distribution, DistributionNotFound
24 | try:
25 | __version__ = get_distribution(__name__).version
26 | except DistributionNotFound:
27 | __version__ = 'x.y.z'
28 | del get_distribution, DistributionNotFound
29 |
30 | __author__ = 'Christoph Wehmeyer'
31 | __email__ = 'christoph.wehmeyer@fu-berlin.de'
32 |
33 | from .api import pca, tica, ae, vae, vampnet
34 | from .models import PCA, TICA, AE, VAE, VAMPNet
35 | from . import utils
36 | from . import toymodels
37 |
--------------------------------------------------------------------------------
/vampnet/setup.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/deeptime repository.
2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | from setuptools import setup, find_packages
19 |
20 | description = '''
21 | Collection of functions to implement neural networks based
22 | on the variational approach for Markov processes,
23 | as described in https://arxiv.org/abs/1710.06012
24 | '''
25 |
26 | setup(
27 | use_scm_version=dict(root='..', relative_to=__file__),
28 | name='vampnet',
29 | author='Andreas Mardt, Luca Pasquali',
30 | author_email='andreas.mardt@fu-berlin.de, luca.pasquali@fu-berlin.de',
31 | url='https://github.com/markovmodel/deeptime/tree/master/vampnet',
32 | description=description,
33 | packages=find_packages(),
34 | setup_requires=['setuptools_scm', 'setuptools_scm_git_archive'],
35 | install_requires=[
36 | 'numpy',
37 | 'scipy',
38 | 'matplotlib'],
39 | zip_safe=False)
40 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 | Untitled*.ipynb
73 |
74 | # pyenv
75 | .python-version
76 |
77 | # celery beat schedule file
78 | celerybeat-schedule
79 |
80 | # SageMath parsed files
81 | *.sage.py
82 |
83 | # dotenv
84 | .env
85 |
86 | # virtualenv
87 | .venv
88 | venv/
89 | ENV/
90 |
91 | # Spyder project settings
92 | .spyderproject
93 | .spyproject
94 |
95 | # Rope project settings
96 | .ropeproject
97 |
98 | # mkdocs documentation
99 | /site
100 |
101 | # mypy
102 | .mypy_cache/
103 | .idea/
104 |
--------------------------------------------------------------------------------
/vampnet/README.md:
--------------------------------------------------------------------------------
1 | # VAMPnet
2 | Variational Approach for Markov Processes networks.
3 |
4 |
5 | ## What is it?
6 | VAMPnet is an open source Python package for the implementation of the VAMPnet method for dynamical systems analysis (described in https://www.nature.com/articles/s41467-017-02388-1). It includes losses functions, metrics, basic estimators for Koopman operators and the most important validation tools for Koopman models.
7 |
8 | VAMPnet can be used from Jupyter (former IPython, recommended), or by
9 | writing Python scripts.
10 |
11 |
12 | ## Citation
13 | If you use VAMPnet in scientific work, please cite:
14 |
15 | Mardt, A., Pasquali, L., Wu, H., & Noé, F. (2018).
16 | VAMPnets for deep learning of molecular kinetics.
17 | Nature communications, 9(1), 5.
18 |
19 | ## Installation
20 |
21 | IMPORTANT: On Tensorflow 1.7 and 1.8 there's an unresolved issue which causes the
22 | eigenvalue decomposition to fail. This issue doesn't present itself on TF 1.4-1.6
23 | and 1.9+, so please use one of these other releases instead.
24 |
25 | This package requires [Tensorflow](https://www.tensorflow.org) to be used.
26 | Please install either tensorflow or tensorflow-gpu. Installation instructions:
27 |
28 | https://www.tensorflow.org/install/
29 |
30 | To install this package, first clone the repository:
31 |
32 | git clone https://github.com/markovmodel/deeptime.git
33 |
34 | Then with pip:
35 |
36 | ```bash
37 | python setup.py install
38 | ```
39 |
40 | The examples are jupyter notebooks, so the jupyter package is needed to run them:
41 |
42 | http://jupyter.readthedocs.io/en/latest/install.html
43 |
44 | This is not needed if you'd like to use the package only.
45 |
46 |
47 | If you want to run the alanine dipeptide example, you'll also need to install the mdshare package (necessary for the download of the trajectory files):
48 |
49 | git clone https://github.com/markovmodel/mdshare.git
50 | pip install ./mdshare
51 |
52 | or
53 |
54 | conda install mdshare -c conda-forge
55 |
56 |
--------------------------------------------------------------------------------
/time-lagged-autoencoder/setup.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/deeptime repository.
2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | from setuptools import setup, find_packages
19 | from setuptools.command.test import test as TestCommand
20 | import sys
21 |
22 | try:
23 | import torch
24 | except ImportError:
25 | # setup.py forces pytorch installation via pip and ignores an existing
26 | # conda installation. That's why we catch this here...
27 | print(
28 | 'Please install pytorch >=0.2.0_4 according to the instructions on '
29 | 'http://pytorch.org before you continue!')
30 | sys.exit(1)
31 |
32 | class PyTest(TestCommand):
33 | user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")]
34 | def initialize_options(self):
35 | TestCommand.initialize_options(self)
36 | self.pytest_args = ['tae']
37 | def run_tests(self):
38 | import pytest
39 | errno = pytest.main(self.pytest_args)
40 | sys.exit(errno)
41 |
42 | setup(
43 | cmdclass={'test': PyTest},
44 | use_scm_version=dict(root='..', relative_to=__file__),
45 | name='tae',
46 | author='Christoph Wehmeyer',
47 | author_email='christoph.wehmeyer@fu-berlin.de',
48 | url='https://github.com/markovmodel/deeptime/tree/master/time-lagged-autoencoder',
49 | description='A toolbox for dimension reduction of time series data with a time-lagged autoencoder.',
50 | packages=find_packages(),
51 | setup_requires=['setuptools_scm', 'setuptools_scm_git_archive'],
52 | install_requires=['numpy'],
53 | tests_require=['pytest'],
54 | zip_safe=False)
55 |
--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/test/test_toymodels.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/deeptime repository.
2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | import numpy as np
19 | from ..toymodels import sample_hmm
20 | from ..toymodels import sample_sqrt_model
21 | from ..toymodels import sample_swissroll_model
22 |
23 | def run_sample_hmm(ndim, nstates):
24 | length = 10000
25 | states = [np.random.randn(ndim) for i in range(nstates)]
26 | cov = np.random.rand(ndim, ndim)
27 | cov = np.matmul(cov.T, cov)
28 | transition_matrix = np.random.rand(nstates, nstates)
29 | transition_matrix = transition_matrix + transition_matrix.T
30 | transition_matrix /= transition_matrix.sum()
31 | pi = transition_matrix.sum(axis=1)
32 | transition_matrix /= pi[:, None]
33 | traj, dtraj = sample_hmm(length, cov, states, transition_matrix)
34 | sets = [np.where(dtraj == state)[0] for state in range(nstates)]
35 | np.testing.assert_allclose(
36 | [float(len(s)) / float(length) for s in sets],
37 | pi, atol=0.1)
38 | for i, s in enumerate(sets):
39 | mean = np.mean(traj[s, :], axis=0)
40 | np.testing.assert_allclose(
41 | mean, states[i], atol=0.2)
42 | traj[s, :] -= mean
43 | np.testing.assert_allclose(np.cov(traj.T), cov, atol=0.2)
44 |
45 | def test_sample_hmm_random():
46 | for _ in range(3):
47 | ndim = np.random.randint(low=2, high=5)
48 | nstates = np.random.randint(low=2, high=5)
49 | run_sample_hmm(ndim, nstates)
50 |
51 | def test_sample_sqrt_model():
52 | traj, dtraj = sample_sqrt_model(20000)
53 | np.testing.assert_allclose(
54 | np.mean(traj, axis=0), [0.0, 1.9], atol=0.2)
55 | np.testing.assert_allclose(
56 | np.std(traj, axis=0, ddof=1), [5.5, 1.3], atol=0.2)
57 |
58 | def test_sample_swissroll_model():
59 | traj, dtraj = sample_swissroll_model(20000)
60 | np.testing.assert_allclose(
61 | np.mean(traj, axis=0), [-3.1, 11.2, 4.9], atol=1.0)
62 | np.testing.assert_allclose(
63 | np.std(traj, axis=0, ddof=1), [7.9, 3.8, 6.7], atol=0.4)
64 |
--------------------------------------------------------------------------------
/time-lagged-autoencoder/README.md:
--------------------------------------------------------------------------------
1 | # time-lagged autoencoder
2 |
3 | A toolbox for dimension reduction of time series data with a [time-lagged autoencoder](https://arxiv.org/abs/1710.11239)-type deep neural network.
4 |
5 | ## Installation
6 | Make sure to install pytorch via conda, instructions on http://pytorch.org, before you install the present module with
7 |
8 | ```bash
9 | python setup.py test
10 | python setup.py install
11 | ```
12 |
13 | To run the included benchmarks, you also need to install the packages [pyemma](https://github.com/markovmodel/pyemma) and [mdshare](https://github.com/markovmodel/mdshare).
14 |
15 | ## Methods
16 | This package implements
17 | - principal component analysis (PCA),
18 | - time-lagged independent component analysis (TICA),
19 | - time-lagged canonical correlation analysis (via TICA),
20 | - kinetic maps (via TICA), and
21 | - an autoencoder-type neural network (AE) trained in a time-lagged manner.
22 |
23 | ## Example
24 | Assume that `data` is a single `numpy.ndarray(shape=[n_frames, n_features])` object or list thereof, `n_frames` refers to the number of timesteps in the/each trajectory, and `n_features` refers to the number of features extracted from the original molecular dyamics (MD) data. Now choose a target dimensionality `dim` and a transformation lag time `lag`, and run:
25 |
26 | ```python
27 | import tae
28 |
29 | # run PCA
30 | pca_transformed_data, pca_train_loss, pca_val_loss = tae.pca(data, dim=dim)
31 |
32 | # run TICA
33 | tica_transformed_data, tica_train_loss, tica_val_loss = tae.tica(data, dim=dim, lag=lag)
34 |
35 | # run AE
36 | ae_transformed_data, ae_train_loss, ae_val_loss = tae.ae(data, dim=dim, lag=lag)
37 |
38 | # run VAE
39 | vae_transformed_data, vae_train_loss, vae_val_loss = tae.vae(data, dim=dim, lag=lag)
40 |
41 | # run AE on a GPU
42 | ae_transformed_data, ae_train_loss, ae_val_loss = tae.ae(data, dim=dim, lag=lag, cuda=True)
43 | ```
44 |
45 | In this example, we get `*_val_loss=None` because we are training on the full data set. To exclude a randomly chosen fraction `fval` of the data from the training, add the parameter `validation_split=fval` to the function calls, e.g.:
46 |
47 | ```python
48 | ae_transformed_data, ae_train_loss, ae_val_loss = tae.ae(
49 | data, dim=dim, lag=lag, validation_split=fval, cuda=True)
50 | ```
51 |
52 | ## Citation
53 | ```
54 | @article{time-lagged-autoencoder,
55 | Author = {Christoph Wehmeyer and Frank No{\'{e}}},
56 | Doi = {10.1063/1.5011399},
57 | Journal = {J. Chem. Phys.},
58 | Month = {jun},
59 | Number = {24},
60 | Pages = {241703},
61 | Publisher = {{AIP} Publishing},
62 | Title = {Time-lagged autoencoders: Deep learning of slow collective variables for molecular kinetics},
63 | Volume = {148},
64 | Year = {2018}}
65 | ```
66 |
67 | ## Development system
68 | This project was developed using the following python environment:
69 |
70 | | package | version | | channel |
71 | |:---|:---|:---|:---|
72 | | python | 3.6.1 | 2 | |
73 | | conda | 4.3.29 | py36_0 | conda-forge |
74 | | numpy | 1.13.3 | py36_blas_openblas_200 [blas_openblas] | conda-forge |
75 | | pytorch | 0.2.0 | py36_4cu75 | soumith |
76 | | pyemma | 2.4 | np113py36_1 | conda-forge |
77 |
--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/test/test_models.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/deeptime repository.
2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | import numpy as np
19 | from torch.utils.data import DataLoader
20 | from ..utils import create_dataset
21 | from ..utils import whiten_data
22 | from ..models import PCA
23 | from ..models import TICA
24 | from ..models import AE
25 |
26 | def generate_data_2state_hmm(length=10000, lag=0, batch_size=100):
27 | transition_matrix = np.asarray([[0.9, 0.1], [0.1, 0.9]])
28 | dtraj = np.zeros(shape=(length,), dtype=np.intc)
29 | for i in range(1, length):
30 | dtraj[i] = np.random.choice(
31 | 2, size=1, p=transition_matrix[dtraj[i - 1], :])
32 | traj = np.random.randn(len(dtraj))
33 | traj[np.where(dtraj == 1)[0]] += 2.0
34 | traj_stacked = np.vstack((traj, np.zeros(len(traj))))
35 | phi = np.random.rand() * 2.0 * np.pi
36 | rot = np.asarray([
37 | [np.cos(phi), -np.sin(phi)],
38 | [np.sin(phi), np.cos(phi)]])
39 | traj_rot = np.dot(rot, traj_stacked).T
40 | return traj, \
41 | DataLoader(
42 | create_dataset(traj_rot, lag=lag),
43 | batch_size=batch_size,
44 | shuffle=True), \
45 | DataLoader(
46 | create_dataset(traj_rot, lag=0),
47 | batch_size=batch_size)
48 |
49 | ################################################################################
50 | #
51 | # PCA
52 | #
53 | ################################################################################
54 |
55 | def test_pca_2state_hmm():
56 | traj, train_loader, transform_loader = generate_data_2state_hmm()
57 | pca = PCA()
58 | pca.fit(train_loader, dim=1)
59 | out = whiten_data(pca.transform(transform_loader)).numpy().reshape((-1,))
60 | traj -= np.mean(traj)
61 | traj /= np.std(traj, ddof=1)
62 | np.testing.assert_allclose(np.abs(np.mean(traj * out)), 1.0, atol=0.001)
63 |
64 | ################################################################################
65 | #
66 | # TICA
67 | #
68 | ################################################################################
69 |
70 | def test_tica_2state_hmm():
71 | traj, train_loader, transform_loader = generate_data_2state_hmm(lag=1)
72 | tica = TICA()
73 | tica.fit(train_loader, dim=1)
74 | out = whiten_data(tica.transform(transform_loader)).numpy().reshape((-1,))
75 | traj -= np.mean(traj)
76 | traj /= np.std(traj, ddof=1)
77 | np.testing.assert_allclose(np.abs(np.mean(traj * out)), 1.0, atol=0.001)
78 |
79 | ################################################################################
80 | #
81 | # AUTOENCODER
82 | #
83 | ################################################################################
84 |
85 | def test_ae_2state_hmm():
86 | traj, train_loader, transform_loader = generate_data_2state_hmm(lag=1)
87 | ae = AE(2, 1, bias=False, alpha=None)
88 | ae.fit(train_loader, 20)
89 | out = whiten_data(ae.transform(transform_loader)).numpy().reshape((-1,))
90 | traj -= np.mean(traj)
91 | traj /= np.std(traj, ddof=1)
92 | np.testing.assert_allclose(np.abs(np.mean(traj * out)), 1.0, atol=0.001)
93 |
--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/test/test_api.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/deeptime repository.
2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | import numpy as np
19 | from torch.utils.data import DataLoader
20 | from ..utils import create_dataset
21 | from ..utils import whiten_data
22 | from ..api import pca
23 | from ..api import tica
24 | from ..api import ae
25 |
26 | def generate_data_2state_hmm(length=10000):
27 | transition_matrix = np.asarray([[0.9, 0.1], [0.1, 0.9]])
28 | phi = np.random.rand() * 2.0 * np.pi
29 | rot = np.asarray([
30 | [np.cos(phi), -np.sin(phi)],
31 | [np.sin(phi), np.cos(phi)]])
32 | trajs, rtrajs = [], []
33 | for _ in range(np.random.randint(1, 5)):
34 | dtraj = np.zeros(
35 | shape=(length + np.random.randint(100),), dtype=np.intc)
36 | for i in range(1, len(dtraj)):
37 | dtraj[i] = np.random.choice(
38 | 2, size=1, p=transition_matrix[dtraj[i - 1], :])
39 | traj = np.random.randn(len(dtraj))
40 | traj[np.where(dtraj == 1)[0]] += 2.0
41 | traj_stacked = np.vstack((traj, np.zeros(len(traj))))
42 | traj_rot = np.dot(rot, traj_stacked).T
43 | trajs.append(traj[:])
44 | rtrajs.append(traj_rot[:])
45 | if len(trajs) == 1:
46 | trajs = trajs[0]
47 | rtrajs = rtrajs[0]
48 | else:
49 | trajs = np.concatenate(trajs)
50 | trajs -= np.mean(trajs)
51 | trajs /= np.std(trajs, ddof=1)
52 | return trajs, rtrajs
53 |
54 | def checkpout_output(ref, data, out):
55 | if isinstance(data, (list, tuple)):
56 | np.testing.assert_array_equal(
57 | [o.shape[0] for o in out],
58 | [d.shape[0] for d in data])
59 | out = np.concatenate(out)
60 | else:
61 | assert data.shape[0] == out.shape[0]
62 | out = out.reshape(-1)
63 | np.testing.assert_allclose(np.abs(np.mean(ref * out)), 1.0, atol=0.001)
64 |
65 | ################################################################################
66 | #
67 | # PCA
68 | #
69 | ################################################################################
70 |
71 | def test_pca_2state_hmm():
72 | ref, data = generate_data_2state_hmm()
73 | out, train_loss, test_loss = pca(data, dim=1, whiten=True)
74 | checkpout_output(ref, data, out)
75 |
76 | ################################################################################
77 | #
78 | # TICA
79 | #
80 | ################################################################################
81 |
82 | def test_tica_2state_hmm():
83 | ref, data = generate_data_2state_hmm()
84 | out, train_loss, test_loss = tica(data, dim=1, lag=1, whiten=True)
85 | checkpout_output(ref, data, out)
86 |
87 | ################################################################################
88 | #
89 | # AUTOENCODER
90 | #
91 | ################################################################################
92 |
93 | def test_ae_2state_hmm():
94 | ref, data = generate_data_2state_hmm()
95 | out, train_loss, test_loss = ae(
96 | data, dim=1, lag=1, n_epochs=20, whiten=True,
97 | bias=False, hid_size=[], alpha=None)
98 | checkpout_output(ref, data, out)
99 |
--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/toymodels.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/deeptime repository.
2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | '''
19 | A collection of "difficult" toymodels.
20 | '''
21 |
22 | import numpy as _np
23 |
24 | __all__ = ['sample_sqrt_model', 'sample_swissroll_model']
25 |
26 | def sample_hmm(length, cov, states, transition_matrix):
27 | '''Sample a hidden state trajectory and n-dimensional emissions.
28 |
29 | We sample a hidden state trajectory using the given transition matrix. For
30 | each hidden state, we compute Gaussian noise around the center of the state
31 | using the given covariance matrix.
32 |
33 | Arguments:
34 | length (int): length of the resulting trajectories
35 | cov (array-like of float): covariance matrix for the noise
36 | states (array-like of float): centers for each state's emissions
37 | transition_matrix (array-like of float): a transition matrix
38 | '''
39 | cov = _np.asarray(cov, dtype=_np.float32)
40 | states = _np.asarray(states, dtype=_np.float32)
41 | transition_matrix = _np.asarray(transition_matrix, dtype=_np.float32)
42 | dtraj = _np.zeros(shape=(length,), dtype=_np.intc)
43 | dtraj[0] = _np.random.randint(low=0, high=len(states))
44 | for i in range(1, length):
45 | dtraj[i] = _np.random.choice(
46 | len(states), size=1, p=transition_matrix[dtraj[i - 1], :])
47 | traj = states[dtraj, :] + _np.random.multivariate_normal(
48 | _np.zeros(len(cov)), cov, size=length, check_valid='ignore')
49 | return traj, dtraj
50 |
51 | def sqrt_transform(traj):
52 | '''Mask an emission trajectory using an sqrt transform.
53 |
54 | We add the square root of the first dimension (which ideally a large
55 | variance) to the second (which is ideally the slowest degree of freedom)
56 | to mask the slow process.
57 |
58 | Arguments:
59 | traj (array-like of float): a trajectory of emissions
60 | '''
61 | transformed_traj = _np.asarray(traj).copy()
62 | transformed_traj[:, 1] += _np.sqrt(_np.abs(traj[:, 0]))
63 | return transformed_traj
64 |
65 | def sample_sqrt_model(length):
66 | '''Sample a hidden state and an sqrt-transformed emission trajectory.
67 |
68 | We sample a hidden state trajectory and sqrt-masked emissions in two
69 | dimensions such that the two metastable states are not linearly separable.
70 |
71 | Arguments:
72 | length (int): length of the resulting trajectories
73 | '''
74 | cov = [[30.0, 0.0], [0.0, 0.015]]
75 | states = [[0.0, 1.0], [0.0, -1.0]]
76 | transition_matrix = [[0.95, 0.05], [0.05, 0.95]]
77 | traj, dtraj = sample_hmm(length, cov, states, transition_matrix)
78 | return sqrt_transform(traj), dtraj
79 |
80 | def swissroll_transform(traj):
81 | '''Mask an emission trajectory using a swissroll transform.
82 |
83 | We roll two dimensional emissions into a swissroll style manifold in three
84 | dimensions.
85 |
86 | Arguments:
87 | traj (array-like of float): a trajectory of emissions
88 | '''
89 | x = traj[:, 0]
90 | return _np.vstack([x * _np.cos(x), traj[:, 1], x * _np.sin(x)]).T
91 |
92 | def sample_swissroll_model(length):
93 | '''Sample a hidden state and a swissroll-transformed emission trajectory.
94 |
95 | We sample a hidden state trajectory and swissroll-masked emissions in two
96 | dimensions such that the four metastable states are not linearly separable.
97 |
98 | Arguments:
99 | length (int): length of the resulting trajectories
100 | '''
101 | cov = [[1.0, 0.0], [0.0, 1.0]]
102 | states = [[7.5, 7.5], [7.5, 15.0], [15.0, 15.0], [15.0, 7.5]]
103 | transition_matrix = [
104 | [0.95, 0.05, 0.00, 0.00],
105 | [0.05, 0.90, 0.05, 0.00],
106 | [0.00, 0.05, 0.90, 0.05],
107 | [0.00, 0.00, 0.05, 0.95]]
108 | traj, dtraj = sample_hmm(length, cov, states, transition_matrix)
109 | return swissroll_transform(traj), dtraj
110 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/api.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/deeptime repository.
2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | '''
19 | A simple API to apply PCA, TICA, and AE to time series data.
20 | '''
21 |
22 | from .models import PCA as _PCA
23 | from .models import TICA as _TICA
24 | from .models import AE as _AE
25 | from .models import VAE as _VAE
26 | from .models import VAMPNet as _VAMPNet
27 | from .utils import create_dataset as _create_dataset
28 | from .utils import random_split as _random_split
29 | from .utils import random_block_split as _random_block_split
30 | from .utils import whiten_data as _whiten_data
31 | from torch import nn as _nn
32 | from torch.utils.data import DataLoader as _DataLoader
33 |
34 | def _transform(model, data, data_0, batch_size, whiten, pin_memory=False):
35 | loader = _DataLoader(data_0, batch_size=batch_size, pin_memory=pin_memory)
36 | if whiten:
37 | transformed_data = _whiten_data(model.transform(loader)).numpy()
38 | else:
39 | transformed_data = model.transform(loader).numpy()
40 | if isinstance(data, (list, tuple)):
41 | collect = []
42 | p = 0
43 | lengths = [d.shape[0] for d in data]
44 | for length in lengths:
45 | collect.append(transformed_data[p:p+length, :])
46 | p += length
47 | return collect
48 | return transformed_data
49 |
50 | def pca(data, dim=None, validation_split=None, batch_size=100, whiten=False):
51 | '''Perform a principal component analysis for dimensionality reduction.
52 |
53 | We compute the first eigenvectors of the instantaneous covariance
54 | matrix and use them to rotate/project the data into a lower dimensional
55 | subspace.
56 |
57 | Arguments:
58 | data (numpy-ndarray of list thereof): the data to be transformed
59 | dim (int): the target dimensionality
60 | validation_split (float): fraction of the data reserved for validation
61 | batch_size (int): specify a batch size for the minibatch process
62 | whiten (boolean): set to True to whiten the transformed data
63 |
64 | Returns:
65 | (numpy.ndarray of list thereof): the transformed data
66 | (float): training loss
67 | (float): validation loss
68 | '''
69 | data_0 = _create_dataset(data, lag=0)
70 | if validation_split is None:
71 | train_loader = _DataLoader(data_0, batch_size=batch_size)
72 | test_loader = None
73 | else:
74 | data_test, data_train = _random_split(
75 | data_0, f_active=validation_split)
76 | train_loader = _DataLoader(data_train, batch_size=batch_size)
77 | test_loader = _DataLoader(data_test, batch_size=batch_size)
78 | model = _PCA()
79 | train_loss, test_loss = model.fit(
80 | train_loader, dim=dim, test_loader=test_loader)
81 | transformed_data = _transform(model, data, data_0, batch_size, whiten)
82 | return transformed_data, train_loss, test_loss
83 |
84 | def tica(
85 | data, dim=None, lag=1, kinetic_map=True, symmetrize=False,
86 | validation_split=None, batch_size=100, whiten=False):
87 | '''Perform a time-lagged independent component analysis for
88 | dimensionality reduction.
89 |
90 | We compute a rank-d approximation to the Koopman operator and use it to
91 | rotate/project the data into a lower dimensional subspace.
92 |
93 | Arguments:
94 | data (numpy-ndarray of list thereof): the data to be transformed
95 | dim (int): the target dimensionality
96 | lag (int): specifies the lag in time steps
97 | kinetic_map (boolean): use the kinetic map variant of TICA
98 | symmetrize (boolean): enforce symmetry and reversibility
99 | validation_split (float): fraction of the data reserved for validation
100 | batch_size (int): specify a batch size for the minibatch process
101 | whiten (boolean): set to True to whiten the transformed data
102 |
103 | Returns:
104 | (numpy.ndarray of list thereof): the transformed data
105 | (float): training loss
106 | (float): validation loss
107 | '''
108 | data_0 = _create_dataset(data, lag=0)
109 | data_lag = _create_dataset(data, lag=lag)
110 | if validation_split is None:
111 | train_loader = _DataLoader(data_lag, batch_size=batch_size)
112 | test_loader = None
113 | else:
114 | data_test, data_train = _random_block_split(
115 | data_lag, lag, f_active=validation_split)
116 | train_loader = _DataLoader(data_train, batch_size=batch_size)
117 | test_loader = _DataLoader(data_test, batch_size=batch_size)
118 | model = _TICA(kinetic_map=kinetic_map, symmetrize=symmetrize)
119 | train_loss, test_loss = model.fit(
120 | train_loader, dim=dim, test_loader=test_loader)
121 | transformed_data = _transform(model, data, data_0, batch_size, whiten)
122 | return transformed_data, train_loss, test_loss
123 |
124 | def ae(
125 | data, dim=None, lag=1, n_epochs=50, validation_split=None,
126 | batch_size=100, whiten=False, pin_memory=False, **kwargs):
127 | '''Use a time-lagged autoencoder model for dimensionality reduction.
128 |
129 | We train a deep (or shallow) time-lagged autoencoder type neural network
130 | and use the first half (encoder stage) to transform the supplied data.
131 |
132 | Arguments:
133 | data (numpy-ndarray of list thereof): the data to be transformed
134 | dim (int): the target dimensionality
135 | lag (int): specifies the lag in time steps
136 | n_epochs (int): number of training epochs
137 | validation_split (float): fraction of the data reserved for validation
138 | batch_size (int): specify a batch size for the minibatch process
139 | whiten (boolean): set to True to whiten the transformed data
140 | pin_memory (boolean): make DataLoaders return pinned memory
141 |
142 | Returns:
143 | (numpy.ndarray of list thereof): the transformed data
144 | (list of float): training loss
145 | (list of float): validation loss
146 | '''
147 | ae_args = dict(
148 | hid_size=[100],
149 | dropout=0.5,
150 | alpha=0.01,
151 | prelu=False,
152 | bias=True,
153 | lr=0.001,
154 | cuda=False,
155 | non_blocking=False)
156 | ae_args.update(kwargs)
157 | try:
158 | size = data.shape[1]
159 | except AttributeError:
160 | size = data[0].shape[1]
161 | data_0 = _create_dataset(data, lag=0)
162 | data_lag = _create_dataset(data, lag=lag)
163 | if validation_split is None:
164 | train_loader = _DataLoader(
165 | data_lag, batch_size=batch_size, pin_memory=pin_memory)
166 | test_loader = None
167 | else:
168 | data_test, data_train = _random_block_split(
169 | data_lag, lag, f_active=validation_split)
170 | train_loader = _DataLoader(
171 | data_train, batch_size=batch_size, pin_memory=pin_memory)
172 | test_loader = _DataLoader(
173 | data_test, batch_size=batch_size, pin_memory=pin_memory)
174 | model = _AE(size, dim, **ae_args)
175 | train_loss, test_loss = model.fit(
176 | train_loader, n_epochs, test_loader=test_loader)
177 | transformed_data = _transform(model, data, data_0, batch_size, whiten)
178 | return transformed_data, train_loss, test_loss
179 |
180 | def vae(
181 | data, dim=None, lag=1, n_epochs=50, validation_split=None,
182 | batch_size=100, whiten=False, pin_memory=False, **kwargs):
183 | '''Use a time-lagged variational autoencoder model for dimensionality
184 | reduction.
185 |
186 | We train a deep (or shallow) time-lagged variational autoencoder type
187 | neural network and use the first half (encoder stage) to transform the
188 | supplied data.
189 |
190 | Arguments:
191 | data (numpy-ndarray of list thereof): the data to be transformed
192 | dim (int): the target dimensionality
193 | lag (int): specifies the lag in time steps
194 | n_epochs (int): number of training epochs
195 | validation_split (float): fraction of the data reserved for validation
196 | batch_size (int): specify a batch size for the minibatch process
197 | whiten (boolean): set to True to whiten the transformed data
198 | pin_memory (boolean): make DataLoaders return pinned memory
199 |
200 | Returns:
201 | (numpy.ndarray of list thereof): the transformed data
202 | (list of float): training loss
203 | (list of float): validation loss
204 | '''
205 | vae_args = dict(
206 | hid_size=[100],
207 | beta=1.0,
208 | dropout=0.5,
209 | alpha=0.01,
210 | prelu=False,
211 | bias=True,
212 | lr=0.001,
213 | cuda=False,
214 | non_blocking=False)
215 | vae_args.update(kwargs)
216 | try:
217 | size = data.shape[1]
218 | except AttributeError:
219 | size = data[0].shape[1]
220 | data_0 = _create_dataset(data, lag=0)
221 | data_lag = _create_dataset(data, lag=lag)
222 | if validation_split is None:
223 | train_loader = _DataLoader(
224 | data_lag, batch_size=batch_size, pin_memory=pin_memory)
225 | test_loader = None
226 | else:
227 | data_test, data_train = _random_block_split(
228 | data_lag, lag, f_active=validation_split)
229 | train_loader = _DataLoader(
230 | data_train, batch_size=batch_size, pin_memory=pin_memory)
231 | test_loader = _DataLoader(
232 | data_test, batch_size=batch_size, pin_memory=pin_memory)
233 | model = _VAE(size, dim, **vae_args)
234 | train_loss, test_loss = model.fit(
235 | train_loader, n_epochs, test_loader=test_loader)
236 | transformed_data = _transform(model, data, data_0, batch_size, whiten)
237 | return transformed_data, train_loss, test_loss
238 |
239 | ################################################################################
240 | #
241 | # VAMPNET WORK IN PROGRESS
242 | #
243 | ################################################################################
244 |
245 | def vampnet(
246 | data, dim=None, lag=1, n_epochs=50, validation_split=None,
247 | batch_size=100, whiten=False, pin_memory=False, **kwargs):
248 | '''Use a vampnet model for dimensionality reduction and/or clustering.
249 |
250 | ....
251 |
252 | Arguments:
253 | data (numpy-ndarray of list thereof): the data to be transformed
254 | dim (int): the target dimensionality
255 | lag (int): specifies the lag in time steps
256 | n_epochs (int): number of training epochs
257 | validation_split (float): fraction of the data reserved for validation
258 | batch_size (int): specify a batch size for the minibatch process
259 | whiten (boolean): set to True to whiten the transformed data
260 | pin_memory (boolean): make DataLoaders return pinned memory
261 |
262 | Returns:
263 | (numpy.ndarray of list thereof): the transformed data
264 | (list of float): training score
265 | (list of float): validation score
266 | '''
267 | vn_args = dict(
268 | hid_size=[100],
269 | dropout=0.5,
270 | alpha=0.01,
271 | prelu=False,
272 | bias=True,
273 | lr=0.001,
274 | cuda=False,
275 | non_blocking=False)
276 | vn_args.update(kwargs)
277 | try:
278 | size = data.shape[1]
279 | except AttributeError:
280 | size = data[0].shape[1]
281 | data_0 = _create_dataset(data, lag=0)
282 | data_lag = _create_dataset(data, lag=lag)
283 | if validation_split is None:
284 | train_loader = _DataLoader(
285 | data_lag, batch_size=batch_size, pin_memory=pin_memory)
286 | test_loader = None
287 | else:
288 | data_test, data_train = _random_block_split(
289 | data_lag, lag, f_active=validation_split)
290 | train_loader = _DataLoader(
291 | data_train, batch_size=batch_size, pin_memory=pin_memory)
292 | test_loader = _DataLoader(
293 | data_test, batch_size=batch_size, pin_memory=pin_memory)
294 | model = _VAMPNet(size, dim, **vn_args)
295 | train_loss, test_loss = model.fit(
296 | train_loader, n_epochs, test_loader=test_loader)
297 | transformed_data = _transform(model, data, data_0, batch_size, whiten)
298 | train_loss = [-loss for loss in train_loss]
299 | test_loss = [-loss for loss in test_loss]
300 | return transformed_data, train_loss, test_loss
301 |
--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/test/test_utils.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/deeptime repository.
2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | import numpy as np
19 | import torch
20 | from torch.utils.data import DataLoader
21 | from ..utils import LaggedDataset
22 | from ..utils import MaskedDataset
23 | from ..utils import ensure_traj_format
24 | from ..utils import create_dataset
25 | from ..utils import stride_split
26 | from ..utils import random_split
27 | from ..utils import random_block_split
28 | from ..utils import get_mean
29 | from ..utils import get_covariance
30 | from ..utils import get_sqrt_inverse
31 | from ..utils import whiten_data
32 | from ..utils import cca
33 | from ..utils import BaseTransform
34 | from ..utils import Transform
35 |
36 | ################################################################################
37 | #
38 | # DATASETS
39 | #
40 | ################################################################################
41 |
42 | def test_lagged_dataset_at_default_lag():
43 | data = np.arange(
44 | 800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32)
45 | dataset = LaggedDataset(torch.Tensor(data), lag=0)
46 | for x, y in dataset:
47 | assert x[0] == y[0]
48 |
49 | def test_lagged_dataset_at_lag0():
50 | data = np.arange(
51 | 800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32)
52 | dataset = LaggedDataset(torch.Tensor(data), lag=0)
53 | for x, y in dataset:
54 | assert x[0] == y[0]
55 |
56 | def test_lagged_dataset_at_random_lag():
57 | data = np.arange(
58 | 800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32)
59 | lag = 1 + np.random.randint(50)
60 | dataset = LaggedDataset(torch.Tensor(data), lag)
61 | for x, y in dataset:
62 | assert x[0] + lag == y[0]
63 |
64 | def test_masked_dataset():
65 | data = np.arange(
66 | 800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32)
67 | active = np.random.choice(data[:, 0], size=100, replace=False)
68 | dataset = MaskedDataset(LaggedDataset(torch.Tensor(data), lag=0), active)
69 | assert len(dataset) == len(active)
70 | for (x, y), z in zip(dataset, active):
71 | assert x[0] == y[0] == z
72 |
73 | def test_ensure_traj_format_1d():
74 | raw_data = np.arange(800 + np.random.randint(200))
75 | data = ensure_traj_format(raw_data)
76 | assert isinstance(data, np.ndarray)
77 | assert data.dtype == np.float32
78 | assert data.ndim == 2
79 | np.testing.assert_array_equal(data.shape, [len(raw_data), 1])
80 | np.testing.assert_allclose(raw_data.astype(np.float32), data[:, 0])
81 |
82 | def test_ensure_traj_format_2d():
83 | raw_data = np.arange(800 + np.random.randint(200)).reshape(-1, 1)
84 | data = ensure_traj_format(raw_data)
85 | assert isinstance(data, np.ndarray)
86 | assert data.dtype == np.float32
87 | assert data.ndim == 2
88 | np.testing.assert_array_equal(data.shape, raw_data.shape)
89 | np.testing.assert_allclose(raw_data.astype(np.float32), data)
90 |
91 | def test_create_dataset_single_file_1d():
92 | data = np.arange(
93 | 800 + np.random.randint(200))
94 | lag = np.random.randint(50)
95 | dataset = create_dataset(data, lag, dtype=np.float32)
96 | for x, y in dataset:
97 | assert x[0] + lag == y[0]
98 |
99 | def test_create_dataset_single_file_2d():
100 | data = np.arange(
101 | 800 + np.random.randint(200)).reshape(-1, 1)
102 | lag = np.random.randint(50)
103 | dataset = create_dataset(data, lag, dtype=np.float32)
104 | for x, y in dataset:
105 | assert x[0] + lag == y[0]
106 |
107 | def test_create_dataset_multiple_files_1d():
108 | data = [np.arange(800 + np.random.randint(200)) for _ in range(3)]
109 | lag = np.random.randint(50)
110 | dataset = create_dataset(data, lag, dtype=np.float32)
111 | for x, y in dataset:
112 | assert x[0] + lag == y[0]
113 |
114 | def test_create_dataset_multiple_files_2d():
115 | data = [np.arange(
116 | 800 + np.random.randint(200)).reshape(-1, 1) for _ in range(3)]
117 | lag = np.random.randint(50)
118 | dataset = create_dataset(data, lag, dtype=np.float32)
119 | for x, y in dataset:
120 | assert x[0] + lag == y[0]
121 |
122 | def test_stride_split():
123 | data = np.arange(
124 | 800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32)
125 | lag = 1 + np.random.randint(50)
126 | dataset = LaggedDataset(torch.Tensor(data), lag)
127 | stride = 1 + np.random.randint(10)
128 | offset = np.random.randint(stride)
129 | dataset_a, dataset_b = stride_split(dataset, stride=stride, offset=offset)
130 | assert len(dataset) == len(dataset_a) + len(dataset_b)
131 | for x, y in dataset_a:
132 | assert x[0] + lag == y[0]
133 | for x, y in dataset_b:
134 | assert x[0] + lag == y[0]
135 |
136 | def test_random_split():
137 | data = np.arange(
138 | 800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32)
139 | lag = 1 + np.random.randint(50)
140 | dataset = LaggedDataset(torch.Tensor(data), lag)
141 | dataset_a, dataset_b = random_split(dataset, f_active=0.5)
142 | assert len(dataset) == len(dataset_a) + len(dataset_b)
143 | for x, y in dataset_a:
144 | assert x[0] + lag == y[0]
145 | for x, y in dataset_b:
146 | assert x[0] + lag == y[0]
147 |
148 | def test_random_block_split():
149 | data = np.arange(
150 | 800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32)
151 | lag = 1 + np.random.randint(50)
152 | dataset = LaggedDataset(torch.Tensor(data), lag)
153 | dataset_a, dataset_b = random_block_split(dataset, lag, f_active=0.5)
154 | assert len(dataset) == len(dataset_a) + len(dataset_b)
155 | for x, y in dataset_a:
156 | assert x[0] + lag == y[0]
157 | for x, y in dataset_b:
158 | assert x[0] + lag == y[0]
159 |
160 | ################################################################################
161 | #
162 | # STATISTICS
163 | #
164 | ################################################################################
165 |
166 | def test_get_mean_via_normal_distribution_parameters():
167 | data = torch.randn(10000, 1)
168 | dataset = LaggedDataset(data, lag=0)
169 | x, y = get_mean(
170 | DataLoader(
171 | dataset, batch_size=np.random.randint(low=10, high=100)))
172 | np.testing.assert_allclose(x.numpy(), 0.0, atol=0.05)
173 | np.testing.assert_allclose(y.numpy(), 0.0, atol=0.05)
174 |
175 | def test_get_mean_via_distribution_symmetry():
176 | data = torch.rand(5000, 1)
177 | data = torch.cat([data, -data])
178 | dataset = LaggedDataset(data, lag=0)
179 | x, y = get_mean(
180 | DataLoader(
181 | dataset, batch_size=np.random.randint(low=10, high=100)))
182 | np.testing.assert_allclose(x.numpy(), 0.0, atol=0.0001)
183 | np.testing.assert_allclose(y.numpy(), 0.0, atol=0.0001)
184 |
185 | def test_get_mean_vs_numpy():
186 | data = torch.randn(10000, 1)
187 | dataset = LaggedDataset(data, lag=0)
188 | x, y = get_mean(
189 | DataLoader(
190 | dataset, batch_size=np.random.randint(low=10, high=100)))
191 | numpy_result = np.mean(data.numpy())
192 | np.testing.assert_allclose(x.numpy(), numpy_result, atol=0.0001)
193 | np.testing.assert_allclose(y.numpy(), numpy_result, atol=0.0001)
194 |
195 | def test_get_covariance_via_normal_distribution_parameters():
196 | data = torch.randn(10000, 1)
197 | dataset = LaggedDataset(data, lag=0)
198 | xx, xy, yy = get_covariance(
199 | DataLoader(
200 | dataset, batch_size=np.random.randint(low=10, high=100)),
201 | torch.Tensor([0]), torch.Tensor([0]))
202 | np.testing.assert_allclose(xx.numpy(), 1.0, atol=0.1)
203 | np.testing.assert_allclose(xy.numpy(), 1.0, atol=0.1)
204 | np.testing.assert_allclose(yy.numpy(), 1.0, atol=0.1)
205 |
206 | def test_get_covariance_vs_numpy():
207 | data = torch.randn(10000, 1)
208 | dataset = LaggedDataset(data, lag=0)
209 | xx, xy, yy = get_covariance(
210 | DataLoader(
211 | dataset, batch_size=np.random.randint(low=10, high=100)),
212 | torch.Tensor([0]), torch.Tensor([0]))
213 | numpy_result = np.var(data.numpy(), ddof=1)
214 | np.testing.assert_allclose(xx.numpy(), numpy_result, atol=0.0005)
215 | np.testing.assert_allclose(xy.numpy(), numpy_result, atol=0.0005)
216 | np.testing.assert_allclose(yy.numpy(), numpy_result, atol=0.0005)
217 |
218 | ################################################################################
219 | #
220 | # WHITENING
221 | #
222 | ################################################################################
223 |
224 | def test_get_sqrt_inverse():
225 | dim = 2 + np.random.randint(5)
226 | x = torch.rand(500, dim)
227 | x = torch.mm(x.t(), x)
228 | y = get_sqrt_inverse(x)
229 | y = torch.mm(y, y)
230 | np.testing.assert_allclose(
231 | x.mm(y).numpy(),
232 | np.diag([1.0] * dim).astype(np.float32),
233 | atol=0.0001)
234 |
235 | def test_whiten_data():
236 | dim = 1 + np.random.randint(5)
237 | x = whiten_data(torch.rand(500, dim))
238 | np.testing.assert_allclose(
239 | x.numpy().mean(axis=0),
240 | 0.0,
241 | atol=0.01)
242 | np.testing.assert_allclose(
243 | torch.mm(x.t(), x).div_(float(x.size()[0])).numpy(),
244 | np.diag([1.0] * dim).astype(np.float32),
245 | atol=0.01)
246 |
247 | ################################################################################
248 | #
249 | # CCA
250 | #
251 | ################################################################################
252 |
253 | def test_cca():
254 | s = np.arange(1000)
255 | x = torch.from_numpy(
256 | np.vstack((s, np.random.randn(s.shape[0]))).T.astype(np.float32))
257 | y = torch.from_numpy(
258 | np.vstack((np.random.randn(s.shape[0]), s)).T.astype(np.float32))
259 | u, s, v = cca(x, y, batch_size=100)
260 | np.testing.assert_allclose(s.numpy(), [1.0, 0.0], atol=0.2)
261 | p = u.mm(torch.diag(s).mm(v))
262 | np.testing.assert_allclose(
263 | np.abs(p.numpy()), [[0.0, 1.0], [0.0, 0.0]], atol=0.2)
264 |
265 | ################################################################################
266 | #
267 | # TRANSFORMER
268 | #
269 | ################################################################################
270 |
271 | def test_base_transform():
272 | dim = 2 + np.random.randint(5)
273 | mean = 10.0 * (torch.rand(dim) - 0.5)
274 | sigma = torch.rand(dim, dim)
275 | sigma.add_(sigma.t())
276 | data = torch.randn(500, dim).mm(sigma) + mean[None, :]
277 | loader = DataLoader(LaggedDataset(data, lag=0), batch_size=64)
278 | x_mean, y_mean = get_mean(loader)
279 | cxx, cxy, cyy = get_covariance(loader, x_mean, y_mean)
280 | transformer = BaseTransform(mean=x_mean, covariance=cxx)
281 | transformed_data = []
282 | for x, _ in loader:
283 | transformed_data.append(transformer(x))
284 | y = torch.cat(transformed_data)
285 | np.testing.assert_allclose(
286 | y.numpy().mean(axis=0),
287 | 0.0,
288 | atol=0.01)
289 | np.testing.assert_allclose(
290 | torch.mm(y.t(), y).div_(float(y.size()[0])).numpy(),
291 | np.diag([1.0] * dim).astype(np.float32),
292 | atol=0.2)
293 |
294 | def test_transform():
295 | dim = 2 + np.random.randint(5)
296 | mean = 10.0 * (torch.rand(dim) - 0.5)
297 | sigma = torch.rand(dim, dim)
298 | sigma.add_(sigma.t())
299 | data = torch.randn(500, dim).mm(sigma) + mean[None, :]
300 | loader = DataLoader(LaggedDataset(data, lag=0), batch_size=64)
301 | x_mean, y_mean = get_mean(loader)
302 | cxx, cxy, cyy = get_covariance(loader, x_mean, y_mean)
303 | transformer = Transform(
304 | x_mean=x_mean, x_covariance=cxx,
305 | y_mean=x_mean, y_covariance=cyy)
306 | x_, y_ = [], []
307 | for x, y in loader:
308 | x, y = transformer(x, y)
309 | x_.append(x)
310 | y_.append(y)
311 | x = torch.cat(x_)
312 | y = torch.cat(y_)
313 | np.testing.assert_allclose(
314 | x.numpy().mean(axis=0),
315 | 0.0,
316 | atol=0.1)
317 | np.testing.assert_allclose(
318 | torch.mm(x.t(), x).div_(float(x.size()[0])).numpy(),
319 | np.diag([1.0] * dim).astype(np.float32),
320 | atol=0.1)
321 | np.testing.assert_allclose(
322 | y.numpy().mean(axis=0),
323 | 0.0,
324 | atol=0.1)
325 | np.testing.assert_allclose(
326 | torch.mm(y.t(), y).div_(float(y.size()[0])).numpy(),
327 | np.diag([1.0] * dim).astype(np.float32),
328 | atol=0.1)
329 |
--------------------------------------------------------------------------------
/vampnet/examples/1D_double_well.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Import all the packages used"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import numpy as np\n",
17 | "import matplotlib.pyplot as plt\n",
18 | "%matplotlib inline\n",
19 | "import vampnet\n",
20 | "from vampnet import data_generator\n",
21 | "from keras.models import Model\n",
22 | "from keras.layers import Dense, Activation, Flatten, Input, BatchNormalization, concatenate\n",
23 | "from keras import optimizers\n",
24 | "import tensorflow as tf\n",
25 | "from keras.backend import clear_session"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "# generate 50000 frames and energy values\n",
35 | "datapoints = int(5e4)"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "traj_whole = data_generator.get_asymmetric_double_well_data(datapoints)\n",
45 | "# To fit the dataformat\n",
46 | "traj_whole = np.expand_dims(traj_whole, 1)\n",
47 | "traj_data_points, input_size = traj_whole.shape"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "x = np.linspace(-1,5,500)\n",
57 | "plt.figure(figsize=(6,2))\n",
58 | "plt.ylim(-15,10)\n",
59 | "plt.xlim(-1,5)\n",
60 | "plt.plot(x,data_generator.asymmetric_double_well_energy(x), lw = 2)\n",
61 | "plt.xlabel('Position x / a.u.', fontsize = 16)\n",
62 | "plt.ylabel('Pot. energy / a.u.', fontsize = 16)\n",
63 | "plt.xticks(fontsize = 14)\n",
64 | "\n",
65 | "plt.yticks(fontsize = 14);"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "# All Hyperparameters\n",
75 | "\n",
76 | "# Tau, how much is the timeshift of the two datasets\n",
77 | "tau = 1\n",
78 | "\n",
79 | "# Batch size for Stochastic Gradient descent\n",
80 | "batch_size = 2048\n",
81 | "\n",
82 | "# Which trajectory points percentage is used as training\n",
83 | "train_ratio = 0.9\n",
84 | "\n",
85 | "# How many hidden layers the network has\n",
86 | "network_depth = 4\n",
87 | "\n",
88 | "# Width of every layer\n",
89 | "layer_width = 20\n",
90 | "nodes = [layer_width]*network_depth\n",
91 | "# Learning rate used for the ADAM optimizer\n",
92 | "learning_rate = 0.0001\n",
93 | "\n",
94 | "# How many output states the network has\n",
95 | "output_size = 5\n",
96 | "\n",
97 | "# Iteration over the training set in the fitting process\n",
98 | "nb_epoch = 300"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "epsilon = 1e-5\n",
108 | "vamp = vampnet.VampnetTools(epsilon = epsilon)"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "# Shuffle trajectory and lagged trajectory together\n",
118 | "length_data = traj_data_points - tau\n",
119 | "\n",
120 | "traj_ord= traj_whole[:length_data]\n",
121 | "traj_ord_lag = traj_whole[tau:length_data+tau]\n",
122 | "\n",
123 | "indexes = np.arange(length_data)\n",
124 | "np.random.shuffle(indexes)\n",
125 | "\n",
126 | "\n",
127 | "\n",
128 | "traj = traj_ord[indexes]\n",
129 | "traj_lag = traj_ord_lag[indexes]\n"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "# Prepare data for tensorflow usage\n",
139 | "length_train = int(np.floor(length_data * train_ratio))\n",
140 | "length_vali = length_data - length_train\n",
141 | "\n",
142 | "traj_data_train = traj[:length_train]\n",
143 | "traj_data_train_lag = traj_lag[:length_train]\n",
144 | "\n",
145 | "traj_data_valid = traj[length_train:]\n",
146 | "traj_data_valid_lag = traj_lag[length_train:]\n",
147 | "\n",
148 | "#Data used for states ordering\n",
149 | "X1 = traj_ord[:length_data].astype('float32')\n",
150 | "X2 = traj_ord_lag[:length_data].astype('float32')\n",
151 | "\n",
152 | "# Input of the first network\n",
153 | "X1_train = traj_data_train.astype('float32')\n",
154 | "X2_train = traj_data_train_lag.astype('float32')\n",
155 | "\n",
156 | "# Input for validation\n",
157 | "X1_vali = traj_data_valid.astype('float32')\n",
158 | "X2_vali = traj_data_valid_lag.astype('float32')\n",
159 | "\n",
160 | "# Needs a Y-train set which we dont have.\n",
161 | "Y_train = np.zeros((length_train,2*output_size)).astype('float32')\n",
162 | "Y_vali = np.zeros((length_vali,2*output_size)).astype('float32')"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "if 'model' in globals():\n",
172 | " del model\n",
173 | " clear_session()\n",
174 | "# Build the model\n",
175 | "Data_X = Input(shape = (input_size,))\n",
176 | "Data_Y = Input(shape = (input_size,))\n",
177 | "\n",
178 | "# A batch normalization layer improves convergence speed\n",
179 | "# bn_layer = BatchNormalization()\n",
180 | "bn_layer = Activation('linear')\n",
181 | "\n",
182 | "# Instance layers and assign them to the two lobes of the network\n",
183 | "dense_layers = [Dense(node, activation = 'relu',)\n",
184 | " for node in nodes]\n",
185 | "\n",
186 | "lx_branch = bn_layer(Data_X)\n",
187 | "rx_branch = bn_layer(Data_Y)\n",
188 | "\n",
189 | "for i, layer in enumerate(dense_layers):\n",
190 | "\n",
191 | " lx_branch = dense_layers[i](lx_branch)\n",
192 | " rx_branch = dense_layers[i](rx_branch)\n",
193 | "\n",
194 | "\n",
195 | "# Add a softmax output layer.\n",
196 | "# Should be replaced with a linear activation layer if\n",
197 | "# the outputs of the network cannot be interpreted as states\n",
198 | "softmax = Dense(output_size, activation='softmax')\n",
199 | "\n",
200 | "lx_branch = softmax(lx_branch)\n",
201 | "rx_branch = softmax(rx_branch)\n",
202 | "\n",
203 | "# Merge both networks to train both at the same time\n",
204 | "merged = concatenate([lx_branch, rx_branch])\n",
205 | "\n",
206 | "# Initialize the model and the optimizer, and compile it with\n",
207 | "# the loss and metric functions from the VAMPnets package\n",
208 | "model = Model(inputs = [Data_X, Data_Y], outputs = merged)\n",
209 | "# model.summary()\n",
210 | "# Compile it with our own loss-function\n",
211 | "adam = optimizers.adam(lr = learning_rate)\n",
212 | "\n",
213 | "\n",
214 | "# Pretraining with VAMP with 'symmetrized' matrices yields a bad approximation of the \n",
215 | "# eigenvectors per se, but improves the 'readability' of the states identified by VAMP-2\n",
216 | "# which would otherwise be difficult to interprete.\n",
217 | "\n",
218 | "\n",
219 | "# IMPORTANT: the function vamp.loss_VAMP2_autograd can only be used with tensorflow 1.6 or more recent.\n",
220 | "# For older versions of TF, use the function vamp.loss_VAMP2\n",
221 | "\n",
222 | "losses = [\n",
223 | " vamp._loss_VAMP_sym,\n",
224 | " vamp.loss_VAMP2,\n",
225 | "]\n",
226 | "\n",
227 | "valid_metric = np.zeros((len(losses), nb_epoch))\n",
228 | "train_metric = np.zeros((len(losses), nb_epoch))\n",
229 | "\n",
230 | "for l_index, loss in enumerate(losses):\n",
231 | " \n",
232 | " model.compile(optimizer = 'adam', loss = loss, metrics = [vamp.metric_VAMP])\n",
233 | " \n",
234 | " hist = model.fit([X1_train, X2_train], Y_train ,batch_size=batch_size, epochs=nb_epoch, verbose=0,\n",
235 | " validation_data=([X1_vali, X2_vali], Y_vali))\n",
236 | " \n",
237 | " temp = model.predict([traj_ord, traj_ord_lag], batch_size=np.shape(X1_vali)[0])\n",
238 | " \n",
239 | " x_a = temp[:,:output_size]\n",
240 | "\n",
241 | "\n",
242 | " X_Validation = np.squeeze(traj_ord)\n",
243 | " for i in range(output_size):\n",
244 | " plt.scatter(X_Validation, x_a[:,i], label= 'state '+str(i))\n",
245 | " plt.title('State probabilities')\n",
246 | " plt.legend()\n",
247 | " plt.show()\n",
248 | "\n",
249 | "\n",
250 | "\n",
251 | "\n",
252 | " states_prob_meanfree = x_a - np.mean(x_a, axis=0)\n",
253 | " tau_msm = 5\n",
254 | " K_smt = vamp.estimate_koopman_op(states_prob_meanfree, tau_msm)\n",
255 | "\n",
256 | " K_eigvals, K_eigvec = np.linalg.eig(np.real(K_smt))\n",
257 | "\n",
258 | " index = np.argmax(np.real(K_eigvals))\n",
259 | " real_eigfunc = states_prob_meanfree @ np.real(K_eigvec[:,index])\n",
260 | "\n",
261 | " plt.scatter(X_Validation, real_eigfunc)\n",
262 | " plt.title('Eigenvector')\n",
263 | " plt.show()\n",
264 | "\n",
265 | " valid_metric[l_index] = np.array(hist.history['val_metric_VAMP'])\n",
266 | " train_metric[l_index] = np.array(hist.history['metric_VAMP'])\n",
267 | "\n",
268 | "valid_metric = np.reshape(valid_metric, (-1))\n",
269 | "train_metric = np.reshape(train_metric, (-1))"
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "metadata": {},
275 | "source": [
276 | "# Training result visualization"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {},
283 | "outputs": [],
284 | "source": [
285 | "plt.plot(train_metric, label = 'Training')\n",
286 | "plt.legend()\n",
287 | "plt.plot(valid_metric, label = 'Validation')\n",
288 | "plt.legend()\n",
289 | "\n",
290 | "plt.show()"
291 | ]
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "metadata": {},
296 | "source": [
297 | "# Transform the input trajectory using the network"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": null,
303 | "metadata": {},
304 | "outputs": [],
305 | "source": [
306 | "states_prob = model.predict([traj_ord, traj_ord_lag])[:, :output_size]\n",
307 | "\n",
308 | "# Order the output states based on their population\n",
309 | "coor_pred = np.argmax(states_prob, axis = 1)\n",
310 | "indexes = [np.where(coor_pred == np.multiply(np.ones_like(coor_pred), n)) for n in range(output_size)]\n",
311 | "states_num = [len(i[0]) for i in indexes]\n",
312 | "states_order = np.argsort(states_num).astype('int')[::-1]\n",
313 | "\n",
314 | "pred_ord = states_prob[:,states_order]"
315 | ]
316 | },
317 | {
318 | "cell_type": "markdown",
319 | "metadata": {},
320 | "source": [
321 | "# Visualize the population of the states"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": null,
327 | "metadata": {},
328 | "outputs": [],
329 | "source": [
330 | "def print_states_pie_chart():\n",
331 | " coors = []\n",
332 | " maxi = np.max(pred_ord, axis= 1)\n",
333 | "\n",
334 | " for i in range(output_size):\n",
335 | " coors.append(len(np.where(pred_ord[:,i] == maxi)[0]))\n",
336 | " \n",
337 | " fig1, ax1 = plt.subplots()\n",
338 | " ax1.pie(np.array(coors), autopct='%1.2f%%', startangle=90)\n",
339 | " ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.\n",
340 | " print('States population: '+str(np.array(coors)/len(maxi)*100)+'%')\n",
341 | " plt.show()\n",
342 | "\n",
343 | "print_states_pie_chart()"
344 | ]
345 | },
346 | {
347 | "cell_type": "markdown",
348 | "metadata": {},
349 | "source": [
350 | "# Estimate the implied timescales"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": null,
356 | "metadata": {},
357 | "outputs": [],
358 | "source": [
359 | "max_tau = 15\n",
360 | "lag = np.arange(1, max_tau, 1)\n",
361 | "its = vamp.get_its(pred_ord, lag)\n",
362 | "vamp.plot_its(its, lag)"
363 | ]
364 | },
365 | {
366 | "cell_type": "markdown",
367 | "metadata": {},
368 | "source": [
369 | "# Chapman-Kolmogorov test for the estimated koopman operator"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": null,
375 | "metadata": {},
376 | "outputs": [],
377 | "source": [
378 | "steps = 16\n",
379 | "tau_msm = 1\n",
380 | "predicted, estimated = vamp.get_ck_test(pred_ord, steps, tau_msm)\n",
381 | "vamp.plot_ck_test(predicted, estimated, output_size, steps, tau_msm)"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": null,
387 | "metadata": {},
388 | "outputs": [],
389 | "source": []
390 | }
391 | ],
392 | "metadata": {
393 | "anaconda-cloud": {},
394 | "kernelspec": {
395 | "display_name": "Python 3",
396 | "language": "python",
397 | "name": "python3"
398 | },
399 | "language_info": {
400 | "codemirror_mode": {
401 | "name": "ipython",
402 | "version": 3
403 | },
404 | "file_extension": ".py",
405 | "mimetype": "text/x-python",
406 | "name": "python",
407 | "nbconvert_exporter": "python",
408 | "pygments_lexer": "ipython3",
409 | "version": "3.6.4"
410 | }
411 | },
412 | "nbformat": 4,
413 | "nbformat_minor": 1
414 | }
415 |
--------------------------------------------------------------------------------
/vampnet/vampnet/data_generator.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/deeptime repository.
2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | """sample generator for the MCMM project's clustering stage"""
19 |
20 | import numpy as np
21 |
22 | ################################################################################
23 | # #
24 | # defining test potentials #
25 | # #
26 | ################################################################################
27 |
28 | class BrownianDynamics(object):
29 | r"""base class for Brownian dynamics integration"""
30 | def __init__(self, dim, dt, kT, mass, damping):
31 | self.dim = dim
32 | self.dt = dt
33 | self.kT = kT
34 | self.mass = mass
35 | self.daming = damping
36 | self.coeff_A = dt / (mass * damping)
37 | self.coeff_B = np.sqrt(2.0 * dt * kT / (mass * damping))
38 | def gradient(self, x):
39 | r"""gradient of the yet unkown potential"""
40 | raise NotImplementedError("implement in child class")
41 | def step(self, x):
42 | r"""perform a single Brownian dynamics step"""
43 | return x - self.coeff_A * self.gradient(x) \
44 | + self.coeff_B * np.random.normal(size=self.dim)
45 |
46 |
47 | ################################################################################
48 | # #
49 | # defining test potentials #
50 | # #
51 | ################################################################################
52 |
53 | def asymmetric_double_well_energy(x):
54 | r"""computes the potential energy at point x"""
55 | _x = x - 2.0
56 | return 2.0 * _x - 6.0 * _x**2 + _x**4
57 |
58 | def asymmetric_double_well_gradient(x):
59 | r"""computes the potential's gradient at point x"""
60 | return 4.0 * x**3 - 24.0 * x**2 + 36.0 * x - 6.0
61 |
62 | def prinz_energy(x):
63 | return 4*(x**8 + 0.8 * np.exp(-80*x**2) + 0.2*np.exp(-80*(x-0.5)**2) + 0.5*np.exp(-40.*(x+0.5)**2))
64 |
65 | def prinz_gradient(x):
66 | return 4*(8*x**7 - 128. * np.exp(-80*x**2)*x - 32.*np.exp(-80*(x-0.5)**2) *(x-0.5) - 40*np.exp(-40.*(x+0.5)**2) *(x+0.5))
67 |
68 | def folding_model_energy(rvec, rcut):
69 | r"""computes the potential energy at point rvec"""
70 | r = np.linalg.norm(rvec) - rcut
71 | rr = r**2
72 | if r < 0.0:
73 | return -2.5 * rr
74 | return 0.5 * (r - 2.0) * rr
75 |
76 | def folding_model_gradient(rvec, rcut):
77 | r"""computes the potential's gradient at point rvec"""
78 | rnorm = np.linalg.norm(rvec)
79 | if rnorm == 0.0:
80 | return np.zeros(rvec.shape)
81 | r = rnorm - rcut
82 | if r < 0.0:
83 | return -5.0 * r * rvec / rnorm
84 | return (1.5 * r - 2.0) * rvec / rnorm
85 |
86 |
87 | ################################################################################
88 | # #
89 | # defining wrapper classes #
90 | # #
91 | ################################################################################
92 |
93 | class AsymmetricDoubleWell(BrownianDynamics):
94 | r"""encapsulates the asymmetric double well potential"""
95 | def __init__(self, dt, kT, mass=1.0, damping=1.0):
96 | super(AsymmetricDoubleWell, self).__init__(1, dt, kT, mass, damping)
97 | def gradient(self, x):
98 | return asymmetric_double_well_gradient(x)
99 | def sample(self, x0, nsteps, nskip=1):
100 | r"""generate nsteps sample points"""
101 | x = np.zeros(shape=(nsteps+1,))
102 | x[0] = x0
103 | for t in range(nsteps):
104 | q = x[t]
105 | for s in range(nskip):
106 | q = self.step(q)
107 | x[t+1] = q
108 | return x
109 |
110 | class FoldingModel(BrownianDynamics):
111 | r"""encapsulates the folding model potential"""
112 | def __init__(self, dt, kT, mass=1.0, damping=1.0, rcut=3.0):
113 | super(FoldingModel, self).__init__(5, dt, kT, mass, damping)
114 | self.rcut = rcut
115 | def gradient(self, x):
116 | return folding_model_gradient(x, self.rcut)
117 | def sample(self, rvec0, nsteps, nskip=1):
118 | r"""generate nsteps sample points"""
119 | rvec = np.zeros(shape=(nsteps+1, self.dim))
120 | rvec[0, :] = rvec0[:]
121 | for t in range(nsteps):
122 | q = rvec[t, :]
123 | for s in range(nskip):
124 | q = self.step(q)
125 | rvec[t+1, :] = q[:]
126 | return rvec
127 |
128 | class PrinzModel(BrownianDynamics):
129 | r"""encapsulates the Prinz potential"""
130 | def __init__(self, dt, kT, mass=1.0, damping=1.0):
131 | super(PrinzModel, self).__init__(1, dt, kT, mass, damping)
132 | def gradient(self, x):
133 | return prinz_gradient(x)
134 | def sample(self, x0, nsteps, nskip=1):
135 | r"""generate nsteps sample points"""
136 | x = np.zeros(shape=(nsteps+1,))
137 | x[0] = x0
138 | for t in range(nsteps):
139 | q = x[t]
140 | for s in range(nskip):
141 | q = self.step(q)
142 | x[t+1] = q
143 | return x
144 |
145 |
146 | ################################################################################
147 | # #
148 | # main area #
149 | # #
150 | ################################################################################
151 |
152 | def get_asymmetric_double_well_data(nstep, x0 = 0., nskip=1, dt=0.01, kT=10.0, mass=1.0, damping=1.0):
153 | r"""wrapper for the asymmetric double well generator"""
154 | adw = AsymmetricDoubleWell(dt, kT, mass=mass, damping=damping)
155 | return adw.sample(x0, nstep, nskip=nskip)
156 |
157 | def get_folding_model_data(
158 | nstep, rvec0 = np.zeros((5)), nskip=1, dt=0.01, kT=10.0, mass=1.0, damping=1.0, rcut=3.0):
159 | r"""wrapper for the folding model generator"""
160 | fm = FoldingModel(dt, kT, mass=mass, damping=damping, rcut=rcut)
161 | return fm.sample(rvec0, nstep, nskip=nskip)
162 |
163 | def get_prinz_pot(nstep, x0 = 0., nskip=1, dt=0.01, kT=10.0, mass=1.0, damping=1.0):
164 | r"""wrapper for the Prinz model generator"""
165 | pw = PrinzModel(dt, kT, mass=mass, damping=damping)
166 | return pw.sample(x0, nstep, nskip=nskip)
167 |
168 | def get_alanine_data(input_type = 'coordinates', return_dihedrals = True):
169 |
170 | import mdshare
171 |
172 | retval = []
173 |
174 | if input_type == 'distances':
175 |
176 | local_filename = mdshare.fetch('alanine-dipeptide-3x250ns-heavy-atom-distances.npz')
177 |
178 | traj_whole = np.load(local_filename)['arr_0']
179 |
180 | elif input_type == 'coordinates':
181 |
182 | local_filename = mdshare.fetch('alanine-dipeptide-3x250ns-heavy-atom-positions.npz')
183 |
184 | traj_whole = np.load(local_filename)['arr_0']
185 |
186 | retval.append(traj_whole)
187 |
188 | if return_dihedrals:
189 | dihedral = np.load(mdshare.fetch('alanine-dipeptide-3x250ns-backbone-dihedrals.npz'))['arr_0']
190 | retval.append(dihedral)
191 |
192 |
193 | return retval
194 |
195 |
196 | def build_generator_on_source(data_source, batch_size, lag, output_size):
197 | '''Function used to create a generator that will fetch data from a data source through an iterator.
198 | This can be passed as parameter to a keras fit_generator method.
199 |
200 | Parameters
201 | ----------
202 | data_source: pyemma source object.
203 | Data files source. This has to be initialized with chunksize = batch_size
204 |
205 | batch_size: int
206 | Batch size to be used for the training
207 |
208 | lag: int
209 | time frames lag to be used in the training of the VAMPnets
210 |
211 | output_size: int
212 | How many output nodes the network has
213 | '''
214 |
215 | counter_batches = 0
216 |
217 |
218 | # How many batches before the iterator has to be reinitialized
219 | steps_epoch = np.sum(np.ceil((data_source.trajectory_lengths()-lag)/batch_size))
220 |
221 | data_iterator = data_source.iterator(chunk = batch_size,
222 | lag = lag,
223 | return_trajindex=False)
224 |
225 | while True:
226 |
227 | input_data = list(data_iterator.next())
228 |
229 | # Create empty labels to accomodate keras' interface requirements
230 | labels = np.empty((input_data[0].shape[0],2*output_size)).astype('float32')
231 | data = input_data, labels
232 | counter_batches += 1
233 |
234 | if counter_batches == steps_epoch:
235 | data_iterator = data_source.iterator(chunk = batch_size,
236 | lag = lag,
237 | return_trajindex=False)
238 | counter_batches = 0
239 |
240 | yield data
241 |
242 |
243 |
244 | def build_generator_on_source_shuffle(data_source, batch_size, lag, output_size, preloaded_batches = 1):
245 | '''Function used to create a generator that will randomly access data and fetch them from a data
246 | source through an iterator. This can be passed as parameter to a keras fit_generator method.
247 |
248 | Parameters
249 | ----------
250 | data_source: pyemma source object.
251 | Data files source. This has to be initialized with chunksize = batch_size
252 |
253 | batch_size: int
254 | Batch size to be used for the training
255 |
256 | lag: int
257 | time frames lag to be used in the training of the VAMPnets
258 |
259 | output_size: int
260 | How many output nodes the network has
261 |
262 | preloaded_batches: int
263 | How many batches of data should be loaded at once; higher values will improve
264 | execution speed but also memory consumption
265 | '''
266 |
267 | counter_batches = 0
268 |
269 |
270 | # How many batches before the iterator has to be reinitialized
271 | steps_epoch = np.ceil(np.sum((data_source.trajectory_lengths()-lag)/ (batch_size* preloaded_batches)))
272 | input_size = data_source.dimension()
273 |
274 |
275 | traj_lengths = data_source.trajectory_lengths()
276 | remaining_frames = np.concatenate([[index_traj*np.ones((traj_len - lag)), np.arange(traj_len - lag)] for index_traj, traj_len in enumerate(traj_lengths)], axis = 1).T.astype('int')
277 | indexes = np.arange(remaining_frames.shape[0])
278 | np.random.shuffle(indexes)
279 |
280 | while True:
281 |
282 | start = counter_batches * batch_size * preloaded_batches
283 | end = min(start + batch_size * preloaded_batches, remaining_frames.shape[0])
284 |
285 | frames = remaining_frames[indexes[start:end]]
286 |
287 | fake_ind = frames[:,0]*(traj_lengths.sum()) + frames[:,1]
288 | arg_sort = np.argsort(fake_ind)
289 | sort_arg_sort = np.argsort(arg_sort)
290 |
291 | frames_tau = frames + np.array([np.zeros((frames.shape[0])), np.ones((frames.shape[0]))*lag], dtype = 'int').T
292 |
293 |
294 | data_iterator_t = data_source.iterator(stride=frames[arg_sort],
295 | return_trajindex=False)
296 | data_iterator_tau = data_source.iterator(stride=frames_tau[arg_sort],
297 | return_trajindex=False)
298 |
299 | data = np.empty((2, batch_size * preloaded_batches, input_size))
300 | start_iter = 0
301 | for iter_data, iter_data_tau in zip(data_iterator_t, data_iterator_tau):
302 | temp_frames = iter_data.shape[0]
303 | end_iter = start_iter + temp_frames
304 | data[0, start_iter:end_iter] = iter_data
305 | data[1, start_iter:end_iter] = iter_data_tau
306 | start_iter = end_iter
307 |
308 |
309 | data = data[:, sort_arg_sort]
310 |
311 | index_preloaded = 0
312 | labels = np.empty((batch_size,2*output_size)).astype('float32')
313 |
314 | while index_preloaded < preloaded_batches:
315 |
316 | start_batch = index_preloaded * batch_size
317 | end_batch = start_batch + batch_size
318 | index_preloaded += 1
319 |
320 | if end_batch > data.shape[1]:
321 | end_batch = data.shape[1]
322 | index_preloaded = preloaded_batches
323 | labels = np.empty((end_batch - start_batch,2*output_size)).astype('float32')
324 |
325 | output_data = [data[0, start_batch:end_batch], data[1, start_batch:end_batch]], labels
326 |
327 | yield output_data
328 |
329 |
330 | counter_batches += 1
331 |
332 | if counter_batches == steps_epoch:
333 |
334 | counter_batches = 0
335 | indexes = np.arange(remaining_frames.shape[0])
336 | np.random.shuffle(indexes)
--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/utils.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/deeptime repository.
2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | '''
19 | Tools to handle datasets, transformations, and statistics.
20 | '''
21 |
22 | import numpy as _np
23 | import torch as _torch
24 | from torch import nn as _nn
25 | from torch.utils.data import Dataset as _Dataset
26 | from torch.utils.data import TensorDataset as _TensorDataset
27 | from torch.utils.data import ConcatDataset as _ConcatDataset
28 | from torch.utils.data import DataLoader as _DataLoader
29 |
30 | __all__ = [
31 | 'LaggedDataset',
32 | 'MaskedDataset',
33 | 'create_dataset',
34 | 'stride_split',
35 | 'random_split',
36 | 'random_block_split',
37 | 'get_mean',
38 | 'get_covariance',
39 | 'get_sqrt_inverse',
40 | 'whiten_data',
41 | 'cca',
42 | 'Transform']
43 |
44 | ################################################################################
45 | #
46 | # DATASETS
47 | #
48 | ################################################################################
49 |
50 | class LaggedDataset(_Dataset):
51 | '''Dataset for wrapping time-lagged data from a single stored time series.
52 |
53 | Each sample will contain the data_tensor at index t and the (not explicitly
54 | stored) target_tensor via data_tensor at index t+lag. We need this for
55 | training the time-lagged autoencoder and TICA.
56 |
57 | Arguments:
58 | data_tensor (Tensor): contains time series data
59 | lag (int): specifies the lag in time steps
60 | '''
61 | def __init__(self, data_tensor, lag=1):
62 | assert data_tensor.size(0) > lag, 'you need more samples than lag'
63 | assert lag >= 0, 'you need a non-negative lagtime'
64 | self.data_tensor = data_tensor
65 | self.lag = lag
66 | def __getitem__(self, index):
67 | return self.data_tensor[index], self.data_tensor[index + self.lag]
68 | def __len__(self):
69 | return self.data_tensor.size(0) - self.lag
70 |
71 | class MaskedDataset(_Dataset):
72 | '''Dataset for wrapping a specified subset of another dataset.
73 |
74 | This helps to separate a dataset into two or more subsets, e.g., for
75 | training and testing.
76 |
77 | Arguments:
78 | data_tensor (Tensor): contains time series data
79 | active (sequence of int): indices of the active elements
80 | '''
81 | def __init__(self, dataset, active):
82 | assert len(dataset) >= len(active), \
83 | 'you cannot have less total samples than active'
84 | assert _np.all(0 <= active) and _np.all(active < len(dataset)), \
85 | 'you must use only valid indices'
86 | assert len(active) == len(_np.unique(active)), \
87 | 'you must use every active index only once'
88 | self.dataset = dataset
89 | self.active = active
90 | def __getitem__(self, index):
91 | return self.dataset[self.active[index]]
92 | def __len__(self):
93 | return len(self.active)
94 |
95 | def ensure_traj_format(data, dtype=_np.float32):
96 | data = _np.asarray(data, dtype=dtype)
97 | if data.ndim == 2:
98 | return data
99 | elif data.ndim == 1:
100 | return data.reshape(-1, 1)
101 | else:
102 | raise ValueError('data has incomplatible ndim: ' + str(data.ndim))
103 |
104 | def create_dataset(data, lag=0, dtype=_np.float32):
105 | '''Create a (time-lagged) dataset from one or more numpy.ndarrays.
106 |
107 | Arguments:
108 | data (numpy.ndarray of list thereof): data to create the dataset from
109 | lag (int): specifies the lag in time steps
110 | dtype (numpy.dtype): dtype of the resulting dataset
111 | '''
112 | if isinstance(data, _np.ndarray):
113 | return LaggedDataset(
114 | _torch.from_numpy(ensure_traj_format(data, dtype=dtype)),
115 | lag=lag)
116 | elif isinstance(data, (list, tuple)):
117 | return _ConcatDataset([LaggedDataset(
118 | _torch.from_numpy(ensure_traj_format(d, dtype=dtype)),
119 | lag=lag) for d in data])
120 | else:
121 | raise ValueError(
122 | 'use a single or a list of numpy.ndarrays of dim 1 or 2')
123 |
124 | def stride_split(dataset, stride=2, offset=0):
125 | '''Split one dataset into two parts based on a stride.
126 |
127 | This helps to separate a dataset into two or more subsets, e.g., for
128 | training and testing. Every th element starting from
129 | goes into the first MaskedDataset, everything else into the second.
130 |
131 | Arguments:
132 | dataset (Dataset): contains the data you want to split
133 | stride (int): specify the size of the stride
134 | offset (int): specify where to start counting
135 | '''
136 | assert 0 < stride < len(dataset), \
137 | 'use a positive stride smaller than the length of the dataset'
138 | assert 0 <= offset < stride, \
139 | 'use a non-negative offset smaller than the stride'
140 | active = _np.arange(offset, len(dataset), stride)
141 | complement = _np.setdiff1d(
142 | _np.arange(len(dataset)), active, assume_unique=True)
143 | return MaskedDataset(dataset, active), MaskedDataset(dataset, complement)
144 |
145 | def random_split(dataset, active=None, n_active=None, f_active=None):
146 | '''Split one dataset into two parts based on a random selection.
147 |
148 | This helps to separate a dataset into two or more subsets, e.g., for
149 | training and testing. Specify the active set either by giving the frame
150 | indices, the number of active frames or the fraction of active frames.
151 |
152 | Arguments:
153 | dataset (Dataset): contains the data you want to split
154 | active (iterable of int): specify the active frames
155 | n_active (int): number of active frames
156 | f_active (float): fraction of active frames
157 | '''
158 | if active is None:
159 | if n_active is None:
160 | if f_active is None:
161 | raise ValueError(
162 | 'specify either active, n_active or f_active')
163 | else:
164 | assert 0 < f_active < 1, \
165 | 'f_active must be 0 < f_active < 1'
166 | n_active = int(_np.floor(0.5 + f_active * len(dataset)))
167 | else:
168 | assert 0 < n_active < len(dataset), \
169 | 'n_active must be 0 < n_active < len(dataset)'
170 | if f_active is not None:
171 | raise ValueError(
172 | 'do not specify f_active if n_active is given')
173 | active = _np.random.choice(len(dataset), size=n_active, replace=False)
174 | else:
175 | active = _np.asarray(active)
176 | assert len(active) == len(_np.unique(active)), \
177 | 'you must use every active index only once'
178 | assert _np.all(0 <= active) and _np.all(active < len(dataset)), \
179 | 'you must use only valid indices'
180 | if f_active is not None:
181 | raise ValueError(
182 | 'do not specify f_active if active is given')
183 | if n_active is not None:
184 | raise ValueError(
185 | 'do not specify n_active if active is given')
186 | complement = _np.setdiff1d(
187 | _np.arange(len(dataset)), active, assume_unique=True)
188 | return MaskedDataset(dataset, active), MaskedDataset(dataset, complement)
189 |
190 | def random_block_split(dataset, lag, f_active=0.5):
191 | '''Split one dataset into two parts based on a random selection of blocks.
192 |
193 | This helps to separate a dataset into two or more subsets, e.g., for
194 | training and testing. Specify the active set either by giving the fraction
195 | of active blocks (the total number of transitions is conserved).
196 |
197 | Arguments:
198 | dataset (Dataset): contains the data you want to split
199 | lag (int): specifies the lag in time steps
200 | f_active (float): fraction of active blocks
201 | '''
202 | active = []
203 | n = 0
204 | nmax = len(dataset)
205 | n_blocks = int(_np.ceil(float(nmax) / float(lag)))
206 | n_active_blocks = int(_np.floor(0.5 + f_active * n_blocks))
207 | active_blocks = _np.random.choice(
208 | n_blocks, size=n_active_blocks, replace=False)
209 | for n in active_blocks:
210 | active += _np.arange(n * lag, min((n + 1) * lag, nmax)).tolist()
211 | return random_split(dataset, active=active)
212 |
213 | ################################################################################
214 | #
215 | # STATISTICS
216 | #
217 | ################################################################################
218 |
219 | def get_mean(loader):
220 | '''Compute the mean value via minibatch summation using a loader.
221 |
222 | Arguments:
223 | loader (DataLoader): contains the data you want to analyze
224 | '''
225 | x_mean, y_mean = None, None
226 | for x, y in loader:
227 | try:
228 | x_mean.add_(x.sum(dim=0))
229 | except AttributeError:
230 | x_mean = x.sum(dim=0)
231 | try:
232 | y_mean.add_(y.sum(dim=0))
233 | except AttributeError:
234 | y_mean = y.sum(dim=0)
235 | x_mean.div_(float(len(loader.dataset)))
236 | y_mean.div_(float(len(loader.dataset)))
237 | return x_mean, y_mean
238 |
239 | def get_covariance(loader, x_mean, y_mean):
240 | '''Compute the instantaneous and time-lagged covariance matrices via
241 | minibatch summation using a loader.
242 |
243 | Arguments:
244 | loader (DataLoader): contains the data you want to analyze
245 | x_mean (Tensor): mean value for the data_tensor
246 | y_mean (Tensor): mean value for the target_tensor
247 | '''
248 | cxx = _torch.zeros(len(x_mean), len(x_mean))
249 | cxy = _torch.zeros(len(x_mean), len(y_mean))
250 | cyy = _torch.zeros(len(y_mean), len(y_mean))
251 | for x, y in loader:
252 | x.sub_(x_mean[None, :])
253 | y.sub_(y_mean[None, :])
254 | cxx.add_(_torch.mm(x.t(), x))
255 | cxy.add_(_torch.mm(x.t(), y))
256 | cyy.add_(_torch.mm(y.t(), y))
257 | cxx.div_(float(len(loader.dataset)))
258 | cxy.div_(float(len(loader.dataset)))
259 | cyy.div_(float(len(loader.dataset)))
260 | return cxx, cxy, cyy
261 |
262 | ################################################################################
263 | #
264 | # WHITENING
265 | #
266 | ################################################################################
267 |
268 | def get_sqrt_inverse(matrix, bias=1.0e-5):
269 | '''Compute the sqrt-inverse of the supplied symmetric/real matrix.
270 |
271 | We need this step for whitening and TICA.
272 |
273 | Arguments:
274 | matrix (Tensor): contains the matrix you want to transform
275 | bias (float): assures numerical stability
276 | '''
277 | e, v = _torch.symeig(matrix, eigenvectors=True)
278 | d = _torch.diag(1.0 / _torch.sqrt(_torch.abs(e) + bias))
279 | return _torch.mm(_torch.mm(v, d), v.t())
280 |
281 | def whiten_data(data_tensor, batch_size=100):
282 | '''Whiten a Tensor in the PCA basis.
283 |
284 | Arguments:
285 | data_tensor (Tensor): contains the data you want to whiten
286 | batch_size (int): specify a batch size for the whitening process
287 | '''
288 | loader = _DataLoader(
289 | LaggedDataset(data_tensor, lag=0), batch_size=batch_size)
290 | x_mean, y_mean = get_mean(loader)
291 | cxx, cxy, cyy = get_covariance(loader, x_mean, y_mean)
292 | ixx = get_sqrt_inverse(cxx)
293 | whitened_data = []
294 | for x, _ in loader:
295 | x.sub_(x_mean[None, :])
296 | whitened_data.append(x.mm(ixx))
297 | return _torch.cat(whitened_data)
298 |
299 | ################################################################################
300 | #
301 | # CCA
302 | #
303 | ################################################################################
304 |
305 | def cca(data_tensor_x, data_tensor_y, batch_size=100):
306 | '''Perform canonical correlation analysis for two data tensors.
307 |
308 | Arguments:
309 | data_tensor_x (Tensor): contains the first data tensor
310 | data_tensor_y (Tensor): contains the second data tensor
311 | batch_size (int): specify a batch size for the CCA calculation
312 | '''
313 | loader = _DataLoader(
314 | _TensorDataset(data_tensor_x, data_tensor_y),
315 | batch_size=batch_size)
316 | x_mean, y_mean = get_mean(loader)
317 | cxx, cxy, cyy = get_covariance(loader, x_mean, y_mean)
318 | ixx = get_sqrt_inverse(cxx)
319 | iyy = get_sqrt_inverse(cyy)
320 | return _torch.svd(_torch.mm(_torch.mm(ixx, cxy), iyy))
321 |
322 | ################################################################################
323 | #
324 | # TRANSFORMER
325 | #
326 | ################################################################################
327 |
328 | class BaseTransform(object):
329 | def __init__(self, mean=None, covariance=None):
330 | if mean is not None:
331 | self.sub = mean
332 | if covariance is not None:
333 | self.mul = get_sqrt_inverse(covariance)
334 | def __call__(self, x):
335 | try:
336 | x.sub_(self.sub[None, :])
337 | except AttributeError:
338 | pass
339 | try:
340 | x = x.mm(self.mul)
341 | except AttributeError:
342 | pass
343 | return x
344 |
345 | class Transform(object):
346 | '''Apply whitening/centering transformations within a minibatch.
347 |
348 | As we do not want to preprocess and, thus, duplicate large datasets,
349 | we do the necessary whitening and centering operations on the fly while
350 | iterating over the datasets.
351 |
352 | Arguments:
353 | x_mean (Tensor): contains the mean of the data tensor
354 | x_covariance (Tensor): contains the covariance of the data tensor
355 | y_mean (Tensor): contains the mean of the target tensor
356 | y_covariance (Tensor): contains the covariance of the target tensor
357 | '''
358 | def __init__(
359 | self, x_mean=None, x_covariance=None, y_mean=None, y_covariance=None):
360 | self.x = BaseTransform(mean=x_mean, covariance=x_covariance)
361 | self.y = BaseTransform(mean=y_mean, covariance=y_covariance)
362 | def __call__(self, x, y):
363 | return self.x(x), self.y(y)
364 |
--------------------------------------------------------------------------------
/vampnet/examples/Folding.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Import all the packages used"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import numpy as np\n",
17 | "import matplotlib.pyplot as plt\n",
18 | "%matplotlib inline\n",
19 | "import vampnet\n",
20 | "from vampnet import data_generator\n",
21 | "from keras.models import Model\n",
22 | "from keras.layers import Dense, Activation, Flatten, Input, BatchNormalization, concatenate\n",
23 | "from keras import optimizers\n",
24 | "import tensorflow as tf\n",
25 | "from keras.backend import clear_session"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "# generate 10^7 frames and energy values\n",
35 | "datapoints = int(1e6)\n",
36 | "stride = 10"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "x = data_generator.get_folding_model_data(datapoints, rvec0=2.0 * (np.random.rand(5) - 0.5), kT=1., dt = 0.1)\n",
46 | "r = np.linalg.norm(x, axis=-1)[::stride]"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "pot = np.zeros_like(r)\n",
56 | "for i in range(r.shape[0]):\n",
57 | " pot[i] = data_generator.folding_model_energy(r[i], 3)"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "plt.plot(r[::stride], pot[::stride], '.')\n",
67 | "plt.show()"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "traj_whole = x\n",
77 | "traj_data_points, input_size = traj_whole.shape"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": null,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "# All Hyperparameters\n",
87 | "\n",
88 | "# Tau, how much is the timeshift of the two datasets\n",
89 | "tau = 10\n",
90 | "\n",
91 | "# Batch size for Stochastic Gradient descent\n",
92 | "batch_size = 2048\n",
93 | "\n",
94 | "# Which trajectory points percentage is used as training\n",
95 | "train_ratio = 0.9\n",
96 | "\n",
97 | "# How many hidden layers the network has\n",
98 | "network_depth = 4\n",
99 | "\n",
100 | "# Width of every layer\n",
101 | "layer_width = 20\n",
102 | "nodes = [layer_width]*network_depth\n",
103 | "# Learning rate used for the ADAM optimizer\n",
104 | "learning_rate = 0.0001\n",
105 | "\n",
106 | "# How many output states the network has\n",
107 | "output_size = 2\n",
108 | "\n",
109 | "# Iteration over the training set in the fitting process\n",
110 | "nb_epoch = 20\n",
111 | "\n",
112 | "plot_stride = 200"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "epsilon = 1e-5\n",
122 | "vamp = vampnet.VampnetTools(epsilon = epsilon)"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "# Shuffle trajectory and lagged trajectory together\n",
132 | "length_data = traj_data_points - tau\n",
133 | "\n",
134 | "traj_ord= traj_whole[:length_data]\n",
135 | "traj_ord_lag = traj_whole[tau:length_data+tau]\n",
136 | "\n",
137 | "indexes = np.arange(length_data)\n",
138 | "np.random.shuffle(indexes)\n",
139 | "\n",
140 | "\n",
141 | "\n",
142 | "traj = traj_ord[indexes]\n",
143 | "traj_lag = traj_ord_lag[indexes]\n"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "# Prepare data for tensorflow usage\n",
153 | "length_train = int(np.floor(length_data * train_ratio))\n",
154 | "length_vali = length_data - length_train\n",
155 | "\n",
156 | "traj_data_train = traj[:length_train]\n",
157 | "traj_data_train_lag = traj_lag[:length_train]\n",
158 | "\n",
159 | "traj_data_valid = traj[length_train:]\n",
160 | "traj_data_valid_lag = traj_lag[length_train:]\n",
161 | "\n",
162 | "#Data used for states ordering\n",
163 | "X1 = traj_ord[:length_data].astype('float32')\n",
164 | "X2 = traj_ord_lag[:length_data].astype('float32')\n",
165 | "\n",
166 | "# Input of the first network\n",
167 | "X1_train = traj_data_train.astype('float32')\n",
168 | "X2_train = traj_data_train_lag.astype('float32')\n",
169 | "\n",
170 | "# Input for validation\n",
171 | "X1_vali = traj_data_valid.astype('float32')\n",
172 | "X2_vali = traj_data_valid_lag.astype('float32')\n",
173 | "\n",
174 | "# Needs a Y-train set which we dont have.\n",
175 | "Y_train = np.zeros((length_train,2*output_size)).astype('float32')\n",
176 | "Y_vali = np.zeros((length_vali,2*output_size)).astype('float32')"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "if 'model' in globals():\n",
186 | " del model\n",
187 | " clear_session()\n",
188 | "\n",
189 | " \n",
190 | "# Build the model\n",
191 | "Data_X = Input(shape = (input_size,))\n",
192 | "Data_Y = Input(shape = (input_size,))\n",
193 | "\n",
194 | "# A batch normalization layer improves convergence speed\n",
195 | "# bn_layer = BatchNormalization()\n",
196 | "bn_layer = Activation('linear')\n",
197 | "\n",
198 | "# Instance layers and assign them to the two lobes of the network\n",
199 | "dense_layers = [Dense(node, activation = 'relu',)\n",
200 | " for node in nodes]\n",
201 | "\n",
202 | "lx_branch = bn_layer(Data_X)\n",
203 | "rx_branch = bn_layer(Data_Y)\n",
204 | "\n",
205 | "for i, layer in enumerate(dense_layers):\n",
206 | "\n",
207 | " lx_branch = dense_layers[i](lx_branch)\n",
208 | " rx_branch = dense_layers[i](rx_branch)\n",
209 | "\n",
210 | "\n",
211 | "# Add a softmax output layer.\n",
212 | "# Should be replaced with a linear activation layer if\n",
213 | "# the outputs of the network cannot be interpreted as states\n",
214 | "softmax = Dense(output_size, activation='softmax')\n",
215 | "\n",
216 | "lx_branch = softmax(lx_branch)\n",
217 | "rx_branch = softmax(rx_branch)\n",
218 | "\n",
219 | "# Merge both networks to train both at the same time\n",
220 | "merged = concatenate([lx_branch, rx_branch])\n",
221 | "\n",
222 | "# Initialize the model and the optimizer, and compile it with\n",
223 | "# the loss and metric functions from the VAMPnets package\n",
224 | "model = Model(inputs = [Data_X, Data_Y], outputs = merged)\n",
225 | "# model.summary()\n",
226 | "# Compile it with our own loss-function\n",
227 | "adam = optimizers.adam(lr = learning_rate)\n",
228 | "\n",
229 | "\n",
230 | "# Pretraining with VAMP with 'symmetrized' matrices yields a bad approximation of the \n",
231 | "# eigenvectors per se, but improves the 'readability' of the states identified by VAMP-2\n",
232 | "# which would otherwise be difficult to interprete.\n",
233 | "\n",
234 | "\n",
235 | "# IMPORTANT: the function vamp.loss_VAMP2_autograd can only be used with tensorflow 1.6 or more recent.\n",
236 | "# For older versions of TF, use the function vamp.loss_VAMP2\n",
237 | "\n",
238 | "losses = [\n",
239 | " vamp._loss_VAMP_sym,\n",
240 | " vamp.loss_VAMP2,\n",
241 | "]\n",
242 | "\n",
243 | "valid_metric = np.zeros((len(losses), nb_epoch))\n",
244 | "train_metric = np.zeros((len(losses), nb_epoch))\n",
245 | "\n",
246 | "for l_index, loss in enumerate(losses):\n",
247 | " \n",
248 | " model.compile(optimizer = 'adam', loss = loss, metrics = [vamp.metric_VAMP])\n",
249 | " \n",
250 | " hist = model.fit([X1_train, X2_train], Y_train ,batch_size=batch_size, epochs=nb_epoch, verbose=0,\n",
251 | " validation_data=([X1_vali, X2_vali], Y_vali))\n",
252 | " \n",
253 | " states_prob = model.predict([traj_ord, traj_ord_lag])[:, :output_size]\n",
254 | "\n",
255 | " # Order the output states based on their population\n",
256 | " coor_pred = np.argmax(states_prob, axis = 1)\n",
257 | " indexes = [np.where(coor_pred == np.multiply(np.ones_like(coor_pred), n)) for n in range(output_size)]\n",
258 | " states_num = [len(i[0]) for i in indexes]\n",
259 | " states_order = np.argsort(states_num).astype('int')[::-1]\n",
260 | "\n",
261 | " pred_ord = states_prob[:,states_order]\n",
262 | " \n",
263 | " X_Validation = np.linalg.norm(traj_ord, axis=1)\n",
264 | " for i in range(output_size):\n",
265 | " plt.plot(X_Validation[::plot_stride], pred_ord[::plot_stride,i], '.', label = 'state '+str(i))\n",
266 | " plt.legend()\n",
267 | " plt.title('States probabilites')\n",
268 | " plt.show()\n",
269 | " tau_msm = 20\n",
270 | " pred_ord_meanfree = pred_ord - pred_ord.mean(0)\n",
271 | " K_smt = vamp.estimate_koopman_op(pred_ord_meanfree, tau_msm)\n",
272 | "\n",
273 | " K_eigvals, K_eigvec = np.linalg.eig(np.real(K_smt))\n",
274 | "\n",
275 | " index = np.argmax(np.real(K_eigvals))\n",
276 | " real_eigfunc = pred_ord_meanfree @ np.real(K_eigvec[:,index])\n",
277 | "\n",
278 | " plt.plot(X_Validation[::plot_stride], real_eigfunc[::plot_stride], '.')\n",
279 | " plt.title('Eigenvector')\n",
280 | " plt.show()\n",
281 | "\n",
282 | " valid_metric[l_index] = np.array(hist.history['val_metric_VAMP'])\n",
283 | " train_metric[l_index] = np.array(hist.history['metric_VAMP'])\n",
284 | "\n",
285 | "valid_metric = np.reshape(valid_metric, (-1))\n",
286 | "train_metric = np.reshape(train_metric, (-1))"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": null,
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "# Training result visualization\n",
296 | "\n",
297 | "plt.plot(train_metric, label = 'Training')\n",
298 | "plt.legend()\n",
299 | "plt.plot(valid_metric, label = 'Validation')\n",
300 | "plt.legend()\n",
301 | "\n",
302 | "plt.show()"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": null,
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "# Transform the input trajectory using the network\n",
312 | "states_prob = model.predict([traj_ord, traj_ord_lag])[:, :output_size]\n",
313 | "\n",
314 | "# Order the output states based on their population\n",
315 | "coor_pred = np.argmax(states_prob, axis = 1)\n",
316 | "indexes = [np.where(coor_pred == np.multiply(np.ones_like(coor_pred), n)) for n in range(output_size)]\n",
317 | "states_num = [len(i[0]) for i in indexes]\n",
318 | "states_order = np.argsort(states_num).astype('int')[::-1]\n",
319 | "\n",
320 | "pred_ord = states_prob[:,states_order]"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": null,
326 | "metadata": {},
327 | "outputs": [],
328 | "source": [
329 | "X_Validation = np.linalg.norm(traj_ord, axis=1)\n",
330 | "for i in range(output_size):\n",
331 | " plt.plot(X_Validation[::plot_stride], pred_ord[::plot_stride,i], '.', label = 'state '+str(i))\n",
332 | " \n",
333 | "scaled_pot = (pot-pot.min())/(pot.max()-pot.min())\n",
334 | " \n",
335 | "plt.plot(r[::plot_stride], scaled_pot[::plot_stride], '.', label = 'Potential')\n",
336 | "plt.show()"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": null,
342 | "metadata": {},
343 | "outputs": [],
344 | "source": [
345 | "tau_msm = 20\n",
346 | "pred_ord_meanfree = pred_ord - pred_ord.mean(0)\n",
347 | "K_smt = vamp.estimate_koopman_op(pred_ord_meanfree, tau_msm)\n",
348 | "\n",
349 | "K_eigvals, K_eigvec = np.linalg.eig(np.real(K_smt))\n",
350 | "\n",
351 | "index = np.argmax(np.real(K_eigvals))\n",
352 | "real_eigfunc = pred_ord_meanfree @ np.real(K_eigvec[:,index])\n",
353 | "\n",
354 | "plt.plot(X_Validation[::plot_stride], real_eigfunc[::plot_stride], '.')\n",
355 | "plt.title('Eigenvector')\n",
356 | "plt.show()"
357 | ]
358 | },
359 | {
360 | "cell_type": "markdown",
361 | "metadata": {},
362 | "source": [
363 | "# Visualize the population of the states"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": null,
369 | "metadata": {},
370 | "outputs": [],
371 | "source": [
372 | "def print_states_pie_chart():\n",
373 | " coors = []\n",
374 | " maxi = np.max(pred_ord, axis= 1)\n",
375 | "\n",
376 | " for i in range(output_size):\n",
377 | " coors.append(len(np.where(pred_ord[:,i] == maxi)[0]))\n",
378 | " \n",
379 | " fig1, ax1 = plt.subplots()\n",
380 | " ax1.pie(np.array(coors), autopct='%1.2f%%', startangle=90)\n",
381 | " ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.\n",
382 | " print('States population: '+str(np.array(coors)/len(maxi)*100)+'%')\n",
383 | " plt.show()\n",
384 | "\n",
385 | "print_states_pie_chart()"
386 | ]
387 | },
388 | {
389 | "cell_type": "markdown",
390 | "metadata": {},
391 | "source": [
392 | "# Estimate the implied timescales"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": null,
398 | "metadata": {},
399 | "outputs": [],
400 | "source": [
401 | "max_tau = 200\n",
402 | "lag = np.arange(1, max_tau, 1)\n",
403 | "its = vamp.get_its(pred_ord, lag)\n",
404 | "vamp.plot_its(its, lag)"
405 | ]
406 | },
407 | {
408 | "cell_type": "markdown",
409 | "metadata": {},
410 | "source": [
411 | "# Chapman-Kolmogorov test for the estimated koopman operator"
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": null,
417 | "metadata": {},
418 | "outputs": [],
419 | "source": [
420 | "steps = 24\n",
421 | "tau_msm = 50\n",
422 | "predicted, estimated = vamp.get_ck_test(pred_ord, steps, tau_msm)\n",
423 | "vamp.plot_ck_test(predicted, estimated, output_size, steps, tau_msm)"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": null,
429 | "metadata": {},
430 | "outputs": [],
431 | "source": []
432 | }
433 | ],
434 | "metadata": {
435 | "anaconda-cloud": {},
436 | "kernelspec": {
437 | "display_name": "Python 3",
438 | "language": "python",
439 | "name": "python3"
440 | },
441 | "language_info": {
442 | "codemirror_mode": {
443 | "name": "ipython",
444 | "version": 3
445 | },
446 | "file_extension": ".py",
447 | "mimetype": "text/x-python",
448 | "name": "python",
449 | "nbconvert_exporter": "python",
450 | "pygments_lexer": "ipython3",
451 | "version": "3.6.4"
452 | }
453 | },
454 | "nbformat": 4,
455 | "nbformat_minor": 1
456 | }
457 |
--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/benchmarks.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/deeptime repository.
2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | '''
19 | Automatized benchmarks.
20 | '''
21 |
22 | import multiprocessing as mp
23 | import numpy as np
24 | import torch
25 | import tae
26 | import os
27 |
28 | import tae
29 | import torch
30 | import pyemma
31 | from time import time
32 |
33 | try:
34 | import pyemma
35 | except ImportError:
36 | print('running benchmarks requires the pyemma package')
37 |
38 | try:
39 | from mdshare import load as _load
40 | except ImportError:
41 | print('running benchmarks requires the mdshare package')
42 |
43 | ################################################################################
44 | #
45 | # BENCHMARKING THE SQRT TOY MODEL
46 | #
47 | ################################################################################
48 |
49 | def evaluate_sqrt_model(
50 | length=10000,
51 | trns_lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
52 | msm_lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
53 | use_cuda=True):
54 | '''A wrapper to run the sqrt model benchmarks
55 |
56 | Arguments:
57 | length (int): length of the sampled trajectory
58 | trns_lags (list of int): lag times for the transformers
59 | msm_lags (list of int): lag times for the MSM validation
60 | use_cuda (boolean): use a GPU to run the benchmarks
61 | '''
62 | def analyse(lat_data, ref_data, msm_lags):
63 | cca = tae.utils.cca(torch.from_numpy(lat_data), ref_data)[1].numpy()
64 | centers = np.linspace(np.min(lat_data), np.max(lat_data), 101)
65 | centers = 0.5 * (centers[:-1] + centers[1:]).reshape(-1, 1)
66 | dtraj = pyemma.coordinates.assign_to_centers(lat_data, centers)
67 | its = pyemma.msm.its(dtraj, lags=msm_lags, nits=1).timescales
68 | return cca, its
69 | data, dtraj = tae.toymodels.sample_sqrt_model(length)
70 | ref_data = tae.utils.whiten_data(
71 | torch.from_numpy(dtraj.reshape(-1, 1).astype(np.float32)))
72 | ref_its = pyemma.msm.its(dtraj, lags=msm_lags, nits=1).timescales
73 | lat, trn, val = tae.pca(
74 | data, dim=1, validation_split=0.5, batch_size=100, whiten=True)
75 | cca, its = analyse(lat, ref_data, msm_lags)
76 | result = dict(
77 | trns_lags=np.asarray(trns_lags),
78 | msm_lags=np.asarray(msm_lags),
79 | ref_its=np.asarray(ref_its),
80 | pca_its=np.asarray(its),
81 | pca_cca=np.asarray(cca),
82 | pca_trn=np.asarray(trn),
83 | pca_val=np.asarray(val))
84 | for lag in trns_lags:
85 | lat, trn, val = tae.tica(
86 | data, dim=1, lag=lag, kinetic_map=True, symmetrize=True,
87 | validation_split=0.5, batch_size=100, whiten=True)
88 | cca, its = analyse(lat, ref_data, msm_lags)
89 | result.update({
90 | 'tica_%d_its' % lag: np.asarray(its),
91 | 'tica_%d_cca' % lag: np.asarray(cca),
92 | 'tica_%d_trn' % lag: np.asarray(trn),
93 | 'tica_%d_val' % lag: np.asarray(val)})
94 | lat, trn, val = tae.ae(
95 | data, dim=1, lag=lag, n_epochs=200, validation_split=0.5,
96 | batch_size=100, whiten=True, pin_memory=use_cuda, hid_size=[200, 100],
97 | cuda=use_cuda, non_blocking=use_cuda)
98 | cca, its = analyse(lat, ref_data, msm_lags)
99 | result.update({
100 | 'ae_%d_its' % lag: np.asarray(its),
101 | 'ae_%d_cca' % lag: np.asarray(cca),
102 | 'ae_%d_trn' % lag: np.asarray(trn),
103 | 'ae_%d_val' % lag: np.asarray(val)})
104 | return result
105 |
106 | ################################################################################
107 | #
108 | # BENCHMARKING THE SWISSROLL TOY MODEL
109 | #
110 | ################################################################################
111 |
112 | def evaluate_swissroll_model(
113 | dim=None,
114 | length=30000,
115 | trns_lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
116 | msm_lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
117 | use_cuda=True):
118 | '''A wrapper to run the swissroll model benchmarks
119 |
120 | Arguments:
121 | dim (int): specify the latent dimension (1 or 2)
122 | length (int): length of the sampled trajectory
123 | trns_lags (list of int): lag times for the transformers
124 | msm_lags (list of int): lag times for the MSM validation
125 | use_cuda (boolean): use a GPU to run the benchmarks
126 | '''
127 | def analyse(lat_data, ref_data, msm_lags):
128 | cca = tae.utils.cca(torch.from_numpy(lat_data), ref_data)[1].numpy()
129 | if lat_data.shape[1] == 1:
130 | centers = np.linspace(np.min(lat_data), np.max(lat_data), 101)
131 | centers = 0.5 * (centers[:-1] + centers[1:]).reshape(-1, 1)
132 | dtraj = pyemma.coordinates.assign_to_centers(lat_data, centers)
133 | else:
134 | dtraj = pyemma.coordinates.cluster_regspace(
135 | lat_data, dmin=0.2, max_centers=400).dtrajs
136 | its = pyemma.msm.its(dtraj, lags=msm_lags, nits=3).timescales
137 | return cca, its
138 | data, dtraj = tae.toymodels.sample_swissroll_model(length)
139 | ref_data = tae.utils.whiten_data(
140 | torch.from_numpy(dtraj.reshape(-1, 1).astype(np.float32)))
141 | ref_its = pyemma.msm.its(dtraj, lags=msm_lags, nits=3).timescales
142 | lat, trn, val = tae.pca(
143 | data, dim=dim, validation_split=0.5, batch_size=100, whiten=True)
144 | cca, its = analyse(lat, ref_data, msm_lags)
145 | result = dict(
146 | trns_lags=np.asarray(trns_lags),
147 | msm_lags=np.asarray(msm_lags),
148 | ref_its=np.asarray(ref_its),
149 | pca_its=np.asarray(its),
150 | pca_cca=np.asarray(cca),
151 | pca_trn=np.asarray(trn),
152 | pca_val=np.asarray(val))
153 | for lag in trns_lags:
154 | lat, trn, val = tae.tica(
155 | data, dim=dim, lag=lag, kinetic_map=True, symmetrize=True,
156 | validation_split=0.5, batch_size=100, whiten=True)
157 | cca, its = analyse(lat, ref_data, msm_lags)
158 | result.update({
159 | 'tica_%d_its' % lag: np.asarray(its),
160 | 'tica_%d_cca' % lag: np.asarray(cca),
161 | 'tica_%d_trn' % lag: np.asarray(trn),
162 | 'tica_%d_val' % lag: np.asarray(val)})
163 | lat, trn, val = tae.ae(
164 | data, dim=dim, lag=lag, n_epochs=200, validation_split=0.5,
165 | batch_size=100, whiten=True, pin_memory=use_cuda, hid_size=[200, 100],
166 | cuda=use_cuda, non_blocking=use_cuda)
167 | cca, its = analyse(lat, ref_data, msm_lags)
168 | result.update({
169 | 'ae_%d_its' % lag: np.asarray(its),
170 | 'ae_%d_cca' % lag: np.asarray(cca),
171 | 'ae_%d_trn' % lag: np.asarray(trn),
172 | 'ae_%d_val' % lag: np.asarray(val)})
173 | return result
174 |
175 | ################################################################################
176 | #
177 | # BENCHMARKING THE ALANINE DIPEPTIDE MD SIMULATIONS
178 | #
179 | ################################################################################
180 |
181 | def evaluate_ala2_md(
182 | n_trajs=5,
183 | length=50000,
184 | trns_lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
185 | msm_lags=[1, 2, 3, 5, 7, 10, 15, 20, 30, 40, 50],
186 | use_cuda=True):
187 | '''A wrapper to run the alanine dipeptide benchmarks
188 |
189 | Arguments:
190 | n_trajs (int): number of bootstrapped trajectories
191 | length (int): length of each bootstrapped trajectory
192 | trns_lags (list of int): lag times for the transformers
193 | msm_lags (list of int): lag times for the MSM validation
194 | use_cuda (boolean): use a GPU to run the benchmarks
195 | '''
196 | def analyse(lat_data, ref_data, msm_lags):
197 | cca = tae.utils.cca(
198 | torch.cat([torch.from_numpy(array) for array in lat_data]),
199 | ref_data)[1].numpy()
200 | dtrajs = pyemma.coordinates.cluster_kmeans(
201 | lat_data, k=300, max_iter=50, stride=10).dtrajs
202 | its = pyemma.msm.its(dtrajs, lags=msm_lags, nits=3).timescales
203 | return cca, its
204 | with np.load(_load('alanine-dipeptide-3x250ns-backbone-dihedrals.npz')) as fh:
205 | n_frames = [fh[key].shape[0] for key in sorted(fh.keys())]
206 | selection = []
207 | for i in np.random.choice(
208 | len(n_frames), size=n_trajs, replace=True):
209 | selection.append(
210 | [i, np.random.randint(n_frames[i] - length)])
211 | ref_data = [fh['arr_%d' % i][l:l+length] for i, l in selection]
212 | with np.load(_load('alanine-dipeptide-3x250ns-heavy-atom-positions.npz')) as fh:
213 | data = [fh['arr_%d' % i][l:l+length] for i, l in selection]
214 | dtrajs = pyemma.coordinates.cluster_kmeans(
215 | ref_data, k=300, max_iter=50, stride=10).dtrajs
216 | ref_its = pyemma.msm.its(dtrajs, lags=msm_lags, nits=3).timescales
217 | ref_data = tae.utils.whiten_data(
218 | torch.cat([torch.from_numpy(array) for array in ref_data]))
219 | lat, trn, val = tae.pca(
220 | data, dim=2, validation_split=0.5, batch_size=100, whiten=True)
221 | cca, its = analyse(lat, ref_data, msm_lags)
222 | result = dict(
223 | trns_lags=np.asarray(trns_lags),
224 | msm_lags=np.asarray(msm_lags),
225 | ref_its=np.asarray(ref_its),
226 | pca_its=np.asarray(its),
227 | pca_cca=np.asarray(cca),
228 | pca_trn=np.asarray(trn),
229 | pca_val=np.asarray(val))
230 | for lag in trns_lags:
231 | lat, trn, val = tae.tica(
232 | data, dim=2, lag=lag, kinetic_map=True, symmetrize=True,
233 | validation_split=0.5, batch_size=100, whiten=True)
234 | cca, its = analyse(lat, ref_data, msm_lags)
235 | result.update({
236 | 'tica_%d_its' % lag: np.asarray(its),
237 | 'tica_%d_cca' % lag: np.asarray(cca),
238 | 'tica_%d_trn' % lag: np.asarray(trn),
239 | 'tica_%d_val' % lag: np.asarray(val)})
240 | lat, trn, val = tae.ae(
241 | data, dim=2, lag=lag, n_epochs=200, validation_split=0.5,
242 | batch_size=100, whiten=True, pin_memory=use_cuda, hid_size=[200, 100],
243 | cuda=use_cuda, non_blocking=use_cuda)
244 | cca, its = analyse(lat, ref_data, msm_lags)
245 | result.update({
246 | 'ae_%d_its' % lag: np.asarray(its),
247 | 'ae_%d_cca' % lag: np.asarray(cca),
248 | 'ae_%d_trn' % lag: np.asarray(trn),
249 | 'ae_%d_val' % lag: np.asarray(val)})
250 | return result
251 |
252 | ################################################################################
253 | #
254 | # BENCHMARKING THE VILLIN MD SIMULATIONS
255 | #
256 | ################################################################################
257 |
258 | def evaluate_villin_md(
259 | data=None,
260 | n_blocks=10,
261 | trns_lags=[10, 20, 50, 100, 200, 500],
262 | msm_lags=[1, 5, 10, 20, 30, 40, 50, 60, 80, 100, 125, 150, 175, 200, 250, 300, 400, 500, 700, 1000],
263 | use_cuda=True):
264 | '''An inner wrapper to run the villin benchmarks for a single featurization
265 |
266 | Arguments:
267 | data (numpy.ndarray): featurized md data
268 | n_blocks (int): number of blocks to divide the original trajectory in
269 | trns_lags (list of int): lag times for the transformers
270 | msm_lags (list of int): lag times for the MSM validation
271 | use_cuda (boolean): use a GPU to run the benchmarks
272 | '''
273 | def analyse(lat_data, msm_lags):
274 | dtrajs = pyemma.coordinates.cluster_kmeans(
275 | lat_data, k=300, max_iter=50, stride=10).dtrajs
276 | return pyemma.msm.its(dtrajs, lags=msm_lags, nits=2).timescales
277 | nmax = len(data)
278 | length = int(np.floor(0.5 + float(nmax) / float(n_blocks)))
279 | active_blocks = np.random.choice(n_blocks, size=n_blocks, replace=True)
280 | _data = [data[n * length:min((n + 1) * length, nmax), :] for n in active_blocks]
281 | result = dict(
282 | trns_lags=np.asarray(trns_lags),
283 | msm_lags=np.asarray(msm_lags))
284 | for lag in trns_lags:
285 | for dim in [2, 5]:
286 | lat, trn, val = tae.tica(
287 | _data, dim=2, lag=lag, kinetic_map=True, symmetrize=True,
288 | validation_split=0.5, batch_size=100, whiten=True)
289 | result.update({
290 | 'tica_%d_%d_its' % (lag, dim): np.asarray(analyse(lat, msm_lags)),
291 | 'tica_%d_%d_trn' % (lag, dim): np.asarray(trn),
292 | 'tica_%d_%d_val' % (lag, dim): np.asarray(val)})
293 | lat, trn, val = tae.ae(
294 | _data, dim=2, lag=lag, n_epochs=200, validation_split=0.5,
295 | batch_size=100, whiten=True, pin_memory=use_cuda, hid_size=[200, 100],
296 | cuda=use_cuda, non_blocking=use_cuda)
297 | result.update({
298 | 'ae_%d_its' % lag: np.asarray(analyse(lat, msm_lags)),
299 | 'ae_%d_trn' % lag: np.asarray(trn),
300 | 'ae_%d_val' % lag: np.asarray(val)})
301 | return result
302 |
303 | def evaluate_villin_md_wrapper(
304 | path_to_data=None,
305 | trns_lags=[10, 20, 50, 100, 200, 500],
306 | msm_lags=[1, 5, 10, 20, 30, 40, 50, 60, 80, 100, 125, 150, 175, 200, 250, 300, 400, 500, 700, 1000],
307 | use_cuda=True):
308 | '''An outer wrapper to run the villin benchmarks for all featurizations
309 |
310 | Arguments:
311 | path_to_data (str): path to the villin data which we are not allowed to share
312 | n_blocks (int): number of blocks to divide the original trajectory in
313 | trns_lags (list of int): lag times for the transformers
314 | msm_lags (list of int): lag times for the MSM validation
315 | use_cuda (boolean): use a GPU to run the benchmarks
316 | '''
317 | featurisations = dict({
318 | 'bbt': 'villin-ff-1ns-backbone-torsions.npy',
319 | 'cap': 'villin-ff-1ns-ca-positions.npy',
320 | 'hap': 'villin-ff-1ns-heavy-atom-positions.npy',
321 | 'icad': 'villin-ff-1ns-inverse-ca-distances.npy'})
322 | result = dict()
323 | for model in featurisations.keys():
324 | data = np.load(os.path.join(path_to_data, featurisations[model]))
325 | model_result = evaluate_villin_md(
326 | data=data, trns_lags=trns_lags,
327 | msm_lags=msm_lags, use_cuda=use_cuda)
328 | for key in model_result.keys():
329 | if key not in ['trns_lags', 'msm_lags']:
330 | result.update({'%s_%s' % (model, key): model_result[key]})
331 | result.update(trns_lags=trns_lags, msm_lags=msm_lags)
332 | return result
333 |
334 | ################################################################################
335 | #
336 | # MANUSCRIPT BENCHMARKS
337 | #
338 | ################################################################################
339 |
340 | def worker(queue, gpu, seed, evaluate_func, evaluate_kwargs):
341 | with torch.cuda.device(gpu):
342 | np.random.seed(seed)
343 | torch.manual_seed(seed)
344 | torch.cuda.manual_seed(seed)
345 | try:
346 | result = evaluate_func(**evaluate_kwargs)
347 | except Exception as e:
348 | print(e)
349 | result = dict()
350 | queue.put(result)
351 | queue.task_done()
352 |
353 | def spawn(
354 | seed_generator, task_index, n_gpus, evaluate_func, evaluate_kwargs=dict()):
355 | processes = []
356 | queue = mp.JoinableQueue()
357 | for gpu in range(n_gpus):
358 | seed = seed_generator(task_index, gpu, n_gpus=n_gpus)
359 | p = mp.Process(
360 | target=worker,
361 | args=[queue, gpu, seed, evaluate_func, evaluate_kwargs])
362 | processes.append(p)
363 | print('Spawning task:%d on gpu:%d with seed:%d' % (task_index, gpu, seed))
364 | for p in processes:
365 | p.start()
366 | queue.join()
367 | out = dict()
368 | for _ in processes:
369 | result = queue.get()
370 | for key in result.keys():
371 | if key in ['trns_lags', 'msm_lags']:
372 | if key not in out:
373 | out.update({key: result[key]})
374 | else:
375 | try:
376 | out[key].append(result[key])
377 | except KeyError:
378 | out.update({key: [result[key]]})
379 | return out
380 |
--------------------------------------------------------------------------------
/vampnet/examples/Alanine_dipeptide.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Import all the packages used"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "scrolled": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "import matplotlib.pyplot as plt\n",
20 | "%matplotlib inline\n",
21 | "import vampnet\n",
22 | "from vampnet import data_generator as vamp_data_generator\n",
23 | "from tensorflow.contrib.keras.api.keras.models import Model\n",
24 | "from tensorflow.contrib.keras.api.keras.layers import Dense, Activation, Flatten, Input, BatchNormalization, concatenate\n",
25 | "from tensorflow.contrib.keras.api.keras.optimizers import Adam\n",
26 | "import tensorflow as tf\n",
27 | "import matplotlib.gridspec as gridspec\n",
28 | "from tensorflow.contrib.keras.api.keras.backend import clear_session"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# Necessary for downloading the trajectory data\n",
38 | "import mdshare"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "# Load Data"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "import pyemma.coordinates as pycoor"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "# Define Hyperparameters"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "# Tau, how much is the timeshift of the two datasets\n",
71 | "tau = 1\n",
72 | "\n",
73 | "# Batch size for Stochastic Gradient descent\n",
74 | "batch_size = 1000\n",
75 | "\n",
76 | "# Which trajectory points percentage is used as training\n",
77 | "train_ratio = 0.9\n",
78 | "\n",
79 | "# How many hidden layers the network has\n",
80 | "network_depth = 6\n",
81 | "\n",
82 | "# Width of every layer\n",
83 | "layer_width = 100\n",
84 | "\n",
85 | "# Learning rate used for the ADAM optimizer\n",
86 | "learning_rate = 1e-4\n",
87 | "\n",
88 | "# How many output states the network has\n",
89 | "output_size = 6\n",
90 | "\n",
91 | "# Iteration over the training set in the fitting process\n",
92 | "nb_epoch = 60\n",
93 | "\n",
94 | "epsilon = 1e-5"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "traj_whole, dihedral = vamp_data_generator.get_alanine_data()\n",
104 | "\n",
105 | "traj_data_points, input_size = traj_whole.shape"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "# Initialized the VAMPnets wrapper class"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "vamp = vampnet.VampnetTools(epsilon = epsilon)"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "# Shuffle trajectory and lagged trajectory together"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "length_data = traj_data_points - tau\n",
138 | "\n",
139 | "traj_ord = traj_whole[:length_data]\n",
140 | "traj_ord_lag = traj_whole[tau:length_data+tau]\n",
141 | "\n",
142 | "\n",
143 | "dihedral_init = dihedral[:length_data]\n",
144 | "\n",
145 | "indexes = np.arange(length_data)\n",
146 | "np.random.shuffle(indexes)\n",
147 | "\n",
148 | "traj = traj_ord[indexes]\n",
149 | "traj_lag = traj_ord_lag[indexes]\n",
150 | "dihedral_shuffle = dihedral_init[indexes]"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "# Prepare data for tensorflow usage"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "length_train = int(np.floor(length_data * train_ratio))\n",
167 | "length_vali = length_data - length_train\n",
168 | "\n",
169 | "traj_data_train = traj[:length_train]\n",
170 | "traj_data_train_lag = traj_lag[:length_train]\n",
171 | "\n",
172 | "traj_data_valid = traj[length_train:]\n",
173 | "traj_data_valid_lag = traj_lag[length_train:]\n",
174 | "\n",
175 | "# Input of the first network\n",
176 | "X1_train = traj_data_train.astype('float32')\n",
177 | "X2_train = traj_data_train_lag.astype('float32')\n",
178 | "\n",
179 | "# Input for validation\n",
180 | "X1_vali = traj_data_valid.astype('float32')\n",
181 | "X2_vali = traj_data_valid_lag.astype('float32')\n",
182 | "\n",
183 | "# Needs a Y-train set which we dont have.\n",
184 | "Y_train = np.zeros((length_train,2*output_size)).astype('float32')\n",
185 | "Y_vali = np.zeros((length_vali,2*output_size)).astype('float32')"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {},
191 | "source": [
192 | "# Run several model iterations saving the best one, to help finding sparcely populated states"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {
199 | "scrolled": true
200 | },
201 | "outputs": [],
202 | "source": [
203 | "max_vm = 0\n",
204 | "attempts = 10\n",
205 | "\n",
206 | "# IMPORTANT: the function vamp.loss_VAMP2_autograd can only be used with tensorflow 1.6 or more recent.\n",
207 | "# For older versions of TF, use the function vamp.loss_VAMP2\n",
208 | "\n",
209 | "losses = [\n",
210 | " vamp.loss_VAMP2_autograd,\n",
211 | "]\n",
212 | "\n",
213 | "\n",
214 | "for i in range(attempts): \n",
215 | "\n",
216 | " # Clear the previous tensorflow session to prevent memory leaks\n",
217 | " clear_session()\n",
218 | "\n",
219 | " # Build the model\n",
220 | "\n",
221 | "\n",
222 | " nodes = [layer_width]*network_depth\n",
223 | "\n",
224 | " Data_X = Input(shape = (input_size,))\n",
225 | " Data_Y = Input(shape = (input_size,))\n",
226 | "\n",
227 | " # A batch normalization layer improves convergence speed\n",
228 | " bn_layer = BatchNormalization()\n",
229 | "\n",
230 | " # Instance layers and assign them to the two lobes of the network\n",
231 | " dense_layers = [Dense(node, activation = 'elu')# if index_layer < 3 else 'linear nodes')\n",
232 | " for index_layer,node in enumerate(nodes)]\n",
233 | "\n",
234 | " lx_branch = bn_layer(Data_X)\n",
235 | " rx_branch = bn_layer(Data_Y)\n",
236 | "\n",
237 | " for i, layer in enumerate(dense_layers):\n",
238 | "\n",
239 | " lx_branch = dense_layers[i](lx_branch)\n",
240 | " rx_branch = dense_layers[i](rx_branch)\n",
241 | "\n",
242 | "\n",
243 | " # Add a softmax output layer.\n",
244 | " # Should be replaced with a linear activation layer if\n",
245 | " # the outputs of the network cannot be interpreted as states\n",
246 | " softmax = Dense(output_size, activation='softmax')\n",
247 | "\n",
248 | " lx_branch = softmax(lx_branch)\n",
249 | " rx_branch = softmax(rx_branch)\n",
250 | "\n",
251 | " # Merge both networks to train both at the same time\n",
252 | " merged = concatenate([lx_branch, rx_branch])\n",
253 | "\n",
254 | " # Initialize the model and the optimizer, and compile it with\n",
255 | " # the loss and metric functions from the VAMPnets package\n",
256 | " model = Model(inputs = [Data_X, Data_Y], outputs = merged)\n",
257 | " adam = Adam(lr = learning_rate/10)\n",
258 | "\n",
259 | " vm1 = np.zeros((len(losses), nb_epoch))\n",
260 | " tm1 = np.zeros_like(vm1)\n",
261 | " vm2 = np.zeros_like(vm1)\n",
262 | " tm2 = np.zeros_like(vm1)\n",
263 | " vm3 = np.zeros_like(vm1)\n",
264 | " tm3 = np.zeros_like(vm1)\n",
265 | " \n",
266 | " for l_index, loss_function in enumerate(losses):\n",
267 | "\n",
268 | " \n",
269 | " model.compile(optimizer = adam,\n",
270 | " loss = loss_function,\n",
271 | " metrics = [\n",
272 | " vamp.metric_VAMP,\n",
273 | " vamp.metric_VAMP2,\n",
274 | " ])\n",
275 | "\n",
276 | "\n",
277 | " # Train the model\n",
278 | " \n",
279 | " hist = model.fit([X1_train, X2_train], Y_train ,\n",
280 | " batch_size=batch_size,\n",
281 | " epochs=nb_epoch,\n",
282 | " validation_data=([X1_vali, X2_vali], Y_vali ),\n",
283 | " verbose=0)\n",
284 | "\n",
285 | "\n",
286 | " vm1[l_index] = np.array(hist.history['val_metric_VAMP'])\n",
287 | " tm1[l_index] = np.array(hist.history['metric_VAMP'])\n",
288 | " \n",
289 | " \n",
290 | " vm2[l_index] = np.array(hist.history['val_metric_VAMP2'])\n",
291 | " tm2[l_index] = np.array(hist.history['metric_VAMP2'])\n",
292 | " \n",
293 | " vm3[l_index] = np.array(hist.history['val_loss'])\n",
294 | " tm3[l_index] = np.array(hist.history['loss'])\n",
295 | " \n",
296 | " \n",
297 | " vm1 = np.reshape(vm1, (-1))\n",
298 | " tm1 = np.reshape(tm1, (-1))\n",
299 | " vm2 = np.reshape(vm2, (-1))\n",
300 | " tm2 = np.reshape(tm2, (-1))\n",
301 | " vm3 = np.reshape(vm3, (-1))\n",
302 | " tm3 = np.reshape(tm3, (-1))\n",
303 | "\n",
304 | " # Average the score obtained in the last part of the training process\n",
305 | " # in order to estabilish which model is better and thus worth saving\n",
306 | "\n",
307 | "\n",
308 | " score = vm1[-5:].mean()\n",
309 | " extra_msg = ''\n",
310 | " if score > max_vm:\n",
311 | " extra_msg = ' - Highest'\n",
312 | " best_weights = model.get_weights()\n",
313 | " max_vm = score\n",
314 | " vm1_max = vm1\n",
315 | " tm1_max = tm1\n",
316 | " vm2_max = vm2\n",
317 | " tm2_max = tm2\n",
318 | " vm3_max = vm3\n",
319 | " tm3_max = tm3\n",
320 | " \n",
321 | " print('Score: {0:.2f}'.format(score) + extra_msg)"
322 | ]
323 | },
324 | {
325 | "cell_type": "markdown",
326 | "metadata": {},
327 | "source": [
328 | "# Recover the saved model and its training history"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": null,
334 | "metadata": {},
335 | "outputs": [],
336 | "source": [
337 | "model.set_weights(best_weights)\n",
338 | "\n",
339 | "tm1 = np.array(tm1_max)\n",
340 | "tm2 = np.array(tm2_max)\n",
341 | "tm3 = np.array(tm3_max)\n",
342 | "vm1 = np.array(vm1_max)\n",
343 | "vm2 = np.array(vm2_max)\n",
344 | "vm3 = np.array(vm3_max)\n"
345 | ]
346 | },
347 | {
348 | "cell_type": "markdown",
349 | "metadata": {},
350 | "source": [
351 | "# Training result visualization"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": null,
357 | "metadata": {
358 | "scrolled": false
359 | },
360 | "outputs": [],
361 | "source": [
362 | "plt.plot(vm1, label = 'VAMP')\n",
363 | "plt.plot(vm2, label = 'VAMP2')\n",
364 | "plt.plot(-vm3, label = 'loss')\n",
365 | "plt.plot(tm1, label = 'training VAMP')\n",
366 | "plt.plot(tm2, label = 'training VAMP2')\n",
367 | "plt.plot(-tm3, label = 'training loss')\n",
368 | "plt.legend()\n",
369 | "plt.show()"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": null,
375 | "metadata": {},
376 | "outputs": [],
377 | "source": [
378 | "# Transform the input trajectory using the network\n",
379 | "states_prob = model.predict([traj_ord, traj_ord_lag])[:, :output_size]\n",
380 | "\n",
381 | "# Order the output states based on their population\n",
382 | "coor_pred = np.argmax(states_prob, axis = 1)\n",
383 | "indexes = [np.where(coor_pred == np.multiply(np.ones_like(coor_pred), n)) for n in range(output_size)]\n",
384 | "states_num = [len(i[0]) for i in indexes]\n",
385 | "states_order = np.argsort(states_num).astype('int')[::-1]\n",
386 | "\n",
387 | "pred_ord = states_prob[:,states_order]"
388 | ]
389 | },
390 | {
391 | "cell_type": "markdown",
392 | "metadata": {},
393 | "source": [
394 | "# Visualize the population of the states"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": null,
400 | "metadata": {},
401 | "outputs": [],
402 | "source": [
403 | "def print_states_pie_chart():\n",
404 | " coors = []\n",
405 | " maxi = np.max(pred_ord, axis= 1)\n",
406 | "\n",
407 | " for i in range(output_size):\n",
408 | " coors.append(len(np.where(pred_ord[:,i] == maxi)[0]))\n",
409 | " \n",
410 | " fig1, ax1 = plt.subplots()\n",
411 | " ax1.pie(np.array(coors), autopct='%1.2f%%', startangle=90)\n",
412 | " ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.\n",
413 | " print('States population: '+str(np.array(coors)/len(maxi)*100)+'%')\n",
414 | " plt.show()\n",
415 | "\n",
416 | "print_states_pie_chart()"
417 | ]
418 | },
419 | {
420 | "cell_type": "markdown",
421 | "metadata": {},
422 | "source": [
423 | "# Visualize how the 4 states are placed on the Ramachandran plot"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": null,
429 | "metadata": {
430 | "scrolled": false
431 | },
432 | "outputs": [],
433 | "source": [
434 | "maxi_train = np.max(pred_ord, axis= 1)\n",
435 | "coor_train = np.zeros_like(pred_ord)\n",
436 | "for i in range(output_size):\n",
437 | " coor_train = np.where(pred_ord[:,i]== maxi_train)[0]\n",
438 | " plt.scatter(dihedral_init[coor_train,0], dihedral_init[coor_train,1], s=5)\n",
439 | "plt.axes = [[-np.pi, np.pi],[-np.pi, np.pi]]"
440 | ]
441 | },
442 | {
443 | "cell_type": "markdown",
444 | "metadata": {},
445 | "source": [
446 | "# For each state, visualize the probabilities the different trajectory points have to belong to it"
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": null,
452 | "metadata": {
453 | "scrolled": false
454 | },
455 | "outputs": [],
456 | "source": [
457 | "fig = plt.figure(figsize=(16, 16))\n",
458 | "\n",
459 | "gs1 = gridspec.GridSpec(2, int(np.ceil(output_size/2)))\n",
460 | "gs1.update(wspace=0.05, hspace = 0.05)\n",
461 | "\n",
462 | "for n in range(output_size):\n",
463 | " ax = plt.subplot(gs1[n])\n",
464 | " im = ax.scatter(dihedral_init[:,0], dihedral_init[:,1], s=30,\n",
465 | " c = pred_ord[:,n],\n",
466 | " alpha=0.5, edgecolor='',\n",
467 | " vmin = 0, vmax = 1\n",
468 | " )\n",
469 | " plt.axis('on')\n",
470 | " title = 'State '+str(n + 1)\n",
471 | "\n",
472 | " ax.text(.85, .15, title,\n",
473 | " horizontalalignment='center',\n",
474 | " transform=ax.transAxes, fontdict = {'size':36})\n",
475 | "\n",
476 | "\n",
477 | " if (n < 3):\n",
478 | " ax.set_xticks([-3, 0, 3])\n",
479 | " ax.set_xticklabels([r'-$\\pi$', r'$0$', r'$\\pi$'])\n",
480 | " ax.xaxis.set_tick_params(top='on', bottom='off', labeltop='on', labelbottom='off')\n",
481 | " ax.xaxis.set_tick_params(labelsize=40)\n",
482 | " else:\n",
483 | " ax.set_xticks([])\n",
484 | " if (n%3==0):\n",
485 | " ax.set_yticks([-3, 0, 3])\n",
486 | " ax.set_yticklabels([r'-$\\pi$', r'$0$', r'$\\pi$'])\n",
487 | " ax.yaxis.set_tick_params(labelsize=40)\n",
488 | " else:\n",
489 | " ax.set_yticks([])\n",
490 | "# ax.set_aspect('equal')\n",
491 | " ax.set_xlim([-np.pi, np.pi]);\n",
492 | " ax.set_ylim([-np.pi, np.pi]);\n",
493 | " \n",
494 | " if (n%3 == 0):\n",
495 | " ax.set_ylabel(r'$\\Psi$ [rad]', fontdict = {'size':40})\n",
496 | " if (n < 3):\n",
497 | " ax.set_xlabel(r'$\\Phi$ [rad]', fontdict = {'size':40}, position = 'top')\n",
498 | " ax.xaxis.set_label_coords(0.5,1.2)\n",
499 | "\n",
500 | "gs1.tight_layout(fig, rect=[0, 0.03, 0.95, 0.94])\n",
501 | "fig.show()\n",
502 | "\n",
503 | "cax = fig.add_axes([0.95, 0.05, 0.02, 0.8])\n",
504 | "cbar = fig.colorbar(im, cax=cax, ticks=[0, 1])\n",
505 | "cbar.ax.yaxis.set_tick_params(labelsize=40)"
506 | ]
507 | },
508 | {
509 | "cell_type": "markdown",
510 | "metadata": {},
511 | "source": [
512 | "# Markov Model Estimation"
513 | ]
514 | },
515 | {
516 | "cell_type": "markdown",
517 | "metadata": {},
518 | "source": [
519 | "# Estimate the implied timescales"
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "execution_count": null,
525 | "metadata": {
526 | "scrolled": false
527 | },
528 | "outputs": [],
529 | "source": [
530 | "max_tau = 200\n",
531 | "lag = np.arange(1, max_tau, 1)\n",
532 | "its = vamp.get_its(pred_ord, lag)\n",
533 | "vamp.plot_its(its, lag)"
534 | ]
535 | },
536 | {
537 | "cell_type": "markdown",
538 | "metadata": {},
539 | "source": [
540 | "# Chapman-Kolmogorov test for the estimated koopman operator"
541 | ]
542 | },
543 | {
544 | "cell_type": "code",
545 | "execution_count": null,
546 | "metadata": {},
547 | "outputs": [],
548 | "source": [
549 | "steps = 8\n",
550 | "tau_msm = 35\n",
551 | "predicted, estimated = vamp.get_ck_test(pred_ord, steps, tau_msm)\n",
552 | "vamp.plot_ck_test(predicted, estimated, output_size, steps, tau_msm)"
553 | ]
554 | }
555 | ],
556 | "metadata": {
557 | "anaconda-cloud": {},
558 | "kernelspec": {
559 | "display_name": "Python 3",
560 | "language": "python",
561 | "name": "python3"
562 | },
563 | "language_info": {
564 | "codemirror_mode": {
565 | "name": "ipython",
566 | "version": 3
567 | },
568 | "file_extension": ".py",
569 | "mimetype": "text/x-python",
570 | "name": "python",
571 | "nbconvert_exporter": "python",
572 | "pygments_lexer": "ipython3",
573 | "version": "3.6.8"
574 | }
575 | },
576 | "nbformat": 4,
577 | "nbformat_minor": 2
578 | }
579 |
--------------------------------------------------------------------------------
/vampnet/examples/Alanine_dipeptide_multiple_files.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Import all the packages used"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "scrolled": false
15 | },
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "import matplotlib.pyplot as plt\n",
20 | "%matplotlib inline\n",
21 | "import vampnet\n",
22 | "from vampnet import data_generator as vamp_data_loader\n",
23 | "from tensorflow.contrib.keras.api.keras.models import Model\n",
24 | "from tensorflow.contrib.keras.api.keras.layers import Dense, Activation, Flatten, Input, BatchNormalization, concatenate\n",
25 | "from tensorflow.contrib.keras.api.keras.optimizers import Adam\n",
26 | "import tensorflow as tf\n",
27 | "import matplotlib.gridspec as gridspec\n",
28 | "from tensorflow.contrib.keras.api.keras.backend import clear_session"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# Necessary for downloading the trajectory data\n",
38 | "import mdshare\n",
39 | "import pyemma.coordinates as pycoor"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "# Define Hyperparameters"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "# Tau, how much is the timeshift of the two datasets\n",
56 | "tau = 1\n",
57 | "\n",
58 | "# Batch size for Stochastic Gradient descent\n",
59 | "batch_size = 1000\n",
60 | "\n",
61 | "# Which trajectory points percentage is used as training\n",
62 | "train_ratio = 0.9\n",
63 | "\n",
64 | "# How many hidden layers the network has\n",
65 | "network_depth = 6\n",
66 | "\n",
67 | "# Width of every layer\n",
68 | "layer_width = 100\n",
69 | "\n",
70 | "# Learning rate used for the ADAM optimizer\n",
71 | "learning_rate = 1e-4\n",
72 | "\n",
73 | "# How many output states the network has\n",
74 | "output_size = 6\n",
75 | "\n",
76 | "# Iteration over the training set in the fitting process\n",
77 | "nb_epoch = 40\n",
78 | "\n",
79 | "epsilon = 1e-5"
80 | ]
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "metadata": {},
85 | "source": [
86 | "# Initialized the VAMPnets wrapper class"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "vamp = vampnet.VampnetTools(epsilon = epsilon)"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "# Load Data"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "# #Download alanine coordinates and dihedral angles data\n",
112 | "mdshare.fetch('alanine-dipeptide-3x250ns-heavy-atom-positions.npz')\n",
113 | "mdshare.fetch('alanine-dipeptide-3x250ns-backbone-dihedrals.npz')\n",
114 | "\n",
115 | "alanine_files = np.load('alanine-dipeptide-3x250ns-heavy-atom-positions.npz')\n",
116 | "\n",
117 | "# # Save the files separately\n",
118 | "np.save('traj0.npy', alanine_files['arr_0'])\n",
119 | "np.save('traj1.npy', alanine_files['arr_1'])\n",
120 | "np.save('traj2.npy', alanine_files['arr_2'])\n",
121 | "\n",
122 | "# Separate data files between training data and validation data\n",
123 | "\n",
124 | "train_data_files_list = [\n",
125 | " 'traj0.npy',\n",
126 | " 'traj1.npy',\n",
127 | "]\n",
128 | "\n",
129 | "valid_data_files_list = [\n",
130 | " 'traj2.npy',\n",
131 | "]\n",
132 | "\n",
133 | "total_data_files_list = train_data_files_list + valid_data_files_list"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "# Define the pyemma data sources and get basic info from the files, number of datapoints and system size\n",
143 | "\n",
144 | "train_data_source = pycoor.source(train_data_files_list,chunksize = batch_size)\n",
145 | "valid_data_source = pycoor.source(valid_data_files_list,chunksize = batch_size)\n",
146 | "total_data_source = pycoor.source(total_data_files_list,chunksize = batch_size)\n",
147 | "\n",
148 | "train_datapoints = train_data_source.n_frames_total()\n",
149 | "valid_datapoints = valid_data_source.n_frames_total()\n",
150 | "total_datapoints = total_data_source.n_frames_total()\n",
151 | " \n",
152 | "traj_lengths = total_data_source.trajectory_lengths()\n",
153 | "\n",
154 | "input_size = total_data_source.dimension()"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "# Run several model iterations saving the best one, to help finding sparcely populated states"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {
168 | "scrolled": true
169 | },
170 | "outputs": [],
171 | "source": [
172 | "max_vm = 0\n",
173 | "attempts_number = 10\n",
174 | "\n",
175 | "\n",
176 | "# IMPORTANT: the function vamp.loss_VAMP2_autograd can only be used with tensorflow 1.6 or more recent.\n",
177 | "# For older versions of TF, use the function vamp.loss_VAMP2\n",
178 | "\n",
179 | "losses = [\n",
180 | " vamp.loss_VAMP2_autograd,\n",
181 | "]\n",
182 | "\n",
183 | "\n",
184 | "for attempt in range(attempts_number):\n",
185 | " \n",
186 | "\n",
187 | " # Clear the previous tensorflow session to prevent memory leaks\n",
188 | " clear_session()\n",
189 | "\n",
190 | " # Build the model\n",
191 | "\n",
192 | "\n",
193 | " nodes = [layer_width]*network_depth\n",
194 | "\n",
195 | " Data_X = Input(shape = (input_size,))\n",
196 | " Data_Y = Input(shape = (input_size,))\n",
197 | "\n",
198 | " # A batch normalization layer improves convergence speed\n",
199 | " bn_layer = BatchNormalization()\n",
200 | "\n",
201 | " # Instance layers and assign them to the two lobes of the network\n",
202 | " dense_layers = [Dense(node, activation = 'elu',)\n",
203 | " for node in nodes]\n",
204 | "\n",
205 | " lx_branch = bn_layer(Data_X)\n",
206 | " rx_branch = bn_layer(Data_Y)\n",
207 | "\n",
208 | " for i, layer in enumerate(dense_layers):\n",
209 | "\n",
210 | " lx_branch = dense_layers[i](lx_branch)\n",
211 | " rx_branch = dense_layers[i](rx_branch)\n",
212 | "\n",
213 | "\n",
214 | " # Add a softmax output layer.\n",
215 | " # Should be replaced with a linear activation layer if\n",
216 | " # the outputs of the network cannot be interpreted as states\n",
217 | " softmax = Dense(output_size, activation='softmax')\n",
218 | "\n",
219 | " lx_branch = softmax(lx_branch)\n",
220 | " rx_branch = softmax(rx_branch)\n",
221 | "\n",
222 | " # Merge both networks to train both at the same time\n",
223 | " merged = concatenate([lx_branch, rx_branch])\n",
224 | "\n",
225 | " # Initialize the model and the optimizer, and compile it with\n",
226 | " # the loss and metric functions from the VAMPnets package\n",
227 | " model = Model(inputs = [Data_X, Data_Y], outputs = merged)\n",
228 | " adam = Adam(lr = learning_rate)\n",
229 | "\n",
230 | " vm1 = np.zeros((len(losses), nb_epoch))\n",
231 | " tm1 = np.zeros_like(vm1)\n",
232 | " vm2 = np.zeros_like(vm1)\n",
233 | " tm2 = np.zeros_like(vm1)\n",
234 | " \n",
235 | " for l_index, loss_function in enumerate(losses):\n",
236 | "\n",
237 | " \n",
238 | " model.compile(optimizer = adam,\n",
239 | " loss = loss_function,\n",
240 | " metrics = [\n",
241 | " vamp.metric_VAMP,\n",
242 | " vamp.metric_VAMP2,\n",
243 | " ])\n",
244 | "\n",
245 | "\n",
246 | " # Train the model\n",
247 | " \n",
248 | " steps_per_train_epoch = int(np.sum(np.ceil((train_data_source.trajectory_lengths()-tau)/batch_size)))\n",
249 | " steps_per_valid_epoch = int(np.sum(np.ceil((valid_data_source.trajectory_lengths()-tau)/batch_size)))\n",
250 | " \n",
251 | " hist = model.fit_generator(generator = vamp_data_loader.build_generator_on_source_shuffle(train_data_source,\n",
252 | " batch_size,\n",
253 | " tau,\n",
254 | " output_size,\n",
255 | " ),\n",
256 | " steps_per_epoch = steps_per_train_epoch,\n",
257 | " epochs = nb_epoch,\n",
258 | " verbose = 0,\n",
259 | " validation_data = vamp_data_loader.build_generator_on_source_shuffle(valid_data_source,\n",
260 | " batch_size,\n",
261 | " tau,\n",
262 | " output_size,\n",
263 | " ),\n",
264 | " validation_steps = steps_per_valid_epoch,\n",
265 | " shuffle = True\n",
266 | " )\n",
267 | "\n",
268 | " vm1[l_index] = np.array(hist.history['val_metric_VAMP'])\n",
269 | " tm1[l_index] = np.array(hist.history['metric_VAMP'])\n",
270 | " \n",
271 | " vm2[l_index] = np.array(hist.history['val_metric_VAMP2'])\n",
272 | " tm2[l_index] = np.array(hist.history['metric_VAMP2'])\n",
273 | " \n",
274 | " \n",
275 | " vm1 = np.reshape(vm1, (-1))\n",
276 | " tm1 = np.reshape(tm1, (-1))\n",
277 | " vm2 = np.reshape(vm2, (-1))\n",
278 | " tm2 = np.reshape(tm2, (-1))\n",
279 | "\n",
280 | " # Average the score obtained in the last part of the training process\n",
281 | " # in order to estabilish which model is better and thus worth saving\n",
282 | "\n",
283 | "\n",
284 | " score = vm1[-5:].mean()\n",
285 | " t_score = tm1[-5:].mean()\n",
286 | " extra_msg = ''\n",
287 | " if score > max_vm:\n",
288 | " extra_msg = ' - Highest'\n",
289 | " best_weights = model.get_weights()\n",
290 | " max_vm = score\n",
291 | " vm1_max = vm1\n",
292 | " tm1_max = tm1\n",
293 | " vm2_max = vm2\n",
294 | " tm2_max = tm2\n",
295 | " \n",
296 | " print('Attempt {0}, training score: {1:.2f}, validation score: {2:.2f}'.format(attempt+1, t_score, score) + extra_msg)"
297 | ]
298 | },
299 | {
300 | "cell_type": "markdown",
301 | "metadata": {},
302 | "source": [
303 | "# Recover the saved model and its training history"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": null,
309 | "metadata": {},
310 | "outputs": [],
311 | "source": [
312 | "model.set_weights(best_weights)\n",
313 | "\n",
314 | "tm1 = np.array(vm1_max)\n",
315 | "tm2 = np.array(tm1_max)\n",
316 | "vm1 = np.array(vm2_max)\n",
317 | "vm2 = np.array(tm2_max)"
318 | ]
319 | },
320 | {
321 | "cell_type": "markdown",
322 | "metadata": {},
323 | "source": [
324 | "# Training result visualization"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": null,
330 | "metadata": {
331 | "scrolled": false
332 | },
333 | "outputs": [],
334 | "source": [
335 | "plt.plot(vm1, label = 'VAMP')\n",
336 | "plt.plot(vm2, label = 'VAMP2')\n",
337 | "plt.plot(tm1, label = 'training VAMP')\n",
338 | "plt.plot(tm2, label = 'training VAMP2')\n",
339 | "plt.legend()\n",
340 | "plt.show()"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "metadata": {},
347 | "outputs": [],
348 | "source": [
349 | "# Transform the input trajectory using the network\n",
350 | "states_prob_all = model.predict_generator(generator = vamp_data_loader.build_generator_on_source(total_data_source,\n",
351 | " batch_size,\n",
352 | " tau,\n",
353 | " output_size),\n",
354 | " steps = np.sum(np.ceil((total_data_source.trajectory_lengths()-tau)/batch_size)),\n",
355 | " verbose = 0)\n",
356 | "\n",
357 | "states_prob_t = states_prob_all[:,:output_size]\n",
358 | "states_prob_lag = states_prob_all[:,output_size:]\n",
359 | "\n",
360 | "# reorganize the output of the network in order to have every data point transformed by the network in one array\n",
361 | "start = 0\n",
362 | "states_prob = np.zeros((states_prob_t.shape[0]+len(traj_lengths)*tau, output_size))\n",
363 | "for l, length_i in enumerate(traj_lengths-tau):\n",
364 | " states_prob[start+l*tau:start+l*tau+length_i] = states_prob_t[start:start+length_i]\n",
365 | " states_prob[start+l*tau+length_i:start+l*tau+length_i+tau] = states_prob_lag[start+length_i-tau:start+length_i]\n",
366 | " start += length_i\n",
367 | "\n",
368 | "# Order the output states based on their population\n",
369 | "coor_pred = np.argmax(states_prob, axis = 1)\n",
370 | "indexes = [np.where(coor_pred == np.multiply(np.ones_like(coor_pred), n)) for n in range(output_size)]\n",
371 | "states_num = [len(i[0]) for i in indexes]\n",
372 | "states_order = np.argsort(states_num).astype('int')[::-1]\n",
373 | "\n",
374 | "pred_ord = states_prob[:,states_order]"
375 | ]
376 | },
377 | {
378 | "cell_type": "markdown",
379 | "metadata": {},
380 | "source": [
381 | "# Visualize the population of the states"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": null,
387 | "metadata": {},
388 | "outputs": [],
389 | "source": [
390 | "def print_states_pie_chart():\n",
391 | " coors = []\n",
392 | " maxi = np.max(pred_ord, axis= 1)\n",
393 | "\n",
394 | " for i in range(output_size):\n",
395 | " coors.append(len(np.where(pred_ord[:,i] == maxi)[0]))\n",
396 | " \n",
397 | " fig1, ax1 = plt.subplots()\n",
398 | " ax1.pie(np.array(coors), autopct='%1.2f%%', startangle=90)\n",
399 | " ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.\n",
400 | " print('States population: '+str(np.array(coors)/len(maxi)*100)+'%')\n",
401 | " plt.show()\n",
402 | "\n",
403 | "print_states_pie_chart()"
404 | ]
405 | },
406 | {
407 | "cell_type": "markdown",
408 | "metadata": {},
409 | "source": [
410 | "# Visualize how the 4 states are placed on the Ramachandran plot"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": null,
416 | "metadata": {},
417 | "outputs": [],
418 | "source": [
419 | "dihedral_file = np.load('alanine-dipeptide-3x250ns-backbone-dihedrals.npz')\n",
420 | "dihedral_init = np.concatenate([dihedral_file['arr_0'],\n",
421 | " dihedral_file['arr_1'],\n",
422 | " dihedral_file['arr_2'],\n",
423 | " ], axis = 0)\n"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": null,
429 | "metadata": {
430 | "scrolled": false
431 | },
432 | "outputs": [],
433 | "source": [
434 | "maxi_train = np.max(pred_ord, axis= 1)\n",
435 | "coor_train = np.zeros_like(pred_ord)\n",
436 | "for i in range(output_size):\n",
437 | " coor_train = np.where(pred_ord[:,i]== maxi_train)[0]\n",
438 | " plt.scatter(dihedral_init[coor_train,0], dihedral_init[coor_train,1], s=1)\n",
439 | "plt.axes = [[-np.pi, np.pi],[-np.pi, np.pi]]"
440 | ]
441 | },
442 | {
443 | "cell_type": "markdown",
444 | "metadata": {},
445 | "source": [
446 | "# For each state, visualize the probabilities the different trajectory points have to belong to it"
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": null,
452 | "metadata": {
453 | "scrolled": false
454 | },
455 | "outputs": [],
456 | "source": [
457 | "fig = plt.figure(figsize=(16, 16))\n",
458 | "\n",
459 | "gs1 = gridspec.GridSpec(int(np.ceil(output_size/2)), 2)\n",
460 | "gs1.update(wspace=0.05, hspace = 0.05)\n",
461 | "\n",
462 | "for n in range(output_size):\n",
463 | " ax = plt.subplot(gs1[n])\n",
464 | " im = ax.scatter(dihedral_init[:,0], dihedral_init[:,1], s=5,\n",
465 | " c = pred_ord[:,n],\n",
466 | " alpha=0.5, edgecolor='', vmin = 0, vmax = 1)\n",
467 | " plt.axis('on')\n",
468 | " title = 'State '+str(n + 1)\n",
469 | "\n",
470 | " ax.text(.85, .15, title,\n",
471 | " horizontalalignment='center',\n",
472 | " transform=ax.transAxes, fontdict = {'size':36})\n",
473 | "\n",
474 | "\n",
475 | " if (n < 3):\n",
476 | " ax.set_xticks([-3, 0, 3])\n",
477 | " ax.set_xticklabels([r'-$\\pi$', r'$0$', r'$\\pi$'])\n",
478 | " ax.xaxis.set_tick_params(top=True, bottom=False, labeltop=True, labelbottom=False)\n",
479 | " ax.xaxis.set_tick_params(labelsize=40)\n",
480 | " else:\n",
481 | " ax.set_xticks([])\n",
482 | " if (n%3==0):\n",
483 | " ax.set_yticks([-3, 0, 3])\n",
484 | " ax.set_yticklabels([r'-$\\pi$', r'$0$', r'$\\pi$'])\n",
485 | " ax.yaxis.set_tick_params(labelsize=40)\n",
486 | " else:\n",
487 | " ax.set_yticks([])\n",
488 | "# ax.set_aspect('equal')\n",
489 | " ax.set_xlim([-np.pi, np.pi]);\n",
490 | " ax.set_ylim([-np.pi, np.pi]);\n",
491 | " \n",
492 | " if (n%3 == 0):\n",
493 | " ax.set_ylabel(r'$\\Psi$ [rad]', fontdict = {'size':40})\n",
494 | " if (n < 3):\n",
495 | " ax.set_xlabel(r'$\\Phi$ [rad]', fontdict = {'size':40}, position = 'top')\n",
496 | " ax.xaxis.set_label_coords(0.5,1.2)\n",
497 | "\n",
498 | "gs1.tight_layout(fig, rect=[0, 0.03, 0.95, 0.94])\n",
499 | "fig.show()\n",
500 | "\n",
501 | "cax = fig.add_axes([0.95, 0.05, 0.02, 0.8])\n",
502 | "cbar = fig.colorbar(im, cax=cax, ticks=[0, 1])\n",
503 | "cbar.ax.yaxis.set_tick_params(labelsize=40)"
504 | ]
505 | },
506 | {
507 | "cell_type": "markdown",
508 | "metadata": {},
509 | "source": [
510 | "# Markov Model Estimation"
511 | ]
512 | },
513 | {
514 | "cell_type": "markdown",
515 | "metadata": {},
516 | "source": [
517 | "## Prepare multiple trajectories "
518 | ]
519 | },
520 | {
521 | "cell_type": "code",
522 | "execution_count": null,
523 | "metadata": {},
524 | "outputs": [],
525 | "source": [
526 | "# separate the trajectories again as a list based on the length of them\n",
527 | "traj_list = []\n",
528 | "start = 0\n",
529 | "for length_i in traj_lengths:\n",
530 | " traj_list.append(pred_ord[start:start+length_i])\n",
531 | " start += length_i"
532 | ]
533 | },
534 | {
535 | "cell_type": "markdown",
536 | "metadata": {},
537 | "source": [
538 | "# Estimate the implied timescales"
539 | ]
540 | },
541 | {
542 | "cell_type": "code",
543 | "execution_count": null,
544 | "metadata": {
545 | "scrolled": false
546 | },
547 | "outputs": [],
548 | "source": [
549 | "max_tau = 200\n",
550 | "lag = np.arange(1, max_tau, 1)\n",
551 | "its = vamp.get_its(traj_list, lag)\n",
552 | "vamp.plot_its(its, lag)"
553 | ]
554 | },
555 | {
556 | "cell_type": "markdown",
557 | "metadata": {},
558 | "source": [
559 | "# Chapman-Kolmogorov test for the estimated koopman operator"
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": null,
565 | "metadata": {},
566 | "outputs": [],
567 | "source": [
568 | "steps = 8\n",
569 | "tau_msm = 35\n",
570 | "predicted, estimated = vamp.get_ck_test(traj_list, steps, tau_msm)\n",
571 | "vamp.plot_ck_test(predicted, estimated, output_size, steps, tau_msm)"
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": null,
577 | "metadata": {},
578 | "outputs": [],
579 | "source": []
580 | }
581 | ],
582 | "metadata": {
583 | "anaconda-cloud": {},
584 | "kernelspec": {
585 | "display_name": "Python 3",
586 | "language": "python",
587 | "name": "python3"
588 | },
589 | "language_info": {
590 | "codemirror_mode": {
591 | "name": "ipython",
592 | "version": 3
593 | },
594 | "file_extension": ".py",
595 | "mimetype": "text/x-python",
596 | "name": "python",
597 | "nbconvert_exporter": "python",
598 | "pygments_lexer": "ipython3",
599 | "version": "3.6.8"
600 | },
601 | "varInspector": {
602 | "cols": {
603 | "lenName": 16,
604 | "lenType": 16,
605 | "lenVar": 40
606 | },
607 | "kernels_config": {
608 | "python": {
609 | "delete_cmd_postfix": "",
610 | "delete_cmd_prefix": "del ",
611 | "library": "var_list.py",
612 | "varRefreshCmd": "print(var_dic_list())"
613 | },
614 | "r": {
615 | "delete_cmd_postfix": ") ",
616 | "delete_cmd_prefix": "rm(",
617 | "library": "var_list.r",
618 | "varRefreshCmd": "cat(var_dic_list()) "
619 | }
620 | },
621 | "types_to_exclude": [
622 | "module",
623 | "function",
624 | "builtin_function_or_method",
625 | "instance",
626 | "_Feature"
627 | ],
628 | "window_display": false
629 | }
630 | },
631 | "nbformat": 4,
632 | "nbformat_minor": 2
633 | }
634 |
--------------------------------------------------------------------------------
/time-lagged-autoencoder/tae/models.py:
--------------------------------------------------------------------------------
1 | # This file is part of the markovmodel/deeptime repository.
2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group,
3 | # Freie Universitaet Berlin (GER)
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see .
17 |
18 | '''
19 | Implementations of PCA, TICA, AE, and VAE.
20 | '''
21 |
22 | from torch import svd as _svd
23 | from torch import nn as _nn
24 | from torch import optim as _optim
25 | from torch import diag as _diag
26 | from torch import cat as _cat
27 | from torch import randn as _randn
28 | from torch import sum as _sum
29 | from torch import mm as _mm
30 | from torch import symeig as _symeig
31 | from torch import abs as _abs
32 | from torch import arange as _arange
33 | from torch import sqrt as _sqrt
34 | from torch import zeros as _zeros
35 | from torch import no_grad as _no_grad
36 | from torch.autograd import Function as _Function
37 | from .utils import get_mean as _get_mean
38 | from .utils import get_covariance as _get_covariance
39 | from .utils import Transform as _Transform
40 |
41 | __all__ = ['PCA', 'TICA', 'AE', 'VAE', 'VAMPNet']
42 |
43 | ################################################################################
44 | #
45 | # PCA
46 | #
47 | ################################################################################
48 |
49 | class PCA(object):
50 | '''Perform a principal component analysis for dimensionality reduction.
51 |
52 | We compute the first eigenvectors of the instantaneous covariance
53 | matrix and use them to rotate/project the data into a lower dimensional
54 | subspace.
55 | '''
56 | def __init__(self):
57 | self.loss_function = _nn.MSELoss(size_average=False)
58 | def get_loss(self, loader):
59 | '''Train the model on the provided data loader.
60 |
61 | Arguments:
62 | loader (DataLoader): the data for loss calculation
63 | '''
64 | if loader is None:
65 | return None
66 | loss = 0.0
67 | for x, y in loader:
68 | x, y = self.transformer(x, y)
69 | loss += self.loss_function(x.mm(self.score_matrix), y).item()
70 | return loss / float(len(loader.dataset))
71 | def fit(self, train_loader, dim=None, test_loader=None):
72 | '''Train the model on the provided data loader.
73 |
74 | Arguments:
75 | train_loader (DataLoader): the training data
76 | dim (int): the target dimensionality
77 | test_loader (DataLoader): the data for validation
78 | '''
79 | self.x_mean, y_mean = _get_mean(train_loader)
80 | self.cxx, cxy, cyy = _get_covariance(
81 | train_loader, self.x_mean, y_mean)
82 | self.transformer = _Transform(
83 | x_mean=self.x_mean, y_mean=self.x_mean)
84 | u, s, v = _svd(self.cxx)
85 | if dim is None:
86 | dim = s.size()[0]
87 | self.decoder_matrix = u[:, :dim]
88 | self.encoder_matrix = v.t()[:dim, :]
89 | self.score_matrix = self.decoder_matrix.mm(self.encoder_matrix)
90 | return self.get_loss(train_loader), self.get_loss(test_loader)
91 | def transform(self, loader):
92 | '''Apply the model on the provided data loader.
93 |
94 | Arguments:
95 | loader (DataLoader): the data you wish to transform
96 | '''
97 | latent = []
98 | for x, _ in loader:
99 | x = self.transformer.x(x)
100 | latent.append(x.mm(self.encoder_matrix.t()))
101 | return _cat(latent)
102 |
103 | ################################################################################
104 | #
105 | # TICA
106 | #
107 | ################################################################################
108 |
109 | class TICA(object):
110 | '''Perform a time-lagged independent component analysis for
111 | dimensionality reduction.
112 |
113 | We compute a rank-d approximation to the Koopman operator and use it to
114 | rotate/project the data into a lower dimensional subspace.
115 |
116 | Arguments:
117 | kinetic_map (boolean): use the kinetic map variant of TICA
118 | symmetrize (boolean): enforce symmetry and reversibility
119 | '''
120 | def __init__(self, kinetic_map=True, symmetrize=False):
121 | self.loss_function = _nn.MSELoss(size_average=False)
122 | self.kinetic_map = kinetic_map
123 | self.symmetrize = symmetrize
124 | def get_loss(self, loader):
125 | '''Train the model on the provided data loader.
126 |
127 | Arguments:
128 | loader (DataLoader): the data for loss calculation
129 | '''
130 | if loader is None:
131 | return None
132 | loss = 0.0
133 | for x, y in loader:
134 | x, y = self.transformer(x, y)
135 | loss += self.loss_function(x.mm(self.koopman_matrix), y).item()
136 | return loss / float(len(loader.dataset))
137 | def fit(self, train_loader, dim=None, test_loader=None):
138 | '''Train the model on the provided data loader.
139 |
140 | Arguments:
141 | train_loader (DataLoader): the training data
142 | dim (int): the target dimensionality
143 | test_loader (DataLoader): the data for validation
144 | '''
145 | self.x_mean, self.y_mean = _get_mean(train_loader)
146 | self.cxx, self.cxy, self.cyy = _get_covariance(
147 | train_loader, self.x_mean, self.y_mean)
148 | if self.symmetrize:
149 | self.cxx = 0.5 * (self.cxx + self.cyy)
150 | self.cyy.copy_(self.cxx)
151 | self.cxy = 0.5 * (self.cxy + self.cxy.t())
152 | self.transformer = _Transform(
153 | x_mean=self.x_mean, x_covariance=self.cxx,
154 | y_mean=self.y_mean, y_covariance=self.cyy)
155 | self.ixx = self.transformer.x.mul
156 | self.iyy = self.transformer.y.mul
157 | u, s, v = _svd(self.ixx.mm(self.cxy.mm(self.iyy)))
158 | if dim is None:
159 | dim = s.size()[0]
160 | self.decoder_matrix = v[:, :dim]
161 | self.encoder_matrix = u.t()[:dim, :]
162 | if self.kinetic_map:
163 | self.encoder_matrix = _diag(s[:dim]).mm(self.encoder_matrix)
164 | else:
165 | self.decoder_matrix = self.decoder_matrix.mm(_diag(s[:dim]))
166 | self.koopman_matrix = self.decoder_matrix.mm(self.encoder_matrix)
167 | return self.get_loss(train_loader), self.get_loss(test_loader)
168 | def transform(self, loader):
169 | '''Apply the model on the provided data loader.
170 |
171 | Arguments:
172 | loader (DataLoader): the data you wish to transform
173 | '''
174 | latent = []
175 | for x, _ in loader:
176 | x = self.transformer.x(x)
177 | latent.append(x.mm(self.encoder_matrix.t()))
178 | return _cat(latent)
179 |
180 | ################################################################################
181 | #
182 | # AUTOENCODER BASE CLASS
183 | #
184 | ################################################################################
185 |
186 | class BaseNet(_nn.Module):
187 | '''Basic shape of a pytorch neural network model for dimension reduction.
188 |
189 | The BaseNet is the basis of more specialised dimension reduction networks
190 | and provides the full infrastructure for the setup and training process.
191 |
192 | Arguments:
193 | inp_size (int): dimensionality of the full space
194 | lat_size (int): dimensionality of the desired latent space
195 | hid_size (sequence of int): sizes of the hidden layers
196 | normalize_batch (boolean): normalize over batches instead samples
197 | dropout (Dropout): dropout layer for each hidden layer
198 | alpha (float) activation parameter for the rectified linear units
199 | prelu (bool) use a learnable ReLU
200 | bias (boolean): specify usage of bias neurons
201 | lr (float): learning rate parameter for Adam
202 | cuda (boolean): use the GPU
203 | non_blocking (boolean): use asyncronous mode (GPU only)
204 | '''
205 | def __init__(
206 | self, inp_size, lat_size, hid_size, normalize_batch,
207 | dropout, alpha, prelu, bias, lr, cuda, non_blocking):
208 | super(BaseNet, self).__init__()
209 | sizes = [inp_size] + list(hid_size) + [lat_size]
210 | self._last = len(sizes) - 2
211 | if isinstance(dropout, float):
212 | dropout = _nn.Dropout(p=dropout)
213 | self._setup(sizes, bias, alpha, prelu, dropout)
214 | self.optimizer = _optim.Adam(self.parameters(), lr=lr)
215 | self.normalize_batch = normalize_batch
216 | self.non_blocking = non_blocking
217 | if cuda:
218 | self.use_cuda = True
219 | self.cuda() # the non_blocking=... parameter is not accepted, here
220 | else:
221 | self.use_cuda = False
222 | def _setup(self, sizes, bias, alpha, prelu, dropout):
223 | '''Implement this in your derived class to create the necessary
224 | layers.
225 | '''
226 | def _create_activation(self, key, idx, alpha, prelu, suffix=''):
227 | '''Helper function to create activations and initialize parameters.'''
228 | if alpha is None:
229 | activation = None
230 | elif alpha < 0.0:
231 | raise ValueError('alpha must be a non-negative number')
232 | elif alpha == 0.0:
233 | activation = _nn.ReLU()
234 | elif prelu:
235 | activation = _nn.PReLU(num_parameters=1, init=alpha)
236 | else:
237 | activation = _nn.LeakyReLU(negative_slope=alpha)
238 | if activation is not None:
239 | setattr(self, key + '_act_%d%s' % (idx, suffix), activation)
240 | layer = getattr(self, key + '_prm_%d%s' % (idx, suffix))
241 | _nn.init.kaiming_normal_(layer.weight.data, a=alpha, mode='fan_in')
242 | try:
243 | layer.bias.data.uniform_(0.0, 0.1)
244 | except AttributeError:
245 | pass
246 | def _try_to_apply_module(self, key, value):
247 | '''Helper function to safely apply a module within the network.'''
248 | try:
249 | return getattr(self, key)(value)
250 | except AttributeError:
251 | return value
252 | def _apply_layer(self, key, idx, value):
253 | '''Helper function to safely apply a layer (module sequence) within
254 | the network.
255 | '''
256 | return self._try_to_apply_module(
257 | key + '_drp_%d' % idx, self._try_to_apply_module(
258 | key + '_act_%d' % idx, self._try_to_apply_module(
259 | key + '_prm_%d' % idx, value)))
260 | def forward_and_apply_loss_function(self, x, y):
261 | '''Implement this in your derived class'''
262 | raise NotImplementedError('Implement in child class')
263 | def train_step(self, loader):
264 | '''A single training epoch.'''
265 | self.train()
266 | train_loss = 0
267 | for x, y in loader:
268 | x, y = self.transformer(x, y)
269 | if self.use_cuda:
270 | x = x.cuda(non_blocking=self.non_blocking)
271 | y = y.cuda(non_blocking=self.non_blocking)
272 | self.optimizer.zero_grad()
273 | loss = self.forward_and_apply_loss_function(x, y)
274 | loss.backward()
275 | train_loss += loss.item()
276 | self.optimizer.step()
277 | if self.normalize_batch:
278 | return train_loss / float(len(loader))
279 | return train_loss / float(len(loader.dataset))
280 | def test_step(self, loader):
281 | '''A single validation epoch'''
282 | self.eval()
283 | test_loss = 0
284 | if loader is None:
285 | return None
286 | for x, y in loader:
287 | x, y = self.transformer(x, y)
288 | if self.use_cuda:
289 | x = x.cuda(non_blocking=self.non_blocking)
290 | y = y.cuda(non_blocking=self.non_blocking)
291 | test_loss += self.forward_and_apply_loss_function(x, y).item()
292 | if self.normalize_batch:
293 | return test_loss / float(len(loader))
294 | return test_loss / float(len(loader.dataset))
295 | def fit(self, train_loader, n_epochs, test_loader=None):
296 | '''Train the model on the provided data loader.
297 |
298 | Arguments:
299 | train_loader (DataLoader): the training data
300 | n_epochs (int): number of training epochs
301 | test_loader (DataLoader): the data for validation
302 | '''
303 | x_mean, y_mean = _get_mean(train_loader)
304 | cxx, cxy, cyy = _get_covariance(train_loader, x_mean, y_mean)
305 | self.transformer = _Transform(
306 | x_mean=x_mean, x_covariance=cxx, y_mean=y_mean, y_covariance=cyy)
307 | train_loss, test_loss = [], []
308 | for epoch in range(n_epochs):
309 | train_loss.append(
310 | self.train_step(
311 | train_loader))
312 | with _no_grad():
313 | test_loss.append(
314 | self.test_step(test_loader))
315 | return train_loss, test_loss
316 | def transform(self, loader):
317 | '''Apply the model on the provided data loader.
318 |
319 | Arguments:
320 | loader (DataLoader): the data you wish to transform
321 | '''
322 | self.eval()
323 | latent = []
324 | for x, _ in loader:
325 | x = self.transformer.x(x)
326 | if self.use_cuda:
327 | x = x.cuda(non_blocking=self.non_blocking)
328 | y = self.encode(x)
329 | if self.cuda:
330 | y = y.cpu()
331 | latent.append(y)
332 | return _cat(latent).data
333 |
334 | ################################################################################
335 | #
336 | # AUTOENCODER
337 | #
338 | ################################################################################
339 |
340 | class AE(BaseNet):
341 | '''Use a time-lagged autoencoder model for dimensionality reduction.
342 |
343 | We train a time-lagged autoencoder type neural network.
344 |
345 | Arguments:
346 | inp_size (int): dimensionality of the full space
347 | lat_size (int): dimensionality of the desired latent space
348 | hid_size (sequence of int): sizes of the hidden layers
349 | dropout (Dropout): dropout layer for each hidden layer
350 | alpha (float) activation parameter for the rectified linear units
351 | prelu (bool) use a learnable ReLU
352 | bias (boolean): specify usage of bias neurons
353 | lr (float): learning rate parameter for Adam
354 | cuda (boolean): use the GPU
355 | '''
356 | def __init__(
357 | self, inp_size, lat_size, hid_size=[],
358 | dropout=0.5, alpha=0.01, prelu=False,
359 | bias=True, lr=0.001, cuda=False, non_blocking=False):
360 | super(AE, self).__init__(
361 | inp_size, lat_size, hid_size, False,
362 | dropout, alpha, prelu, bias, lr, cuda, non_blocking)
363 | self._mse_loss_function = _nn.MSELoss(size_average=False)
364 | def _setup(self, sizes, bias, alpha, prelu, dropout):
365 | '''Helper function to create al necessary layers.'''
366 | for c, idx in enumerate(range(1, len(sizes))):
367 | setattr(
368 | self,
369 | 'enc_prm_%d' % c,
370 | _nn.Linear(sizes[idx - 1], sizes[idx], bias=bias))
371 | self._create_activation('enc', c, alpha, prelu)
372 | if c < self._last:
373 | if dropout is not None:
374 | setattr(self, 'enc_drp_%d' % c, dropout)
375 | for c, idx in enumerate(reversed(range(1, len(sizes)))):
376 | setattr(
377 | self,
378 | 'dec_prm_%d' % c,
379 | _nn.Linear(sizes[idx], sizes[idx - 1], bias=bias))
380 | if c < self._last:
381 | self._create_activation('dec', c, alpha, prelu)
382 | if dropout is not None:
383 | setattr(self, 'dec_drp_%d' % c, dropout)
384 | else:
385 | self._create_activation('dec', c, None, None)
386 | def forward_and_apply_loss_function(self, x, y):
387 | '''Helper function to feed data through the network and compute the
388 | desired loss.
389 | '''
390 | return self._mse_loss_function(self(x), y)
391 | def encode(self, x):
392 | '''Encode the given input.'''
393 | y = x
394 | for idx in range(self._last):
395 | y = self._apply_layer('enc', idx, y)
396 | return getattr(self, 'enc_prm_%d' % self._last)(y)
397 | def decode(self, z):
398 | '''Decode the given input.'''
399 | y = self._try_to_apply_module('enc_act_%d' % self._last, z)
400 | for idx in range(self._last):
401 | y = self._apply_layer('dec', idx, y)
402 | return getattr(self, 'dec_prm_%d' % self._last)(y)
403 | def forward(self, x):
404 | '''Forward the given input through the network.'''
405 | return self.decode(self.encode(x))
406 |
407 | ################################################################################
408 | #
409 | # VARIATIONAL AUTOENCODER
410 | #
411 | ################################################################################
412 |
413 | class VAE(BaseNet):
414 | '''Use a time-lagged variational autoencoder model for dimensionality
415 | reduction.
416 |
417 | We train a time-lagged variational autoencoder type neural network.
418 |
419 | Arguments:
420 | inp_size (int): dimensionality of the full space
421 | lat_size (int): dimensionality of the desired latent space
422 | hid_size (sequence of int): sizes of the hidden layers
423 | beta (float) : KLD weight for optimization
424 | dropout (Dropout): dropout layer for each hidden layer
425 | alpha (float) activation parameter for the rectified linear units
426 | prelu (bool) use a learnable ReLU
427 | bias (boolean): specify usage of bias neurons
428 | lr (float): learning rate parameter for Adam
429 | cuda (boolean): use the GPU
430 | '''
431 | def __init__(
432 | self, inp_size, lat_size, hid_size=[], beta=1.0,
433 | dropout=0.5, alpha=0.01, prelu=False,
434 | bias=True, lr=0.001, cuda=False, non_blocking=False):
435 | super(VAE, self).__init__(
436 | inp_size, lat_size, hid_size, False,
437 | dropout, alpha, prelu, bias, lr, cuda, non_blocking)
438 | self.beta = beta
439 | self._mse_loss_function = _nn.MSELoss(size_average=False)
440 | def _setup(self, sizes, bias, alpha, prelu, dropout):
441 | '''Helper function to create al necessary layers.'''
442 | for c, idx in enumerate(range(1, len(sizes) - 1)):
443 | setattr(
444 | self,
445 | 'enc_prm_%d' % c,
446 | _nn.Linear(sizes[idx - 1], sizes[idx], bias=bias))
447 | self._create_activation('enc', c, alpha, prelu)
448 | if dropout is not None:
449 | setattr(self, 'enc_drp_%d' % c, dropout)
450 | setattr(
451 | self,
452 | 'enc_prm_%d_mu' % self._last,
453 | _nn.Linear(sizes[-2], sizes[-1], bias=bias))
454 | self._create_activation('enc', self._last, None, None, suffix='_mu')
455 | setattr(
456 | self,
457 | 'enc_prm_%d_lv' % self._last,
458 | _nn.Linear(sizes[-2], sizes[-1], bias=bias))
459 | self._create_activation('enc', self._last, None, None, suffix='_lv')
460 | for c, idx in enumerate(reversed(range(1, len(sizes)))):
461 | setattr(
462 | self,
463 | 'dec_prm_%d' % c,
464 | _nn.Linear(sizes[idx], sizes[idx - 1], bias=bias))
465 | if c < self._last:
466 | self._create_activation('dec', c, alpha, prelu)
467 | if dropout is not None:
468 | setattr(self, 'dec_drp_%d' % c, dropout)
469 | else:
470 | self._create_activation('dec', c, None, None)
471 | def forward_and_apply_loss_function(self, x, y):
472 | '''Helper function to feed data through the network and compute the
473 | desired loss.
474 | '''
475 | y_recon, mu, lv = self(x)
476 | mse = self._mse_loss_function(y_recon, y)
477 | kld = -0.5 * _sum(1.0 + lv - mu.pow(2) - lv.exp())
478 | return mse + self.beta * kld / float(y.size(1))
479 | def _encode(self, x):
480 | '''Encode the given input.'''
481 | y = x
482 | for idx in range(self._last):
483 | y = self._apply_layer('enc', idx, y)
484 | mu = getattr(self, 'enc_prm_%d_mu' % self._last)(y)
485 | lv = getattr(self, 'enc_prm_%d_lv' % self._last)(y)
486 | return mu, lv
487 | def _reparameterize(self, mu, lv):
488 | '''Reparametrize the given input.'''
489 | if self.training:
490 | std = lv.mul(0.5).exp_()
491 | eps = _randn(*std.size())
492 | if self.use_cuda:
493 | eps = eps.cuda()
494 | return eps.mul(std).add_(mu)
495 | else:
496 | return mu
497 | def encode(self, x):
498 | '''Encode/reparametrize the given input.'''
499 | return self._reparameterize(*self._encode(x))
500 | def decode(self, z):
501 | '''Decode the given input.'''
502 | y = z
503 | for idx in range(self._last):
504 | y = self._apply_layer('dec', idx, y)
505 | return getattr(self, 'dec_prm_%d' % self._last)(y)
506 | def forward(self, x):
507 | '''Forward the given input through the network.'''
508 | mu, lv = self._encode(x)
509 | return self.decode(self._reparameterize(mu, lv)), mu, lv
510 |
511 | ################################################################################
512 | #
513 | # VAMPNET WORK IN PROGRESS
514 | #
515 | ################################################################################
516 |
517 | class DecomposeRSPDMatrix(_Function):
518 | @staticmethod
519 | def forward(ctx, matrix):
520 | eigval, eigvec = _symeig(matrix, eigenvectors=True)
521 | eigval = _abs(eigval) + 1e-10
522 | ctx.eigval = eigval
523 | ctx.eigvec = eigvec
524 | return eigval, eigvec
525 | @staticmethod
526 | def backward(ctx, dval, dvec):
527 | eigval = ctx.eigval
528 | eigvec = ctx.eigvec
529 | n = len(eigval)
530 | eigval_dist = eigval[:, None] - eigval[None, :]
531 | idx = _arange(n).long().tolist()
532 | eigval_dist[idx, idx] = 1.0
533 | dval_out = eigvec[:, None, :] * eigvec[None, :, :]
534 | dvec_out = _zeros(n, n, n, n).type(eigval.type())
535 | omega = _zeros(n, n).type(eigval.type())
536 | for i in range(n):
537 | for j in range(n):
538 | omega[:, :] = eigvec[i, :, None] * eigvec[j, None, :]
539 | omega[idx, idx] = 0.0
540 | omega.div_(eigval_dist)
541 | dvec_out[i, j, :, :] = -_mm(eigvec, omega)
542 | dval = _sum(dval[None, None, :] * dval_out, -1)
543 | dvec = _sum(_sum(dvec[None, None, :, :] * dvec_out, -1), -1)
544 | return dval + dvec
545 |
546 | def covar(x, y):
547 | return _mm(x.t(), y).div_(len(x))
548 |
549 | def sqrtinv(matrix):
550 | eigval, eigvec = DecomposeRSPDMatrix.apply(matrix)
551 | diag = _diag(1.0 / _sqrt(eigval))
552 | return _mm(eigvec, _mm(diag, eigvec.t()))
553 |
554 | def get_koopman_matrix(x, y):
555 | ixx = sqrtinv(covar(x, x))
556 | iyy = sqrtinv(covar(y, y))
557 | cxy = covar(x, y)
558 | kmm = _mm(ixx, _mm(cxy, iyy))
559 | return kmm.t()
560 |
561 | class VAMPNet(BaseNet):
562 | def __init__(
563 | self, inp_size, lat_size, hid_size=[],
564 | dropout=0.5, alpha=0.01, prelu=False,
565 | bias=True, lr=0.001, cuda=False, non_blocking=False):
566 | super(VAMPNet, self).__init__(
567 | inp_size, lat_size, hid_size, True,
568 | dropout, alpha, prelu, bias, lr, cuda, non_blocking)
569 | def _setup(self, sizes, bias, alpha, prelu, dropout):
570 | for c, idx in enumerate(range(1, len(sizes))):
571 | setattr(
572 | self,
573 | 'enc_prm_%d' % c,
574 | _nn.Linear(sizes[idx - 1], sizes[idx], bias=bias))
575 | if c < self._last:
576 | self._create_activation('enc', c, alpha, prelu)
577 | if dropout is not None:
578 | setattr(self, 'enc_drp_%d' % c, dropout)
579 | self._create_activation('enc', self._last, None, None)
580 | self.max = _nn.Softmax(dim=1)
581 | def forward_and_apply_loss_function(self, x, y):
582 | koopman = self(x, y)
583 | return -_sum(koopman**2)
584 | def encode(self, x):
585 | y = x
586 | for idx in range(self._last):
587 | y = self._apply_layer('enc', idx, y)
588 | y = getattr(self, 'enc_prm_%d' % self._last)(y)
589 | return self.max(y)
590 | def forward(self, x, y):
591 | x_enc = self.encode(x)
592 | y_enc = self.encode(y)
593 | return get_koopman_matrix(x_enc, y_enc)
594 |
--------------------------------------------------------------------------------