├── .git_archival.txt ├── .gitattributes ├── time-lagged-autoencoder ├── tae │ ├── test │ │ ├── __init__.py │ │ ├── test_toymodels.py │ │ ├── test_models.py │ │ ├── test_api.py │ │ └── test_utils.py │ ├── __init__.py │ ├── toymodels.py │ ├── api.py │ ├── utils.py │ ├── benchmarks.py │ └── models.py ├── setup.cfg ├── setup.py └── README.md ├── README.md ├── docs └── wishlist.md ├── vampnet ├── vampnet │ ├── __init__.py │ └── data_generator.py ├── setup.py ├── README.md └── examples │ ├── 1D_double_well.ipynb │ ├── Folding.ipynb │ ├── Alanine_dipeptide.ipynb │ └── Alanine_dipeptide_multiple_files.ipynb ├── .gitignore └── LICENSE /.git_archival.txt: -------------------------------------------------------------------------------- 1 | ref-names: HEAD -> master -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | .git_archival.txt export-subst -------------------------------------------------------------------------------- /time-lagged-autoencoder/tae/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /time-lagged-autoencoder/setup.cfg: -------------------------------------------------------------------------------- 1 | [alias] 2 | test=pytest -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # deeptime 2 | Deep learning meets molecular dynamics. 3 | 4 | ## Contents 5 | 6 | - **time-lagged-autoencoder**: a toolbox for dimension reduction of time series data with a [time-lagged autoencoder](https://aip.scitation.org/doi/full/10.1063/1.5011399)-type deep neural network. 7 | - **vampnet**: Variational Approach for Markov Processes networks, see https://www.nature.com/articles/s41467-017-02388-1 8 | -------------------------------------------------------------------------------- /docs/wishlist.md: -------------------------------------------------------------------------------- 1 | # General 2 | - want to be able to fit by batch by either providing a numpy style array (so everything fits into memory) or provide a generator function 3 | - which framework is used should be decided based on what is in the environment / based on a user configuration if both frameworks are available 4 | # Top level 5 | - invisible to the user which NN framework is used 6 | - have Models `TAE` and `VAMPNet` which can be "trained" (layer sizes, dropout, batch size, learning rate, activation functions etc) 7 | - Trained models can be `fit`-ted on data 8 | # Mid level 9 | - abstraction layer between the actual NN-framework implementation and the top layer 10 | # Low level 11 | - specialization toward pytorch / TF as implementation of the abstraction layer 12 | - smaller dispatch interfaces separated through namespaces, e.g., 13 | ```python 14 | deeptime.scores.tf.vamp 15 | deeptime.scores.torch.vamp 16 | ``` -------------------------------------------------------------------------------- /vampnet/vampnet/__init__.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/deeptime repository. 2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | from pkg_resources import get_distribution, DistributionNotFound 19 | try: 20 | __version__ = get_distribution(__name__).version 21 | except DistributionNotFound: 22 | __version__ = 'x.y.z' 23 | del get_distribution, DistributionNotFound 24 | 25 | __author__ = 'Andreas Mardt, Luca Pasquali' 26 | __email__ = 'andreas.mardt@fu-berlin.de, luca.pasquali@fu-berlin.de' 27 | 28 | from .vampnet import VampnetTools 29 | -------------------------------------------------------------------------------- /time-lagged-autoencoder/tae/__init__.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/deeptime repository. 2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | ''' 19 | A toolbox for dimension reduction of time series data with a 20 | time-lagged autoencoder. 21 | ''' 22 | 23 | from pkg_resources import get_distribution, DistributionNotFound 24 | try: 25 | __version__ = get_distribution(__name__).version 26 | except DistributionNotFound: 27 | __version__ = 'x.y.z' 28 | del get_distribution, DistributionNotFound 29 | 30 | __author__ = 'Christoph Wehmeyer' 31 | __email__ = 'christoph.wehmeyer@fu-berlin.de' 32 | 33 | from .api import pca, tica, ae, vae, vampnet 34 | from .models import PCA, TICA, AE, VAE, VAMPNet 35 | from . import utils 36 | from . import toymodels 37 | -------------------------------------------------------------------------------- /vampnet/setup.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/deeptime repository. 2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | from setuptools import setup, find_packages 19 | 20 | description = ''' 21 | Collection of functions to implement neural networks based 22 | on the variational approach for Markov processes, 23 | as described in https://arxiv.org/abs/1710.06012 24 | ''' 25 | 26 | setup( 27 | use_scm_version=dict(root='..', relative_to=__file__), 28 | name='vampnet', 29 | author='Andreas Mardt, Luca Pasquali', 30 | author_email='andreas.mardt@fu-berlin.de, luca.pasquali@fu-berlin.de', 31 | url='https://github.com/markovmodel/deeptime/tree/master/vampnet', 32 | description=description, 33 | packages=find_packages(), 34 | setup_requires=['setuptools_scm', 'setuptools_scm_git_archive'], 35 | install_requires=[ 36 | 'numpy', 37 | 'scipy', 38 | 'matplotlib'], 39 | zip_safe=False) 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | Untitled*.ipynb 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | .idea/ 104 | -------------------------------------------------------------------------------- /vampnet/README.md: -------------------------------------------------------------------------------- 1 | # VAMPnet 2 | Variational Approach for Markov Processes networks. 3 | 4 | 5 | ## What is it? 6 | VAMPnet is an open source Python package for the implementation of the VAMPnet method for dynamical systems analysis (described in https://www.nature.com/articles/s41467-017-02388-1). It includes losses functions, metrics, basic estimators for Koopman operators and the most important validation tools for Koopman models. 7 | 8 | VAMPnet can be used from Jupyter (former IPython, recommended), or by 9 | writing Python scripts. 10 | 11 | 12 | ## Citation 13 | If you use VAMPnet in scientific work, please cite: 14 | 15 | Mardt, A., Pasquali, L., Wu, H., & Noé, F. (2018). 16 | VAMPnets for deep learning of molecular kinetics. 17 | Nature communications, 9(1), 5. 18 | 19 | ## Installation 20 | 21 | IMPORTANT: On Tensorflow 1.7 and 1.8 there's an unresolved issue which causes the 22 | eigenvalue decomposition to fail. This issue doesn't present itself on TF 1.4-1.6 23 | and 1.9+, so please use one of these other releases instead. 24 | 25 | This package requires [Tensorflow](https://www.tensorflow.org) to be used. 26 | Please install either tensorflow or tensorflow-gpu. Installation instructions: 27 | 28 | https://www.tensorflow.org/install/ 29 | 30 | To install this package, first clone the repository: 31 | 32 | git clone https://github.com/markovmodel/deeptime.git 33 | 34 | Then with pip: 35 | 36 | ```bash 37 | python setup.py install 38 | ``` 39 | 40 | The examples are jupyter notebooks, so the jupyter package is needed to run them: 41 | 42 | http://jupyter.readthedocs.io/en/latest/install.html 43 | 44 | This is not needed if you'd like to use the package only. 45 | 46 | 47 | If you want to run the alanine dipeptide example, you'll also need to install the mdshare package (necessary for the download of the trajectory files): 48 | 49 | git clone https://github.com/markovmodel/mdshare.git 50 | pip install ./mdshare 51 | 52 | or 53 | 54 | conda install mdshare -c conda-forge 55 | 56 | -------------------------------------------------------------------------------- /time-lagged-autoencoder/setup.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/deeptime repository. 2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | from setuptools import setup, find_packages 19 | from setuptools.command.test import test as TestCommand 20 | import sys 21 | 22 | try: 23 | import torch 24 | except ImportError: 25 | # setup.py forces pytorch installation via pip and ignores an existing 26 | # conda installation. That's why we catch this here... 27 | print( 28 | 'Please install pytorch >=0.2.0_4 according to the instructions on ' 29 | 'http://pytorch.org before you continue!') 30 | sys.exit(1) 31 | 32 | class PyTest(TestCommand): 33 | user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")] 34 | def initialize_options(self): 35 | TestCommand.initialize_options(self) 36 | self.pytest_args = ['tae'] 37 | def run_tests(self): 38 | import pytest 39 | errno = pytest.main(self.pytest_args) 40 | sys.exit(errno) 41 | 42 | setup( 43 | cmdclass={'test': PyTest}, 44 | use_scm_version=dict(root='..', relative_to=__file__), 45 | name='tae', 46 | author='Christoph Wehmeyer', 47 | author_email='christoph.wehmeyer@fu-berlin.de', 48 | url='https://github.com/markovmodel/deeptime/tree/master/time-lagged-autoencoder', 49 | description='A toolbox for dimension reduction of time series data with a time-lagged autoencoder.', 50 | packages=find_packages(), 51 | setup_requires=['setuptools_scm', 'setuptools_scm_git_archive'], 52 | install_requires=['numpy'], 53 | tests_require=['pytest'], 54 | zip_safe=False) 55 | -------------------------------------------------------------------------------- /time-lagged-autoencoder/tae/test/test_toymodels.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/deeptime repository. 2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | import numpy as np 19 | from ..toymodels import sample_hmm 20 | from ..toymodels import sample_sqrt_model 21 | from ..toymodels import sample_swissroll_model 22 | 23 | def run_sample_hmm(ndim, nstates): 24 | length = 10000 25 | states = [np.random.randn(ndim) for i in range(nstates)] 26 | cov = np.random.rand(ndim, ndim) 27 | cov = np.matmul(cov.T, cov) 28 | transition_matrix = np.random.rand(nstates, nstates) 29 | transition_matrix = transition_matrix + transition_matrix.T 30 | transition_matrix /= transition_matrix.sum() 31 | pi = transition_matrix.sum(axis=1) 32 | transition_matrix /= pi[:, None] 33 | traj, dtraj = sample_hmm(length, cov, states, transition_matrix) 34 | sets = [np.where(dtraj == state)[0] for state in range(nstates)] 35 | np.testing.assert_allclose( 36 | [float(len(s)) / float(length) for s in sets], 37 | pi, atol=0.1) 38 | for i, s in enumerate(sets): 39 | mean = np.mean(traj[s, :], axis=0) 40 | np.testing.assert_allclose( 41 | mean, states[i], atol=0.2) 42 | traj[s, :] -= mean 43 | np.testing.assert_allclose(np.cov(traj.T), cov, atol=0.2) 44 | 45 | def test_sample_hmm_random(): 46 | for _ in range(3): 47 | ndim = np.random.randint(low=2, high=5) 48 | nstates = np.random.randint(low=2, high=5) 49 | run_sample_hmm(ndim, nstates) 50 | 51 | def test_sample_sqrt_model(): 52 | traj, dtraj = sample_sqrt_model(20000) 53 | np.testing.assert_allclose( 54 | np.mean(traj, axis=0), [0.0, 1.9], atol=0.2) 55 | np.testing.assert_allclose( 56 | np.std(traj, axis=0, ddof=1), [5.5, 1.3], atol=0.2) 57 | 58 | def test_sample_swissroll_model(): 59 | traj, dtraj = sample_swissroll_model(20000) 60 | np.testing.assert_allclose( 61 | np.mean(traj, axis=0), [-3.1, 11.2, 4.9], atol=1.0) 62 | np.testing.assert_allclose( 63 | np.std(traj, axis=0, ddof=1), [7.9, 3.8, 6.7], atol=0.4) 64 | -------------------------------------------------------------------------------- /time-lagged-autoencoder/README.md: -------------------------------------------------------------------------------- 1 | # time-lagged autoencoder 2 | 3 | A toolbox for dimension reduction of time series data with a [time-lagged autoencoder](https://arxiv.org/abs/1710.11239)-type deep neural network. 4 | 5 | ## Installation 6 | Make sure to install pytorch via conda, instructions on http://pytorch.org, before you install the present module with 7 | 8 | ```bash 9 | python setup.py test 10 | python setup.py install 11 | ``` 12 | 13 | To run the included benchmarks, you also need to install the packages [pyemma](https://github.com/markovmodel/pyemma) and [mdshare](https://github.com/markovmodel/mdshare). 14 | 15 | ## Methods 16 | This package implements 17 | - principal component analysis (PCA), 18 | - time-lagged independent component analysis (TICA), 19 | - time-lagged canonical correlation analysis (via TICA), 20 | - kinetic maps (via TICA), and 21 | - an autoencoder-type neural network (AE) trained in a time-lagged manner. 22 | 23 | ## Example 24 | Assume that `data` is a single `numpy.ndarray(shape=[n_frames, n_features])` object or list thereof, `n_frames` refers to the number of timesteps in the/each trajectory, and `n_features` refers to the number of features extracted from the original molecular dyamics (MD) data. Now choose a target dimensionality `dim` and a transformation lag time `lag`, and run: 25 | 26 | ```python 27 | import tae 28 | 29 | # run PCA 30 | pca_transformed_data, pca_train_loss, pca_val_loss = tae.pca(data, dim=dim) 31 | 32 | # run TICA 33 | tica_transformed_data, tica_train_loss, tica_val_loss = tae.tica(data, dim=dim, lag=lag) 34 | 35 | # run AE 36 | ae_transformed_data, ae_train_loss, ae_val_loss = tae.ae(data, dim=dim, lag=lag) 37 | 38 | # run VAE 39 | vae_transformed_data, vae_train_loss, vae_val_loss = tae.vae(data, dim=dim, lag=lag) 40 | 41 | # run AE on a GPU 42 | ae_transformed_data, ae_train_loss, ae_val_loss = tae.ae(data, dim=dim, lag=lag, cuda=True) 43 | ``` 44 | 45 | In this example, we get `*_val_loss=None` because we are training on the full data set. To exclude a randomly chosen fraction `fval` of the data from the training, add the parameter `validation_split=fval` to the function calls, e.g.: 46 | 47 | ```python 48 | ae_transformed_data, ae_train_loss, ae_val_loss = tae.ae( 49 | data, dim=dim, lag=lag, validation_split=fval, cuda=True) 50 | ``` 51 | 52 | ## Citation 53 | ``` 54 | @article{time-lagged-autoencoder, 55 | Author = {Christoph Wehmeyer and Frank No{\'{e}}}, 56 | Doi = {10.1063/1.5011399}, 57 | Journal = {J. Chem. Phys.}, 58 | Month = {jun}, 59 | Number = {24}, 60 | Pages = {241703}, 61 | Publisher = {{AIP} Publishing}, 62 | Title = {Time-lagged autoencoders: Deep learning of slow collective variables for molecular kinetics}, 63 | Volume = {148}, 64 | Year = {2018}} 65 | ``` 66 | 67 | ## Development system 68 | This project was developed using the following python environment: 69 | 70 | | package | version | | channel | 71 | |:---|:---|:---|:---| 72 | | python | 3.6.1 | 2 | | 73 | | conda | 4.3.29 | py36_0 | conda-forge | 74 | | numpy | 1.13.3 | py36_blas_openblas_200 [blas_openblas] | conda-forge | 75 | | pytorch | 0.2.0 | py36_4cu75 | soumith | 76 | | pyemma | 2.4 | np113py36_1 | conda-forge | 77 | -------------------------------------------------------------------------------- /time-lagged-autoencoder/tae/test/test_models.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/deeptime repository. 2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | import numpy as np 19 | from torch.utils.data import DataLoader 20 | from ..utils import create_dataset 21 | from ..utils import whiten_data 22 | from ..models import PCA 23 | from ..models import TICA 24 | from ..models import AE 25 | 26 | def generate_data_2state_hmm(length=10000, lag=0, batch_size=100): 27 | transition_matrix = np.asarray([[0.9, 0.1], [0.1, 0.9]]) 28 | dtraj = np.zeros(shape=(length,), dtype=np.intc) 29 | for i in range(1, length): 30 | dtraj[i] = np.random.choice( 31 | 2, size=1, p=transition_matrix[dtraj[i - 1], :]) 32 | traj = np.random.randn(len(dtraj)) 33 | traj[np.where(dtraj == 1)[0]] += 2.0 34 | traj_stacked = np.vstack((traj, np.zeros(len(traj)))) 35 | phi = np.random.rand() * 2.0 * np.pi 36 | rot = np.asarray([ 37 | [np.cos(phi), -np.sin(phi)], 38 | [np.sin(phi), np.cos(phi)]]) 39 | traj_rot = np.dot(rot, traj_stacked).T 40 | return traj, \ 41 | DataLoader( 42 | create_dataset(traj_rot, lag=lag), 43 | batch_size=batch_size, 44 | shuffle=True), \ 45 | DataLoader( 46 | create_dataset(traj_rot, lag=0), 47 | batch_size=batch_size) 48 | 49 | ################################################################################ 50 | # 51 | # PCA 52 | # 53 | ################################################################################ 54 | 55 | def test_pca_2state_hmm(): 56 | traj, train_loader, transform_loader = generate_data_2state_hmm() 57 | pca = PCA() 58 | pca.fit(train_loader, dim=1) 59 | out = whiten_data(pca.transform(transform_loader)).numpy().reshape((-1,)) 60 | traj -= np.mean(traj) 61 | traj /= np.std(traj, ddof=1) 62 | np.testing.assert_allclose(np.abs(np.mean(traj * out)), 1.0, atol=0.001) 63 | 64 | ################################################################################ 65 | # 66 | # TICA 67 | # 68 | ################################################################################ 69 | 70 | def test_tica_2state_hmm(): 71 | traj, train_loader, transform_loader = generate_data_2state_hmm(lag=1) 72 | tica = TICA() 73 | tica.fit(train_loader, dim=1) 74 | out = whiten_data(tica.transform(transform_loader)).numpy().reshape((-1,)) 75 | traj -= np.mean(traj) 76 | traj /= np.std(traj, ddof=1) 77 | np.testing.assert_allclose(np.abs(np.mean(traj * out)), 1.0, atol=0.001) 78 | 79 | ################################################################################ 80 | # 81 | # AUTOENCODER 82 | # 83 | ################################################################################ 84 | 85 | def test_ae_2state_hmm(): 86 | traj, train_loader, transform_loader = generate_data_2state_hmm(lag=1) 87 | ae = AE(2, 1, bias=False, alpha=None) 88 | ae.fit(train_loader, 20) 89 | out = whiten_data(ae.transform(transform_loader)).numpy().reshape((-1,)) 90 | traj -= np.mean(traj) 91 | traj /= np.std(traj, ddof=1) 92 | np.testing.assert_allclose(np.abs(np.mean(traj * out)), 1.0, atol=0.001) 93 | -------------------------------------------------------------------------------- /time-lagged-autoencoder/tae/test/test_api.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/deeptime repository. 2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | import numpy as np 19 | from torch.utils.data import DataLoader 20 | from ..utils import create_dataset 21 | from ..utils import whiten_data 22 | from ..api import pca 23 | from ..api import tica 24 | from ..api import ae 25 | 26 | def generate_data_2state_hmm(length=10000): 27 | transition_matrix = np.asarray([[0.9, 0.1], [0.1, 0.9]]) 28 | phi = np.random.rand() * 2.0 * np.pi 29 | rot = np.asarray([ 30 | [np.cos(phi), -np.sin(phi)], 31 | [np.sin(phi), np.cos(phi)]]) 32 | trajs, rtrajs = [], [] 33 | for _ in range(np.random.randint(1, 5)): 34 | dtraj = np.zeros( 35 | shape=(length + np.random.randint(100),), dtype=np.intc) 36 | for i in range(1, len(dtraj)): 37 | dtraj[i] = np.random.choice( 38 | 2, size=1, p=transition_matrix[dtraj[i - 1], :]) 39 | traj = np.random.randn(len(dtraj)) 40 | traj[np.where(dtraj == 1)[0]] += 2.0 41 | traj_stacked = np.vstack((traj, np.zeros(len(traj)))) 42 | traj_rot = np.dot(rot, traj_stacked).T 43 | trajs.append(traj[:]) 44 | rtrajs.append(traj_rot[:]) 45 | if len(trajs) == 1: 46 | trajs = trajs[0] 47 | rtrajs = rtrajs[0] 48 | else: 49 | trajs = np.concatenate(trajs) 50 | trajs -= np.mean(trajs) 51 | trajs /= np.std(trajs, ddof=1) 52 | return trajs, rtrajs 53 | 54 | def checkpout_output(ref, data, out): 55 | if isinstance(data, (list, tuple)): 56 | np.testing.assert_array_equal( 57 | [o.shape[0] for o in out], 58 | [d.shape[0] for d in data]) 59 | out = np.concatenate(out) 60 | else: 61 | assert data.shape[0] == out.shape[0] 62 | out = out.reshape(-1) 63 | np.testing.assert_allclose(np.abs(np.mean(ref * out)), 1.0, atol=0.001) 64 | 65 | ################################################################################ 66 | # 67 | # PCA 68 | # 69 | ################################################################################ 70 | 71 | def test_pca_2state_hmm(): 72 | ref, data = generate_data_2state_hmm() 73 | out, train_loss, test_loss = pca(data, dim=1, whiten=True) 74 | checkpout_output(ref, data, out) 75 | 76 | ################################################################################ 77 | # 78 | # TICA 79 | # 80 | ################################################################################ 81 | 82 | def test_tica_2state_hmm(): 83 | ref, data = generate_data_2state_hmm() 84 | out, train_loss, test_loss = tica(data, dim=1, lag=1, whiten=True) 85 | checkpout_output(ref, data, out) 86 | 87 | ################################################################################ 88 | # 89 | # AUTOENCODER 90 | # 91 | ################################################################################ 92 | 93 | def test_ae_2state_hmm(): 94 | ref, data = generate_data_2state_hmm() 95 | out, train_loss, test_loss = ae( 96 | data, dim=1, lag=1, n_epochs=20, whiten=True, 97 | bias=False, hid_size=[], alpha=None) 98 | checkpout_output(ref, data, out) 99 | -------------------------------------------------------------------------------- /time-lagged-autoencoder/tae/toymodels.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/deeptime repository. 2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | ''' 19 | A collection of "difficult" toymodels. 20 | ''' 21 | 22 | import numpy as _np 23 | 24 | __all__ = ['sample_sqrt_model', 'sample_swissroll_model'] 25 | 26 | def sample_hmm(length, cov, states, transition_matrix): 27 | '''Sample a hidden state trajectory and n-dimensional emissions. 28 | 29 | We sample a hidden state trajectory using the given transition matrix. For 30 | each hidden state, we compute Gaussian noise around the center of the state 31 | using the given covariance matrix. 32 | 33 | Arguments: 34 | length (int): length of the resulting trajectories 35 | cov (array-like of float): covariance matrix for the noise 36 | states (array-like of float): centers for each state's emissions 37 | transition_matrix (array-like of float): a transition matrix 38 | ''' 39 | cov = _np.asarray(cov, dtype=_np.float32) 40 | states = _np.asarray(states, dtype=_np.float32) 41 | transition_matrix = _np.asarray(transition_matrix, dtype=_np.float32) 42 | dtraj = _np.zeros(shape=(length,), dtype=_np.intc) 43 | dtraj[0] = _np.random.randint(low=0, high=len(states)) 44 | for i in range(1, length): 45 | dtraj[i] = _np.random.choice( 46 | len(states), size=1, p=transition_matrix[dtraj[i - 1], :]) 47 | traj = states[dtraj, :] + _np.random.multivariate_normal( 48 | _np.zeros(len(cov)), cov, size=length, check_valid='ignore') 49 | return traj, dtraj 50 | 51 | def sqrt_transform(traj): 52 | '''Mask an emission trajectory using an sqrt transform. 53 | 54 | We add the square root of the first dimension (which ideally a large 55 | variance) to the second (which is ideally the slowest degree of freedom) 56 | to mask the slow process. 57 | 58 | Arguments: 59 | traj (array-like of float): a trajectory of emissions 60 | ''' 61 | transformed_traj = _np.asarray(traj).copy() 62 | transformed_traj[:, 1] += _np.sqrt(_np.abs(traj[:, 0])) 63 | return transformed_traj 64 | 65 | def sample_sqrt_model(length): 66 | '''Sample a hidden state and an sqrt-transformed emission trajectory. 67 | 68 | We sample a hidden state trajectory and sqrt-masked emissions in two 69 | dimensions such that the two metastable states are not linearly separable. 70 | 71 | Arguments: 72 | length (int): length of the resulting trajectories 73 | ''' 74 | cov = [[30.0, 0.0], [0.0, 0.015]] 75 | states = [[0.0, 1.0], [0.0, -1.0]] 76 | transition_matrix = [[0.95, 0.05], [0.05, 0.95]] 77 | traj, dtraj = sample_hmm(length, cov, states, transition_matrix) 78 | return sqrt_transform(traj), dtraj 79 | 80 | def swissroll_transform(traj): 81 | '''Mask an emission trajectory using a swissroll transform. 82 | 83 | We roll two dimensional emissions into a swissroll style manifold in three 84 | dimensions. 85 | 86 | Arguments: 87 | traj (array-like of float): a trajectory of emissions 88 | ''' 89 | x = traj[:, 0] 90 | return _np.vstack([x * _np.cos(x), traj[:, 1], x * _np.sin(x)]).T 91 | 92 | def sample_swissroll_model(length): 93 | '''Sample a hidden state and a swissroll-transformed emission trajectory. 94 | 95 | We sample a hidden state trajectory and swissroll-masked emissions in two 96 | dimensions such that the four metastable states are not linearly separable. 97 | 98 | Arguments: 99 | length (int): length of the resulting trajectories 100 | ''' 101 | cov = [[1.0, 0.0], [0.0, 1.0]] 102 | states = [[7.5, 7.5], [7.5, 15.0], [15.0, 15.0], [15.0, 7.5]] 103 | transition_matrix = [ 104 | [0.95, 0.05, 0.00, 0.00], 105 | [0.05, 0.90, 0.05, 0.00], 106 | [0.00, 0.05, 0.90, 0.05], 107 | [0.00, 0.00, 0.05, 0.95]] 108 | traj, dtraj = sample_hmm(length, cov, states, transition_matrix) 109 | return swissroll_transform(traj), dtraj 110 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. -------------------------------------------------------------------------------- /time-lagged-autoencoder/tae/api.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/deeptime repository. 2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | ''' 19 | A simple API to apply PCA, TICA, and AE to time series data. 20 | ''' 21 | 22 | from .models import PCA as _PCA 23 | from .models import TICA as _TICA 24 | from .models import AE as _AE 25 | from .models import VAE as _VAE 26 | from .models import VAMPNet as _VAMPNet 27 | from .utils import create_dataset as _create_dataset 28 | from .utils import random_split as _random_split 29 | from .utils import random_block_split as _random_block_split 30 | from .utils import whiten_data as _whiten_data 31 | from torch import nn as _nn 32 | from torch.utils.data import DataLoader as _DataLoader 33 | 34 | def _transform(model, data, data_0, batch_size, whiten, pin_memory=False): 35 | loader = _DataLoader(data_0, batch_size=batch_size, pin_memory=pin_memory) 36 | if whiten: 37 | transformed_data = _whiten_data(model.transform(loader)).numpy() 38 | else: 39 | transformed_data = model.transform(loader).numpy() 40 | if isinstance(data, (list, tuple)): 41 | collect = [] 42 | p = 0 43 | lengths = [d.shape[0] for d in data] 44 | for length in lengths: 45 | collect.append(transformed_data[p:p+length, :]) 46 | p += length 47 | return collect 48 | return transformed_data 49 | 50 | def pca(data, dim=None, validation_split=None, batch_size=100, whiten=False): 51 | '''Perform a principal component analysis for dimensionality reduction. 52 | 53 | We compute the first eigenvectors of the instantaneous covariance 54 | matrix and use them to rotate/project the data into a lower dimensional 55 | subspace. 56 | 57 | Arguments: 58 | data (numpy-ndarray of list thereof): the data to be transformed 59 | dim (int): the target dimensionality 60 | validation_split (float): fraction of the data reserved for validation 61 | batch_size (int): specify a batch size for the minibatch process 62 | whiten (boolean): set to True to whiten the transformed data 63 | 64 | Returns: 65 | (numpy.ndarray of list thereof): the transformed data 66 | (float): training loss 67 | (float): validation loss 68 | ''' 69 | data_0 = _create_dataset(data, lag=0) 70 | if validation_split is None: 71 | train_loader = _DataLoader(data_0, batch_size=batch_size) 72 | test_loader = None 73 | else: 74 | data_test, data_train = _random_split( 75 | data_0, f_active=validation_split) 76 | train_loader = _DataLoader(data_train, batch_size=batch_size) 77 | test_loader = _DataLoader(data_test, batch_size=batch_size) 78 | model = _PCA() 79 | train_loss, test_loss = model.fit( 80 | train_loader, dim=dim, test_loader=test_loader) 81 | transformed_data = _transform(model, data, data_0, batch_size, whiten) 82 | return transformed_data, train_loss, test_loss 83 | 84 | def tica( 85 | data, dim=None, lag=1, kinetic_map=True, symmetrize=False, 86 | validation_split=None, batch_size=100, whiten=False): 87 | '''Perform a time-lagged independent component analysis for 88 | dimensionality reduction. 89 | 90 | We compute a rank-d approximation to the Koopman operator and use it to 91 | rotate/project the data into a lower dimensional subspace. 92 | 93 | Arguments: 94 | data (numpy-ndarray of list thereof): the data to be transformed 95 | dim (int): the target dimensionality 96 | lag (int): specifies the lag in time steps 97 | kinetic_map (boolean): use the kinetic map variant of TICA 98 | symmetrize (boolean): enforce symmetry and reversibility 99 | validation_split (float): fraction of the data reserved for validation 100 | batch_size (int): specify a batch size for the minibatch process 101 | whiten (boolean): set to True to whiten the transformed data 102 | 103 | Returns: 104 | (numpy.ndarray of list thereof): the transformed data 105 | (float): training loss 106 | (float): validation loss 107 | ''' 108 | data_0 = _create_dataset(data, lag=0) 109 | data_lag = _create_dataset(data, lag=lag) 110 | if validation_split is None: 111 | train_loader = _DataLoader(data_lag, batch_size=batch_size) 112 | test_loader = None 113 | else: 114 | data_test, data_train = _random_block_split( 115 | data_lag, lag, f_active=validation_split) 116 | train_loader = _DataLoader(data_train, batch_size=batch_size) 117 | test_loader = _DataLoader(data_test, batch_size=batch_size) 118 | model = _TICA(kinetic_map=kinetic_map, symmetrize=symmetrize) 119 | train_loss, test_loss = model.fit( 120 | train_loader, dim=dim, test_loader=test_loader) 121 | transformed_data = _transform(model, data, data_0, batch_size, whiten) 122 | return transformed_data, train_loss, test_loss 123 | 124 | def ae( 125 | data, dim=None, lag=1, n_epochs=50, validation_split=None, 126 | batch_size=100, whiten=False, pin_memory=False, **kwargs): 127 | '''Use a time-lagged autoencoder model for dimensionality reduction. 128 | 129 | We train a deep (or shallow) time-lagged autoencoder type neural network 130 | and use the first half (encoder stage) to transform the supplied data. 131 | 132 | Arguments: 133 | data (numpy-ndarray of list thereof): the data to be transformed 134 | dim (int): the target dimensionality 135 | lag (int): specifies the lag in time steps 136 | n_epochs (int): number of training epochs 137 | validation_split (float): fraction of the data reserved for validation 138 | batch_size (int): specify a batch size for the minibatch process 139 | whiten (boolean): set to True to whiten the transformed data 140 | pin_memory (boolean): make DataLoaders return pinned memory 141 | 142 | Returns: 143 | (numpy.ndarray of list thereof): the transformed data 144 | (list of float): training loss 145 | (list of float): validation loss 146 | ''' 147 | ae_args = dict( 148 | hid_size=[100], 149 | dropout=0.5, 150 | alpha=0.01, 151 | prelu=False, 152 | bias=True, 153 | lr=0.001, 154 | cuda=False, 155 | non_blocking=False) 156 | ae_args.update(kwargs) 157 | try: 158 | size = data.shape[1] 159 | except AttributeError: 160 | size = data[0].shape[1] 161 | data_0 = _create_dataset(data, lag=0) 162 | data_lag = _create_dataset(data, lag=lag) 163 | if validation_split is None: 164 | train_loader = _DataLoader( 165 | data_lag, batch_size=batch_size, pin_memory=pin_memory) 166 | test_loader = None 167 | else: 168 | data_test, data_train = _random_block_split( 169 | data_lag, lag, f_active=validation_split) 170 | train_loader = _DataLoader( 171 | data_train, batch_size=batch_size, pin_memory=pin_memory) 172 | test_loader = _DataLoader( 173 | data_test, batch_size=batch_size, pin_memory=pin_memory) 174 | model = _AE(size, dim, **ae_args) 175 | train_loss, test_loss = model.fit( 176 | train_loader, n_epochs, test_loader=test_loader) 177 | transformed_data = _transform(model, data, data_0, batch_size, whiten) 178 | return transformed_data, train_loss, test_loss 179 | 180 | def vae( 181 | data, dim=None, lag=1, n_epochs=50, validation_split=None, 182 | batch_size=100, whiten=False, pin_memory=False, **kwargs): 183 | '''Use a time-lagged variational autoencoder model for dimensionality 184 | reduction. 185 | 186 | We train a deep (or shallow) time-lagged variational autoencoder type 187 | neural network and use the first half (encoder stage) to transform the 188 | supplied data. 189 | 190 | Arguments: 191 | data (numpy-ndarray of list thereof): the data to be transformed 192 | dim (int): the target dimensionality 193 | lag (int): specifies the lag in time steps 194 | n_epochs (int): number of training epochs 195 | validation_split (float): fraction of the data reserved for validation 196 | batch_size (int): specify a batch size for the minibatch process 197 | whiten (boolean): set to True to whiten the transformed data 198 | pin_memory (boolean): make DataLoaders return pinned memory 199 | 200 | Returns: 201 | (numpy.ndarray of list thereof): the transformed data 202 | (list of float): training loss 203 | (list of float): validation loss 204 | ''' 205 | vae_args = dict( 206 | hid_size=[100], 207 | beta=1.0, 208 | dropout=0.5, 209 | alpha=0.01, 210 | prelu=False, 211 | bias=True, 212 | lr=0.001, 213 | cuda=False, 214 | non_blocking=False) 215 | vae_args.update(kwargs) 216 | try: 217 | size = data.shape[1] 218 | except AttributeError: 219 | size = data[0].shape[1] 220 | data_0 = _create_dataset(data, lag=0) 221 | data_lag = _create_dataset(data, lag=lag) 222 | if validation_split is None: 223 | train_loader = _DataLoader( 224 | data_lag, batch_size=batch_size, pin_memory=pin_memory) 225 | test_loader = None 226 | else: 227 | data_test, data_train = _random_block_split( 228 | data_lag, lag, f_active=validation_split) 229 | train_loader = _DataLoader( 230 | data_train, batch_size=batch_size, pin_memory=pin_memory) 231 | test_loader = _DataLoader( 232 | data_test, batch_size=batch_size, pin_memory=pin_memory) 233 | model = _VAE(size, dim, **vae_args) 234 | train_loss, test_loss = model.fit( 235 | train_loader, n_epochs, test_loader=test_loader) 236 | transformed_data = _transform(model, data, data_0, batch_size, whiten) 237 | return transformed_data, train_loss, test_loss 238 | 239 | ################################################################################ 240 | # 241 | # VAMPNET WORK IN PROGRESS 242 | # 243 | ################################################################################ 244 | 245 | def vampnet( 246 | data, dim=None, lag=1, n_epochs=50, validation_split=None, 247 | batch_size=100, whiten=False, pin_memory=False, **kwargs): 248 | '''Use a vampnet model for dimensionality reduction and/or clustering. 249 | 250 | .... 251 | 252 | Arguments: 253 | data (numpy-ndarray of list thereof): the data to be transformed 254 | dim (int): the target dimensionality 255 | lag (int): specifies the lag in time steps 256 | n_epochs (int): number of training epochs 257 | validation_split (float): fraction of the data reserved for validation 258 | batch_size (int): specify a batch size for the minibatch process 259 | whiten (boolean): set to True to whiten the transformed data 260 | pin_memory (boolean): make DataLoaders return pinned memory 261 | 262 | Returns: 263 | (numpy.ndarray of list thereof): the transformed data 264 | (list of float): training score 265 | (list of float): validation score 266 | ''' 267 | vn_args = dict( 268 | hid_size=[100], 269 | dropout=0.5, 270 | alpha=0.01, 271 | prelu=False, 272 | bias=True, 273 | lr=0.001, 274 | cuda=False, 275 | non_blocking=False) 276 | vn_args.update(kwargs) 277 | try: 278 | size = data.shape[1] 279 | except AttributeError: 280 | size = data[0].shape[1] 281 | data_0 = _create_dataset(data, lag=0) 282 | data_lag = _create_dataset(data, lag=lag) 283 | if validation_split is None: 284 | train_loader = _DataLoader( 285 | data_lag, batch_size=batch_size, pin_memory=pin_memory) 286 | test_loader = None 287 | else: 288 | data_test, data_train = _random_block_split( 289 | data_lag, lag, f_active=validation_split) 290 | train_loader = _DataLoader( 291 | data_train, batch_size=batch_size, pin_memory=pin_memory) 292 | test_loader = _DataLoader( 293 | data_test, batch_size=batch_size, pin_memory=pin_memory) 294 | model = _VAMPNet(size, dim, **vn_args) 295 | train_loss, test_loss = model.fit( 296 | train_loader, n_epochs, test_loader=test_loader) 297 | transformed_data = _transform(model, data, data_0, batch_size, whiten) 298 | train_loss = [-loss for loss in train_loss] 299 | test_loss = [-loss for loss in test_loss] 300 | return transformed_data, train_loss, test_loss 301 | -------------------------------------------------------------------------------- /time-lagged-autoencoder/tae/test/test_utils.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/deeptime repository. 2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | import numpy as np 19 | import torch 20 | from torch.utils.data import DataLoader 21 | from ..utils import LaggedDataset 22 | from ..utils import MaskedDataset 23 | from ..utils import ensure_traj_format 24 | from ..utils import create_dataset 25 | from ..utils import stride_split 26 | from ..utils import random_split 27 | from ..utils import random_block_split 28 | from ..utils import get_mean 29 | from ..utils import get_covariance 30 | from ..utils import get_sqrt_inverse 31 | from ..utils import whiten_data 32 | from ..utils import cca 33 | from ..utils import BaseTransform 34 | from ..utils import Transform 35 | 36 | ################################################################################ 37 | # 38 | # DATASETS 39 | # 40 | ################################################################################ 41 | 42 | def test_lagged_dataset_at_default_lag(): 43 | data = np.arange( 44 | 800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32) 45 | dataset = LaggedDataset(torch.Tensor(data), lag=0) 46 | for x, y in dataset: 47 | assert x[0] == y[0] 48 | 49 | def test_lagged_dataset_at_lag0(): 50 | data = np.arange( 51 | 800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32) 52 | dataset = LaggedDataset(torch.Tensor(data), lag=0) 53 | for x, y in dataset: 54 | assert x[0] == y[0] 55 | 56 | def test_lagged_dataset_at_random_lag(): 57 | data = np.arange( 58 | 800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32) 59 | lag = 1 + np.random.randint(50) 60 | dataset = LaggedDataset(torch.Tensor(data), lag) 61 | for x, y in dataset: 62 | assert x[0] + lag == y[0] 63 | 64 | def test_masked_dataset(): 65 | data = np.arange( 66 | 800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32) 67 | active = np.random.choice(data[:, 0], size=100, replace=False) 68 | dataset = MaskedDataset(LaggedDataset(torch.Tensor(data), lag=0), active) 69 | assert len(dataset) == len(active) 70 | for (x, y), z in zip(dataset, active): 71 | assert x[0] == y[0] == z 72 | 73 | def test_ensure_traj_format_1d(): 74 | raw_data = np.arange(800 + np.random.randint(200)) 75 | data = ensure_traj_format(raw_data) 76 | assert isinstance(data, np.ndarray) 77 | assert data.dtype == np.float32 78 | assert data.ndim == 2 79 | np.testing.assert_array_equal(data.shape, [len(raw_data), 1]) 80 | np.testing.assert_allclose(raw_data.astype(np.float32), data[:, 0]) 81 | 82 | def test_ensure_traj_format_2d(): 83 | raw_data = np.arange(800 + np.random.randint(200)).reshape(-1, 1) 84 | data = ensure_traj_format(raw_data) 85 | assert isinstance(data, np.ndarray) 86 | assert data.dtype == np.float32 87 | assert data.ndim == 2 88 | np.testing.assert_array_equal(data.shape, raw_data.shape) 89 | np.testing.assert_allclose(raw_data.astype(np.float32), data) 90 | 91 | def test_create_dataset_single_file_1d(): 92 | data = np.arange( 93 | 800 + np.random.randint(200)) 94 | lag = np.random.randint(50) 95 | dataset = create_dataset(data, lag, dtype=np.float32) 96 | for x, y in dataset: 97 | assert x[0] + lag == y[0] 98 | 99 | def test_create_dataset_single_file_2d(): 100 | data = np.arange( 101 | 800 + np.random.randint(200)).reshape(-1, 1) 102 | lag = np.random.randint(50) 103 | dataset = create_dataset(data, lag, dtype=np.float32) 104 | for x, y in dataset: 105 | assert x[0] + lag == y[0] 106 | 107 | def test_create_dataset_multiple_files_1d(): 108 | data = [np.arange(800 + np.random.randint(200)) for _ in range(3)] 109 | lag = np.random.randint(50) 110 | dataset = create_dataset(data, lag, dtype=np.float32) 111 | for x, y in dataset: 112 | assert x[0] + lag == y[0] 113 | 114 | def test_create_dataset_multiple_files_2d(): 115 | data = [np.arange( 116 | 800 + np.random.randint(200)).reshape(-1, 1) for _ in range(3)] 117 | lag = np.random.randint(50) 118 | dataset = create_dataset(data, lag, dtype=np.float32) 119 | for x, y in dataset: 120 | assert x[0] + lag == y[0] 121 | 122 | def test_stride_split(): 123 | data = np.arange( 124 | 800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32) 125 | lag = 1 + np.random.randint(50) 126 | dataset = LaggedDataset(torch.Tensor(data), lag) 127 | stride = 1 + np.random.randint(10) 128 | offset = np.random.randint(stride) 129 | dataset_a, dataset_b = stride_split(dataset, stride=stride, offset=offset) 130 | assert len(dataset) == len(dataset_a) + len(dataset_b) 131 | for x, y in dataset_a: 132 | assert x[0] + lag == y[0] 133 | for x, y in dataset_b: 134 | assert x[0] + lag == y[0] 135 | 136 | def test_random_split(): 137 | data = np.arange( 138 | 800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32) 139 | lag = 1 + np.random.randint(50) 140 | dataset = LaggedDataset(torch.Tensor(data), lag) 141 | dataset_a, dataset_b = random_split(dataset, f_active=0.5) 142 | assert len(dataset) == len(dataset_a) + len(dataset_b) 143 | for x, y in dataset_a: 144 | assert x[0] + lag == y[0] 145 | for x, y in dataset_b: 146 | assert x[0] + lag == y[0] 147 | 148 | def test_random_block_split(): 149 | data = np.arange( 150 | 800 + np.random.randint(200)).reshape(-1, 1).astype(np.float32) 151 | lag = 1 + np.random.randint(50) 152 | dataset = LaggedDataset(torch.Tensor(data), lag) 153 | dataset_a, dataset_b = random_block_split(dataset, lag, f_active=0.5) 154 | assert len(dataset) == len(dataset_a) + len(dataset_b) 155 | for x, y in dataset_a: 156 | assert x[0] + lag == y[0] 157 | for x, y in dataset_b: 158 | assert x[0] + lag == y[0] 159 | 160 | ################################################################################ 161 | # 162 | # STATISTICS 163 | # 164 | ################################################################################ 165 | 166 | def test_get_mean_via_normal_distribution_parameters(): 167 | data = torch.randn(10000, 1) 168 | dataset = LaggedDataset(data, lag=0) 169 | x, y = get_mean( 170 | DataLoader( 171 | dataset, batch_size=np.random.randint(low=10, high=100))) 172 | np.testing.assert_allclose(x.numpy(), 0.0, atol=0.05) 173 | np.testing.assert_allclose(y.numpy(), 0.0, atol=0.05) 174 | 175 | def test_get_mean_via_distribution_symmetry(): 176 | data = torch.rand(5000, 1) 177 | data = torch.cat([data, -data]) 178 | dataset = LaggedDataset(data, lag=0) 179 | x, y = get_mean( 180 | DataLoader( 181 | dataset, batch_size=np.random.randint(low=10, high=100))) 182 | np.testing.assert_allclose(x.numpy(), 0.0, atol=0.0001) 183 | np.testing.assert_allclose(y.numpy(), 0.0, atol=0.0001) 184 | 185 | def test_get_mean_vs_numpy(): 186 | data = torch.randn(10000, 1) 187 | dataset = LaggedDataset(data, lag=0) 188 | x, y = get_mean( 189 | DataLoader( 190 | dataset, batch_size=np.random.randint(low=10, high=100))) 191 | numpy_result = np.mean(data.numpy()) 192 | np.testing.assert_allclose(x.numpy(), numpy_result, atol=0.0001) 193 | np.testing.assert_allclose(y.numpy(), numpy_result, atol=0.0001) 194 | 195 | def test_get_covariance_via_normal_distribution_parameters(): 196 | data = torch.randn(10000, 1) 197 | dataset = LaggedDataset(data, lag=0) 198 | xx, xy, yy = get_covariance( 199 | DataLoader( 200 | dataset, batch_size=np.random.randint(low=10, high=100)), 201 | torch.Tensor([0]), torch.Tensor([0])) 202 | np.testing.assert_allclose(xx.numpy(), 1.0, atol=0.1) 203 | np.testing.assert_allclose(xy.numpy(), 1.0, atol=0.1) 204 | np.testing.assert_allclose(yy.numpy(), 1.0, atol=0.1) 205 | 206 | def test_get_covariance_vs_numpy(): 207 | data = torch.randn(10000, 1) 208 | dataset = LaggedDataset(data, lag=0) 209 | xx, xy, yy = get_covariance( 210 | DataLoader( 211 | dataset, batch_size=np.random.randint(low=10, high=100)), 212 | torch.Tensor([0]), torch.Tensor([0])) 213 | numpy_result = np.var(data.numpy(), ddof=1) 214 | np.testing.assert_allclose(xx.numpy(), numpy_result, atol=0.0005) 215 | np.testing.assert_allclose(xy.numpy(), numpy_result, atol=0.0005) 216 | np.testing.assert_allclose(yy.numpy(), numpy_result, atol=0.0005) 217 | 218 | ################################################################################ 219 | # 220 | # WHITENING 221 | # 222 | ################################################################################ 223 | 224 | def test_get_sqrt_inverse(): 225 | dim = 2 + np.random.randint(5) 226 | x = torch.rand(500, dim) 227 | x = torch.mm(x.t(), x) 228 | y = get_sqrt_inverse(x) 229 | y = torch.mm(y, y) 230 | np.testing.assert_allclose( 231 | x.mm(y).numpy(), 232 | np.diag([1.0] * dim).astype(np.float32), 233 | atol=0.0001) 234 | 235 | def test_whiten_data(): 236 | dim = 1 + np.random.randint(5) 237 | x = whiten_data(torch.rand(500, dim)) 238 | np.testing.assert_allclose( 239 | x.numpy().mean(axis=0), 240 | 0.0, 241 | atol=0.01) 242 | np.testing.assert_allclose( 243 | torch.mm(x.t(), x).div_(float(x.size()[0])).numpy(), 244 | np.diag([1.0] * dim).astype(np.float32), 245 | atol=0.01) 246 | 247 | ################################################################################ 248 | # 249 | # CCA 250 | # 251 | ################################################################################ 252 | 253 | def test_cca(): 254 | s = np.arange(1000) 255 | x = torch.from_numpy( 256 | np.vstack((s, np.random.randn(s.shape[0]))).T.astype(np.float32)) 257 | y = torch.from_numpy( 258 | np.vstack((np.random.randn(s.shape[0]), s)).T.astype(np.float32)) 259 | u, s, v = cca(x, y, batch_size=100) 260 | np.testing.assert_allclose(s.numpy(), [1.0, 0.0], atol=0.2) 261 | p = u.mm(torch.diag(s).mm(v)) 262 | np.testing.assert_allclose( 263 | np.abs(p.numpy()), [[0.0, 1.0], [0.0, 0.0]], atol=0.2) 264 | 265 | ################################################################################ 266 | # 267 | # TRANSFORMER 268 | # 269 | ################################################################################ 270 | 271 | def test_base_transform(): 272 | dim = 2 + np.random.randint(5) 273 | mean = 10.0 * (torch.rand(dim) - 0.5) 274 | sigma = torch.rand(dim, dim) 275 | sigma.add_(sigma.t()) 276 | data = torch.randn(500, dim).mm(sigma) + mean[None, :] 277 | loader = DataLoader(LaggedDataset(data, lag=0), batch_size=64) 278 | x_mean, y_mean = get_mean(loader) 279 | cxx, cxy, cyy = get_covariance(loader, x_mean, y_mean) 280 | transformer = BaseTransform(mean=x_mean, covariance=cxx) 281 | transformed_data = [] 282 | for x, _ in loader: 283 | transformed_data.append(transformer(x)) 284 | y = torch.cat(transformed_data) 285 | np.testing.assert_allclose( 286 | y.numpy().mean(axis=0), 287 | 0.0, 288 | atol=0.01) 289 | np.testing.assert_allclose( 290 | torch.mm(y.t(), y).div_(float(y.size()[0])).numpy(), 291 | np.diag([1.0] * dim).astype(np.float32), 292 | atol=0.2) 293 | 294 | def test_transform(): 295 | dim = 2 + np.random.randint(5) 296 | mean = 10.0 * (torch.rand(dim) - 0.5) 297 | sigma = torch.rand(dim, dim) 298 | sigma.add_(sigma.t()) 299 | data = torch.randn(500, dim).mm(sigma) + mean[None, :] 300 | loader = DataLoader(LaggedDataset(data, lag=0), batch_size=64) 301 | x_mean, y_mean = get_mean(loader) 302 | cxx, cxy, cyy = get_covariance(loader, x_mean, y_mean) 303 | transformer = Transform( 304 | x_mean=x_mean, x_covariance=cxx, 305 | y_mean=x_mean, y_covariance=cyy) 306 | x_, y_ = [], [] 307 | for x, y in loader: 308 | x, y = transformer(x, y) 309 | x_.append(x) 310 | y_.append(y) 311 | x = torch.cat(x_) 312 | y = torch.cat(y_) 313 | np.testing.assert_allclose( 314 | x.numpy().mean(axis=0), 315 | 0.0, 316 | atol=0.1) 317 | np.testing.assert_allclose( 318 | torch.mm(x.t(), x).div_(float(x.size()[0])).numpy(), 319 | np.diag([1.0] * dim).astype(np.float32), 320 | atol=0.1) 321 | np.testing.assert_allclose( 322 | y.numpy().mean(axis=0), 323 | 0.0, 324 | atol=0.1) 325 | np.testing.assert_allclose( 326 | torch.mm(y.t(), y).div_(float(y.size()[0])).numpy(), 327 | np.diag([1.0] * dim).astype(np.float32), 328 | atol=0.1) 329 | -------------------------------------------------------------------------------- /vampnet/examples/1D_double_well.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Import all the packages used" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "%matplotlib inline\n", 19 | "import vampnet\n", 20 | "from vampnet import data_generator\n", 21 | "from keras.models import Model\n", 22 | "from keras.layers import Dense, Activation, Flatten, Input, BatchNormalization, concatenate\n", 23 | "from keras import optimizers\n", 24 | "import tensorflow as tf\n", 25 | "from keras.backend import clear_session" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# generate 50000 frames and energy values\n", 35 | "datapoints = int(5e4)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "traj_whole = data_generator.get_asymmetric_double_well_data(datapoints)\n", 45 | "# To fit the dataformat\n", 46 | "traj_whole = np.expand_dims(traj_whole, 1)\n", 47 | "traj_data_points, input_size = traj_whole.shape" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "x = np.linspace(-1,5,500)\n", 57 | "plt.figure(figsize=(6,2))\n", 58 | "plt.ylim(-15,10)\n", 59 | "plt.xlim(-1,5)\n", 60 | "plt.plot(x,data_generator.asymmetric_double_well_energy(x), lw = 2)\n", 61 | "plt.xlabel('Position x / a.u.', fontsize = 16)\n", 62 | "plt.ylabel('Pot. energy / a.u.', fontsize = 16)\n", 63 | "plt.xticks(fontsize = 14)\n", 64 | "\n", 65 | "plt.yticks(fontsize = 14);" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "# All Hyperparameters\n", 75 | "\n", 76 | "# Tau, how much is the timeshift of the two datasets\n", 77 | "tau = 1\n", 78 | "\n", 79 | "# Batch size for Stochastic Gradient descent\n", 80 | "batch_size = 2048\n", 81 | "\n", 82 | "# Which trajectory points percentage is used as training\n", 83 | "train_ratio = 0.9\n", 84 | "\n", 85 | "# How many hidden layers the network has\n", 86 | "network_depth = 4\n", 87 | "\n", 88 | "# Width of every layer\n", 89 | "layer_width = 20\n", 90 | "nodes = [layer_width]*network_depth\n", 91 | "# Learning rate used for the ADAM optimizer\n", 92 | "learning_rate = 0.0001\n", 93 | "\n", 94 | "# How many output states the network has\n", 95 | "output_size = 5\n", 96 | "\n", 97 | "# Iteration over the training set in the fitting process\n", 98 | "nb_epoch = 300" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "epsilon = 1e-5\n", 108 | "vamp = vampnet.VampnetTools(epsilon = epsilon)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "# Shuffle trajectory and lagged trajectory together\n", 118 | "length_data = traj_data_points - tau\n", 119 | "\n", 120 | "traj_ord= traj_whole[:length_data]\n", 121 | "traj_ord_lag = traj_whole[tau:length_data+tau]\n", 122 | "\n", 123 | "indexes = np.arange(length_data)\n", 124 | "np.random.shuffle(indexes)\n", 125 | "\n", 126 | "\n", 127 | "\n", 128 | "traj = traj_ord[indexes]\n", 129 | "traj_lag = traj_ord_lag[indexes]\n" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "# Prepare data for tensorflow usage\n", 139 | "length_train = int(np.floor(length_data * train_ratio))\n", 140 | "length_vali = length_data - length_train\n", 141 | "\n", 142 | "traj_data_train = traj[:length_train]\n", 143 | "traj_data_train_lag = traj_lag[:length_train]\n", 144 | "\n", 145 | "traj_data_valid = traj[length_train:]\n", 146 | "traj_data_valid_lag = traj_lag[length_train:]\n", 147 | "\n", 148 | "#Data used for states ordering\n", 149 | "X1 = traj_ord[:length_data].astype('float32')\n", 150 | "X2 = traj_ord_lag[:length_data].astype('float32')\n", 151 | "\n", 152 | "# Input of the first network\n", 153 | "X1_train = traj_data_train.astype('float32')\n", 154 | "X2_train = traj_data_train_lag.astype('float32')\n", 155 | "\n", 156 | "# Input for validation\n", 157 | "X1_vali = traj_data_valid.astype('float32')\n", 158 | "X2_vali = traj_data_valid_lag.astype('float32')\n", 159 | "\n", 160 | "# Needs a Y-train set which we dont have.\n", 161 | "Y_train = np.zeros((length_train,2*output_size)).astype('float32')\n", 162 | "Y_vali = np.zeros((length_vali,2*output_size)).astype('float32')" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "if 'model' in globals():\n", 172 | " del model\n", 173 | " clear_session()\n", 174 | "# Build the model\n", 175 | "Data_X = Input(shape = (input_size,))\n", 176 | "Data_Y = Input(shape = (input_size,))\n", 177 | "\n", 178 | "# A batch normalization layer improves convergence speed\n", 179 | "# bn_layer = BatchNormalization()\n", 180 | "bn_layer = Activation('linear')\n", 181 | "\n", 182 | "# Instance layers and assign them to the two lobes of the network\n", 183 | "dense_layers = [Dense(node, activation = 'relu',)\n", 184 | " for node in nodes]\n", 185 | "\n", 186 | "lx_branch = bn_layer(Data_X)\n", 187 | "rx_branch = bn_layer(Data_Y)\n", 188 | "\n", 189 | "for i, layer in enumerate(dense_layers):\n", 190 | "\n", 191 | " lx_branch = dense_layers[i](lx_branch)\n", 192 | " rx_branch = dense_layers[i](rx_branch)\n", 193 | "\n", 194 | "\n", 195 | "# Add a softmax output layer.\n", 196 | "# Should be replaced with a linear activation layer if\n", 197 | "# the outputs of the network cannot be interpreted as states\n", 198 | "softmax = Dense(output_size, activation='softmax')\n", 199 | "\n", 200 | "lx_branch = softmax(lx_branch)\n", 201 | "rx_branch = softmax(rx_branch)\n", 202 | "\n", 203 | "# Merge both networks to train both at the same time\n", 204 | "merged = concatenate([lx_branch, rx_branch])\n", 205 | "\n", 206 | "# Initialize the model and the optimizer, and compile it with\n", 207 | "# the loss and metric functions from the VAMPnets package\n", 208 | "model = Model(inputs = [Data_X, Data_Y], outputs = merged)\n", 209 | "# model.summary()\n", 210 | "# Compile it with our own loss-function\n", 211 | "adam = optimizers.adam(lr = learning_rate)\n", 212 | "\n", 213 | "\n", 214 | "# Pretraining with VAMP with 'symmetrized' matrices yields a bad approximation of the \n", 215 | "# eigenvectors per se, but improves the 'readability' of the states identified by VAMP-2\n", 216 | "# which would otherwise be difficult to interprete.\n", 217 | "\n", 218 | "\n", 219 | "# IMPORTANT: the function vamp.loss_VAMP2_autograd can only be used with tensorflow 1.6 or more recent.\n", 220 | "# For older versions of TF, use the function vamp.loss_VAMP2\n", 221 | "\n", 222 | "losses = [\n", 223 | " vamp._loss_VAMP_sym,\n", 224 | " vamp.loss_VAMP2,\n", 225 | "]\n", 226 | "\n", 227 | "valid_metric = np.zeros((len(losses), nb_epoch))\n", 228 | "train_metric = np.zeros((len(losses), nb_epoch))\n", 229 | "\n", 230 | "for l_index, loss in enumerate(losses):\n", 231 | " \n", 232 | " model.compile(optimizer = 'adam', loss = loss, metrics = [vamp.metric_VAMP])\n", 233 | " \n", 234 | " hist = model.fit([X1_train, X2_train], Y_train ,batch_size=batch_size, epochs=nb_epoch, verbose=0,\n", 235 | " validation_data=([X1_vali, X2_vali], Y_vali))\n", 236 | " \n", 237 | " temp = model.predict([traj_ord, traj_ord_lag], batch_size=np.shape(X1_vali)[0])\n", 238 | " \n", 239 | " x_a = temp[:,:output_size]\n", 240 | "\n", 241 | "\n", 242 | " X_Validation = np.squeeze(traj_ord)\n", 243 | " for i in range(output_size):\n", 244 | " plt.scatter(X_Validation, x_a[:,i], label= 'state '+str(i))\n", 245 | " plt.title('State probabilities')\n", 246 | " plt.legend()\n", 247 | " plt.show()\n", 248 | "\n", 249 | "\n", 250 | "\n", 251 | "\n", 252 | " states_prob_meanfree = x_a - np.mean(x_a, axis=0)\n", 253 | " tau_msm = 5\n", 254 | " K_smt = vamp.estimate_koopman_op(states_prob_meanfree, tau_msm)\n", 255 | "\n", 256 | " K_eigvals, K_eigvec = np.linalg.eig(np.real(K_smt))\n", 257 | "\n", 258 | " index = np.argmax(np.real(K_eigvals))\n", 259 | " real_eigfunc = states_prob_meanfree @ np.real(K_eigvec[:,index])\n", 260 | "\n", 261 | " plt.scatter(X_Validation, real_eigfunc)\n", 262 | " plt.title('Eigenvector')\n", 263 | " plt.show()\n", 264 | "\n", 265 | " valid_metric[l_index] = np.array(hist.history['val_metric_VAMP'])\n", 266 | " train_metric[l_index] = np.array(hist.history['metric_VAMP'])\n", 267 | "\n", 268 | "valid_metric = np.reshape(valid_metric, (-1))\n", 269 | "train_metric = np.reshape(train_metric, (-1))" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "# Training result visualization" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "plt.plot(train_metric, label = 'Training')\n", 286 | "plt.legend()\n", 287 | "plt.plot(valid_metric, label = 'Validation')\n", 288 | "plt.legend()\n", 289 | "\n", 290 | "plt.show()" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "# Transform the input trajectory using the network" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "states_prob = model.predict([traj_ord, traj_ord_lag])[:, :output_size]\n", 307 | "\n", 308 | "# Order the output states based on their population\n", 309 | "coor_pred = np.argmax(states_prob, axis = 1)\n", 310 | "indexes = [np.where(coor_pred == np.multiply(np.ones_like(coor_pred), n)) for n in range(output_size)]\n", 311 | "states_num = [len(i[0]) for i in indexes]\n", 312 | "states_order = np.argsort(states_num).astype('int')[::-1]\n", 313 | "\n", 314 | "pred_ord = states_prob[:,states_order]" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "# Visualize the population of the states" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "def print_states_pie_chart():\n", 331 | " coors = []\n", 332 | " maxi = np.max(pred_ord, axis= 1)\n", 333 | "\n", 334 | " for i in range(output_size):\n", 335 | " coors.append(len(np.where(pred_ord[:,i] == maxi)[0]))\n", 336 | " \n", 337 | " fig1, ax1 = plt.subplots()\n", 338 | " ax1.pie(np.array(coors), autopct='%1.2f%%', startangle=90)\n", 339 | " ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.\n", 340 | " print('States population: '+str(np.array(coors)/len(maxi)*100)+'%')\n", 341 | " plt.show()\n", 342 | "\n", 343 | "print_states_pie_chart()" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "# Estimate the implied timescales" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "max_tau = 15\n", 360 | "lag = np.arange(1, max_tau, 1)\n", 361 | "its = vamp.get_its(pred_ord, lag)\n", 362 | "vamp.plot_its(its, lag)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "# Chapman-Kolmogorov test for the estimated koopman operator" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "steps = 16\n", 379 | "tau_msm = 1\n", 380 | "predicted, estimated = vamp.get_ck_test(pred_ord, steps, tau_msm)\n", 381 | "vamp.plot_ck_test(predicted, estimated, output_size, steps, tau_msm)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [] 390 | } 391 | ], 392 | "metadata": { 393 | "anaconda-cloud": {}, 394 | "kernelspec": { 395 | "display_name": "Python 3", 396 | "language": "python", 397 | "name": "python3" 398 | }, 399 | "language_info": { 400 | "codemirror_mode": { 401 | "name": "ipython", 402 | "version": 3 403 | }, 404 | "file_extension": ".py", 405 | "mimetype": "text/x-python", 406 | "name": "python", 407 | "nbconvert_exporter": "python", 408 | "pygments_lexer": "ipython3", 409 | "version": "3.6.4" 410 | } 411 | }, 412 | "nbformat": 4, 413 | "nbformat_minor": 1 414 | } 415 | -------------------------------------------------------------------------------- /vampnet/vampnet/data_generator.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/deeptime repository. 2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | """sample generator for the MCMM project's clustering stage""" 19 | 20 | import numpy as np 21 | 22 | ################################################################################ 23 | # # 24 | # defining test potentials # 25 | # # 26 | ################################################################################ 27 | 28 | class BrownianDynamics(object): 29 | r"""base class for Brownian dynamics integration""" 30 | def __init__(self, dim, dt, kT, mass, damping): 31 | self.dim = dim 32 | self.dt = dt 33 | self.kT = kT 34 | self.mass = mass 35 | self.daming = damping 36 | self.coeff_A = dt / (mass * damping) 37 | self.coeff_B = np.sqrt(2.0 * dt * kT / (mass * damping)) 38 | def gradient(self, x): 39 | r"""gradient of the yet unkown potential""" 40 | raise NotImplementedError("implement in child class") 41 | def step(self, x): 42 | r"""perform a single Brownian dynamics step""" 43 | return x - self.coeff_A * self.gradient(x) \ 44 | + self.coeff_B * np.random.normal(size=self.dim) 45 | 46 | 47 | ################################################################################ 48 | # # 49 | # defining test potentials # 50 | # # 51 | ################################################################################ 52 | 53 | def asymmetric_double_well_energy(x): 54 | r"""computes the potential energy at point x""" 55 | _x = x - 2.0 56 | return 2.0 * _x - 6.0 * _x**2 + _x**4 57 | 58 | def asymmetric_double_well_gradient(x): 59 | r"""computes the potential's gradient at point x""" 60 | return 4.0 * x**3 - 24.0 * x**2 + 36.0 * x - 6.0 61 | 62 | def prinz_energy(x): 63 | return 4*(x**8 + 0.8 * np.exp(-80*x**2) + 0.2*np.exp(-80*(x-0.5)**2) + 0.5*np.exp(-40.*(x+0.5)**2)) 64 | 65 | def prinz_gradient(x): 66 | return 4*(8*x**7 - 128. * np.exp(-80*x**2)*x - 32.*np.exp(-80*(x-0.5)**2) *(x-0.5) - 40*np.exp(-40.*(x+0.5)**2) *(x+0.5)) 67 | 68 | def folding_model_energy(rvec, rcut): 69 | r"""computes the potential energy at point rvec""" 70 | r = np.linalg.norm(rvec) - rcut 71 | rr = r**2 72 | if r < 0.0: 73 | return -2.5 * rr 74 | return 0.5 * (r - 2.0) * rr 75 | 76 | def folding_model_gradient(rvec, rcut): 77 | r"""computes the potential's gradient at point rvec""" 78 | rnorm = np.linalg.norm(rvec) 79 | if rnorm == 0.0: 80 | return np.zeros(rvec.shape) 81 | r = rnorm - rcut 82 | if r < 0.0: 83 | return -5.0 * r * rvec / rnorm 84 | return (1.5 * r - 2.0) * rvec / rnorm 85 | 86 | 87 | ################################################################################ 88 | # # 89 | # defining wrapper classes # 90 | # # 91 | ################################################################################ 92 | 93 | class AsymmetricDoubleWell(BrownianDynamics): 94 | r"""encapsulates the asymmetric double well potential""" 95 | def __init__(self, dt, kT, mass=1.0, damping=1.0): 96 | super(AsymmetricDoubleWell, self).__init__(1, dt, kT, mass, damping) 97 | def gradient(self, x): 98 | return asymmetric_double_well_gradient(x) 99 | def sample(self, x0, nsteps, nskip=1): 100 | r"""generate nsteps sample points""" 101 | x = np.zeros(shape=(nsteps+1,)) 102 | x[0] = x0 103 | for t in range(nsteps): 104 | q = x[t] 105 | for s in range(nskip): 106 | q = self.step(q) 107 | x[t+1] = q 108 | return x 109 | 110 | class FoldingModel(BrownianDynamics): 111 | r"""encapsulates the folding model potential""" 112 | def __init__(self, dt, kT, mass=1.0, damping=1.0, rcut=3.0): 113 | super(FoldingModel, self).__init__(5, dt, kT, mass, damping) 114 | self.rcut = rcut 115 | def gradient(self, x): 116 | return folding_model_gradient(x, self.rcut) 117 | def sample(self, rvec0, nsteps, nskip=1): 118 | r"""generate nsteps sample points""" 119 | rvec = np.zeros(shape=(nsteps+1, self.dim)) 120 | rvec[0, :] = rvec0[:] 121 | for t in range(nsteps): 122 | q = rvec[t, :] 123 | for s in range(nskip): 124 | q = self.step(q) 125 | rvec[t+1, :] = q[:] 126 | return rvec 127 | 128 | class PrinzModel(BrownianDynamics): 129 | r"""encapsulates the Prinz potential""" 130 | def __init__(self, dt, kT, mass=1.0, damping=1.0): 131 | super(PrinzModel, self).__init__(1, dt, kT, mass, damping) 132 | def gradient(self, x): 133 | return prinz_gradient(x) 134 | def sample(self, x0, nsteps, nskip=1): 135 | r"""generate nsteps sample points""" 136 | x = np.zeros(shape=(nsteps+1,)) 137 | x[0] = x0 138 | for t in range(nsteps): 139 | q = x[t] 140 | for s in range(nskip): 141 | q = self.step(q) 142 | x[t+1] = q 143 | return x 144 | 145 | 146 | ################################################################################ 147 | # # 148 | # main area # 149 | # # 150 | ################################################################################ 151 | 152 | def get_asymmetric_double_well_data(nstep, x0 = 0., nskip=1, dt=0.01, kT=10.0, mass=1.0, damping=1.0): 153 | r"""wrapper for the asymmetric double well generator""" 154 | adw = AsymmetricDoubleWell(dt, kT, mass=mass, damping=damping) 155 | return adw.sample(x0, nstep, nskip=nskip) 156 | 157 | def get_folding_model_data( 158 | nstep, rvec0 = np.zeros((5)), nskip=1, dt=0.01, kT=10.0, mass=1.0, damping=1.0, rcut=3.0): 159 | r"""wrapper for the folding model generator""" 160 | fm = FoldingModel(dt, kT, mass=mass, damping=damping, rcut=rcut) 161 | return fm.sample(rvec0, nstep, nskip=nskip) 162 | 163 | def get_prinz_pot(nstep, x0 = 0., nskip=1, dt=0.01, kT=10.0, mass=1.0, damping=1.0): 164 | r"""wrapper for the Prinz model generator""" 165 | pw = PrinzModel(dt, kT, mass=mass, damping=damping) 166 | return pw.sample(x0, nstep, nskip=nskip) 167 | 168 | def get_alanine_data(input_type = 'coordinates', return_dihedrals = True): 169 | 170 | import mdshare 171 | 172 | retval = [] 173 | 174 | if input_type == 'distances': 175 | 176 | local_filename = mdshare.fetch('alanine-dipeptide-3x250ns-heavy-atom-distances.npz') 177 | 178 | traj_whole = np.load(local_filename)['arr_0'] 179 | 180 | elif input_type == 'coordinates': 181 | 182 | local_filename = mdshare.fetch('alanine-dipeptide-3x250ns-heavy-atom-positions.npz') 183 | 184 | traj_whole = np.load(local_filename)['arr_0'] 185 | 186 | retval.append(traj_whole) 187 | 188 | if return_dihedrals: 189 | dihedral = np.load(mdshare.fetch('alanine-dipeptide-3x250ns-backbone-dihedrals.npz'))['arr_0'] 190 | retval.append(dihedral) 191 | 192 | 193 | return retval 194 | 195 | 196 | def build_generator_on_source(data_source, batch_size, lag, output_size): 197 | '''Function used to create a generator that will fetch data from a data source through an iterator. 198 | This can be passed as parameter to a keras fit_generator method. 199 | 200 | Parameters 201 | ---------- 202 | data_source: pyemma source object. 203 | Data files source. This has to be initialized with chunksize = batch_size 204 | 205 | batch_size: int 206 | Batch size to be used for the training 207 | 208 | lag: int 209 | time frames lag to be used in the training of the VAMPnets 210 | 211 | output_size: int 212 | How many output nodes the network has 213 | ''' 214 | 215 | counter_batches = 0 216 | 217 | 218 | # How many batches before the iterator has to be reinitialized 219 | steps_epoch = np.sum(np.ceil((data_source.trajectory_lengths()-lag)/batch_size)) 220 | 221 | data_iterator = data_source.iterator(chunk = batch_size, 222 | lag = lag, 223 | return_trajindex=False) 224 | 225 | while True: 226 | 227 | input_data = list(data_iterator.next()) 228 | 229 | # Create empty labels to accomodate keras' interface requirements 230 | labels = np.empty((input_data[0].shape[0],2*output_size)).astype('float32') 231 | data = input_data, labels 232 | counter_batches += 1 233 | 234 | if counter_batches == steps_epoch: 235 | data_iterator = data_source.iterator(chunk = batch_size, 236 | lag = lag, 237 | return_trajindex=False) 238 | counter_batches = 0 239 | 240 | yield data 241 | 242 | 243 | 244 | def build_generator_on_source_shuffle(data_source, batch_size, lag, output_size, preloaded_batches = 1): 245 | '''Function used to create a generator that will randomly access data and fetch them from a data 246 | source through an iterator. This can be passed as parameter to a keras fit_generator method. 247 | 248 | Parameters 249 | ---------- 250 | data_source: pyemma source object. 251 | Data files source. This has to be initialized with chunksize = batch_size 252 | 253 | batch_size: int 254 | Batch size to be used for the training 255 | 256 | lag: int 257 | time frames lag to be used in the training of the VAMPnets 258 | 259 | output_size: int 260 | How many output nodes the network has 261 | 262 | preloaded_batches: int 263 | How many batches of data should be loaded at once; higher values will improve 264 | execution speed but also memory consumption 265 | ''' 266 | 267 | counter_batches = 0 268 | 269 | 270 | # How many batches before the iterator has to be reinitialized 271 | steps_epoch = np.ceil(np.sum((data_source.trajectory_lengths()-lag)/ (batch_size* preloaded_batches))) 272 | input_size = data_source.dimension() 273 | 274 | 275 | traj_lengths = data_source.trajectory_lengths() 276 | remaining_frames = np.concatenate([[index_traj*np.ones((traj_len - lag)), np.arange(traj_len - lag)] for index_traj, traj_len in enumerate(traj_lengths)], axis = 1).T.astype('int') 277 | indexes = np.arange(remaining_frames.shape[0]) 278 | np.random.shuffle(indexes) 279 | 280 | while True: 281 | 282 | start = counter_batches * batch_size * preloaded_batches 283 | end = min(start + batch_size * preloaded_batches, remaining_frames.shape[0]) 284 | 285 | frames = remaining_frames[indexes[start:end]] 286 | 287 | fake_ind = frames[:,0]*(traj_lengths.sum()) + frames[:,1] 288 | arg_sort = np.argsort(fake_ind) 289 | sort_arg_sort = np.argsort(arg_sort) 290 | 291 | frames_tau = frames + np.array([np.zeros((frames.shape[0])), np.ones((frames.shape[0]))*lag], dtype = 'int').T 292 | 293 | 294 | data_iterator_t = data_source.iterator(stride=frames[arg_sort], 295 | return_trajindex=False) 296 | data_iterator_tau = data_source.iterator(stride=frames_tau[arg_sort], 297 | return_trajindex=False) 298 | 299 | data = np.empty((2, batch_size * preloaded_batches, input_size)) 300 | start_iter = 0 301 | for iter_data, iter_data_tau in zip(data_iterator_t, data_iterator_tau): 302 | temp_frames = iter_data.shape[0] 303 | end_iter = start_iter + temp_frames 304 | data[0, start_iter:end_iter] = iter_data 305 | data[1, start_iter:end_iter] = iter_data_tau 306 | start_iter = end_iter 307 | 308 | 309 | data = data[:, sort_arg_sort] 310 | 311 | index_preloaded = 0 312 | labels = np.empty((batch_size,2*output_size)).astype('float32') 313 | 314 | while index_preloaded < preloaded_batches: 315 | 316 | start_batch = index_preloaded * batch_size 317 | end_batch = start_batch + batch_size 318 | index_preloaded += 1 319 | 320 | if end_batch > data.shape[1]: 321 | end_batch = data.shape[1] 322 | index_preloaded = preloaded_batches 323 | labels = np.empty((end_batch - start_batch,2*output_size)).astype('float32') 324 | 325 | output_data = [data[0, start_batch:end_batch], data[1, start_batch:end_batch]], labels 326 | 327 | yield output_data 328 | 329 | 330 | counter_batches += 1 331 | 332 | if counter_batches == steps_epoch: 333 | 334 | counter_batches = 0 335 | indexes = np.arange(remaining_frames.shape[0]) 336 | np.random.shuffle(indexes) -------------------------------------------------------------------------------- /time-lagged-autoencoder/tae/utils.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/deeptime repository. 2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | ''' 19 | Tools to handle datasets, transformations, and statistics. 20 | ''' 21 | 22 | import numpy as _np 23 | import torch as _torch 24 | from torch import nn as _nn 25 | from torch.utils.data import Dataset as _Dataset 26 | from torch.utils.data import TensorDataset as _TensorDataset 27 | from torch.utils.data import ConcatDataset as _ConcatDataset 28 | from torch.utils.data import DataLoader as _DataLoader 29 | 30 | __all__ = [ 31 | 'LaggedDataset', 32 | 'MaskedDataset', 33 | 'create_dataset', 34 | 'stride_split', 35 | 'random_split', 36 | 'random_block_split', 37 | 'get_mean', 38 | 'get_covariance', 39 | 'get_sqrt_inverse', 40 | 'whiten_data', 41 | 'cca', 42 | 'Transform'] 43 | 44 | ################################################################################ 45 | # 46 | # DATASETS 47 | # 48 | ################################################################################ 49 | 50 | class LaggedDataset(_Dataset): 51 | '''Dataset for wrapping time-lagged data from a single stored time series. 52 | 53 | Each sample will contain the data_tensor at index t and the (not explicitly 54 | stored) target_tensor via data_tensor at index t+lag. We need this for 55 | training the time-lagged autoencoder and TICA. 56 | 57 | Arguments: 58 | data_tensor (Tensor): contains time series data 59 | lag (int): specifies the lag in time steps 60 | ''' 61 | def __init__(self, data_tensor, lag=1): 62 | assert data_tensor.size(0) > lag, 'you need more samples than lag' 63 | assert lag >= 0, 'you need a non-negative lagtime' 64 | self.data_tensor = data_tensor 65 | self.lag = lag 66 | def __getitem__(self, index): 67 | return self.data_tensor[index], self.data_tensor[index + self.lag] 68 | def __len__(self): 69 | return self.data_tensor.size(0) - self.lag 70 | 71 | class MaskedDataset(_Dataset): 72 | '''Dataset for wrapping a specified subset of another dataset. 73 | 74 | This helps to separate a dataset into two or more subsets, e.g., for 75 | training and testing. 76 | 77 | Arguments: 78 | data_tensor (Tensor): contains time series data 79 | active (sequence of int): indices of the active elements 80 | ''' 81 | def __init__(self, dataset, active): 82 | assert len(dataset) >= len(active), \ 83 | 'you cannot have less total samples than active' 84 | assert _np.all(0 <= active) and _np.all(active < len(dataset)), \ 85 | 'you must use only valid indices' 86 | assert len(active) == len(_np.unique(active)), \ 87 | 'you must use every active index only once' 88 | self.dataset = dataset 89 | self.active = active 90 | def __getitem__(self, index): 91 | return self.dataset[self.active[index]] 92 | def __len__(self): 93 | return len(self.active) 94 | 95 | def ensure_traj_format(data, dtype=_np.float32): 96 | data = _np.asarray(data, dtype=dtype) 97 | if data.ndim == 2: 98 | return data 99 | elif data.ndim == 1: 100 | return data.reshape(-1, 1) 101 | else: 102 | raise ValueError('data has incomplatible ndim: ' + str(data.ndim)) 103 | 104 | def create_dataset(data, lag=0, dtype=_np.float32): 105 | '''Create a (time-lagged) dataset from one or more numpy.ndarrays. 106 | 107 | Arguments: 108 | data (numpy.ndarray of list thereof): data to create the dataset from 109 | lag (int): specifies the lag in time steps 110 | dtype (numpy.dtype): dtype of the resulting dataset 111 | ''' 112 | if isinstance(data, _np.ndarray): 113 | return LaggedDataset( 114 | _torch.from_numpy(ensure_traj_format(data, dtype=dtype)), 115 | lag=lag) 116 | elif isinstance(data, (list, tuple)): 117 | return _ConcatDataset([LaggedDataset( 118 | _torch.from_numpy(ensure_traj_format(d, dtype=dtype)), 119 | lag=lag) for d in data]) 120 | else: 121 | raise ValueError( 122 | 'use a single or a list of numpy.ndarrays of dim 1 or 2') 123 | 124 | def stride_split(dataset, stride=2, offset=0): 125 | '''Split one dataset into two parts based on a stride. 126 | 127 | This helps to separate a dataset into two or more subsets, e.g., for 128 | training and testing. Every th element starting from 129 | goes into the first MaskedDataset, everything else into the second. 130 | 131 | Arguments: 132 | dataset (Dataset): contains the data you want to split 133 | stride (int): specify the size of the stride 134 | offset (int): specify where to start counting 135 | ''' 136 | assert 0 < stride < len(dataset), \ 137 | 'use a positive stride smaller than the length of the dataset' 138 | assert 0 <= offset < stride, \ 139 | 'use a non-negative offset smaller than the stride' 140 | active = _np.arange(offset, len(dataset), stride) 141 | complement = _np.setdiff1d( 142 | _np.arange(len(dataset)), active, assume_unique=True) 143 | return MaskedDataset(dataset, active), MaskedDataset(dataset, complement) 144 | 145 | def random_split(dataset, active=None, n_active=None, f_active=None): 146 | '''Split one dataset into two parts based on a random selection. 147 | 148 | This helps to separate a dataset into two or more subsets, e.g., for 149 | training and testing. Specify the active set either by giving the frame 150 | indices, the number of active frames or the fraction of active frames. 151 | 152 | Arguments: 153 | dataset (Dataset): contains the data you want to split 154 | active (iterable of int): specify the active frames 155 | n_active (int): number of active frames 156 | f_active (float): fraction of active frames 157 | ''' 158 | if active is None: 159 | if n_active is None: 160 | if f_active is None: 161 | raise ValueError( 162 | 'specify either active, n_active or f_active') 163 | else: 164 | assert 0 < f_active < 1, \ 165 | 'f_active must be 0 < f_active < 1' 166 | n_active = int(_np.floor(0.5 + f_active * len(dataset))) 167 | else: 168 | assert 0 < n_active < len(dataset), \ 169 | 'n_active must be 0 < n_active < len(dataset)' 170 | if f_active is not None: 171 | raise ValueError( 172 | 'do not specify f_active if n_active is given') 173 | active = _np.random.choice(len(dataset), size=n_active, replace=False) 174 | else: 175 | active = _np.asarray(active) 176 | assert len(active) == len(_np.unique(active)), \ 177 | 'you must use every active index only once' 178 | assert _np.all(0 <= active) and _np.all(active < len(dataset)), \ 179 | 'you must use only valid indices' 180 | if f_active is not None: 181 | raise ValueError( 182 | 'do not specify f_active if active is given') 183 | if n_active is not None: 184 | raise ValueError( 185 | 'do not specify n_active if active is given') 186 | complement = _np.setdiff1d( 187 | _np.arange(len(dataset)), active, assume_unique=True) 188 | return MaskedDataset(dataset, active), MaskedDataset(dataset, complement) 189 | 190 | def random_block_split(dataset, lag, f_active=0.5): 191 | '''Split one dataset into two parts based on a random selection of blocks. 192 | 193 | This helps to separate a dataset into two or more subsets, e.g., for 194 | training and testing. Specify the active set either by giving the fraction 195 | of active blocks (the total number of transitions is conserved). 196 | 197 | Arguments: 198 | dataset (Dataset): contains the data you want to split 199 | lag (int): specifies the lag in time steps 200 | f_active (float): fraction of active blocks 201 | ''' 202 | active = [] 203 | n = 0 204 | nmax = len(dataset) 205 | n_blocks = int(_np.ceil(float(nmax) / float(lag))) 206 | n_active_blocks = int(_np.floor(0.5 + f_active * n_blocks)) 207 | active_blocks = _np.random.choice( 208 | n_blocks, size=n_active_blocks, replace=False) 209 | for n in active_blocks: 210 | active += _np.arange(n * lag, min((n + 1) * lag, nmax)).tolist() 211 | return random_split(dataset, active=active) 212 | 213 | ################################################################################ 214 | # 215 | # STATISTICS 216 | # 217 | ################################################################################ 218 | 219 | def get_mean(loader): 220 | '''Compute the mean value via minibatch summation using a loader. 221 | 222 | Arguments: 223 | loader (DataLoader): contains the data you want to analyze 224 | ''' 225 | x_mean, y_mean = None, None 226 | for x, y in loader: 227 | try: 228 | x_mean.add_(x.sum(dim=0)) 229 | except AttributeError: 230 | x_mean = x.sum(dim=0) 231 | try: 232 | y_mean.add_(y.sum(dim=0)) 233 | except AttributeError: 234 | y_mean = y.sum(dim=0) 235 | x_mean.div_(float(len(loader.dataset))) 236 | y_mean.div_(float(len(loader.dataset))) 237 | return x_mean, y_mean 238 | 239 | def get_covariance(loader, x_mean, y_mean): 240 | '''Compute the instantaneous and time-lagged covariance matrices via 241 | minibatch summation using a loader. 242 | 243 | Arguments: 244 | loader (DataLoader): contains the data you want to analyze 245 | x_mean (Tensor): mean value for the data_tensor 246 | y_mean (Tensor): mean value for the target_tensor 247 | ''' 248 | cxx = _torch.zeros(len(x_mean), len(x_mean)) 249 | cxy = _torch.zeros(len(x_mean), len(y_mean)) 250 | cyy = _torch.zeros(len(y_mean), len(y_mean)) 251 | for x, y in loader: 252 | x.sub_(x_mean[None, :]) 253 | y.sub_(y_mean[None, :]) 254 | cxx.add_(_torch.mm(x.t(), x)) 255 | cxy.add_(_torch.mm(x.t(), y)) 256 | cyy.add_(_torch.mm(y.t(), y)) 257 | cxx.div_(float(len(loader.dataset))) 258 | cxy.div_(float(len(loader.dataset))) 259 | cyy.div_(float(len(loader.dataset))) 260 | return cxx, cxy, cyy 261 | 262 | ################################################################################ 263 | # 264 | # WHITENING 265 | # 266 | ################################################################################ 267 | 268 | def get_sqrt_inverse(matrix, bias=1.0e-5): 269 | '''Compute the sqrt-inverse of the supplied symmetric/real matrix. 270 | 271 | We need this step for whitening and TICA. 272 | 273 | Arguments: 274 | matrix (Tensor): contains the matrix you want to transform 275 | bias (float): assures numerical stability 276 | ''' 277 | e, v = _torch.symeig(matrix, eigenvectors=True) 278 | d = _torch.diag(1.0 / _torch.sqrt(_torch.abs(e) + bias)) 279 | return _torch.mm(_torch.mm(v, d), v.t()) 280 | 281 | def whiten_data(data_tensor, batch_size=100): 282 | '''Whiten a Tensor in the PCA basis. 283 | 284 | Arguments: 285 | data_tensor (Tensor): contains the data you want to whiten 286 | batch_size (int): specify a batch size for the whitening process 287 | ''' 288 | loader = _DataLoader( 289 | LaggedDataset(data_tensor, lag=0), batch_size=batch_size) 290 | x_mean, y_mean = get_mean(loader) 291 | cxx, cxy, cyy = get_covariance(loader, x_mean, y_mean) 292 | ixx = get_sqrt_inverse(cxx) 293 | whitened_data = [] 294 | for x, _ in loader: 295 | x.sub_(x_mean[None, :]) 296 | whitened_data.append(x.mm(ixx)) 297 | return _torch.cat(whitened_data) 298 | 299 | ################################################################################ 300 | # 301 | # CCA 302 | # 303 | ################################################################################ 304 | 305 | def cca(data_tensor_x, data_tensor_y, batch_size=100): 306 | '''Perform canonical correlation analysis for two data tensors. 307 | 308 | Arguments: 309 | data_tensor_x (Tensor): contains the first data tensor 310 | data_tensor_y (Tensor): contains the second data tensor 311 | batch_size (int): specify a batch size for the CCA calculation 312 | ''' 313 | loader = _DataLoader( 314 | _TensorDataset(data_tensor_x, data_tensor_y), 315 | batch_size=batch_size) 316 | x_mean, y_mean = get_mean(loader) 317 | cxx, cxy, cyy = get_covariance(loader, x_mean, y_mean) 318 | ixx = get_sqrt_inverse(cxx) 319 | iyy = get_sqrt_inverse(cyy) 320 | return _torch.svd(_torch.mm(_torch.mm(ixx, cxy), iyy)) 321 | 322 | ################################################################################ 323 | # 324 | # TRANSFORMER 325 | # 326 | ################################################################################ 327 | 328 | class BaseTransform(object): 329 | def __init__(self, mean=None, covariance=None): 330 | if mean is not None: 331 | self.sub = mean 332 | if covariance is not None: 333 | self.mul = get_sqrt_inverse(covariance) 334 | def __call__(self, x): 335 | try: 336 | x.sub_(self.sub[None, :]) 337 | except AttributeError: 338 | pass 339 | try: 340 | x = x.mm(self.mul) 341 | except AttributeError: 342 | pass 343 | return x 344 | 345 | class Transform(object): 346 | '''Apply whitening/centering transformations within a minibatch. 347 | 348 | As we do not want to preprocess and, thus, duplicate large datasets, 349 | we do the necessary whitening and centering operations on the fly while 350 | iterating over the datasets. 351 | 352 | Arguments: 353 | x_mean (Tensor): contains the mean of the data tensor 354 | x_covariance (Tensor): contains the covariance of the data tensor 355 | y_mean (Tensor): contains the mean of the target tensor 356 | y_covariance (Tensor): contains the covariance of the target tensor 357 | ''' 358 | def __init__( 359 | self, x_mean=None, x_covariance=None, y_mean=None, y_covariance=None): 360 | self.x = BaseTransform(mean=x_mean, covariance=x_covariance) 361 | self.y = BaseTransform(mean=y_mean, covariance=y_covariance) 362 | def __call__(self, x, y): 363 | return self.x(x), self.y(y) 364 | -------------------------------------------------------------------------------- /vampnet/examples/Folding.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Import all the packages used" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "%matplotlib inline\n", 19 | "import vampnet\n", 20 | "from vampnet import data_generator\n", 21 | "from keras.models import Model\n", 22 | "from keras.layers import Dense, Activation, Flatten, Input, BatchNormalization, concatenate\n", 23 | "from keras import optimizers\n", 24 | "import tensorflow as tf\n", 25 | "from keras.backend import clear_session" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# generate 10^7 frames and energy values\n", 35 | "datapoints = int(1e6)\n", 36 | "stride = 10" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "x = data_generator.get_folding_model_data(datapoints, rvec0=2.0 * (np.random.rand(5) - 0.5), kT=1., dt = 0.1)\n", 46 | "r = np.linalg.norm(x, axis=-1)[::stride]" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "pot = np.zeros_like(r)\n", 56 | "for i in range(r.shape[0]):\n", 57 | " pot[i] = data_generator.folding_model_energy(r[i], 3)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "plt.plot(r[::stride], pot[::stride], '.')\n", 67 | "plt.show()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "traj_whole = x\n", 77 | "traj_data_points, input_size = traj_whole.shape" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "# All Hyperparameters\n", 87 | "\n", 88 | "# Tau, how much is the timeshift of the two datasets\n", 89 | "tau = 10\n", 90 | "\n", 91 | "# Batch size for Stochastic Gradient descent\n", 92 | "batch_size = 2048\n", 93 | "\n", 94 | "# Which trajectory points percentage is used as training\n", 95 | "train_ratio = 0.9\n", 96 | "\n", 97 | "# How many hidden layers the network has\n", 98 | "network_depth = 4\n", 99 | "\n", 100 | "# Width of every layer\n", 101 | "layer_width = 20\n", 102 | "nodes = [layer_width]*network_depth\n", 103 | "# Learning rate used for the ADAM optimizer\n", 104 | "learning_rate = 0.0001\n", 105 | "\n", 106 | "# How many output states the network has\n", 107 | "output_size = 2\n", 108 | "\n", 109 | "# Iteration over the training set in the fitting process\n", 110 | "nb_epoch = 20\n", 111 | "\n", 112 | "plot_stride = 200" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "epsilon = 1e-5\n", 122 | "vamp = vampnet.VampnetTools(epsilon = epsilon)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# Shuffle trajectory and lagged trajectory together\n", 132 | "length_data = traj_data_points - tau\n", 133 | "\n", 134 | "traj_ord= traj_whole[:length_data]\n", 135 | "traj_ord_lag = traj_whole[tau:length_data+tau]\n", 136 | "\n", 137 | "indexes = np.arange(length_data)\n", 138 | "np.random.shuffle(indexes)\n", 139 | "\n", 140 | "\n", 141 | "\n", 142 | "traj = traj_ord[indexes]\n", 143 | "traj_lag = traj_ord_lag[indexes]\n" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# Prepare data for tensorflow usage\n", 153 | "length_train = int(np.floor(length_data * train_ratio))\n", 154 | "length_vali = length_data - length_train\n", 155 | "\n", 156 | "traj_data_train = traj[:length_train]\n", 157 | "traj_data_train_lag = traj_lag[:length_train]\n", 158 | "\n", 159 | "traj_data_valid = traj[length_train:]\n", 160 | "traj_data_valid_lag = traj_lag[length_train:]\n", 161 | "\n", 162 | "#Data used for states ordering\n", 163 | "X1 = traj_ord[:length_data].astype('float32')\n", 164 | "X2 = traj_ord_lag[:length_data].astype('float32')\n", 165 | "\n", 166 | "# Input of the first network\n", 167 | "X1_train = traj_data_train.astype('float32')\n", 168 | "X2_train = traj_data_train_lag.astype('float32')\n", 169 | "\n", 170 | "# Input for validation\n", 171 | "X1_vali = traj_data_valid.astype('float32')\n", 172 | "X2_vali = traj_data_valid_lag.astype('float32')\n", 173 | "\n", 174 | "# Needs a Y-train set which we dont have.\n", 175 | "Y_train = np.zeros((length_train,2*output_size)).astype('float32')\n", 176 | "Y_vali = np.zeros((length_vali,2*output_size)).astype('float32')" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "if 'model' in globals():\n", 186 | " del model\n", 187 | " clear_session()\n", 188 | "\n", 189 | " \n", 190 | "# Build the model\n", 191 | "Data_X = Input(shape = (input_size,))\n", 192 | "Data_Y = Input(shape = (input_size,))\n", 193 | "\n", 194 | "# A batch normalization layer improves convergence speed\n", 195 | "# bn_layer = BatchNormalization()\n", 196 | "bn_layer = Activation('linear')\n", 197 | "\n", 198 | "# Instance layers and assign them to the two lobes of the network\n", 199 | "dense_layers = [Dense(node, activation = 'relu',)\n", 200 | " for node in nodes]\n", 201 | "\n", 202 | "lx_branch = bn_layer(Data_X)\n", 203 | "rx_branch = bn_layer(Data_Y)\n", 204 | "\n", 205 | "for i, layer in enumerate(dense_layers):\n", 206 | "\n", 207 | " lx_branch = dense_layers[i](lx_branch)\n", 208 | " rx_branch = dense_layers[i](rx_branch)\n", 209 | "\n", 210 | "\n", 211 | "# Add a softmax output layer.\n", 212 | "# Should be replaced with a linear activation layer if\n", 213 | "# the outputs of the network cannot be interpreted as states\n", 214 | "softmax = Dense(output_size, activation='softmax')\n", 215 | "\n", 216 | "lx_branch = softmax(lx_branch)\n", 217 | "rx_branch = softmax(rx_branch)\n", 218 | "\n", 219 | "# Merge both networks to train both at the same time\n", 220 | "merged = concatenate([lx_branch, rx_branch])\n", 221 | "\n", 222 | "# Initialize the model and the optimizer, and compile it with\n", 223 | "# the loss and metric functions from the VAMPnets package\n", 224 | "model = Model(inputs = [Data_X, Data_Y], outputs = merged)\n", 225 | "# model.summary()\n", 226 | "# Compile it with our own loss-function\n", 227 | "adam = optimizers.adam(lr = learning_rate)\n", 228 | "\n", 229 | "\n", 230 | "# Pretraining with VAMP with 'symmetrized' matrices yields a bad approximation of the \n", 231 | "# eigenvectors per se, but improves the 'readability' of the states identified by VAMP-2\n", 232 | "# which would otherwise be difficult to interprete.\n", 233 | "\n", 234 | "\n", 235 | "# IMPORTANT: the function vamp.loss_VAMP2_autograd can only be used with tensorflow 1.6 or more recent.\n", 236 | "# For older versions of TF, use the function vamp.loss_VAMP2\n", 237 | "\n", 238 | "losses = [\n", 239 | " vamp._loss_VAMP_sym,\n", 240 | " vamp.loss_VAMP2,\n", 241 | "]\n", 242 | "\n", 243 | "valid_metric = np.zeros((len(losses), nb_epoch))\n", 244 | "train_metric = np.zeros((len(losses), nb_epoch))\n", 245 | "\n", 246 | "for l_index, loss in enumerate(losses):\n", 247 | " \n", 248 | " model.compile(optimizer = 'adam', loss = loss, metrics = [vamp.metric_VAMP])\n", 249 | " \n", 250 | " hist = model.fit([X1_train, X2_train], Y_train ,batch_size=batch_size, epochs=nb_epoch, verbose=0,\n", 251 | " validation_data=([X1_vali, X2_vali], Y_vali))\n", 252 | " \n", 253 | " states_prob = model.predict([traj_ord, traj_ord_lag])[:, :output_size]\n", 254 | "\n", 255 | " # Order the output states based on their population\n", 256 | " coor_pred = np.argmax(states_prob, axis = 1)\n", 257 | " indexes = [np.where(coor_pred == np.multiply(np.ones_like(coor_pred), n)) for n in range(output_size)]\n", 258 | " states_num = [len(i[0]) for i in indexes]\n", 259 | " states_order = np.argsort(states_num).astype('int')[::-1]\n", 260 | "\n", 261 | " pred_ord = states_prob[:,states_order]\n", 262 | " \n", 263 | " X_Validation = np.linalg.norm(traj_ord, axis=1)\n", 264 | " for i in range(output_size):\n", 265 | " plt.plot(X_Validation[::plot_stride], pred_ord[::plot_stride,i], '.', label = 'state '+str(i))\n", 266 | " plt.legend()\n", 267 | " plt.title('States probabilites')\n", 268 | " plt.show()\n", 269 | " tau_msm = 20\n", 270 | " pred_ord_meanfree = pred_ord - pred_ord.mean(0)\n", 271 | " K_smt = vamp.estimate_koopman_op(pred_ord_meanfree, tau_msm)\n", 272 | "\n", 273 | " K_eigvals, K_eigvec = np.linalg.eig(np.real(K_smt))\n", 274 | "\n", 275 | " index = np.argmax(np.real(K_eigvals))\n", 276 | " real_eigfunc = pred_ord_meanfree @ np.real(K_eigvec[:,index])\n", 277 | "\n", 278 | " plt.plot(X_Validation[::plot_stride], real_eigfunc[::plot_stride], '.')\n", 279 | " plt.title('Eigenvector')\n", 280 | " plt.show()\n", 281 | "\n", 282 | " valid_metric[l_index] = np.array(hist.history['val_metric_VAMP'])\n", 283 | " train_metric[l_index] = np.array(hist.history['metric_VAMP'])\n", 284 | "\n", 285 | "valid_metric = np.reshape(valid_metric, (-1))\n", 286 | "train_metric = np.reshape(train_metric, (-1))" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "# Training result visualization\n", 296 | "\n", 297 | "plt.plot(train_metric, label = 'Training')\n", 298 | "plt.legend()\n", 299 | "plt.plot(valid_metric, label = 'Validation')\n", 300 | "plt.legend()\n", 301 | "\n", 302 | "plt.show()" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "# Transform the input trajectory using the network\n", 312 | "states_prob = model.predict([traj_ord, traj_ord_lag])[:, :output_size]\n", 313 | "\n", 314 | "# Order the output states based on their population\n", 315 | "coor_pred = np.argmax(states_prob, axis = 1)\n", 316 | "indexes = [np.where(coor_pred == np.multiply(np.ones_like(coor_pred), n)) for n in range(output_size)]\n", 317 | "states_num = [len(i[0]) for i in indexes]\n", 318 | "states_order = np.argsort(states_num).astype('int')[::-1]\n", 319 | "\n", 320 | "pred_ord = states_prob[:,states_order]" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "X_Validation = np.linalg.norm(traj_ord, axis=1)\n", 330 | "for i in range(output_size):\n", 331 | " plt.plot(X_Validation[::plot_stride], pred_ord[::plot_stride,i], '.', label = 'state '+str(i))\n", 332 | " \n", 333 | "scaled_pot = (pot-pot.min())/(pot.max()-pot.min())\n", 334 | " \n", 335 | "plt.plot(r[::plot_stride], scaled_pot[::plot_stride], '.', label = 'Potential')\n", 336 | "plt.show()" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "tau_msm = 20\n", 346 | "pred_ord_meanfree = pred_ord - pred_ord.mean(0)\n", 347 | "K_smt = vamp.estimate_koopman_op(pred_ord_meanfree, tau_msm)\n", 348 | "\n", 349 | "K_eigvals, K_eigvec = np.linalg.eig(np.real(K_smt))\n", 350 | "\n", 351 | "index = np.argmax(np.real(K_eigvals))\n", 352 | "real_eigfunc = pred_ord_meanfree @ np.real(K_eigvec[:,index])\n", 353 | "\n", 354 | "plt.plot(X_Validation[::plot_stride], real_eigfunc[::plot_stride], '.')\n", 355 | "plt.title('Eigenvector')\n", 356 | "plt.show()" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "# Visualize the population of the states" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "def print_states_pie_chart():\n", 373 | " coors = []\n", 374 | " maxi = np.max(pred_ord, axis= 1)\n", 375 | "\n", 376 | " for i in range(output_size):\n", 377 | " coors.append(len(np.where(pred_ord[:,i] == maxi)[0]))\n", 378 | " \n", 379 | " fig1, ax1 = plt.subplots()\n", 380 | " ax1.pie(np.array(coors), autopct='%1.2f%%', startangle=90)\n", 381 | " ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.\n", 382 | " print('States population: '+str(np.array(coors)/len(maxi)*100)+'%')\n", 383 | " plt.show()\n", 384 | "\n", 385 | "print_states_pie_chart()" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "# Estimate the implied timescales" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": {}, 399 | "outputs": [], 400 | "source": [ 401 | "max_tau = 200\n", 402 | "lag = np.arange(1, max_tau, 1)\n", 403 | "its = vamp.get_its(pred_ord, lag)\n", 404 | "vamp.plot_its(its, lag)" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "# Chapman-Kolmogorov test for the estimated koopman operator" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "steps = 24\n", 421 | "tau_msm = 50\n", 422 | "predicted, estimated = vamp.get_ck_test(pred_ord, steps, tau_msm)\n", 423 | "vamp.plot_ck_test(predicted, estimated, output_size, steps, tau_msm)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [] 432 | } 433 | ], 434 | "metadata": { 435 | "anaconda-cloud": {}, 436 | "kernelspec": { 437 | "display_name": "Python 3", 438 | "language": "python", 439 | "name": "python3" 440 | }, 441 | "language_info": { 442 | "codemirror_mode": { 443 | "name": "ipython", 444 | "version": 3 445 | }, 446 | "file_extension": ".py", 447 | "mimetype": "text/x-python", 448 | "name": "python", 449 | "nbconvert_exporter": "python", 450 | "pygments_lexer": "ipython3", 451 | "version": "3.6.4" 452 | } 453 | }, 454 | "nbformat": 4, 455 | "nbformat_minor": 1 456 | } 457 | -------------------------------------------------------------------------------- /time-lagged-autoencoder/tae/benchmarks.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/deeptime repository. 2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | ''' 19 | Automatized benchmarks. 20 | ''' 21 | 22 | import multiprocessing as mp 23 | import numpy as np 24 | import torch 25 | import tae 26 | import os 27 | 28 | import tae 29 | import torch 30 | import pyemma 31 | from time import time 32 | 33 | try: 34 | import pyemma 35 | except ImportError: 36 | print('running benchmarks requires the pyemma package') 37 | 38 | try: 39 | from mdshare import load as _load 40 | except ImportError: 41 | print('running benchmarks requires the mdshare package') 42 | 43 | ################################################################################ 44 | # 45 | # BENCHMARKING THE SQRT TOY MODEL 46 | # 47 | ################################################################################ 48 | 49 | def evaluate_sqrt_model( 50 | length=10000, 51 | trns_lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 52 | msm_lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 53 | use_cuda=True): 54 | '''A wrapper to run the sqrt model benchmarks 55 | 56 | Arguments: 57 | length (int): length of the sampled trajectory 58 | trns_lags (list of int): lag times for the transformers 59 | msm_lags (list of int): lag times for the MSM validation 60 | use_cuda (boolean): use a GPU to run the benchmarks 61 | ''' 62 | def analyse(lat_data, ref_data, msm_lags): 63 | cca = tae.utils.cca(torch.from_numpy(lat_data), ref_data)[1].numpy() 64 | centers = np.linspace(np.min(lat_data), np.max(lat_data), 101) 65 | centers = 0.5 * (centers[:-1] + centers[1:]).reshape(-1, 1) 66 | dtraj = pyemma.coordinates.assign_to_centers(lat_data, centers) 67 | its = pyemma.msm.its(dtraj, lags=msm_lags, nits=1).timescales 68 | return cca, its 69 | data, dtraj = tae.toymodels.sample_sqrt_model(length) 70 | ref_data = tae.utils.whiten_data( 71 | torch.from_numpy(dtraj.reshape(-1, 1).astype(np.float32))) 72 | ref_its = pyemma.msm.its(dtraj, lags=msm_lags, nits=1).timescales 73 | lat, trn, val = tae.pca( 74 | data, dim=1, validation_split=0.5, batch_size=100, whiten=True) 75 | cca, its = analyse(lat, ref_data, msm_lags) 76 | result = dict( 77 | trns_lags=np.asarray(trns_lags), 78 | msm_lags=np.asarray(msm_lags), 79 | ref_its=np.asarray(ref_its), 80 | pca_its=np.asarray(its), 81 | pca_cca=np.asarray(cca), 82 | pca_trn=np.asarray(trn), 83 | pca_val=np.asarray(val)) 84 | for lag in trns_lags: 85 | lat, trn, val = tae.tica( 86 | data, dim=1, lag=lag, kinetic_map=True, symmetrize=True, 87 | validation_split=0.5, batch_size=100, whiten=True) 88 | cca, its = analyse(lat, ref_data, msm_lags) 89 | result.update({ 90 | 'tica_%d_its' % lag: np.asarray(its), 91 | 'tica_%d_cca' % lag: np.asarray(cca), 92 | 'tica_%d_trn' % lag: np.asarray(trn), 93 | 'tica_%d_val' % lag: np.asarray(val)}) 94 | lat, trn, val = tae.ae( 95 | data, dim=1, lag=lag, n_epochs=200, validation_split=0.5, 96 | batch_size=100, whiten=True, pin_memory=use_cuda, hid_size=[200, 100], 97 | cuda=use_cuda, non_blocking=use_cuda) 98 | cca, its = analyse(lat, ref_data, msm_lags) 99 | result.update({ 100 | 'ae_%d_its' % lag: np.asarray(its), 101 | 'ae_%d_cca' % lag: np.asarray(cca), 102 | 'ae_%d_trn' % lag: np.asarray(trn), 103 | 'ae_%d_val' % lag: np.asarray(val)}) 104 | return result 105 | 106 | ################################################################################ 107 | # 108 | # BENCHMARKING THE SWISSROLL TOY MODEL 109 | # 110 | ################################################################################ 111 | 112 | def evaluate_swissroll_model( 113 | dim=None, 114 | length=30000, 115 | trns_lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 116 | msm_lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 117 | use_cuda=True): 118 | '''A wrapper to run the swissroll model benchmarks 119 | 120 | Arguments: 121 | dim (int): specify the latent dimension (1 or 2) 122 | length (int): length of the sampled trajectory 123 | trns_lags (list of int): lag times for the transformers 124 | msm_lags (list of int): lag times for the MSM validation 125 | use_cuda (boolean): use a GPU to run the benchmarks 126 | ''' 127 | def analyse(lat_data, ref_data, msm_lags): 128 | cca = tae.utils.cca(torch.from_numpy(lat_data), ref_data)[1].numpy() 129 | if lat_data.shape[1] == 1: 130 | centers = np.linspace(np.min(lat_data), np.max(lat_data), 101) 131 | centers = 0.5 * (centers[:-1] + centers[1:]).reshape(-1, 1) 132 | dtraj = pyemma.coordinates.assign_to_centers(lat_data, centers) 133 | else: 134 | dtraj = pyemma.coordinates.cluster_regspace( 135 | lat_data, dmin=0.2, max_centers=400).dtrajs 136 | its = pyemma.msm.its(dtraj, lags=msm_lags, nits=3).timescales 137 | return cca, its 138 | data, dtraj = tae.toymodels.sample_swissroll_model(length) 139 | ref_data = tae.utils.whiten_data( 140 | torch.from_numpy(dtraj.reshape(-1, 1).astype(np.float32))) 141 | ref_its = pyemma.msm.its(dtraj, lags=msm_lags, nits=3).timescales 142 | lat, trn, val = tae.pca( 143 | data, dim=dim, validation_split=0.5, batch_size=100, whiten=True) 144 | cca, its = analyse(lat, ref_data, msm_lags) 145 | result = dict( 146 | trns_lags=np.asarray(trns_lags), 147 | msm_lags=np.asarray(msm_lags), 148 | ref_its=np.asarray(ref_its), 149 | pca_its=np.asarray(its), 150 | pca_cca=np.asarray(cca), 151 | pca_trn=np.asarray(trn), 152 | pca_val=np.asarray(val)) 153 | for lag in trns_lags: 154 | lat, trn, val = tae.tica( 155 | data, dim=dim, lag=lag, kinetic_map=True, symmetrize=True, 156 | validation_split=0.5, batch_size=100, whiten=True) 157 | cca, its = analyse(lat, ref_data, msm_lags) 158 | result.update({ 159 | 'tica_%d_its' % lag: np.asarray(its), 160 | 'tica_%d_cca' % lag: np.asarray(cca), 161 | 'tica_%d_trn' % lag: np.asarray(trn), 162 | 'tica_%d_val' % lag: np.asarray(val)}) 163 | lat, trn, val = tae.ae( 164 | data, dim=dim, lag=lag, n_epochs=200, validation_split=0.5, 165 | batch_size=100, whiten=True, pin_memory=use_cuda, hid_size=[200, 100], 166 | cuda=use_cuda, non_blocking=use_cuda) 167 | cca, its = analyse(lat, ref_data, msm_lags) 168 | result.update({ 169 | 'ae_%d_its' % lag: np.asarray(its), 170 | 'ae_%d_cca' % lag: np.asarray(cca), 171 | 'ae_%d_trn' % lag: np.asarray(trn), 172 | 'ae_%d_val' % lag: np.asarray(val)}) 173 | return result 174 | 175 | ################################################################################ 176 | # 177 | # BENCHMARKING THE ALANINE DIPEPTIDE MD SIMULATIONS 178 | # 179 | ################################################################################ 180 | 181 | def evaluate_ala2_md( 182 | n_trajs=5, 183 | length=50000, 184 | trns_lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 185 | msm_lags=[1, 2, 3, 5, 7, 10, 15, 20, 30, 40, 50], 186 | use_cuda=True): 187 | '''A wrapper to run the alanine dipeptide benchmarks 188 | 189 | Arguments: 190 | n_trajs (int): number of bootstrapped trajectories 191 | length (int): length of each bootstrapped trajectory 192 | trns_lags (list of int): lag times for the transformers 193 | msm_lags (list of int): lag times for the MSM validation 194 | use_cuda (boolean): use a GPU to run the benchmarks 195 | ''' 196 | def analyse(lat_data, ref_data, msm_lags): 197 | cca = tae.utils.cca( 198 | torch.cat([torch.from_numpy(array) for array in lat_data]), 199 | ref_data)[1].numpy() 200 | dtrajs = pyemma.coordinates.cluster_kmeans( 201 | lat_data, k=300, max_iter=50, stride=10).dtrajs 202 | its = pyemma.msm.its(dtrajs, lags=msm_lags, nits=3).timescales 203 | return cca, its 204 | with np.load(_load('alanine-dipeptide-3x250ns-backbone-dihedrals.npz')) as fh: 205 | n_frames = [fh[key].shape[0] for key in sorted(fh.keys())] 206 | selection = [] 207 | for i in np.random.choice( 208 | len(n_frames), size=n_trajs, replace=True): 209 | selection.append( 210 | [i, np.random.randint(n_frames[i] - length)]) 211 | ref_data = [fh['arr_%d' % i][l:l+length] for i, l in selection] 212 | with np.load(_load('alanine-dipeptide-3x250ns-heavy-atom-positions.npz')) as fh: 213 | data = [fh['arr_%d' % i][l:l+length] for i, l in selection] 214 | dtrajs = pyemma.coordinates.cluster_kmeans( 215 | ref_data, k=300, max_iter=50, stride=10).dtrajs 216 | ref_its = pyemma.msm.its(dtrajs, lags=msm_lags, nits=3).timescales 217 | ref_data = tae.utils.whiten_data( 218 | torch.cat([torch.from_numpy(array) for array in ref_data])) 219 | lat, trn, val = tae.pca( 220 | data, dim=2, validation_split=0.5, batch_size=100, whiten=True) 221 | cca, its = analyse(lat, ref_data, msm_lags) 222 | result = dict( 223 | trns_lags=np.asarray(trns_lags), 224 | msm_lags=np.asarray(msm_lags), 225 | ref_its=np.asarray(ref_its), 226 | pca_its=np.asarray(its), 227 | pca_cca=np.asarray(cca), 228 | pca_trn=np.asarray(trn), 229 | pca_val=np.asarray(val)) 230 | for lag in trns_lags: 231 | lat, trn, val = tae.tica( 232 | data, dim=2, lag=lag, kinetic_map=True, symmetrize=True, 233 | validation_split=0.5, batch_size=100, whiten=True) 234 | cca, its = analyse(lat, ref_data, msm_lags) 235 | result.update({ 236 | 'tica_%d_its' % lag: np.asarray(its), 237 | 'tica_%d_cca' % lag: np.asarray(cca), 238 | 'tica_%d_trn' % lag: np.asarray(trn), 239 | 'tica_%d_val' % lag: np.asarray(val)}) 240 | lat, trn, val = tae.ae( 241 | data, dim=2, lag=lag, n_epochs=200, validation_split=0.5, 242 | batch_size=100, whiten=True, pin_memory=use_cuda, hid_size=[200, 100], 243 | cuda=use_cuda, non_blocking=use_cuda) 244 | cca, its = analyse(lat, ref_data, msm_lags) 245 | result.update({ 246 | 'ae_%d_its' % lag: np.asarray(its), 247 | 'ae_%d_cca' % lag: np.asarray(cca), 248 | 'ae_%d_trn' % lag: np.asarray(trn), 249 | 'ae_%d_val' % lag: np.asarray(val)}) 250 | return result 251 | 252 | ################################################################################ 253 | # 254 | # BENCHMARKING THE VILLIN MD SIMULATIONS 255 | # 256 | ################################################################################ 257 | 258 | def evaluate_villin_md( 259 | data=None, 260 | n_blocks=10, 261 | trns_lags=[10, 20, 50, 100, 200, 500], 262 | msm_lags=[1, 5, 10, 20, 30, 40, 50, 60, 80, 100, 125, 150, 175, 200, 250, 300, 400, 500, 700, 1000], 263 | use_cuda=True): 264 | '''An inner wrapper to run the villin benchmarks for a single featurization 265 | 266 | Arguments: 267 | data (numpy.ndarray): featurized md data 268 | n_blocks (int): number of blocks to divide the original trajectory in 269 | trns_lags (list of int): lag times for the transformers 270 | msm_lags (list of int): lag times for the MSM validation 271 | use_cuda (boolean): use a GPU to run the benchmarks 272 | ''' 273 | def analyse(lat_data, msm_lags): 274 | dtrajs = pyemma.coordinates.cluster_kmeans( 275 | lat_data, k=300, max_iter=50, stride=10).dtrajs 276 | return pyemma.msm.its(dtrajs, lags=msm_lags, nits=2).timescales 277 | nmax = len(data) 278 | length = int(np.floor(0.5 + float(nmax) / float(n_blocks))) 279 | active_blocks = np.random.choice(n_blocks, size=n_blocks, replace=True) 280 | _data = [data[n * length:min((n + 1) * length, nmax), :] for n in active_blocks] 281 | result = dict( 282 | trns_lags=np.asarray(trns_lags), 283 | msm_lags=np.asarray(msm_lags)) 284 | for lag in trns_lags: 285 | for dim in [2, 5]: 286 | lat, trn, val = tae.tica( 287 | _data, dim=2, lag=lag, kinetic_map=True, symmetrize=True, 288 | validation_split=0.5, batch_size=100, whiten=True) 289 | result.update({ 290 | 'tica_%d_%d_its' % (lag, dim): np.asarray(analyse(lat, msm_lags)), 291 | 'tica_%d_%d_trn' % (lag, dim): np.asarray(trn), 292 | 'tica_%d_%d_val' % (lag, dim): np.asarray(val)}) 293 | lat, trn, val = tae.ae( 294 | _data, dim=2, lag=lag, n_epochs=200, validation_split=0.5, 295 | batch_size=100, whiten=True, pin_memory=use_cuda, hid_size=[200, 100], 296 | cuda=use_cuda, non_blocking=use_cuda) 297 | result.update({ 298 | 'ae_%d_its' % lag: np.asarray(analyse(lat, msm_lags)), 299 | 'ae_%d_trn' % lag: np.asarray(trn), 300 | 'ae_%d_val' % lag: np.asarray(val)}) 301 | return result 302 | 303 | def evaluate_villin_md_wrapper( 304 | path_to_data=None, 305 | trns_lags=[10, 20, 50, 100, 200, 500], 306 | msm_lags=[1, 5, 10, 20, 30, 40, 50, 60, 80, 100, 125, 150, 175, 200, 250, 300, 400, 500, 700, 1000], 307 | use_cuda=True): 308 | '''An outer wrapper to run the villin benchmarks for all featurizations 309 | 310 | Arguments: 311 | path_to_data (str): path to the villin data which we are not allowed to share 312 | n_blocks (int): number of blocks to divide the original trajectory in 313 | trns_lags (list of int): lag times for the transformers 314 | msm_lags (list of int): lag times for the MSM validation 315 | use_cuda (boolean): use a GPU to run the benchmarks 316 | ''' 317 | featurisations = dict({ 318 | 'bbt': 'villin-ff-1ns-backbone-torsions.npy', 319 | 'cap': 'villin-ff-1ns-ca-positions.npy', 320 | 'hap': 'villin-ff-1ns-heavy-atom-positions.npy', 321 | 'icad': 'villin-ff-1ns-inverse-ca-distances.npy'}) 322 | result = dict() 323 | for model in featurisations.keys(): 324 | data = np.load(os.path.join(path_to_data, featurisations[model])) 325 | model_result = evaluate_villin_md( 326 | data=data, trns_lags=trns_lags, 327 | msm_lags=msm_lags, use_cuda=use_cuda) 328 | for key in model_result.keys(): 329 | if key not in ['trns_lags', 'msm_lags']: 330 | result.update({'%s_%s' % (model, key): model_result[key]}) 331 | result.update(trns_lags=trns_lags, msm_lags=msm_lags) 332 | return result 333 | 334 | ################################################################################ 335 | # 336 | # MANUSCRIPT BENCHMARKS 337 | # 338 | ################################################################################ 339 | 340 | def worker(queue, gpu, seed, evaluate_func, evaluate_kwargs): 341 | with torch.cuda.device(gpu): 342 | np.random.seed(seed) 343 | torch.manual_seed(seed) 344 | torch.cuda.manual_seed(seed) 345 | try: 346 | result = evaluate_func(**evaluate_kwargs) 347 | except Exception as e: 348 | print(e) 349 | result = dict() 350 | queue.put(result) 351 | queue.task_done() 352 | 353 | def spawn( 354 | seed_generator, task_index, n_gpus, evaluate_func, evaluate_kwargs=dict()): 355 | processes = [] 356 | queue = mp.JoinableQueue() 357 | for gpu in range(n_gpus): 358 | seed = seed_generator(task_index, gpu, n_gpus=n_gpus) 359 | p = mp.Process( 360 | target=worker, 361 | args=[queue, gpu, seed, evaluate_func, evaluate_kwargs]) 362 | processes.append(p) 363 | print('Spawning task:%d on gpu:%d with seed:%d' % (task_index, gpu, seed)) 364 | for p in processes: 365 | p.start() 366 | queue.join() 367 | out = dict() 368 | for _ in processes: 369 | result = queue.get() 370 | for key in result.keys(): 371 | if key in ['trns_lags', 'msm_lags']: 372 | if key not in out: 373 | out.update({key: result[key]}) 374 | else: 375 | try: 376 | out[key].append(result[key]) 377 | except KeyError: 378 | out.update({key: [result[key]]}) 379 | return out 380 | -------------------------------------------------------------------------------- /vampnet/examples/Alanine_dipeptide.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Import all the packages used" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "scrolled": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "%matplotlib inline\n", 21 | "import vampnet\n", 22 | "from vampnet import data_generator as vamp_data_generator\n", 23 | "from tensorflow.contrib.keras.api.keras.models import Model\n", 24 | "from tensorflow.contrib.keras.api.keras.layers import Dense, Activation, Flatten, Input, BatchNormalization, concatenate\n", 25 | "from tensorflow.contrib.keras.api.keras.optimizers import Adam\n", 26 | "import tensorflow as tf\n", 27 | "import matplotlib.gridspec as gridspec\n", 28 | "from tensorflow.contrib.keras.api.keras.backend import clear_session" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# Necessary for downloading the trajectory data\n", 38 | "import mdshare" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "# Load Data" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "import pyemma.coordinates as pycoor" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "# Define Hyperparameters" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# Tau, how much is the timeshift of the two datasets\n", 71 | "tau = 1\n", 72 | "\n", 73 | "# Batch size for Stochastic Gradient descent\n", 74 | "batch_size = 1000\n", 75 | "\n", 76 | "# Which trajectory points percentage is used as training\n", 77 | "train_ratio = 0.9\n", 78 | "\n", 79 | "# How many hidden layers the network has\n", 80 | "network_depth = 6\n", 81 | "\n", 82 | "# Width of every layer\n", 83 | "layer_width = 100\n", 84 | "\n", 85 | "# Learning rate used for the ADAM optimizer\n", 86 | "learning_rate = 1e-4\n", 87 | "\n", 88 | "# How many output states the network has\n", 89 | "output_size = 6\n", 90 | "\n", 91 | "# Iteration over the training set in the fitting process\n", 92 | "nb_epoch = 60\n", 93 | "\n", 94 | "epsilon = 1e-5" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "traj_whole, dihedral = vamp_data_generator.get_alanine_data()\n", 104 | "\n", 105 | "traj_data_points, input_size = traj_whole.shape" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "# Initialized the VAMPnets wrapper class" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "vamp = vampnet.VampnetTools(epsilon = epsilon)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "# Shuffle trajectory and lagged trajectory together" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "length_data = traj_data_points - tau\n", 138 | "\n", 139 | "traj_ord = traj_whole[:length_data]\n", 140 | "traj_ord_lag = traj_whole[tau:length_data+tau]\n", 141 | "\n", 142 | "\n", 143 | "dihedral_init = dihedral[:length_data]\n", 144 | "\n", 145 | "indexes = np.arange(length_data)\n", 146 | "np.random.shuffle(indexes)\n", 147 | "\n", 148 | "traj = traj_ord[indexes]\n", 149 | "traj_lag = traj_ord_lag[indexes]\n", 150 | "dihedral_shuffle = dihedral_init[indexes]" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "# Prepare data for tensorflow usage" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "length_train = int(np.floor(length_data * train_ratio))\n", 167 | "length_vali = length_data - length_train\n", 168 | "\n", 169 | "traj_data_train = traj[:length_train]\n", 170 | "traj_data_train_lag = traj_lag[:length_train]\n", 171 | "\n", 172 | "traj_data_valid = traj[length_train:]\n", 173 | "traj_data_valid_lag = traj_lag[length_train:]\n", 174 | "\n", 175 | "# Input of the first network\n", 176 | "X1_train = traj_data_train.astype('float32')\n", 177 | "X2_train = traj_data_train_lag.astype('float32')\n", 178 | "\n", 179 | "# Input for validation\n", 180 | "X1_vali = traj_data_valid.astype('float32')\n", 181 | "X2_vali = traj_data_valid_lag.astype('float32')\n", 182 | "\n", 183 | "# Needs a Y-train set which we dont have.\n", 184 | "Y_train = np.zeros((length_train,2*output_size)).astype('float32')\n", 185 | "Y_vali = np.zeros((length_vali,2*output_size)).astype('float32')" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "# Run several model iterations saving the best one, to help finding sparcely populated states" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "scrolled": true 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "max_vm = 0\n", 204 | "attempts = 10\n", 205 | "\n", 206 | "# IMPORTANT: the function vamp.loss_VAMP2_autograd can only be used with tensorflow 1.6 or more recent.\n", 207 | "# For older versions of TF, use the function vamp.loss_VAMP2\n", 208 | "\n", 209 | "losses = [\n", 210 | " vamp.loss_VAMP2_autograd,\n", 211 | "]\n", 212 | "\n", 213 | "\n", 214 | "for i in range(attempts): \n", 215 | "\n", 216 | " # Clear the previous tensorflow session to prevent memory leaks\n", 217 | " clear_session()\n", 218 | "\n", 219 | " # Build the model\n", 220 | "\n", 221 | "\n", 222 | " nodes = [layer_width]*network_depth\n", 223 | "\n", 224 | " Data_X = Input(shape = (input_size,))\n", 225 | " Data_Y = Input(shape = (input_size,))\n", 226 | "\n", 227 | " # A batch normalization layer improves convergence speed\n", 228 | " bn_layer = BatchNormalization()\n", 229 | "\n", 230 | " # Instance layers and assign them to the two lobes of the network\n", 231 | " dense_layers = [Dense(node, activation = 'elu')# if index_layer < 3 else 'linear nodes')\n", 232 | " for index_layer,node in enumerate(nodes)]\n", 233 | "\n", 234 | " lx_branch = bn_layer(Data_X)\n", 235 | " rx_branch = bn_layer(Data_Y)\n", 236 | "\n", 237 | " for i, layer in enumerate(dense_layers):\n", 238 | "\n", 239 | " lx_branch = dense_layers[i](lx_branch)\n", 240 | " rx_branch = dense_layers[i](rx_branch)\n", 241 | "\n", 242 | "\n", 243 | " # Add a softmax output layer.\n", 244 | " # Should be replaced with a linear activation layer if\n", 245 | " # the outputs of the network cannot be interpreted as states\n", 246 | " softmax = Dense(output_size, activation='softmax')\n", 247 | "\n", 248 | " lx_branch = softmax(lx_branch)\n", 249 | " rx_branch = softmax(rx_branch)\n", 250 | "\n", 251 | " # Merge both networks to train both at the same time\n", 252 | " merged = concatenate([lx_branch, rx_branch])\n", 253 | "\n", 254 | " # Initialize the model and the optimizer, and compile it with\n", 255 | " # the loss and metric functions from the VAMPnets package\n", 256 | " model = Model(inputs = [Data_X, Data_Y], outputs = merged)\n", 257 | " adam = Adam(lr = learning_rate/10)\n", 258 | "\n", 259 | " vm1 = np.zeros((len(losses), nb_epoch))\n", 260 | " tm1 = np.zeros_like(vm1)\n", 261 | " vm2 = np.zeros_like(vm1)\n", 262 | " tm2 = np.zeros_like(vm1)\n", 263 | " vm3 = np.zeros_like(vm1)\n", 264 | " tm3 = np.zeros_like(vm1)\n", 265 | " \n", 266 | " for l_index, loss_function in enumerate(losses):\n", 267 | "\n", 268 | " \n", 269 | " model.compile(optimizer = adam,\n", 270 | " loss = loss_function,\n", 271 | " metrics = [\n", 272 | " vamp.metric_VAMP,\n", 273 | " vamp.metric_VAMP2,\n", 274 | " ])\n", 275 | "\n", 276 | "\n", 277 | " # Train the model\n", 278 | " \n", 279 | " hist = model.fit([X1_train, X2_train], Y_train ,\n", 280 | " batch_size=batch_size,\n", 281 | " epochs=nb_epoch,\n", 282 | " validation_data=([X1_vali, X2_vali], Y_vali ),\n", 283 | " verbose=0)\n", 284 | "\n", 285 | "\n", 286 | " vm1[l_index] = np.array(hist.history['val_metric_VAMP'])\n", 287 | " tm1[l_index] = np.array(hist.history['metric_VAMP'])\n", 288 | " \n", 289 | " \n", 290 | " vm2[l_index] = np.array(hist.history['val_metric_VAMP2'])\n", 291 | " tm2[l_index] = np.array(hist.history['metric_VAMP2'])\n", 292 | " \n", 293 | " vm3[l_index] = np.array(hist.history['val_loss'])\n", 294 | " tm3[l_index] = np.array(hist.history['loss'])\n", 295 | " \n", 296 | " \n", 297 | " vm1 = np.reshape(vm1, (-1))\n", 298 | " tm1 = np.reshape(tm1, (-1))\n", 299 | " vm2 = np.reshape(vm2, (-1))\n", 300 | " tm2 = np.reshape(tm2, (-1))\n", 301 | " vm3 = np.reshape(vm3, (-1))\n", 302 | " tm3 = np.reshape(tm3, (-1))\n", 303 | "\n", 304 | " # Average the score obtained in the last part of the training process\n", 305 | " # in order to estabilish which model is better and thus worth saving\n", 306 | "\n", 307 | "\n", 308 | " score = vm1[-5:].mean()\n", 309 | " extra_msg = ''\n", 310 | " if score > max_vm:\n", 311 | " extra_msg = ' - Highest'\n", 312 | " best_weights = model.get_weights()\n", 313 | " max_vm = score\n", 314 | " vm1_max = vm1\n", 315 | " tm1_max = tm1\n", 316 | " vm2_max = vm2\n", 317 | " tm2_max = tm2\n", 318 | " vm3_max = vm3\n", 319 | " tm3_max = tm3\n", 320 | " \n", 321 | " print('Score: {0:.2f}'.format(score) + extra_msg)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "# Recover the saved model and its training history" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "model.set_weights(best_weights)\n", 338 | "\n", 339 | "tm1 = np.array(tm1_max)\n", 340 | "tm2 = np.array(tm2_max)\n", 341 | "tm3 = np.array(tm3_max)\n", 342 | "vm1 = np.array(vm1_max)\n", 343 | "vm2 = np.array(vm2_max)\n", 344 | "vm3 = np.array(vm3_max)\n" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "# Training result visualization" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": { 358 | "scrolled": false 359 | }, 360 | "outputs": [], 361 | "source": [ 362 | "plt.plot(vm1, label = 'VAMP')\n", 363 | "plt.plot(vm2, label = 'VAMP2')\n", 364 | "plt.plot(-vm3, label = 'loss')\n", 365 | "plt.plot(tm1, label = 'training VAMP')\n", 366 | "plt.plot(tm2, label = 'training VAMP2')\n", 367 | "plt.plot(-tm3, label = 'training loss')\n", 368 | "plt.legend()\n", 369 | "plt.show()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "# Transform the input trajectory using the network\n", 379 | "states_prob = model.predict([traj_ord, traj_ord_lag])[:, :output_size]\n", 380 | "\n", 381 | "# Order the output states based on their population\n", 382 | "coor_pred = np.argmax(states_prob, axis = 1)\n", 383 | "indexes = [np.where(coor_pred == np.multiply(np.ones_like(coor_pred), n)) for n in range(output_size)]\n", 384 | "states_num = [len(i[0]) for i in indexes]\n", 385 | "states_order = np.argsort(states_num).astype('int')[::-1]\n", 386 | "\n", 387 | "pred_ord = states_prob[:,states_order]" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "# Visualize the population of the states" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "def print_states_pie_chart():\n", 404 | " coors = []\n", 405 | " maxi = np.max(pred_ord, axis= 1)\n", 406 | "\n", 407 | " for i in range(output_size):\n", 408 | " coors.append(len(np.where(pred_ord[:,i] == maxi)[0]))\n", 409 | " \n", 410 | " fig1, ax1 = plt.subplots()\n", 411 | " ax1.pie(np.array(coors), autopct='%1.2f%%', startangle=90)\n", 412 | " ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.\n", 413 | " print('States population: '+str(np.array(coors)/len(maxi)*100)+'%')\n", 414 | " plt.show()\n", 415 | "\n", 416 | "print_states_pie_chart()" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "# Visualize how the 4 states are placed on the Ramachandran plot" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": { 430 | "scrolled": false 431 | }, 432 | "outputs": [], 433 | "source": [ 434 | "maxi_train = np.max(pred_ord, axis= 1)\n", 435 | "coor_train = np.zeros_like(pred_ord)\n", 436 | "for i in range(output_size):\n", 437 | " coor_train = np.where(pred_ord[:,i]== maxi_train)[0]\n", 438 | " plt.scatter(dihedral_init[coor_train,0], dihedral_init[coor_train,1], s=5)\n", 439 | "plt.axes = [[-np.pi, np.pi],[-np.pi, np.pi]]" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "# For each state, visualize the probabilities the different trajectory points have to belong to it" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": { 453 | "scrolled": false 454 | }, 455 | "outputs": [], 456 | "source": [ 457 | "fig = plt.figure(figsize=(16, 16))\n", 458 | "\n", 459 | "gs1 = gridspec.GridSpec(2, int(np.ceil(output_size/2)))\n", 460 | "gs1.update(wspace=0.05, hspace = 0.05)\n", 461 | "\n", 462 | "for n in range(output_size):\n", 463 | " ax = plt.subplot(gs1[n])\n", 464 | " im = ax.scatter(dihedral_init[:,0], dihedral_init[:,1], s=30,\n", 465 | " c = pred_ord[:,n],\n", 466 | " alpha=0.5, edgecolor='',\n", 467 | " vmin = 0, vmax = 1\n", 468 | " )\n", 469 | " plt.axis('on')\n", 470 | " title = 'State '+str(n + 1)\n", 471 | "\n", 472 | " ax.text(.85, .15, title,\n", 473 | " horizontalalignment='center',\n", 474 | " transform=ax.transAxes, fontdict = {'size':36})\n", 475 | "\n", 476 | "\n", 477 | " if (n < 3):\n", 478 | " ax.set_xticks([-3, 0, 3])\n", 479 | " ax.set_xticklabels([r'-$\\pi$', r'$0$', r'$\\pi$'])\n", 480 | " ax.xaxis.set_tick_params(top='on', bottom='off', labeltop='on', labelbottom='off')\n", 481 | " ax.xaxis.set_tick_params(labelsize=40)\n", 482 | " else:\n", 483 | " ax.set_xticks([])\n", 484 | " if (n%3==0):\n", 485 | " ax.set_yticks([-3, 0, 3])\n", 486 | " ax.set_yticklabels([r'-$\\pi$', r'$0$', r'$\\pi$'])\n", 487 | " ax.yaxis.set_tick_params(labelsize=40)\n", 488 | " else:\n", 489 | " ax.set_yticks([])\n", 490 | "# ax.set_aspect('equal')\n", 491 | " ax.set_xlim([-np.pi, np.pi]);\n", 492 | " ax.set_ylim([-np.pi, np.pi]);\n", 493 | " \n", 494 | " if (n%3 == 0):\n", 495 | " ax.set_ylabel(r'$\\Psi$ [rad]', fontdict = {'size':40})\n", 496 | " if (n < 3):\n", 497 | " ax.set_xlabel(r'$\\Phi$ [rad]', fontdict = {'size':40}, position = 'top')\n", 498 | " ax.xaxis.set_label_coords(0.5,1.2)\n", 499 | "\n", 500 | "gs1.tight_layout(fig, rect=[0, 0.03, 0.95, 0.94])\n", 501 | "fig.show()\n", 502 | "\n", 503 | "cax = fig.add_axes([0.95, 0.05, 0.02, 0.8])\n", 504 | "cbar = fig.colorbar(im, cax=cax, ticks=[0, 1])\n", 505 | "cbar.ax.yaxis.set_tick_params(labelsize=40)" 506 | ] 507 | }, 508 | { 509 | "cell_type": "markdown", 510 | "metadata": {}, 511 | "source": [ 512 | "# Markov Model Estimation" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "# Estimate the implied timescales" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": { 526 | "scrolled": false 527 | }, 528 | "outputs": [], 529 | "source": [ 530 | "max_tau = 200\n", 531 | "lag = np.arange(1, max_tau, 1)\n", 532 | "its = vamp.get_its(pred_ord, lag)\n", 533 | "vamp.plot_its(its, lag)" 534 | ] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "metadata": {}, 539 | "source": [ 540 | "# Chapman-Kolmogorov test for the estimated koopman operator" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [ 549 | "steps = 8\n", 550 | "tau_msm = 35\n", 551 | "predicted, estimated = vamp.get_ck_test(pred_ord, steps, tau_msm)\n", 552 | "vamp.plot_ck_test(predicted, estimated, output_size, steps, tau_msm)" 553 | ] 554 | } 555 | ], 556 | "metadata": { 557 | "anaconda-cloud": {}, 558 | "kernelspec": { 559 | "display_name": "Python 3", 560 | "language": "python", 561 | "name": "python3" 562 | }, 563 | "language_info": { 564 | "codemirror_mode": { 565 | "name": "ipython", 566 | "version": 3 567 | }, 568 | "file_extension": ".py", 569 | "mimetype": "text/x-python", 570 | "name": "python", 571 | "nbconvert_exporter": "python", 572 | "pygments_lexer": "ipython3", 573 | "version": "3.6.8" 574 | } 575 | }, 576 | "nbformat": 4, 577 | "nbformat_minor": 2 578 | } 579 | -------------------------------------------------------------------------------- /vampnet/examples/Alanine_dipeptide_multiple_files.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Import all the packages used" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "scrolled": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "%matplotlib inline\n", 21 | "import vampnet\n", 22 | "from vampnet import data_generator as vamp_data_loader\n", 23 | "from tensorflow.contrib.keras.api.keras.models import Model\n", 24 | "from tensorflow.contrib.keras.api.keras.layers import Dense, Activation, Flatten, Input, BatchNormalization, concatenate\n", 25 | "from tensorflow.contrib.keras.api.keras.optimizers import Adam\n", 26 | "import tensorflow as tf\n", 27 | "import matplotlib.gridspec as gridspec\n", 28 | "from tensorflow.contrib.keras.api.keras.backend import clear_session" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# Necessary for downloading the trajectory data\n", 38 | "import mdshare\n", 39 | "import pyemma.coordinates as pycoor" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "# Define Hyperparameters" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# Tau, how much is the timeshift of the two datasets\n", 56 | "tau = 1\n", 57 | "\n", 58 | "# Batch size for Stochastic Gradient descent\n", 59 | "batch_size = 1000\n", 60 | "\n", 61 | "# Which trajectory points percentage is used as training\n", 62 | "train_ratio = 0.9\n", 63 | "\n", 64 | "# How many hidden layers the network has\n", 65 | "network_depth = 6\n", 66 | "\n", 67 | "# Width of every layer\n", 68 | "layer_width = 100\n", 69 | "\n", 70 | "# Learning rate used for the ADAM optimizer\n", 71 | "learning_rate = 1e-4\n", 72 | "\n", 73 | "# How many output states the network has\n", 74 | "output_size = 6\n", 75 | "\n", 76 | "# Iteration over the training set in the fitting process\n", 77 | "nb_epoch = 40\n", 78 | "\n", 79 | "epsilon = 1e-5" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "# Initialized the VAMPnets wrapper class" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "vamp = vampnet.VampnetTools(epsilon = epsilon)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "# Load Data" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "# #Download alanine coordinates and dihedral angles data\n", 112 | "mdshare.fetch('alanine-dipeptide-3x250ns-heavy-atom-positions.npz')\n", 113 | "mdshare.fetch('alanine-dipeptide-3x250ns-backbone-dihedrals.npz')\n", 114 | "\n", 115 | "alanine_files = np.load('alanine-dipeptide-3x250ns-heavy-atom-positions.npz')\n", 116 | "\n", 117 | "# # Save the files separately\n", 118 | "np.save('traj0.npy', alanine_files['arr_0'])\n", 119 | "np.save('traj1.npy', alanine_files['arr_1'])\n", 120 | "np.save('traj2.npy', alanine_files['arr_2'])\n", 121 | "\n", 122 | "# Separate data files between training data and validation data\n", 123 | "\n", 124 | "train_data_files_list = [\n", 125 | " 'traj0.npy',\n", 126 | " 'traj1.npy',\n", 127 | "]\n", 128 | "\n", 129 | "valid_data_files_list = [\n", 130 | " 'traj2.npy',\n", 131 | "]\n", 132 | "\n", 133 | "total_data_files_list = train_data_files_list + valid_data_files_list" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "# Define the pyemma data sources and get basic info from the files, number of datapoints and system size\n", 143 | "\n", 144 | "train_data_source = pycoor.source(train_data_files_list,chunksize = batch_size)\n", 145 | "valid_data_source = pycoor.source(valid_data_files_list,chunksize = batch_size)\n", 146 | "total_data_source = pycoor.source(total_data_files_list,chunksize = batch_size)\n", 147 | "\n", 148 | "train_datapoints = train_data_source.n_frames_total()\n", 149 | "valid_datapoints = valid_data_source.n_frames_total()\n", 150 | "total_datapoints = total_data_source.n_frames_total()\n", 151 | " \n", 152 | "traj_lengths = total_data_source.trajectory_lengths()\n", 153 | "\n", 154 | "input_size = total_data_source.dimension()" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "# Run several model iterations saving the best one, to help finding sparcely populated states" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "scrolled": true 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "max_vm = 0\n", 173 | "attempts_number = 10\n", 174 | "\n", 175 | "\n", 176 | "# IMPORTANT: the function vamp.loss_VAMP2_autograd can only be used with tensorflow 1.6 or more recent.\n", 177 | "# For older versions of TF, use the function vamp.loss_VAMP2\n", 178 | "\n", 179 | "losses = [\n", 180 | " vamp.loss_VAMP2_autograd,\n", 181 | "]\n", 182 | "\n", 183 | "\n", 184 | "for attempt in range(attempts_number):\n", 185 | " \n", 186 | "\n", 187 | " # Clear the previous tensorflow session to prevent memory leaks\n", 188 | " clear_session()\n", 189 | "\n", 190 | " # Build the model\n", 191 | "\n", 192 | "\n", 193 | " nodes = [layer_width]*network_depth\n", 194 | "\n", 195 | " Data_X = Input(shape = (input_size,))\n", 196 | " Data_Y = Input(shape = (input_size,))\n", 197 | "\n", 198 | " # A batch normalization layer improves convergence speed\n", 199 | " bn_layer = BatchNormalization()\n", 200 | "\n", 201 | " # Instance layers and assign them to the two lobes of the network\n", 202 | " dense_layers = [Dense(node, activation = 'elu',)\n", 203 | " for node in nodes]\n", 204 | "\n", 205 | " lx_branch = bn_layer(Data_X)\n", 206 | " rx_branch = bn_layer(Data_Y)\n", 207 | "\n", 208 | " for i, layer in enumerate(dense_layers):\n", 209 | "\n", 210 | " lx_branch = dense_layers[i](lx_branch)\n", 211 | " rx_branch = dense_layers[i](rx_branch)\n", 212 | "\n", 213 | "\n", 214 | " # Add a softmax output layer.\n", 215 | " # Should be replaced with a linear activation layer if\n", 216 | " # the outputs of the network cannot be interpreted as states\n", 217 | " softmax = Dense(output_size, activation='softmax')\n", 218 | "\n", 219 | " lx_branch = softmax(lx_branch)\n", 220 | " rx_branch = softmax(rx_branch)\n", 221 | "\n", 222 | " # Merge both networks to train both at the same time\n", 223 | " merged = concatenate([lx_branch, rx_branch])\n", 224 | "\n", 225 | " # Initialize the model and the optimizer, and compile it with\n", 226 | " # the loss and metric functions from the VAMPnets package\n", 227 | " model = Model(inputs = [Data_X, Data_Y], outputs = merged)\n", 228 | " adam = Adam(lr = learning_rate)\n", 229 | "\n", 230 | " vm1 = np.zeros((len(losses), nb_epoch))\n", 231 | " tm1 = np.zeros_like(vm1)\n", 232 | " vm2 = np.zeros_like(vm1)\n", 233 | " tm2 = np.zeros_like(vm1)\n", 234 | " \n", 235 | " for l_index, loss_function in enumerate(losses):\n", 236 | "\n", 237 | " \n", 238 | " model.compile(optimizer = adam,\n", 239 | " loss = loss_function,\n", 240 | " metrics = [\n", 241 | " vamp.metric_VAMP,\n", 242 | " vamp.metric_VAMP2,\n", 243 | " ])\n", 244 | "\n", 245 | "\n", 246 | " # Train the model\n", 247 | " \n", 248 | " steps_per_train_epoch = int(np.sum(np.ceil((train_data_source.trajectory_lengths()-tau)/batch_size)))\n", 249 | " steps_per_valid_epoch = int(np.sum(np.ceil((valid_data_source.trajectory_lengths()-tau)/batch_size)))\n", 250 | " \n", 251 | " hist = model.fit_generator(generator = vamp_data_loader.build_generator_on_source_shuffle(train_data_source,\n", 252 | " batch_size,\n", 253 | " tau,\n", 254 | " output_size,\n", 255 | " ),\n", 256 | " steps_per_epoch = steps_per_train_epoch,\n", 257 | " epochs = nb_epoch,\n", 258 | " verbose = 0,\n", 259 | " validation_data = vamp_data_loader.build_generator_on_source_shuffle(valid_data_source,\n", 260 | " batch_size,\n", 261 | " tau,\n", 262 | " output_size,\n", 263 | " ),\n", 264 | " validation_steps = steps_per_valid_epoch,\n", 265 | " shuffle = True\n", 266 | " )\n", 267 | "\n", 268 | " vm1[l_index] = np.array(hist.history['val_metric_VAMP'])\n", 269 | " tm1[l_index] = np.array(hist.history['metric_VAMP'])\n", 270 | " \n", 271 | " vm2[l_index] = np.array(hist.history['val_metric_VAMP2'])\n", 272 | " tm2[l_index] = np.array(hist.history['metric_VAMP2'])\n", 273 | " \n", 274 | " \n", 275 | " vm1 = np.reshape(vm1, (-1))\n", 276 | " tm1 = np.reshape(tm1, (-1))\n", 277 | " vm2 = np.reshape(vm2, (-1))\n", 278 | " tm2 = np.reshape(tm2, (-1))\n", 279 | "\n", 280 | " # Average the score obtained in the last part of the training process\n", 281 | " # in order to estabilish which model is better and thus worth saving\n", 282 | "\n", 283 | "\n", 284 | " score = vm1[-5:].mean()\n", 285 | " t_score = tm1[-5:].mean()\n", 286 | " extra_msg = ''\n", 287 | " if score > max_vm:\n", 288 | " extra_msg = ' - Highest'\n", 289 | " best_weights = model.get_weights()\n", 290 | " max_vm = score\n", 291 | " vm1_max = vm1\n", 292 | " tm1_max = tm1\n", 293 | " vm2_max = vm2\n", 294 | " tm2_max = tm2\n", 295 | " \n", 296 | " print('Attempt {0}, training score: {1:.2f}, validation score: {2:.2f}'.format(attempt+1, t_score, score) + extra_msg)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "# Recover the saved model and its training history" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "model.set_weights(best_weights)\n", 313 | "\n", 314 | "tm1 = np.array(vm1_max)\n", 315 | "tm2 = np.array(tm1_max)\n", 316 | "vm1 = np.array(vm2_max)\n", 317 | "vm2 = np.array(tm2_max)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "# Training result visualization" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": { 331 | "scrolled": false 332 | }, 333 | "outputs": [], 334 | "source": [ 335 | "plt.plot(vm1, label = 'VAMP')\n", 336 | "plt.plot(vm2, label = 'VAMP2')\n", 337 | "plt.plot(tm1, label = 'training VAMP')\n", 338 | "plt.plot(tm2, label = 'training VAMP2')\n", 339 | "plt.legend()\n", 340 | "plt.show()" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "# Transform the input trajectory using the network\n", 350 | "states_prob_all = model.predict_generator(generator = vamp_data_loader.build_generator_on_source(total_data_source,\n", 351 | " batch_size,\n", 352 | " tau,\n", 353 | " output_size),\n", 354 | " steps = np.sum(np.ceil((total_data_source.trajectory_lengths()-tau)/batch_size)),\n", 355 | " verbose = 0)\n", 356 | "\n", 357 | "states_prob_t = states_prob_all[:,:output_size]\n", 358 | "states_prob_lag = states_prob_all[:,output_size:]\n", 359 | "\n", 360 | "# reorganize the output of the network in order to have every data point transformed by the network in one array\n", 361 | "start = 0\n", 362 | "states_prob = np.zeros((states_prob_t.shape[0]+len(traj_lengths)*tau, output_size))\n", 363 | "for l, length_i in enumerate(traj_lengths-tau):\n", 364 | " states_prob[start+l*tau:start+l*tau+length_i] = states_prob_t[start:start+length_i]\n", 365 | " states_prob[start+l*tau+length_i:start+l*tau+length_i+tau] = states_prob_lag[start+length_i-tau:start+length_i]\n", 366 | " start += length_i\n", 367 | "\n", 368 | "# Order the output states based on their population\n", 369 | "coor_pred = np.argmax(states_prob, axis = 1)\n", 370 | "indexes = [np.where(coor_pred == np.multiply(np.ones_like(coor_pred), n)) for n in range(output_size)]\n", 371 | "states_num = [len(i[0]) for i in indexes]\n", 372 | "states_order = np.argsort(states_num).astype('int')[::-1]\n", 373 | "\n", 374 | "pred_ord = states_prob[:,states_order]" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "# Visualize the population of the states" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "def print_states_pie_chart():\n", 391 | " coors = []\n", 392 | " maxi = np.max(pred_ord, axis= 1)\n", 393 | "\n", 394 | " for i in range(output_size):\n", 395 | " coors.append(len(np.where(pred_ord[:,i] == maxi)[0]))\n", 396 | " \n", 397 | " fig1, ax1 = plt.subplots()\n", 398 | " ax1.pie(np.array(coors), autopct='%1.2f%%', startangle=90)\n", 399 | " ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.\n", 400 | " print('States population: '+str(np.array(coors)/len(maxi)*100)+'%')\n", 401 | " plt.show()\n", 402 | "\n", 403 | "print_states_pie_chart()" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": {}, 409 | "source": [ 410 | "# Visualize how the 4 states are placed on the Ramachandran plot" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "dihedral_file = np.load('alanine-dipeptide-3x250ns-backbone-dihedrals.npz')\n", 420 | "dihedral_init = np.concatenate([dihedral_file['arr_0'],\n", 421 | " dihedral_file['arr_1'],\n", 422 | " dihedral_file['arr_2'],\n", 423 | " ], axis = 0)\n" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": { 430 | "scrolled": false 431 | }, 432 | "outputs": [], 433 | "source": [ 434 | "maxi_train = np.max(pred_ord, axis= 1)\n", 435 | "coor_train = np.zeros_like(pred_ord)\n", 436 | "for i in range(output_size):\n", 437 | " coor_train = np.where(pred_ord[:,i]== maxi_train)[0]\n", 438 | " plt.scatter(dihedral_init[coor_train,0], dihedral_init[coor_train,1], s=1)\n", 439 | "plt.axes = [[-np.pi, np.pi],[-np.pi, np.pi]]" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "# For each state, visualize the probabilities the different trajectory points have to belong to it" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": { 453 | "scrolled": false 454 | }, 455 | "outputs": [], 456 | "source": [ 457 | "fig = plt.figure(figsize=(16, 16))\n", 458 | "\n", 459 | "gs1 = gridspec.GridSpec(int(np.ceil(output_size/2)), 2)\n", 460 | "gs1.update(wspace=0.05, hspace = 0.05)\n", 461 | "\n", 462 | "for n in range(output_size):\n", 463 | " ax = plt.subplot(gs1[n])\n", 464 | " im = ax.scatter(dihedral_init[:,0], dihedral_init[:,1], s=5,\n", 465 | " c = pred_ord[:,n],\n", 466 | " alpha=0.5, edgecolor='', vmin = 0, vmax = 1)\n", 467 | " plt.axis('on')\n", 468 | " title = 'State '+str(n + 1)\n", 469 | "\n", 470 | " ax.text(.85, .15, title,\n", 471 | " horizontalalignment='center',\n", 472 | " transform=ax.transAxes, fontdict = {'size':36})\n", 473 | "\n", 474 | "\n", 475 | " if (n < 3):\n", 476 | " ax.set_xticks([-3, 0, 3])\n", 477 | " ax.set_xticklabels([r'-$\\pi$', r'$0$', r'$\\pi$'])\n", 478 | " ax.xaxis.set_tick_params(top=True, bottom=False, labeltop=True, labelbottom=False)\n", 479 | " ax.xaxis.set_tick_params(labelsize=40)\n", 480 | " else:\n", 481 | " ax.set_xticks([])\n", 482 | " if (n%3==0):\n", 483 | " ax.set_yticks([-3, 0, 3])\n", 484 | " ax.set_yticklabels([r'-$\\pi$', r'$0$', r'$\\pi$'])\n", 485 | " ax.yaxis.set_tick_params(labelsize=40)\n", 486 | " else:\n", 487 | " ax.set_yticks([])\n", 488 | "# ax.set_aspect('equal')\n", 489 | " ax.set_xlim([-np.pi, np.pi]);\n", 490 | " ax.set_ylim([-np.pi, np.pi]);\n", 491 | " \n", 492 | " if (n%3 == 0):\n", 493 | " ax.set_ylabel(r'$\\Psi$ [rad]', fontdict = {'size':40})\n", 494 | " if (n < 3):\n", 495 | " ax.set_xlabel(r'$\\Phi$ [rad]', fontdict = {'size':40}, position = 'top')\n", 496 | " ax.xaxis.set_label_coords(0.5,1.2)\n", 497 | "\n", 498 | "gs1.tight_layout(fig, rect=[0, 0.03, 0.95, 0.94])\n", 499 | "fig.show()\n", 500 | "\n", 501 | "cax = fig.add_axes([0.95, 0.05, 0.02, 0.8])\n", 502 | "cbar = fig.colorbar(im, cax=cax, ticks=[0, 1])\n", 503 | "cbar.ax.yaxis.set_tick_params(labelsize=40)" 504 | ] 505 | }, 506 | { 507 | "cell_type": "markdown", 508 | "metadata": {}, 509 | "source": [ 510 | "# Markov Model Estimation" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": {}, 516 | "source": [ 517 | "## Prepare multiple trajectories " 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": null, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "# separate the trajectories again as a list based on the length of them\n", 527 | "traj_list = []\n", 528 | "start = 0\n", 529 | "for length_i in traj_lengths:\n", 530 | " traj_list.append(pred_ord[start:start+length_i])\n", 531 | " start += length_i" 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "metadata": {}, 537 | "source": [ 538 | "# Estimate the implied timescales" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "metadata": { 545 | "scrolled": false 546 | }, 547 | "outputs": [], 548 | "source": [ 549 | "max_tau = 200\n", 550 | "lag = np.arange(1, max_tau, 1)\n", 551 | "its = vamp.get_its(traj_list, lag)\n", 552 | "vamp.plot_its(its, lag)" 553 | ] 554 | }, 555 | { 556 | "cell_type": "markdown", 557 | "metadata": {}, 558 | "source": [ 559 | "# Chapman-Kolmogorov test for the estimated koopman operator" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "metadata": {}, 566 | "outputs": [], 567 | "source": [ 568 | "steps = 8\n", 569 | "tau_msm = 35\n", 570 | "predicted, estimated = vamp.get_ck_test(traj_list, steps, tau_msm)\n", 571 | "vamp.plot_ck_test(predicted, estimated, output_size, steps, tau_msm)" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": null, 577 | "metadata": {}, 578 | "outputs": [], 579 | "source": [] 580 | } 581 | ], 582 | "metadata": { 583 | "anaconda-cloud": {}, 584 | "kernelspec": { 585 | "display_name": "Python 3", 586 | "language": "python", 587 | "name": "python3" 588 | }, 589 | "language_info": { 590 | "codemirror_mode": { 591 | "name": "ipython", 592 | "version": 3 593 | }, 594 | "file_extension": ".py", 595 | "mimetype": "text/x-python", 596 | "name": "python", 597 | "nbconvert_exporter": "python", 598 | "pygments_lexer": "ipython3", 599 | "version": "3.6.8" 600 | }, 601 | "varInspector": { 602 | "cols": { 603 | "lenName": 16, 604 | "lenType": 16, 605 | "lenVar": 40 606 | }, 607 | "kernels_config": { 608 | "python": { 609 | "delete_cmd_postfix": "", 610 | "delete_cmd_prefix": "del ", 611 | "library": "var_list.py", 612 | "varRefreshCmd": "print(var_dic_list())" 613 | }, 614 | "r": { 615 | "delete_cmd_postfix": ") ", 616 | "delete_cmd_prefix": "rm(", 617 | "library": "var_list.r", 618 | "varRefreshCmd": "cat(var_dic_list()) " 619 | } 620 | }, 621 | "types_to_exclude": [ 622 | "module", 623 | "function", 624 | "builtin_function_or_method", 625 | "instance", 626 | "_Feature" 627 | ], 628 | "window_display": false 629 | } 630 | }, 631 | "nbformat": 4, 632 | "nbformat_minor": 2 633 | } 634 | -------------------------------------------------------------------------------- /time-lagged-autoencoder/tae/models.py: -------------------------------------------------------------------------------- 1 | # This file is part of the markovmodel/deeptime repository. 2 | # Copyright (C) 2017, 2018 Computational Molecular Biology Group, 3 | # Freie Universitaet Berlin (GER) 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | ''' 19 | Implementations of PCA, TICA, AE, and VAE. 20 | ''' 21 | 22 | from torch import svd as _svd 23 | from torch import nn as _nn 24 | from torch import optim as _optim 25 | from torch import diag as _diag 26 | from torch import cat as _cat 27 | from torch import randn as _randn 28 | from torch import sum as _sum 29 | from torch import mm as _mm 30 | from torch import symeig as _symeig 31 | from torch import abs as _abs 32 | from torch import arange as _arange 33 | from torch import sqrt as _sqrt 34 | from torch import zeros as _zeros 35 | from torch import no_grad as _no_grad 36 | from torch.autograd import Function as _Function 37 | from .utils import get_mean as _get_mean 38 | from .utils import get_covariance as _get_covariance 39 | from .utils import Transform as _Transform 40 | 41 | __all__ = ['PCA', 'TICA', 'AE', 'VAE', 'VAMPNet'] 42 | 43 | ################################################################################ 44 | # 45 | # PCA 46 | # 47 | ################################################################################ 48 | 49 | class PCA(object): 50 | '''Perform a principal component analysis for dimensionality reduction. 51 | 52 | We compute the first eigenvectors of the instantaneous covariance 53 | matrix and use them to rotate/project the data into a lower dimensional 54 | subspace. 55 | ''' 56 | def __init__(self): 57 | self.loss_function = _nn.MSELoss(size_average=False) 58 | def get_loss(self, loader): 59 | '''Train the model on the provided data loader. 60 | 61 | Arguments: 62 | loader (DataLoader): the data for loss calculation 63 | ''' 64 | if loader is None: 65 | return None 66 | loss = 0.0 67 | for x, y in loader: 68 | x, y = self.transformer(x, y) 69 | loss += self.loss_function(x.mm(self.score_matrix), y).item() 70 | return loss / float(len(loader.dataset)) 71 | def fit(self, train_loader, dim=None, test_loader=None): 72 | '''Train the model on the provided data loader. 73 | 74 | Arguments: 75 | train_loader (DataLoader): the training data 76 | dim (int): the target dimensionality 77 | test_loader (DataLoader): the data for validation 78 | ''' 79 | self.x_mean, y_mean = _get_mean(train_loader) 80 | self.cxx, cxy, cyy = _get_covariance( 81 | train_loader, self.x_mean, y_mean) 82 | self.transformer = _Transform( 83 | x_mean=self.x_mean, y_mean=self.x_mean) 84 | u, s, v = _svd(self.cxx) 85 | if dim is None: 86 | dim = s.size()[0] 87 | self.decoder_matrix = u[:, :dim] 88 | self.encoder_matrix = v.t()[:dim, :] 89 | self.score_matrix = self.decoder_matrix.mm(self.encoder_matrix) 90 | return self.get_loss(train_loader), self.get_loss(test_loader) 91 | def transform(self, loader): 92 | '''Apply the model on the provided data loader. 93 | 94 | Arguments: 95 | loader (DataLoader): the data you wish to transform 96 | ''' 97 | latent = [] 98 | for x, _ in loader: 99 | x = self.transformer.x(x) 100 | latent.append(x.mm(self.encoder_matrix.t())) 101 | return _cat(latent) 102 | 103 | ################################################################################ 104 | # 105 | # TICA 106 | # 107 | ################################################################################ 108 | 109 | class TICA(object): 110 | '''Perform a time-lagged independent component analysis for 111 | dimensionality reduction. 112 | 113 | We compute a rank-d approximation to the Koopman operator and use it to 114 | rotate/project the data into a lower dimensional subspace. 115 | 116 | Arguments: 117 | kinetic_map (boolean): use the kinetic map variant of TICA 118 | symmetrize (boolean): enforce symmetry and reversibility 119 | ''' 120 | def __init__(self, kinetic_map=True, symmetrize=False): 121 | self.loss_function = _nn.MSELoss(size_average=False) 122 | self.kinetic_map = kinetic_map 123 | self.symmetrize = symmetrize 124 | def get_loss(self, loader): 125 | '''Train the model on the provided data loader. 126 | 127 | Arguments: 128 | loader (DataLoader): the data for loss calculation 129 | ''' 130 | if loader is None: 131 | return None 132 | loss = 0.0 133 | for x, y in loader: 134 | x, y = self.transformer(x, y) 135 | loss += self.loss_function(x.mm(self.koopman_matrix), y).item() 136 | return loss / float(len(loader.dataset)) 137 | def fit(self, train_loader, dim=None, test_loader=None): 138 | '''Train the model on the provided data loader. 139 | 140 | Arguments: 141 | train_loader (DataLoader): the training data 142 | dim (int): the target dimensionality 143 | test_loader (DataLoader): the data for validation 144 | ''' 145 | self.x_mean, self.y_mean = _get_mean(train_loader) 146 | self.cxx, self.cxy, self.cyy = _get_covariance( 147 | train_loader, self.x_mean, self.y_mean) 148 | if self.symmetrize: 149 | self.cxx = 0.5 * (self.cxx + self.cyy) 150 | self.cyy.copy_(self.cxx) 151 | self.cxy = 0.5 * (self.cxy + self.cxy.t()) 152 | self.transformer = _Transform( 153 | x_mean=self.x_mean, x_covariance=self.cxx, 154 | y_mean=self.y_mean, y_covariance=self.cyy) 155 | self.ixx = self.transformer.x.mul 156 | self.iyy = self.transformer.y.mul 157 | u, s, v = _svd(self.ixx.mm(self.cxy.mm(self.iyy))) 158 | if dim is None: 159 | dim = s.size()[0] 160 | self.decoder_matrix = v[:, :dim] 161 | self.encoder_matrix = u.t()[:dim, :] 162 | if self.kinetic_map: 163 | self.encoder_matrix = _diag(s[:dim]).mm(self.encoder_matrix) 164 | else: 165 | self.decoder_matrix = self.decoder_matrix.mm(_diag(s[:dim])) 166 | self.koopman_matrix = self.decoder_matrix.mm(self.encoder_matrix) 167 | return self.get_loss(train_loader), self.get_loss(test_loader) 168 | def transform(self, loader): 169 | '''Apply the model on the provided data loader. 170 | 171 | Arguments: 172 | loader (DataLoader): the data you wish to transform 173 | ''' 174 | latent = [] 175 | for x, _ in loader: 176 | x = self.transformer.x(x) 177 | latent.append(x.mm(self.encoder_matrix.t())) 178 | return _cat(latent) 179 | 180 | ################################################################################ 181 | # 182 | # AUTOENCODER BASE CLASS 183 | # 184 | ################################################################################ 185 | 186 | class BaseNet(_nn.Module): 187 | '''Basic shape of a pytorch neural network model for dimension reduction. 188 | 189 | The BaseNet is the basis of more specialised dimension reduction networks 190 | and provides the full infrastructure for the setup and training process. 191 | 192 | Arguments: 193 | inp_size (int): dimensionality of the full space 194 | lat_size (int): dimensionality of the desired latent space 195 | hid_size (sequence of int): sizes of the hidden layers 196 | normalize_batch (boolean): normalize over batches instead samples 197 | dropout (Dropout): dropout layer for each hidden layer 198 | alpha (float) activation parameter for the rectified linear units 199 | prelu (bool) use a learnable ReLU 200 | bias (boolean): specify usage of bias neurons 201 | lr (float): learning rate parameter for Adam 202 | cuda (boolean): use the GPU 203 | non_blocking (boolean): use asyncronous mode (GPU only) 204 | ''' 205 | def __init__( 206 | self, inp_size, lat_size, hid_size, normalize_batch, 207 | dropout, alpha, prelu, bias, lr, cuda, non_blocking): 208 | super(BaseNet, self).__init__() 209 | sizes = [inp_size] + list(hid_size) + [lat_size] 210 | self._last = len(sizes) - 2 211 | if isinstance(dropout, float): 212 | dropout = _nn.Dropout(p=dropout) 213 | self._setup(sizes, bias, alpha, prelu, dropout) 214 | self.optimizer = _optim.Adam(self.parameters(), lr=lr) 215 | self.normalize_batch = normalize_batch 216 | self.non_blocking = non_blocking 217 | if cuda: 218 | self.use_cuda = True 219 | self.cuda() # the non_blocking=... parameter is not accepted, here 220 | else: 221 | self.use_cuda = False 222 | def _setup(self, sizes, bias, alpha, prelu, dropout): 223 | '''Implement this in your derived class to create the necessary 224 | layers. 225 | ''' 226 | def _create_activation(self, key, idx, alpha, prelu, suffix=''): 227 | '''Helper function to create activations and initialize parameters.''' 228 | if alpha is None: 229 | activation = None 230 | elif alpha < 0.0: 231 | raise ValueError('alpha must be a non-negative number') 232 | elif alpha == 0.0: 233 | activation = _nn.ReLU() 234 | elif prelu: 235 | activation = _nn.PReLU(num_parameters=1, init=alpha) 236 | else: 237 | activation = _nn.LeakyReLU(negative_slope=alpha) 238 | if activation is not None: 239 | setattr(self, key + '_act_%d%s' % (idx, suffix), activation) 240 | layer = getattr(self, key + '_prm_%d%s' % (idx, suffix)) 241 | _nn.init.kaiming_normal_(layer.weight.data, a=alpha, mode='fan_in') 242 | try: 243 | layer.bias.data.uniform_(0.0, 0.1) 244 | except AttributeError: 245 | pass 246 | def _try_to_apply_module(self, key, value): 247 | '''Helper function to safely apply a module within the network.''' 248 | try: 249 | return getattr(self, key)(value) 250 | except AttributeError: 251 | return value 252 | def _apply_layer(self, key, idx, value): 253 | '''Helper function to safely apply a layer (module sequence) within 254 | the network. 255 | ''' 256 | return self._try_to_apply_module( 257 | key + '_drp_%d' % idx, self._try_to_apply_module( 258 | key + '_act_%d' % idx, self._try_to_apply_module( 259 | key + '_prm_%d' % idx, value))) 260 | def forward_and_apply_loss_function(self, x, y): 261 | '''Implement this in your derived class''' 262 | raise NotImplementedError('Implement in child class') 263 | def train_step(self, loader): 264 | '''A single training epoch.''' 265 | self.train() 266 | train_loss = 0 267 | for x, y in loader: 268 | x, y = self.transformer(x, y) 269 | if self.use_cuda: 270 | x = x.cuda(non_blocking=self.non_blocking) 271 | y = y.cuda(non_blocking=self.non_blocking) 272 | self.optimizer.zero_grad() 273 | loss = self.forward_and_apply_loss_function(x, y) 274 | loss.backward() 275 | train_loss += loss.item() 276 | self.optimizer.step() 277 | if self.normalize_batch: 278 | return train_loss / float(len(loader)) 279 | return train_loss / float(len(loader.dataset)) 280 | def test_step(self, loader): 281 | '''A single validation epoch''' 282 | self.eval() 283 | test_loss = 0 284 | if loader is None: 285 | return None 286 | for x, y in loader: 287 | x, y = self.transformer(x, y) 288 | if self.use_cuda: 289 | x = x.cuda(non_blocking=self.non_blocking) 290 | y = y.cuda(non_blocking=self.non_blocking) 291 | test_loss += self.forward_and_apply_loss_function(x, y).item() 292 | if self.normalize_batch: 293 | return test_loss / float(len(loader)) 294 | return test_loss / float(len(loader.dataset)) 295 | def fit(self, train_loader, n_epochs, test_loader=None): 296 | '''Train the model on the provided data loader. 297 | 298 | Arguments: 299 | train_loader (DataLoader): the training data 300 | n_epochs (int): number of training epochs 301 | test_loader (DataLoader): the data for validation 302 | ''' 303 | x_mean, y_mean = _get_mean(train_loader) 304 | cxx, cxy, cyy = _get_covariance(train_loader, x_mean, y_mean) 305 | self.transformer = _Transform( 306 | x_mean=x_mean, x_covariance=cxx, y_mean=y_mean, y_covariance=cyy) 307 | train_loss, test_loss = [], [] 308 | for epoch in range(n_epochs): 309 | train_loss.append( 310 | self.train_step( 311 | train_loader)) 312 | with _no_grad(): 313 | test_loss.append( 314 | self.test_step(test_loader)) 315 | return train_loss, test_loss 316 | def transform(self, loader): 317 | '''Apply the model on the provided data loader. 318 | 319 | Arguments: 320 | loader (DataLoader): the data you wish to transform 321 | ''' 322 | self.eval() 323 | latent = [] 324 | for x, _ in loader: 325 | x = self.transformer.x(x) 326 | if self.use_cuda: 327 | x = x.cuda(non_blocking=self.non_blocking) 328 | y = self.encode(x) 329 | if self.cuda: 330 | y = y.cpu() 331 | latent.append(y) 332 | return _cat(latent).data 333 | 334 | ################################################################################ 335 | # 336 | # AUTOENCODER 337 | # 338 | ################################################################################ 339 | 340 | class AE(BaseNet): 341 | '''Use a time-lagged autoencoder model for dimensionality reduction. 342 | 343 | We train a time-lagged autoencoder type neural network. 344 | 345 | Arguments: 346 | inp_size (int): dimensionality of the full space 347 | lat_size (int): dimensionality of the desired latent space 348 | hid_size (sequence of int): sizes of the hidden layers 349 | dropout (Dropout): dropout layer for each hidden layer 350 | alpha (float) activation parameter for the rectified linear units 351 | prelu (bool) use a learnable ReLU 352 | bias (boolean): specify usage of bias neurons 353 | lr (float): learning rate parameter for Adam 354 | cuda (boolean): use the GPU 355 | ''' 356 | def __init__( 357 | self, inp_size, lat_size, hid_size=[], 358 | dropout=0.5, alpha=0.01, prelu=False, 359 | bias=True, lr=0.001, cuda=False, non_blocking=False): 360 | super(AE, self).__init__( 361 | inp_size, lat_size, hid_size, False, 362 | dropout, alpha, prelu, bias, lr, cuda, non_blocking) 363 | self._mse_loss_function = _nn.MSELoss(size_average=False) 364 | def _setup(self, sizes, bias, alpha, prelu, dropout): 365 | '''Helper function to create al necessary layers.''' 366 | for c, idx in enumerate(range(1, len(sizes))): 367 | setattr( 368 | self, 369 | 'enc_prm_%d' % c, 370 | _nn.Linear(sizes[idx - 1], sizes[idx], bias=bias)) 371 | self._create_activation('enc', c, alpha, prelu) 372 | if c < self._last: 373 | if dropout is not None: 374 | setattr(self, 'enc_drp_%d' % c, dropout) 375 | for c, idx in enumerate(reversed(range(1, len(sizes)))): 376 | setattr( 377 | self, 378 | 'dec_prm_%d' % c, 379 | _nn.Linear(sizes[idx], sizes[idx - 1], bias=bias)) 380 | if c < self._last: 381 | self._create_activation('dec', c, alpha, prelu) 382 | if dropout is not None: 383 | setattr(self, 'dec_drp_%d' % c, dropout) 384 | else: 385 | self._create_activation('dec', c, None, None) 386 | def forward_and_apply_loss_function(self, x, y): 387 | '''Helper function to feed data through the network and compute the 388 | desired loss. 389 | ''' 390 | return self._mse_loss_function(self(x), y) 391 | def encode(self, x): 392 | '''Encode the given input.''' 393 | y = x 394 | for idx in range(self._last): 395 | y = self._apply_layer('enc', idx, y) 396 | return getattr(self, 'enc_prm_%d' % self._last)(y) 397 | def decode(self, z): 398 | '''Decode the given input.''' 399 | y = self._try_to_apply_module('enc_act_%d' % self._last, z) 400 | for idx in range(self._last): 401 | y = self._apply_layer('dec', idx, y) 402 | return getattr(self, 'dec_prm_%d' % self._last)(y) 403 | def forward(self, x): 404 | '''Forward the given input through the network.''' 405 | return self.decode(self.encode(x)) 406 | 407 | ################################################################################ 408 | # 409 | # VARIATIONAL AUTOENCODER 410 | # 411 | ################################################################################ 412 | 413 | class VAE(BaseNet): 414 | '''Use a time-lagged variational autoencoder model for dimensionality 415 | reduction. 416 | 417 | We train a time-lagged variational autoencoder type neural network. 418 | 419 | Arguments: 420 | inp_size (int): dimensionality of the full space 421 | lat_size (int): dimensionality of the desired latent space 422 | hid_size (sequence of int): sizes of the hidden layers 423 | beta (float) : KLD weight for optimization 424 | dropout (Dropout): dropout layer for each hidden layer 425 | alpha (float) activation parameter for the rectified linear units 426 | prelu (bool) use a learnable ReLU 427 | bias (boolean): specify usage of bias neurons 428 | lr (float): learning rate parameter for Adam 429 | cuda (boolean): use the GPU 430 | ''' 431 | def __init__( 432 | self, inp_size, lat_size, hid_size=[], beta=1.0, 433 | dropout=0.5, alpha=0.01, prelu=False, 434 | bias=True, lr=0.001, cuda=False, non_blocking=False): 435 | super(VAE, self).__init__( 436 | inp_size, lat_size, hid_size, False, 437 | dropout, alpha, prelu, bias, lr, cuda, non_blocking) 438 | self.beta = beta 439 | self._mse_loss_function = _nn.MSELoss(size_average=False) 440 | def _setup(self, sizes, bias, alpha, prelu, dropout): 441 | '''Helper function to create al necessary layers.''' 442 | for c, idx in enumerate(range(1, len(sizes) - 1)): 443 | setattr( 444 | self, 445 | 'enc_prm_%d' % c, 446 | _nn.Linear(sizes[idx - 1], sizes[idx], bias=bias)) 447 | self._create_activation('enc', c, alpha, prelu) 448 | if dropout is not None: 449 | setattr(self, 'enc_drp_%d' % c, dropout) 450 | setattr( 451 | self, 452 | 'enc_prm_%d_mu' % self._last, 453 | _nn.Linear(sizes[-2], sizes[-1], bias=bias)) 454 | self._create_activation('enc', self._last, None, None, suffix='_mu') 455 | setattr( 456 | self, 457 | 'enc_prm_%d_lv' % self._last, 458 | _nn.Linear(sizes[-2], sizes[-1], bias=bias)) 459 | self._create_activation('enc', self._last, None, None, suffix='_lv') 460 | for c, idx in enumerate(reversed(range(1, len(sizes)))): 461 | setattr( 462 | self, 463 | 'dec_prm_%d' % c, 464 | _nn.Linear(sizes[idx], sizes[idx - 1], bias=bias)) 465 | if c < self._last: 466 | self._create_activation('dec', c, alpha, prelu) 467 | if dropout is not None: 468 | setattr(self, 'dec_drp_%d' % c, dropout) 469 | else: 470 | self._create_activation('dec', c, None, None) 471 | def forward_and_apply_loss_function(self, x, y): 472 | '''Helper function to feed data through the network and compute the 473 | desired loss. 474 | ''' 475 | y_recon, mu, lv = self(x) 476 | mse = self._mse_loss_function(y_recon, y) 477 | kld = -0.5 * _sum(1.0 + lv - mu.pow(2) - lv.exp()) 478 | return mse + self.beta * kld / float(y.size(1)) 479 | def _encode(self, x): 480 | '''Encode the given input.''' 481 | y = x 482 | for idx in range(self._last): 483 | y = self._apply_layer('enc', idx, y) 484 | mu = getattr(self, 'enc_prm_%d_mu' % self._last)(y) 485 | lv = getattr(self, 'enc_prm_%d_lv' % self._last)(y) 486 | return mu, lv 487 | def _reparameterize(self, mu, lv): 488 | '''Reparametrize the given input.''' 489 | if self.training: 490 | std = lv.mul(0.5).exp_() 491 | eps = _randn(*std.size()) 492 | if self.use_cuda: 493 | eps = eps.cuda() 494 | return eps.mul(std).add_(mu) 495 | else: 496 | return mu 497 | def encode(self, x): 498 | '''Encode/reparametrize the given input.''' 499 | return self._reparameterize(*self._encode(x)) 500 | def decode(self, z): 501 | '''Decode the given input.''' 502 | y = z 503 | for idx in range(self._last): 504 | y = self._apply_layer('dec', idx, y) 505 | return getattr(self, 'dec_prm_%d' % self._last)(y) 506 | def forward(self, x): 507 | '''Forward the given input through the network.''' 508 | mu, lv = self._encode(x) 509 | return self.decode(self._reparameterize(mu, lv)), mu, lv 510 | 511 | ################################################################################ 512 | # 513 | # VAMPNET WORK IN PROGRESS 514 | # 515 | ################################################################################ 516 | 517 | class DecomposeRSPDMatrix(_Function): 518 | @staticmethod 519 | def forward(ctx, matrix): 520 | eigval, eigvec = _symeig(matrix, eigenvectors=True) 521 | eigval = _abs(eigval) + 1e-10 522 | ctx.eigval = eigval 523 | ctx.eigvec = eigvec 524 | return eigval, eigvec 525 | @staticmethod 526 | def backward(ctx, dval, dvec): 527 | eigval = ctx.eigval 528 | eigvec = ctx.eigvec 529 | n = len(eigval) 530 | eigval_dist = eigval[:, None] - eigval[None, :] 531 | idx = _arange(n).long().tolist() 532 | eigval_dist[idx, idx] = 1.0 533 | dval_out = eigvec[:, None, :] * eigvec[None, :, :] 534 | dvec_out = _zeros(n, n, n, n).type(eigval.type()) 535 | omega = _zeros(n, n).type(eigval.type()) 536 | for i in range(n): 537 | for j in range(n): 538 | omega[:, :] = eigvec[i, :, None] * eigvec[j, None, :] 539 | omega[idx, idx] = 0.0 540 | omega.div_(eigval_dist) 541 | dvec_out[i, j, :, :] = -_mm(eigvec, omega) 542 | dval = _sum(dval[None, None, :] * dval_out, -1) 543 | dvec = _sum(_sum(dvec[None, None, :, :] * dvec_out, -1), -1) 544 | return dval + dvec 545 | 546 | def covar(x, y): 547 | return _mm(x.t(), y).div_(len(x)) 548 | 549 | def sqrtinv(matrix): 550 | eigval, eigvec = DecomposeRSPDMatrix.apply(matrix) 551 | diag = _diag(1.0 / _sqrt(eigval)) 552 | return _mm(eigvec, _mm(diag, eigvec.t())) 553 | 554 | def get_koopman_matrix(x, y): 555 | ixx = sqrtinv(covar(x, x)) 556 | iyy = sqrtinv(covar(y, y)) 557 | cxy = covar(x, y) 558 | kmm = _mm(ixx, _mm(cxy, iyy)) 559 | return kmm.t() 560 | 561 | class VAMPNet(BaseNet): 562 | def __init__( 563 | self, inp_size, lat_size, hid_size=[], 564 | dropout=0.5, alpha=0.01, prelu=False, 565 | bias=True, lr=0.001, cuda=False, non_blocking=False): 566 | super(VAMPNet, self).__init__( 567 | inp_size, lat_size, hid_size, True, 568 | dropout, alpha, prelu, bias, lr, cuda, non_blocking) 569 | def _setup(self, sizes, bias, alpha, prelu, dropout): 570 | for c, idx in enumerate(range(1, len(sizes))): 571 | setattr( 572 | self, 573 | 'enc_prm_%d' % c, 574 | _nn.Linear(sizes[idx - 1], sizes[idx], bias=bias)) 575 | if c < self._last: 576 | self._create_activation('enc', c, alpha, prelu) 577 | if dropout is not None: 578 | setattr(self, 'enc_drp_%d' % c, dropout) 579 | self._create_activation('enc', self._last, None, None) 580 | self.max = _nn.Softmax(dim=1) 581 | def forward_and_apply_loss_function(self, x, y): 582 | koopman = self(x, y) 583 | return -_sum(koopman**2) 584 | def encode(self, x): 585 | y = x 586 | for idx in range(self._last): 587 | y = self._apply_layer('enc', idx, y) 588 | y = getattr(self, 'enc_prm_%d' % self._last)(y) 589 | return self.max(y) 590 | def forward(self, x, y): 591 | x_enc = self.encode(x) 592 | y_enc = self.encode(y) 593 | return get_koopman_matrix(x_enc, y_enc) 594 | --------------------------------------------------------------------------------