├── notebooks
    ├── _misc
    │   ├── chain.png
    │   ├── ifcs.png
    │   ├── Bpti.pyemma
    │   ├── Bpti_cl.pyemma
    │   ├── dist_10_34.npz
    │   ├── hmm_ck_6states.png
    │   └── hmm_its_4and6_states.png
    ├── img
    │   ├── alanine.png
    │   ├── prob_state1.png
    │   ├── prob_state2.png
    │   ├── prob_state3.png
    │   ├── prob_state4.png
    │   ├── prob_state5.png
    │   ├── prob_state6.png
    │   ├── timescales.png
    │   └── space_division.png
    ├── project-day-1.ipynb
    ├── project-day-2.ipynb
    ├── 01-io-features-presentation.ipynb
    ├── 09-tram-double-well.ipynb
    ├── 10-tram-alanine-dipeptide.ipynb
    ├── 07-troubleshooting.ipynb
    ├── 02-io-features-hands-on.ipynb
    ├── 08-vampnets-session.ipynb
    ├── 11-independent-markov-decomposition.ipynb
    ├── 03-msm-estimation-validation.ipynb
    └── 08-vampnets-session-solved.ipynb
├── .gitignore
├── README.md
└── LICENSE


/notebooks/_misc/chain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markovmodel/pyemma-workshop/HEAD/notebooks/_misc/chain.png


--------------------------------------------------------------------------------
/notebooks/_misc/ifcs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markovmodel/pyemma-workshop/HEAD/notebooks/_misc/ifcs.png


--------------------------------------------------------------------------------
/notebooks/img/alanine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markovmodel/pyemma-workshop/HEAD/notebooks/img/alanine.png


--------------------------------------------------------------------------------
/notebooks/_misc/Bpti.pyemma:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markovmodel/pyemma-workshop/HEAD/notebooks/_misc/Bpti.pyemma


--------------------------------------------------------------------------------
/notebooks/_misc/Bpti_cl.pyemma:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markovmodel/pyemma-workshop/HEAD/notebooks/_misc/Bpti_cl.pyemma


--------------------------------------------------------------------------------
/notebooks/_misc/dist_10_34.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markovmodel/pyemma-workshop/HEAD/notebooks/_misc/dist_10_34.npz


--------------------------------------------------------------------------------
/notebooks/img/prob_state1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markovmodel/pyemma-workshop/HEAD/notebooks/img/prob_state1.png


--------------------------------------------------------------------------------
/notebooks/img/prob_state2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markovmodel/pyemma-workshop/HEAD/notebooks/img/prob_state2.png


--------------------------------------------------------------------------------
/notebooks/img/prob_state3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markovmodel/pyemma-workshop/HEAD/notebooks/img/prob_state3.png


--------------------------------------------------------------------------------
/notebooks/img/prob_state4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markovmodel/pyemma-workshop/HEAD/notebooks/img/prob_state4.png


--------------------------------------------------------------------------------
/notebooks/img/prob_state5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markovmodel/pyemma-workshop/HEAD/notebooks/img/prob_state5.png


--------------------------------------------------------------------------------
/notebooks/img/prob_state6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markovmodel/pyemma-workshop/HEAD/notebooks/img/prob_state6.png


--------------------------------------------------------------------------------
/notebooks/img/timescales.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markovmodel/pyemma-workshop/HEAD/notebooks/img/timescales.png


--------------------------------------------------------------------------------
/notebooks/img/space_division.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markovmodel/pyemma-workshop/HEAD/notebooks/img/space_division.png


--------------------------------------------------------------------------------
/notebooks/_misc/hmm_ck_6states.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markovmodel/pyemma-workshop/HEAD/notebooks/_misc/hmm_ck_6states.png


--------------------------------------------------------------------------------
/notebooks/_misc/hmm_its_4and6_states.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markovmodel/pyemma-workshop/HEAD/notebooks/_misc/hmm_its_4and6_states.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pyemma-workshop
  2 | 
  3 | ## Installation
  4 | We strongly recommend to install the latest `pyemma` and `deeptime` release from the anaconda Python distribution.
  5 | 
  6 | ### step 1: Miniconda
  7 | If you do not have miniconda or anaconda, please follow the instructions here for Python 3.8: https://conda.io/miniconda.html
  8 | 
  9 | We recommend to create a separate environment for the workshop, especially if you already have a anaconda/miniconda installation:
 10 | ```
 11 | # these steps are optional but recommended
 12 | conda create -n workshop
 13 | conda activate workshop
 14 | 
 15 | # this is not optional
 16 | conda config --env --add channels conda-forge
 17 | ```
 18 | 
 19 | ---
 20 | **NOTE**
 21 | 
 22 | For Windows users it makes sense to also install GIT if it is not already available on the system: ``conda install git``
 23 | 
 24 | ---
 25 | 
 26 | ### step 2: pyemma and deeptime
 27 | Installation of all required software packages works by simply executing:
 28 | 
 29 | ```bash
 30 | conda install python=3.9 pyemma_tutorials networkx black
 31 | ```
 32 | 
 33 | You can test your deeptime installation but running the following in your environment:
 34 | 
 35 | ```bash
 36 | python -c "import deeptime; print(deeptime.__version__)" 
 37 | ```
 38 | 
 39 | This should print the latest version of deeptime.
 40 | 
 41 | ### step 3: activate some helpers
 42 | In order to activate some features of the notebooks that we will be using, please also run
 43 | ```bash
 44 | jupyter contrib nbextension install --sys-prefix
 45 | jupyter nbextension enable toc2/main
 46 | jupyter nbextension enable exercise2/main
 47 | jupyter nbextension enable nglview --py --sys-prefix
 48 | ```
 49 | 
 50 | In case you are already a conda and jupyter notebook user with various environments, you can install your environment Python kernel via
 51 | ```bash
 52 | python -m ipykernel install --user --name workshop
 53 | ```
 54 | 
 55 | ## Sanity check
 56 | 
 57 | You can check whether you installed the correct versions by calling
 58 | ```
 59 | conda list
 60 | ```
 61 | 
 62 | PyEMMA should show up with version `2.5.11` and deeptime with version `0.4.1`.
 63 | 
 64 | ## Usage
 65 | ### only on the first day
 66 | Please clone (download) this repository to get local access to the worksheets.
 67 | 
 68 | ```bash
 69 | git clone https://github.com/markovmodel/pyemma-workshop.git
 70 | ```
 71 | Please remember *where* on your local hard disk you have written it!
 72 | 
 73 | ### every morning:
 74 | 
 75 | #### activate environment (optional) 
 76 | Skip if you don't know what a conda environment is. Only if conda environment is used; name might differ.
 77 | ``` bash
 78 | conda activate workshop
 79 | ```
 80 | 
 81 | #### navigate to the right folder
 82 | Please navigate to the folder that you cloned from our github page.
 83 | ```bash
 84 | cd path/to/pyemma-workshop/notebooks
 85 | ```
 86 | 
 87 | #### start the jupyter notebook server
 88 | This command will start the notebook server:
 89 | ```bash
 90 | jupyter notebook
 91 | ```
 92 | 
 93 | Your browser should pop up pointing to a list of notebooks. If it's the wrong browser, add for example `--browser=firefox` or copy and paste the URL into the browser of your choice.
 94 | 
 95 | ### getting updates
 96 | Once you have a local clone of this repository, you can easily obtain updates with `git pull`. 
 97 | We'll let you know once we have published anything new.
 98 | If you work directly in the notebooks that we provide, you might have to use the sequence (`git pull` will raise an error):
 99 | ```bash
100 | git stash
101 | git pull
102 | git stash pop
103 | ```
104 | 


--------------------------------------------------------------------------------
/notebooks/project-day-1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Mini project I\n",
  8 |     "\n",
  9 |     "Content:\n",
 10 |     "- I/O\n",
 11 |     "- Featurisation\n",
 12 |     "- Dimension reduction\n",
 13 |     "- MSM estimation\n",
 14 |     "- MSM validation"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {
 21 |     "ExecuteTime": {
 22 |      "end_time": "2021-02-26T09:01:30.524610Z",
 23 |      "start_time": "2021-02-26T09:01:22.241360Z"
 24 |     }
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "%matplotlib inline\n",
 29 |     "import matplotlib.pyplot as plt\n",
 30 |     "import numpy as np\n",
 31 |     "import mdshare\n",
 32 |     "import pyemma\n",
 33 |     "import deeptime as dt\n",
 34 |     "\n",
 35 |     "pdb = mdshare.fetch('pentapeptide-impl-solv.pdb', working_directory='data')\n",
 36 |     "files = mdshare.fetch('pentapeptide-*-500ns-impl-solv.xtc', working_directory='data')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "Load two different molecular features:\n",
 44 |     "- backbone torsions (with `cossin=True` and `periodic=False`)\n",
 45 |     "- backbone atom positions (with `periodic=False`)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "torsions_feat = pyemma.coordinates.featurizer(pdb)\n",
 55 |     "torsions_feat. #FIXME\n",
 56 |     "torsions_data = #FIXME\n",
 57 |     "\n",
 58 |     "positions_feat = pyemma.coordinates.featurizer(pdb)\n",
 59 |     "positions_feat. #FIXME\n",
 60 |     "positions_data = #FIXME"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "Compute VAMP scores for each molecular feature at a lag time of five steps and a dimensionality of four:"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "torsions_scores = #FIXME\n",
 77 |     "print(f'Torsions:  {np.mean(torsions_scores):.2f}±{np.std(torsions_scores):.2f}')\n",
 78 |     "\n",
 79 |     "positions_scores = #FIXME\n",
 80 |     "print(f'Positions: {np.mean(positions_scores):.2f}±{np.std(positions_scores):.2f}')"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "Perform a dimension reduction of the torsion data with TICA at a lag time of five steps and four independent components:"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "tica = #FIXME\n",
 97 |     "tica_output = #FIXME\n",
 98 |     "\n",
 99 |     "fig, axes = plt.subplots(1, 2, figsize=(10, 4))\n",
100 |     "pyemma.plots.plot_feature_histograms(\n",
101 |     "    np.concatenate(tica_output),\n",
102 |     "    ax=axes[0],\n",
103 |     "    feature_labels=['IC1', 'IC2', 'IC3', 'IC4'],\n",
104 |     "    ylog=True)\n",
105 |     "pyemma.plots.plot_density(*np.concatenate(tica_output)[:, :2].T, ax=axes[1], logscale=True)\n",
106 |     "axes[1].set_xlabel('IC 1')\n",
107 |     "axes[1].set_ylabel('IC 2')\n",
108 |     "fig.tight_layout()"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "Discretise with 75 $k$-means centers and a stride of 10 and show the free energy surface for the first two independent components:"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "cluster = #FIXME\n",
125 |     "dtrajs = #FIXME\n",
126 |     "\n",
127 |     "fig, ax = plt.subplots(figsize=(4, 4))\n",
128 |     "pyemma.plots. #FIXME\n",
129 |     "ax.scatter(*cluster.cluster_centers[:, :2].T, s=15, c='k')\n",
130 |     "ax.set_xlabel('IC 1')\n",
131 |     "ax.set_ylabel('IC 2')\n",
132 |     "fig.tight_layout()"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "Plot the first 10 implied timescales with errorbars up to a lag time of 50 steps (trajectory spacing is $0.1\\frac{\\mathrm{ns}}{\\mathrm{step}}$):"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "its = #FIXME\n",
149 |     "pyemma.plots. #FIXME"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "Estimate a Bayesian MSM at a lag time of five steps and show a CK test for five metastable states:"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "msm = #FIXME\n",
166 |     "\n",
167 |     "pyemma.plots. #FIXME"
168 |    ]
169 |   }
170 |  ],
171 |  "metadata": {
172 |   "kernelspec": {
173 |    "display_name": "Python 3",
174 |    "language": "python",
175 |    "name": "python3"
176 |   },
177 |   "language_info": {
178 |    "codemirror_mode": {
179 |     "name": "ipython",
180 |     "version": 3
181 |    },
182 |    "file_extension": ".py",
183 |    "mimetype": "text/x-python",
184 |    "name": "python",
185 |    "nbconvert_exporter": "python",
186 |    "pygments_lexer": "ipython3",
187 |    "version": "3.8.8"
188 |   },
189 |   "toc": {
190 |    "base_numbering": 1,
191 |    "nav_menu": {},
192 |    "number_sections": true,
193 |    "sideBar": true,
194 |    "skip_h1_title": false,
195 |    "title_cell": "Table of Contents",
196 |    "title_sidebar": "Contents",
197 |    "toc_cell": false,
198 |    "toc_position": {},
199 |    "toc_section_display": true,
200 |    "toc_window_display": false
201 |   }
202 |  },
203 |  "nbformat": 4,
204 |  "nbformat_minor": 2
205 | }
206 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/notebooks/project-day-2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "toc": true
  7 |    },
  8 |    "source": [
  9 |     "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
 10 |     "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Mini-project-II\" data-toc-modified-id=\"Mini-project-II-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Mini project II</a></span><ul class=\"toc-item\"><li><span><a href=\"#Data-preparation-and-MSM-estimation/validation\" data-toc-modified-id=\"Data-preparation-and-MSM-estimation/validation-1.1\"><span class=\"toc-item-num\">1.1&nbsp;&nbsp;</span>Data preparation and MSM estimation/validation</a></span></li></ul></li></ul></div>"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "# Mini project II\n",
 18 |     "\n",
 19 |     "Content:\n",
 20 |     "- Stationary distribution and free energies\n",
 21 |     "- Eigenvectors\n",
 22 |     "- Metastable states\n",
 23 |     "- Mean firts passage times\n",
 24 |     "- Committors"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "%matplotlib inline\n",
 34 |     "import matplotlib.pyplot as plt\n",
 35 |     "import matplotlib as mpl\n",
 36 |     "import numpy as np\n",
 37 |     "import mdshare\n",
 38 |     "import pyemma\n",
 39 |     "import deeptime as dt\n",
 40 |     "\n",
 41 |     "\n",
 42 |     "def visualize_metastable(samples, cmap, selection='not element H'):\n",
 43 |     "    \"\"\" visualize metastable states\n",
 44 |     "    Parameters\n",
 45 |     "    ----------\n",
 46 |     "    samples: list of mdtraj.Trajectory objects\n",
 47 |     "        each element contains all samples for one metastable state.\n",
 48 |     "    cmap: matplotlib.colors.ListedColormap\n",
 49 |     "        color map used to visualize metastable states before.\n",
 50 |     "    selection: str\n",
 51 |     "        which part of the molecule to selection for visualization. For details have a look here:\n",
 52 |     "        http://mdtraj.org/latest/examples/atom-selection.html#Atom-Selection-Language\n",
 53 |     "    \"\"\"\n",
 54 |     "    import nglview\n",
 55 |     "    from matplotlib.colors import to_hex\n",
 56 |     "\n",
 57 |     "    widget = nglview.NGLWidget()\n",
 58 |     "    widget.clear_representations()\n",
 59 |     "    ref = samples[0]\n",
 60 |     "    for i, s in enumerate(samples):\n",
 61 |     "        s = s.superpose(ref, atom_indices=s.top.select('resid 2 3 and mass > 2'))\n",
 62 |     "        s = s.atom_slice(s.top.select(selection))\n",
 63 |     "        comp = widget.add_trajectory(s)\n",
 64 |     "        comp.add_licorice()\n",
 65 |     "\n",
 66 |     "    # this has to be done in a separate loop for whatever reason...\n",
 67 |     "    x = np.linspace(0, 1, num=len(samples))\n",
 68 |     "    for i, x_ in enumerate(x):\n",
 69 |     "        c = to_hex(cmap(x_))\n",
 70 |     "        widget.update_licorice(color=c, component=i, repr_index=i)\n",
 71 |     "        widget.remove_cartoon(component=i)\n",
 72 |     "    return widget\n",
 73 |     "\n",
 74 |     "\n",
 75 |     "pdb = mdshare.fetch('pentapeptide-impl-solv.pdb', working_directory='data')\n",
 76 |     "files = mdshare.fetch('pentapeptide-*-500ns-impl-solv.xtc', working_directory='data')"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "## Data preparation and MSM estimation/validation\n",
 84 |     "\n",
 85 |     "We load backbone torsions into memory, use a 4D TICA projection at lag time five steps, cluster with 75 $k$-means centers, and show the first 10 implied timescales with errorbars:"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "features = pyemma.coordinates.featurizer(pdb)\n",
 95 |     "features.add_backbone_torsions(cossin=True, periodic=False)\n",
 96 |     "data = pyemma.coordinates.#FIXME\n",
 97 |     "\n",
 98 |     "tica_estimator = dt.decomposition.TICA #FIXME\n",
 99 |     "tica = tica_estimator.fit(data).fetch_model()\n",
100 |     "tica_output = [tica.transform(x) for x in data]\n",
101 |     "\n",
102 |     "cluster = pyemma.coordinates.cluster_kmeans(\n",
103 |     "    tica_output, k=75, max_iter=50, stride=10, fixed_seed=1)\n",
104 |     "dtrajs_concatenated = np.concatenate(cluster.dtrajs)\n",
105 |     "\n",
106 |     "fig, axes = plt.subplots(1, 3, figsize=(12, 4))\n",
107 |     "pyemma.plots.plot_feature_histograms(\n",
108 |     "    tica_concatenated,\n",
109 |     "    ax=axes[0],\n",
110 |     "    feature_labels=['IC1', 'IC2', 'IC3', 'IC4'],\n",
111 |     "    ylog=True)\n",
112 |     "pyemma.plots.plot_free_energy(\n",
113 |     "    *tica_concatenated[:, :2].T, ax=axes[1], legacy=False)\n",
114 |     "axes[1].scatter(*cluster.clustercenters[:, :2].T, s=15, c='k')\n",
115 |     "axes[1].set_xlabel('IC 1')\n",
116 |     "axes[1].set_ylabel('IC 2')\n",
117 |     "pyemma.plots.plot_implied_timescales(\n",
118 |     "    pyemma.msm.its(cluster.dtrajs, lags=50, nits=10, errors='bayes'),\n",
119 |     "    units='ns',\n",
120 |     "    dt=0.1,\n",
121 |     "    ax=axes[2])\n",
122 |     "fig.tight_layout()"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "Then, we estimate a Bayesian MSM at lag time five steps and do a CK test with five metastable states:"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "counts = dt.markov.TransitionCountEstimator(5, 'effective').fit(dtrajs).fetch_model().submodel_largest()\n",
139 |     "msm_estimator = dt.markov.msm.BayesianMSM()\n",
140 |     "msm = msm_estimator.fit(counts).fetch_model()\n",
141 |     "\n",
142 |     "nstates = 5\n",
143 |     "validator = msm_estimator.chapman_kolmogorov_validator(nstates, mlags=6)\n",
144 |     "cktest = validator.fit(dtrajs).fetch_model()\n",
145 |     "\n",
146 |     "pyemma.plots.plot_cktest(cktest, dt=0.1, units='ns');"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "Visualise the MSM stationary distribution and the reweighted free energy surface in the first two ICs:"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharex=True, sharey=True)\n",
163 |     "pyemma.plots.plot_contour(\n",
164 |     "    *np.concatenate(tica_output)[:, :2].T,\n",
165 |     "    msm.prior.stationary_distribution[np.concatenate(dtrajs)],\n",
166 |     "    ax=axes[0],\n",
167 |     "    mask=True,\n",
168 |     "    cbar_label='stationary distribution')\n",
169 |     "pyemma.plots.plot_free_energy(\n",
170 |     "    *np.concatenate(tica_output)[:, :2].T,\n",
171 |     "    weights=np.concatenate( ### FIXME ###  ),\n",
172 |     "    ax=axes[1],\n",
173 |     "    legacy=False)\n",
174 |     "for ax in axes.flat:\n",
175 |     "    ax.set_xlabel('IC 1')\n",
176 |     "axes[0].set_ylabel('IC 2')\n",
177 |     "axes[0].set_title('Stationary distribution', fontweight='bold')\n",
178 |     "axes[1].set_title('Reweighted free energy surface', fontweight='bold')\n",
179 |     "fig.tight_layout()"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "Show the first four nontrivial right eigenvectors projected into the first two ICs:"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "eigvec = #FIXME\n",
196 |     "print('The first eigenvector is one: {} (min={}, max={})'.format(\n",
197 |     "    np.allclose(eigvec[:, 0], 1, atol=1e-15), eigvec[:, 0].min(), eigvec[:, 0].max()))\n",
198 |     "\n",
199 |     "fig, axes = plt.subplots(1, 4, figsize=(15, 3), sharex=True, sharey=True)\n",
200 |     "for i, ax in enumerate(axes.flat):\n",
201 |     "    pyemma.plots.plot_contour(\n",
202 |     "        *np.concatenate(tica_output)[:, :2].T,\n",
203 |     "        #FIXME\n",
204 |     "        ax=ax,\n",
205 |     "        cmap='PiYG',\n",
206 |     "        cbar_label='{}. right eigenvector'.format(i + 2),\n",
207 |     "        mask=True)\n",
208 |     "    ax.set_xlabel('IC 1')\n",
209 |     "axes[0].set_ylabel('IC 2')\n",
210 |     "fig.tight_layout()"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "metadata": {},
216 |    "source": [
217 |     "Show the metastable state assigments projected into the first two ICs:"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "pcca = msm.prior.pcca(5)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "metastable_traj = pcca.assignments[np.concatenate(dtrajs)]\n",
236 |     "\n",
237 |     "fig, ax = plt.subplots(figsize=(5, 4))\n",
238 |     "_, _, misc = pyemma.plots.####FIXME####(\n",
239 |     "    *np.concatenate(tica_output)[:, :2].T, metastable_traj, ax=ax)\n",
240 |     "ax.set_xlabel('IC 1')\n",
241 |     "ax.set_ylabel('IC 2')\n",
242 |     "misc['cbar'].set_ticklabels([r'$\\mathcal{S}_%d$' % (i + 1)\n",
243 |     "                             for i in range(nstates)])\n",
244 |     "fig.tight_layout()"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "markdown",
249 |    "metadata": {},
250 |    "source": [
251 |     "Sample 50 frames from the five metastable distributions and visualise using nglview:"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "indices = dt.markov.sample.compute_index_states(dtrajs)\n",
261 |     "sample_indices = dt.markov.sample.indices_by_distribution(indices, pcca.metastable_distributions, 50)\n",
262 |     "\n",
263 |     "my_samples = [pyemma.coordinates.save_traj(files, idist, outfile=None, top=pdb)\n",
264 |     "              for idist in sample_indices]\n",
265 |     "\n",
266 |     "cmap = mpl.cm.get_cmap('viridis', nstates)\n",
267 |     "visualize_metastable(my_samples, cmap)"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "Compute the stationary probabilities and free energies for the five metastable states:"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "print('state\\tπ\\t\\tG/kT')\n",
284 |     "for i, s in enumerate(pcca.sets):\n",
285 |     "    p = #FIXME\n",
286 |     "    print('{}\\t{:f}\\t{:f}'.format(i + 1, p, -np.log(p)))"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "metadata": {},
292 |    "source": [
293 |     "Compute mean first passage times (MFPTs) between all five metastable states:"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "from itertools import product\n",
303 |     "\n",
304 |     "mfpt = np.zeros((nstates, nstates))\n",
305 |     "for i, j in product(range(nstates), repeat=2):\n",
306 |     "    mfpt[i, j] = #FIXME\n",
307 |     "\n",
308 |     "from pandas import DataFrame\n",
309 |     "print('MFPT / ns:')\n",
310 |     "DataFrame(np.round(mfpt, decimals=2), index=range(1, nstates + 1), columns=range(1, nstates + 1))"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {},
316 |    "source": [
317 |     "Compute the coarse-grained flux from metastable state 1 to metastable state 3 and visualise the commitor projected into the first two ICs:"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "metadata": {},
324 |    "outputs": [],
325 |    "source": [
326 |     "start, final = 1, 3\n",
327 |     "A = pcca.sets[start]\n",
328 |     "B = pcca.sets[final]\n",
329 |     "flux = msm.prior.reactive_flux(A, B)\n",
330 |     "\n",
331 |     "cg, cgflux = flux.coarse_grain(pcca.sets)\n",
332 |     "\n",
333 |     "fig, ax = plt.subplots(figsize=(5, 4))\n",
334 |     "pyemma.plots.plot_contour(\n",
335 |     "    *np.concatenate(tica_output)[:, :2].T,\n",
336 |     "    ### FIXME ###\n",
337 |     "    cmap='brg',\n",
338 |     "    ax=ax,\n",
339 |     "    mask=True,\n",
340 |     "    cbar_label=r'committor $\\mathcal{S}_%d \\to \\mathcal{S}_%d$' % (\n",
341 |     "        start + 1, final + 1))\n",
342 |     "fig.tight_layout()"
343 |    ]
344 |   }
345 |  ],
346 |  "metadata": {
347 |   "kernelspec": {
348 |    "display_name": "Python 3",
349 |    "language": "python",
350 |    "name": "python3"
351 |   },
352 |   "language_info": {
353 |    "codemirror_mode": {
354 |     "name": "ipython",
355 |     "version": 3
356 |    },
357 |    "file_extension": ".py",
358 |    "mimetype": "text/x-python",
359 |    "name": "python",
360 |    "nbconvert_exporter": "python",
361 |    "pygments_lexer": "ipython3",
362 |    "version": "3.8.8"
363 |   },
364 |   "toc": {
365 |    "base_numbering": 1,
366 |    "nav_menu": {},
367 |    "number_sections": true,
368 |    "sideBar": true,
369 |    "skip_h1_title": false,
370 |    "title_cell": "Table of Contents",
371 |    "title_sidebar": "Contents",
372 |    "toc_cell": true,
373 |    "toc_position": {},
374 |    "toc_section_display": true,
375 |    "toc_window_display": true
376 |   }
377 |  },
378 |  "nbformat": 4,
379 |  "nbformat_minor": 2
380 | }
381 | 


--------------------------------------------------------------------------------
/notebooks/01-io-features-presentation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data I/O, featurization and coordinate transforms in PyEMMA\n",
  8 |     "## Loading MD example data from our FTP server\n",
  9 |     "Ingredients:\n",
 10 |     "- Topology file: PDB\n",
 11 |     "- Trajectory data: List of .XTC files"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from mdshare import fetch"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "topfile = fetch('alanine-dipeptide-nowater.pdb', working_directory='data')\n",
 30 |     "traj_list = [fetch('alanine-dipeptide-%d-250ns-nowater.xtc' % i, working_directory='data') for i in range(3)]"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "The `fetch` function fetches the data from our servers. **Do not use `mdshare` for your own data!**\n",
 38 |     "\n",
 39 |     "## Import PyEMMA & friends"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "import pyemma\n",
 49 |     "import deeptime\n",
 50 |     "import numpy as np\n",
 51 |     "import matplotlib.pyplot as plt\n",
 52 |     "plt.matplotlib.rcParams.update({'font.size': 16})\n",
 53 |     "\n",
 54 |     "print(f\"PyEMMA {pyemma.__version__}, deeptime {deeptime.__version__}\")"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## The featurizer\n",
 62 |     "All information for data processing (raw MD -> observable) is stored in a `Featurizer` object."
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "featurizer = pyemma.coordinates.featurizer(topfile)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "Features are simply added like this:\n",
 79 |     "```python\n",
 80 |     "featurizer.add_my_feature()\n",
 81 |     "```\n",
 82 |     "For example, we will add all heavy atom distances by first selecting heavy atoms"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "heavy_atom_indices = featurizer.select_Heavy()\n",
 92 |     "heavy_atom_indices"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "... and by adding distances between them:"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "featurizer.add_distances(heavy_atom_indices, periodic=False)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "We can add several different features; to find out which ones have been added, simply use `featurizer.describe()`."
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "featurizer.describe()"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "There are some more handy methods that come with the featurizer:"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "featurizer.dimension()"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "featurizer.select(\"element C\")"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "featurizer.pairs([1, 8, 18])"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "## loading featurized data\n",
166 |     "When dealing with datasets that fit into memory, we preferably load the data directly with\n",
167 |     "#### `load`"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "Y = pyemma.coordinates.load(traj_list, featurizer)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "Alternatively, for high memory demands, the data can be streamed with\n",
184 |     "#### `source`"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "source = pyemma.coordinates.source(traj_list, featurizer)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "The source object has some useful properties. e.g."
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "source.trajectory_lengths()"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "We go on with the data in our memory, `Y`. Let's do a component-wise histogram plot of the loaded data:"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "fig, ax = plt.subplots(figsize=(10, 14))\n",
226 |     "pyemma.plots.plot_feature_histograms(np.concatenate(Y), \n",
227 |     "                                     feature_labels=featurizer, \n",
228 |     "                                     ax=ax)\n",
229 |     "ax.set_xlabel('heavy atom distance')\n",
230 |     "ax.set_title('distance histograms per dimension (normalized)');"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "markdown",
235 |    "metadata": {},
236 |    "source": [
237 |     "## Dimension reduction\n",
238 |     "The very high dimensional space can be transformed into a lower dimensional representation of the dynamics e.g. with TICA:\n",
239 |     "\n",
240 |     "First, we create an estimator of type TICA which allows us to fit data to it and then retrieve one or multiple TICA models."
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "from deeptime.decomposition import TICA\n",
250 |     "tica_estimator = TICA(lagtime=10, var_cutoff=0.95)"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "With the estimator, we fit data (``Y``) and then ``fetch_model``."
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "tica = tica_estimator.fit(Y).fetch_model()\n",
267 |     "# alternatively: tica_estimator.fit_fetch(Y)"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "We obtain a TICA model python object. We get the transformed data (`tics`) from it:"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "tics = tica.transform(Y)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "markdown",
288 |    "metadata": {},
289 |    "source": [
290 |     "The TICA object contains useful properties such as the number of dimension that explain `var_cutoff` of the kinetic variance. The output data has the same shape."
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": null,
296 |    "metadata": {},
297 |    "outputs": [],
298 |    "source": [
299 |     "tica.output_dimension"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "metadata": {},
305 |    "source": [
306 |     "Let's visualize these two dimensions in a 2D histogram:"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "metadata": {},
313 |    "outputs": [],
314 |    "source": [
315 |     "pyemma.plots.plot_free_energy(np.concatenate(tics)[:, 0], np.concatenate(tics)[:, 1])\n",
316 |     "plt.xlabel('TIC 1') \n",
317 |     "plt.ylabel('TIC 2');"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "metadata": {},
323 |    "source": [
324 |     "## VAMP-scoring\n",
325 |     "We can use the VAMP-2 score e.g. to assess how many dimensions we should ideally take. We check for"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "metadata": {},
332 |    "outputs": [],
333 |    "source": [
334 |     "dims = [1, 2, 3, 5]"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "metadata": {},
340 |    "source": [
341 |     "To avoid overfitting, we perform cross validation:"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "from deeptime.decomposition import vamp_score_cv\n",
351 |     "\n",
352 |     "fig, ax = plt.subplots(1, 1, figsize=(12, 3), sharey=True)\n",
353 |     "\n",
354 |     "scores = []\n",
355 |     "errors = []\n",
356 |     "\n",
357 |     "tica_estimator.var_cutoff = None  # this takes precedence over dim if it is set\n",
358 |     "for dim in dims:\n",
359 |     "    tica_estimator.dim = dim\n",
360 |     "    torsions_scores = vamp_score_cv(tica_estimator, trajs=Y, blocksplit=False, n=3)\n",
361 |     "    scores.append(torsions_scores.mean())\n",
362 |     "    errors.append(torsions_scores.std())\n",
363 |     "\n",
364 |     "ax.bar([str(d) for d in dims], scores, yerr=errors)\n",
365 |     "\n",
366 |     "ax.set_ylabel('VAMP2 score\\n @ {} ps'.format(tica_estimator.lagtime))\n",
367 |     "ax.set_xlabel('# dimensions')"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "markdown",
372 |    "metadata": {},
373 |    "source": [
374 |     "We note that the VAMP-2 score is converged at 2 dimensions.\n",
375 |     "\n",
376 |     "## Discretization / clustering\n",
377 |     "There are different ways of clustering the data, we use $k$-means here.\n",
378 |     "\n",
379 |     "Same as with TICA, we first create an estimator and then obtain a clustering model from it."
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": null,
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "from tqdm.notebook import tqdm\n",
389 |     "from deeptime.clustering import KMeans\n",
390 |     "\n",
391 |     "kmeans_estimator = KMeans(n_clusters=75, progress=tqdm)\n",
392 |     "clustering = kmeans_estimator.fit(np.concatenate(tics)[::50]).fetch_model()"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "markdown",
397 |    "metadata": {},
398 |    "source": [
399 |     "As before, the clustering routine returns an object with several useful properties and methods. For example, let us visualize the cluster centers stored in `clustering.cluster_centers`:"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": null,
405 |    "metadata": {},
406 |    "outputs": [],
407 |    "source": [
408 |     "fig, ax = plt.subplots()\n",
409 |     "ax.plot(*clustering.cluster_centers.T, 'ko')\n",
410 |     "pyemma.plots.plot_free_energy(*np.concatenate(tics).T, ax=ax)\n",
411 |     "ax.set_xlabel('$\\Phi$ / rad') \n",
412 |     "ax.set_ylabel('$\\Psi$ / rad');"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "markdown",
417 |    "metadata": {},
418 |    "source": [
419 |     "Most importantly, the clustering object contains the discrete trajectories that we need for later MSM estimation. Each frame in each trajectory gets assigned to one of the cluster centers here."
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "metadata": {},
426 |    "outputs": [],
427 |    "source": [
428 |     "dtrajs = []\n",
429 |     "for projected_trajectory in tics:\n",
430 |     "    dtrajs.append(clustering.transform(projected_trajectory))"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": null,
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": [
439 |     "fig, ax = plt.subplots(2, 1, figsize=(15, 8), sharex=True)\n",
440 |     "b, e = 20400, 21100\n",
441 |     "ax[0].plot(tics[0][b:e, 0], alpha=.75, label='TIC 1')\n",
442 |     "ax[0].plot(tics[0][b:e, 1], alpha=.75, label='TIC 2')\n",
443 |     "ax[0].set_ylabel('TICA transformed data')\n",
444 |     "ax[0].legend()\n",
445 |     "ax[1].step(range(dtrajs[0][b:e].shape[0]), dtrajs[0][b:e])\n",
446 |     "ax[1].set_xlabel('time (steps)')\n",
447 |     "ax[1].set_ylabel('state')\n",
448 |     "fig.tight_layout()"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": null,
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "print(dtrajs[0][:25])"
458 |    ]
459 |   }
460 |  ],
461 |  "metadata": {
462 |   "kernelspec": {
463 |    "display_name": "deeptime",
464 |    "language": "python",
465 |    "name": "deeptime"
466 |   },
467 |   "language_info": {
468 |    "codemirror_mode": {
469 |     "name": "ipython",
470 |     "version": 3
471 |    },
472 |    "file_extension": ".py",
473 |    "mimetype": "text/x-python",
474 |    "name": "python",
475 |    "nbconvert_exporter": "python",
476 |    "pygments_lexer": "ipython3",
477 |    "version": "3.9.10"
478 |   },
479 |   "toc": {
480 |    "base_numbering": 1,
481 |    "nav_menu": {},
482 |    "number_sections": true,
483 |    "sideBar": true,
484 |    "skip_h1_title": false,
485 |    "title_cell": "Table of Contents",
486 |    "title_sidebar": "Contents",
487 |    "toc_cell": false,
488 |    "toc_position": {},
489 |    "toc_section_display": true,
490 |    "toc_window_display": false
491 |   }
492 |  },
493 |  "nbformat": 4,
494 |  "nbformat_minor": 4
495 | }
496 | 


--------------------------------------------------------------------------------
/notebooks/09-tram-double-well.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "b1e6f196",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Analysing a 1D double well using TRAM"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "6c7aa563",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Import deeptime and other preliminaries..."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "id": "b5031313",
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import numpy as np\n",
 27 |     "import scipy\n",
 28 |     "import matplotlib.pyplot as plt\n",
 29 |     "from tqdm.notebook import tqdm\n",
 30 |     "\n",
 31 |     "from deeptime.clustering import KMeans\n",
 32 |     "from deeptime.markov.msm import TRAM, TRAMDataset"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "id": "cbe3ea65",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## The system\n",
 41 |     "We define the potential to be a simple double well."
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "id": "bbab87e3",
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "def potential(x):\n",
 52 |     "    return 10 * (x - 1) ** 4 * (x + 1) ** 4"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "id": "95e3e1f8",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "Plot the potential over the range of interest"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "id": "523a52bb",
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "x_min = -1.5\n",
 71 |     "x_max = 1.5\n",
 72 |     "\n",
 73 |     "xs = np.linspace(x_min, x_max, num=100) # We will sample 100 discrete bins\n",
 74 |     "plt.plot(xs, potential(xs));"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "id": "4009e151",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "## Sampling\n",
 83 |     "\n",
 84 |     "The following is a simple Markov-chain Monte Carlo (MCMC) sampling algorithm that samples our potential `U`."
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "id": "2fb29be5",
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "# this is a simple markov-chain monte carlo (MCMC) sampler that samples a trajectory\n",
 95 |     "# in an n-dimensional space given a potential function U.\n",
 96 |     "def sample_MCMC(U, d, beta=1.0, n_steps=1000, max_stepsize=1.):\n",
 97 |     "    trajectory = np.zeros((n_steps, d), dtype=np.float32)\n",
 98 |     "    p = lambda u: np.exp(-beta * u)\n",
 99 |     "    r_prev = np.random.uniform(0, 8, size=d)\n",
100 |     "\n",
101 |     "    for n in range(n_steps):\n",
102 |     "\n",
103 |     "        r = r_prev + 2 * max_stepsize * (np.random.uniform(size=d)) - max_stepsize\n",
104 |     "\n",
105 |     "        delta = U(r) - U(r_prev)\n",
106 |     "        if delta > 0:\n",
107 |     "            if p(delta) < np.random.uniform():\n",
108 |     "                r = r_prev\n",
109 |     "            else:\n",
110 |     "                r_prev = r\n",
111 |     "        else:\n",
112 |     "            r_prev = r\n",
113 |     "\n",
114 |     "        trajectory[n] = r\n",
115 |     "    return trajectory"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "id": "2e9bc28a",
121 |    "metadata": {},
122 |    "source": [
123 |     "### An example trajectory\n",
124 |     "We sample an example trajectory. As we will very likely see in the histogram, we get stuck in one of the wells, and don't cross the barrier."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "id": "dd018fac",
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "traj = sample_MCMC(potential, 1, n_steps=1000)\n",
135 |     "\n",
136 |     "plt.hist(traj, bins=10, density=True, color='C1');\n",
137 |     "plt.plot(xs, potential(xs))\n",
138 |     "plt.xlim(x_min, x_max);"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "id": "27ff73cd",
144 |    "metadata": {},
145 |    "source": [
146 |     "### Sampling at multiple temperatures\n",
147 |     "To solve this sampling problem we will sample at multiple temperatures. We define six thermodynamic states by their temperatures. The unnormalized probabilities for each temperature are plotted. As we can see, the higher the temperature, the more flattened the shape of the probability distribution becomes."
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "id": "3ad712d8",
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "temperatures = [1, 10, 20, 30, 40, 50]  # six temperatures to perform parallel tempering.\n",
158 |     "\n",
159 |     "for T in temperatures:\n",
160 |     "    plt.plot(xs, np.exp(-(1/T) * potential(xs)))"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "id": "3b4b2619",
166 |    "metadata": {},
167 |    "source": [
168 |     "Now we will sample a trajectory for each of the the temperatures."
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "id": "ca2843df",
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "N_SAMPLES_PER_TRAJ = 10000\n",
179 |     "trajectories = []\n",
180 |     "\n",
181 |     "for T in temperatures:\n",
182 |     "    print(f\"sampling for T={T} ...\")\n",
183 |     "    traj = sample_MCMC(potential, 1, beta=1/T, n_steps=N_SAMPLES_PER_TRAJ, max_stepsize=2.)\n",
184 |     "    trajectories.append(traj.squeeze())\n",
185 |     "print('done!')"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "id": "c124f6ff",
191 |    "metadata": {},
192 |    "source": [
193 |     "We can plot a histogram of all trajectories to see the distribution we sample. By sampling at multiple temperatures, we are now able to sample in the transition region."
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "id": "3f72b2d8",
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "plt.hist(np.concatenate(trajectories), bins=100, density=True, color='C1',\n",
204 |     "         label='Histogram over visited states');"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "id": "e9ac48f1",
210 |    "metadata": {},
211 |    "source": [
212 |     "## Analysing the data with TRAM\n",
213 |     "Now we want to recombine our data to obtain the original potential function. We do this by analysing our data using TRAM. \n",
214 |     "\n",
215 |     "### Bias matrices\n",
216 |     "First, we construct the bias energy matrices by computing the bias potential for each sample in each state. The bias energy for a temperature biased simulation is given by $b^k(x) = (\\beta^k - \\beta^0)\\; U^0(x)$. In this example, we assume all quantities are unitless.\n",
217 |     "\n",
218 |     "When we print the shape of one of the bias matrices we see it has shape `(N, S)`, `N` being the number of samples, and `S` the number of thermodynamic states. Thus, for each sample we have computed the energy of that sample in _all_ states."
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "id": "e103cf89",
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "bias_matrices = []\n",
229 |     "\n",
230 |     "betas = np.asarray(temperatures, dtype=float)**-1\n",
231 |     "bias_factors = betas - betas[0]\n",
232 |     "\n",
233 |     "for traj in trajectories:\n",
234 |     "    energies = potential(traj)\n",
235 |     "    bias_potentials = energies[:, None] * bias_factors\n",
236 |     "    bias_matrices.append(bias_potentials)\n",
237 |     "    \n",
238 |     "print(bias_matrices[0].shape)"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "id": "cd19b290",
244 |    "metadata": {},
245 |    "source": [
246 |     "### The discrete trajectories\n",
247 |     "Now we discretize our samples into 10 Markov states along the x-axis. The discretized trajectories (`dtrajs`) are what allow `TRAM` to compute state counts and transition counts."
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "id": "716157f8",
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "# define the markov states\n",
258 |     "n_bins = 10\n",
259 |     "markov_states = np.linspace(x_min, x_max, n_bins, endpoint=True)\n",
260 |     "print(markov_states)"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "id": "5fe516a7",
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": [
270 |     "# construct the discretized trajectories\n",
271 |     "dtrajs = [np.digitize(traj, markov_states, right=False) for traj in trajectories]\n",
272 |     "\n",
273 |     "print(trajectories[0][:10])\n",
274 |     "print(dtrajs[0][:10])"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "id": "6d164679",
280 |    "metadata": {},
281 |    "source": [
282 |     "### Applying TRAM\n",
283 |     "Now, we are ready to analyse the data using TRAM. We set `connectivity='summed_count_matrix'` because we assume all states are connected since they are distributed along a one-dimensional axis. In a realistic scenario, you should reduce your data to the largest connected set after clustering."
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "id": "df247d8b",
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "# first construct the dataset...\n",
294 |     "dataset = TRAMDataset(dtrajs=dtrajs, bias_matrices=bias_matrices, lagtime=10)\n",
295 |     "dataset.restrict_to_largest_connected_set(connectivity='summed_count_matrix', progress=tqdm)\n",
296 |     "\n",
297 |     "# ... and use that run TRAM \n",
298 |     "tram = TRAM(maxiter=1000, progress=tqdm, callback_interval=3)\n",
299 |     "model = tram.fit_fetch(dataset)"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "id": "233a21df",
305 |    "metadata": {},
306 |    "source": [
307 |     "We can have a look at the free energies per thermodynamic state and per Markov state (i.e. the $f_i^k$) that we estimated"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": null,
313 |    "id": "3984adb2",
314 |    "metadata": {},
315 |    "outputs": [],
316 |    "source": [
317 |     "plt.contourf(model.biased_conf_energies);"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "id": "098cbdd1",
323 |    "metadata": {},
324 |    "source": [
325 |     "And plot the free energies per Markov state, $f_i$, which already give us an impression of the actual shape of our potential:"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "id": "5c4efb07",
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "plt.plot(np.linspace(-1.5, 1.5, len(model.markov_state_energies)), model.markov_state_energies)"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "id": "715c8dee",
341 |    "metadata": {},
342 |    "source": [
343 |     "The underlying MEMM is a property of the model, namely the `msm_collection`. This contains all transition matrices for each thermodynamic state."
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "id": "283460b3",
350 |    "metadata": {},
351 |    "outputs": [],
352 |    "source": [
353 |     "MEMM = model.msm_collection\n",
354 |     "MEMM.transition_matrix[1,2]"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "markdown",
359 |    "id": "2f5e0b2c",
360 |    "metadata": {},
361 |    "source": [
362 |     "The MEMM behaves as an MSM, namely the MSM of the currently selected thermodynamic state. By default, state 0 is selected. We can select the Markov model for state $k$ from the MSM collection by calling `select`:"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": null,
368 |    "id": "17ed46fc",
369 |    "metadata": {},
370 |    "outputs": [],
371 |    "source": [
372 |     "MEMM.select(2)\n",
373 |     "MEMM.transition_matrix[1,2]"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "id": "25365e43",
379 |    "metadata": {},
380 |    "source": [
381 |     "## Estimating the PMF\n",
382 |     "Now we want to recover the potential of mean force (PMF). To do this, we define a number of bins (in this case 25) that we want to calculate the unbiased PMF over. We essentially calculate a probability distribution over all bins, and then take the negative logarithm to recover the PMF. "
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": null,
388 |    "id": "32c9b6f2",
389 |    "metadata": {},
390 |    "outputs": [],
391 |    "source": [
392 |     "n_bins = 25\n",
393 |     "bins = np.linspace(-1.5, 1.5, n_bins)\n",
394 |     "\n",
395 |     "# the bin number for each sample. Samples are binned into 25 bins\n",
396 |     "bin_numbers = [np.digitize(traj, bins, right=True) for traj in trajectories]\n",
397 |     "\n",
398 |     "# pass samples to the model with their corresponding bin numbers\n",
399 |     "pmf = model.compute_PMF(dtrajs, bias_matrices, bin_numbers, n_bins = n_bins)\n",
400 |     "plt.plot(bins, pmf - pmf.min())\n",
401 |     "\n",
402 |     "# compare with the original distribution, shifted so that the minimum lies at zero.\n",
403 |     "plt.plot(bins, potential(bins) - potential(bins).min(), 'k--');"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": null,
409 |    "id": "a8cb6ffe",
410 |    "metadata": {},
411 |    "outputs": [],
412 |    "source": []
413 |   }
414 |  ],
415 |  "metadata": {
416 |   "kernelspec": {
417 |    "display_name": "Python 3 (ipykernel)",
418 |    "language": "python",
419 |    "name": "python3"
420 |   },
421 |   "language_info": {
422 |    "codemirror_mode": {
423 |     "name": "ipython",
424 |     "version": 3
425 |    },
426 |    "file_extension": ".py",
427 |    "mimetype": "text/x-python",
428 |    "name": "python",
429 |    "nbconvert_exporter": "python",
430 |    "pygments_lexer": "ipython3",
431 |    "version": "3.9.10"
432 |   },
433 |   "toc": {
434 |    "base_numbering": 1,
435 |    "nav_menu": {},
436 |    "number_sections": true,
437 |    "sideBar": true,
438 |    "skip_h1_title": false,
439 |    "title_cell": "Table of Contents",
440 |    "title_sidebar": "Contents",
441 |    "toc_cell": false,
442 |    "toc_position": {},
443 |    "toc_section_display": true,
444 |    "toc_window_display": false
445 |   }
446 |  },
447 |  "nbformat": 4,
448 |  "nbformat_minor": 5
449 | }
450 | 


--------------------------------------------------------------------------------
/notebooks/10-tram-alanine-dipeptide.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "49edcefc",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Analysing alanine dipeptide with TRAM\n",
  9 |     "As an excercise, find the potential of mean force (PMF) with respect to the torsion angles of alanine dipeptide.\n",
 10 |     "\n",
 11 |     "Alanine dipeptide is a small peptide which is often used as a model system. It consists of 21 atoms, and we are interested in two backbone torsion angles $\\phi$ and $\\psi$.\n",
 12 |     "\n",
 13 |     "![Alanine dipeptide](img/alanine.png)\n",
 14 |     "(image source: https://www.cp2k.org/)\n",
 15 |     "\n",
 16 |     "We want to know how alanine dipeptide is structured, specifically, what combinations of these two torsion angles are energetically favourable, and which are unfavourable.\n",
 17 |     "\n",
 18 |     "To do this, simulations have been performed at 21 different temperatures between 300K and 500K. Each simulation corresponds to one thermodynamic state, and 10000 samples were taken during each simulation (energies and torsion angles have been stored).\n",
 19 |     "\n",
 20 |     "Use TRAM to combine the data from these different simulations, and estimate the free energy of each state. Then use those free energies to estimate the free energy surface as a function of the two torsion angles."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "ff2b1023",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Input data\n",
 29 |     "The temperatures of the different simulations (i.e. replica's, i.e. thermodynamic states) are given, as well as some useful imports and constants:"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "id": "13e290fc",
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "import numpy as np\n",
 40 |     "import matplotlib.pyplot as plt\n",
 41 |     "from tqdm.notebook import tqdm\n",
 42 |     "from deeptime.clustering import KMeans\n",
 43 |     "from deeptime.markov.msm import TRAMDataset, TRAM\n",
 44 |     "import mdshare\n",
 45 |     "\n",
 46 |     "\n",
 47 |     "N_REPLICAS = 10 # total number of temperature replicas (=simulations)\n",
 48 |     "SAMPLES_PER_T = 10000 # number of samples that were taken per simulation\n",
 49 |     "\n",
 50 |     "temperatures = np.arange(300, 501, N_REPLICAS) # the temperatures of each simulation\n",
 51 |     "\n",
 52 |     "# kBT in kJ/mol\n",
 53 |     "kB_kJ = 0.00831446261815324  \n",
 54 |     "\n",
 55 |     "# kBT in kcal/mol\n",
 56 |     "kB_kcal = 0.0019872042586408316"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "id": "c1117bcd",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "The input data consists of energies and angles. These are loaded into lists, each lists is of length `N_REPLICAS`. The `i`-th element in each list contains the data for the temperature at index `i`. In other words:\n",
 65 |     "\n",
 66 |     "* `angles[i][n]` is of shape `(2)` and contains angles $\\phi$ and $\\psi$ of the `n`-th sample taken in simulation `i` (i.e. at temperature `i`), in degrees.\n",
 67 |     "\n",
 68 |     "* `energies[i][n]` is the potential energy belonging to that same sample, in kcal/mol. "
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "id": "95a5fd5c",
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "angles_file_name = mdshare.fetch('alanine_dipeptide_parallel_tempering_dihedrals.npz', working_directory='data')\n",
 79 |     "energies_file_name = mdshare.fetch('alanine_dipeptide_parallel_tempering_energies.npz', working_directory='data')\n",
 80 |     "\n",
 81 |     "angles = []\n",
 82 |     "energies = []\n",
 83 |     "\n",
 84 |     "for T in temperatures:\n",
 85 |     "    angles.append(np.load(angles_file_name)[f't{T}'])\n",
 86 |     "    energies_T = np.load(energies_file_name)[f't{T}']\n",
 87 |     "    energies_T -= energies_T.min()\n",
 88 |     "    energies.append(energies_T / 1000)\n",
 89 |     "print(f\"angles    -    length: {len(angles)},  shape: {angles[0].shape}\")\n",
 90 |     "print(f\"energies  -    length: {len(energies)},  shape: {energies[0].shape}\")"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "id": "959aa196",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "## Construct the bias matrix\n",
 99 |     "The energies are used to fill the bias matrix. For each sample, the bias needs to be computed in each thermodynamic state. In other words: for each sample, compute the bias energy $b^k(x) = U^k(x) - U^0(x)$ for every thermodynamic state $k$. "
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "id": "a7129ee0",
105 |    "metadata": {},
106 |    "source": [
107 |     "First compute the inverse temperature, $\\beta$ for each thermodynamic state. Note: the energies are stored in kcal/mol, but the bias energies will need to be non-dimensional! Choose $\\beta$ accordingly. See section 0 for some useful constants."
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "id": "aef998e8",
114 |    "metadata": {
115 |     "solution2": "hidden",
116 |     "solution2_first": true
117 |    },
118 |    "outputs": [],
119 |    "source": [
120 |     "betas ="
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "id": "6d629eb7",
127 |    "metadata": {
128 |     "solution2": "hidden"
129 |    },
130 |    "outputs": [],
131 |    "source": [
132 |     "betas = (kB_kJ * temperatures.astype(float))**(-1)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "id": "c3834ebb",
138 |    "metadata": {},
139 |    "source": [
140 |     "Now compute the bias matrices and add them to the list. You should obtain a list of bias matrices of length `N_REPLICAS`, with each bias matrix of shape `(SAMPLES_PER_T, N_REPLICAS)`"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "id": "42336736",
147 |    "metadata": {
148 |     "solution2": "hidden",
149 |     "solution2_first": true
150 |    },
151 |    "outputs": [],
152 |    "source": [
153 |     "bias_matrices = []"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "id": "668703fb",
160 |    "metadata": {
161 |     "solution2": "hidden"
162 |    },
163 |    "outputs": [],
164 |    "source": [
165 |     "bias_matrices = []\n",
166 |     "\n",
167 |     "for k, T in enumerate(temperatures):\n",
168 |     "    # apply the bias factors to the potential energies to produce bias energies\n",
169 |     "    bias_matrices.append((betas - betas[0]) * energies[k][:, None])\n",
170 |     "    \n",
171 |     "print(bias_matrices[0].mean())"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "id": "cba9174b",
177 |    "metadata": {},
178 |    "source": [
179 |     "## Discretize the trajectories\n",
180 |     "The torsion angles $\\phi$ and $\\psi$ need to be transformed into discrete trajectories from which the transition counts are computed.\n",
181 |     "\n",
182 |     "Discretize the angles into Markov states using an appropriate clustering method (for example Kmeans++: https://deeptime-ml.github.io/latest/notebooks/clustering.html#k-means++-initialization)."
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "id": "71b704f7",
189 |    "metadata": {
190 |     "solution2": "hidden",
191 |     "solution2_first": true
192 |    },
193 |    "outputs": [],
194 |    "source": [
195 |     "estimator ="
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "id": "fd930c38",
202 |    "metadata": {
203 |     "solution2": "hidden"
204 |    },
205 |    "outputs": [],
206 |    "source": [
207 |     "estimator = KMeans(\n",
208 |     "    n_clusters=20, # we will cluster data to 20 Markov states\n",
209 |     "    init_strategy='kmeans++',\n",
210 |     "    max_iter=10,\n",
211 |     "    fixed_seed=13,\n",
212 |     "    n_jobs=8\n",
213 |     ")"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "id": "80205e28",
219 |    "metadata": {},
220 |    "source": [
221 |     "Use the estimator to obtain a clustering mode."
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "id": "2e59ecd1",
228 |    "metadata": {
229 |     "solution2": "hidden",
230 |     "solution2_first": true
231 |    },
232 |    "outputs": [],
233 |    "source": [
234 |     "clustering ="
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "id": "ec67c678",
241 |    "metadata": {
242 |     "solution2": "hidden"
243 |    },
244 |    "outputs": [],
245 |    "source": [
246 |     "clustering = estimator.fit(angles).fetch_model()"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "markdown",
251 |    "id": "453b5019",
252 |    "metadata": {},
253 |    "source": [
254 |     "Now compute the dtrajs by applying the clustering transformation."
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "id": "0179179d",
261 |    "metadata": {
262 |     "solution2": "hidden",
263 |     "solution2_first": true
264 |    },
265 |    "outputs": [],
266 |    "source": [
267 |     "dtrajs = "
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "id": "4a5434f4",
274 |    "metadata": {
275 |     "solution2": "hidden"
276 |    },
277 |    "outputs": [],
278 |    "source": [
279 |     "dtrajs = []\n",
280 |     "\n",
281 |     "for A in angles:\n",
282 |     "    dtrajs.append(np.asarray(clustering.transform(A)))"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "markdown",
287 |    "id": "24fd2d28",
288 |    "metadata": {},
289 |    "source": [
290 |     "## Analyse the data with TRAM\n",
291 |     "Now use TRAM to estimate the free energies. First construct a TRAMDataset, and use this to restrict the data to the largest connected set."
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": null,
297 |    "id": "625b2283",
298 |    "metadata": {
299 |     "solution2": "hidden",
300 |     "solution2_first": true
301 |    },
302 |    "outputs": [],
303 |    "source": [
304 |     "dataset = "
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "id": "80fe494a",
311 |    "metadata": {
312 |     "solution2": "hidden"
313 |    },
314 |    "outputs": [],
315 |    "source": [
316 |     "dataset = TRAMDataset(dtrajs, bias_matrices, lagtime=10)\n",
317 |     "dataset.restrict_to_largest_connected_set(connectivity='BAR_variance')"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "id": "1da14811",
323 |    "metadata": {},
324 |    "source": [
325 |     "Now create the TRAM estimator and fit the model.\n",
326 |     "\n",
327 |     "Convergence can take a while (you will need at least a few 1000 iterations). Use the `MBAR` initialization strategy to speed up the initial convergence, and pass a tqdm progress bar to the TRAM object to visualize the progress.\n",
328 |     "\n",
329 |     "It may help to run only a few TRAM iterations first, and plot the `TRAMModel.therm_state_energies` (the free energies of the thermodynamic states) as a sanity check, and once everything behaves as you would expect, run TRAM until convergence. In this example, the free energies should increase with increased temperature."
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "id": "ef093083",
336 |    "metadata": {
337 |     "solution2": "hidden",
338 |     "solution2_first": true
339 |    },
340 |    "outputs": [],
341 |    "source": [
342 |     "tram_estimator =\n",
343 |     "model = "
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "id": "928a9f36",
350 |    "metadata": {
351 |     "solution2": "hidden"
352 |    },
353 |    "outputs": [],
354 |    "source": [
355 |     "tram_estimator = TRAM(lagtime=10, maxiter=10000, progress=tqdm, maxerr=1e-8,  \n",
356 |     "                      init_strategy=\"MBAR\", init_maxerr=1e-10, init_maxiter=1000)\n",
357 |     "model = tram_estimator.fit_fetch(dataset)\n",
358 |     "\n",
359 |     "plt.plot(model.therm_state_energies)"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "markdown",
364 |    "id": "d130cbda",
365 |    "metadata": {},
366 |    "source": [
367 |     "## Recover the PMF\n",
368 |     "Recover the free energy surface as a function of the torsion angles. For this, you will need to discretize the angles into a one-dimensional set of bins over the space (-180, 180). Choose a number of bins and use numpy's digitize to discretize each angle."
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": null,
374 |    "id": "69532610",
375 |    "metadata": {
376 |     "solution2": "hidden",
377 |     "solution2_first": true
378 |    },
379 |    "outputs": [],
380 |    "source": [
381 |     "n_bins = 20\n",
382 |     "bins = np.linspace(-180, 180, n_bins, endpoint=True)\n",
383 |     "binned_angles ="
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": null,
389 |    "id": "e7610ec8",
390 |    "metadata": {
391 |     "solution2": "hidden"
392 |    },
393 |    "outputs": [],
394 |    "source": [
395 |     "n_bins = 20\n",
396 |     "bins = np.linspace(-180, 180, n_bins, endpoint=True)\n",
397 |     "binned_angles = np.digitize(angles, np.linspace(-180, 180, n_bins, endpoint=False), right=False) - 1"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "markdown",
402 |    "id": "dd012721",
403 |    "metadata": {},
404 |    "source": [
405 |     "Turn the 2-dimensional angle indices into a 1-dimensional index."
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": null,
411 |    "id": "896ee377",
412 |    "metadata": {
413 |     "solution2": "hidden",
414 |     "solution2_first": true
415 |    },
416 |    "outputs": [],
417 |    "source": [
418 |     "binned_trajectories ="
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": null,
424 |    "id": "d52b6051",
425 |    "metadata": {
426 |     "solution2": "hidden"
427 |    },
428 |    "outputs": [],
429 |    "source": [
430 |     "binned_trajectories = n_bins * binned_angles[:, :, 0] + binned_angles[:, :, 1]"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "markdown",
435 |    "id": "ffa227a9",
436 |    "metadata": {},
437 |    "source": [
438 |     "Use the `compute_PMF` method of `TRAMModel` to compute the PMF over the bins. Since we are interested in free energy differences, shift the PMF so that the minimum is at 0."
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": null,
444 |    "id": "7e8d3233",
445 |    "metadata": {
446 |     "solution2": "hidden",
447 |     "solution2_first": true
448 |    },
449 |    "outputs": [],
450 |    "source": [
451 |     "pmf ="
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "code",
456 |    "execution_count": null,
457 |    "id": "c7aa1a40",
458 |    "metadata": {
459 |     "solution2": "hidden"
460 |    },
461 |    "outputs": [],
462 |    "source": [
463 |     "pmf = model.compute_PMF(dtrajs, bias_matrices, binned_trajectories) * kB_kcal * 300\n",
464 |     "pmf -= pmf.min()"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "markdown",
469 |    "id": "28eb9120",
470 |    "metadata": {},
471 |    "source": [
472 |     "The plot of the free energy surface with on the x- and y-axes torsion angles is called a Ramachandran plot. Make such a plot for alanine dipeptide, showing the energy surface in kcal/mol at T=300K (recall that TRAM operates on unitless quantities). You can use matplotlibs `contourf` for visualization, and numpy's `meshgrid` to construct 2D coordinates from the bins.\n",
473 |     "\n",
474 |     "* Have you recovered the meta-stable states?\n",
475 |     "* Can you identify the transition path between the different states?\n",
476 |     "* What are the free energy differences?"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": null,
482 |    "id": "b145acac",
483 |    "metadata": {
484 |     "solution2": "hidden",
485 |     "solution2_first": true
486 |    },
487 |    "outputs": [],
488 |    "source": []
489 |   },
490 |   {
491 |    "cell_type": "code",
492 |    "execution_count": null,
493 |    "id": "70c77959",
494 |    "metadata": {
495 |     "solution2": "hidden"
496 |    },
497 |    "outputs": [],
498 |    "source": [
499 |     "XS, YS = np.meshgrid(bins, bins)\n",
500 |     "im = plt.contourf(XS, YS, np.reshape(pmf, [n_bins, n_bins]).T, cmap='jet', levels=50)\n",
501 |     "plt.colorbar(im);"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "markdown",
506 |    "id": "ed8d09dc",
507 |    "metadata": {},
508 |    "source": [
509 |     "What else?\n",
510 |     "* the underlying Markov Models of the states that you clustered the data in, are stored in `model.msm_collection`. Use these to analyse kinetic properties\n",
511 |     "* What about the lagtime dependence of the model?"
512 |    ]
513 |   }
514 |  ],
515 |  "metadata": {
516 |   "kernelspec": {
517 |    "display_name": "Python 3 (ipykernel)",
518 |    "language": "python",
519 |    "name": "python3"
520 |   },
521 |   "language_info": {
522 |    "codemirror_mode": {
523 |     "name": "ipython",
524 |     "version": 3
525 |    },
526 |    "file_extension": ".py",
527 |    "mimetype": "text/x-python",
528 |    "name": "python",
529 |    "nbconvert_exporter": "python",
530 |    "pygments_lexer": "ipython3",
531 |    "version": "3.9.10"
532 |   },
533 |   "toc": {
534 |    "base_numbering": 1,
535 |    "nav_menu": {},
536 |    "number_sections": true,
537 |    "sideBar": true,
538 |    "skip_h1_title": false,
539 |    "title_cell": "Table of Contents",
540 |    "title_sidebar": "Contents",
541 |    "toc_cell": false,
542 |    "toc_position": {},
543 |    "toc_section_display": true,
544 |    "toc_window_display": false
545 |   }
546 |  },
547 |  "nbformat": 4,
548 |  "nbformat_minor": 5
549 | }
550 | 


--------------------------------------------------------------------------------
/notebooks/07-troubleshooting.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Markov models: What could go wrong?\n",
  8 |     "<img alt=\"FU BERLIN LOGO\" style=\"border-width:0\" src=\"https://www.fu-berlin.de/assets/default2/fu-logo-1x-2e301182ca3c9a45ae34adf6dca8b6ba.png\" title='FU BERLIN.' align=\"right\"/>\n",
  9 |     "\n",
 10 |     "```\n",
 11 |     "# Tim Hempel\n",
 12 |     "# Markov Winterschool 2022\n",
 13 |     "# Noe Group, FU Berlin\n",
 14 |     "```"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import numpy as np\n",
 24 |     "import pyemma\n",
 25 |     "import deeptime as dt\n",
 26 |     "import mdshare\n",
 27 |     "import matplotlib.pyplot as plt"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "plt.matplotlib.rcParams.update({'font.size': 16})"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "## well-sampled double-well potential"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "file = mdshare.fetch('hmm-doublewell-2d-100k.npz', working_directory='data')\n",
 53 |     "with np.load(file) as fh:\n",
 54 |     "    trjs = [fh['trajectory'][:, 1]]\n"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "def implied_timescales_msm(dtrajs, lagtimes):\n",
 64 |     "    \"\"\"\n",
 65 |     "    Estimate implied timescales based on the largest connected set of Markov states.\n",
 66 |     "    \n",
 67 |     "    Parameters\n",
 68 |     "    ----------\n",
 69 |     "    dtrajs: (list of) np.ndarray, dtype int \n",
 70 |     "        discrete trajectories or observations\n",
 71 |     "    lagtimes: iterable of integers\n",
 72 |     "        lag times for model evaluation\n",
 73 |     "        \n",
 74 |     "    Returns:\n",
 75 |     "    -------\n",
 76 |     "    deeptime.util.validation.ImpliedTimescales\n",
 77 |     "        Object containing implied timescales of estimated MSMs\n",
 78 |     "    \"\"\"\n",
 79 |     "    \n",
 80 |     "    models = []\n",
 81 |     "    for lag in lagtimes:\n",
 82 |     "        count_model = dt.markov.TransitionCountEstimator(lag, 'sliding').fit_fetch(dtrajs)\n",
 83 |     "        msm = dt.markov.msm.MaximumLikelihoodMSM().fit_fetch(count_model.submodel_largest())\n",
 84 |     "        \n",
 85 |     "        models.append(msm)\n",
 86 |     "        \n",
 87 |     "    return dt.util.validation.implied_timescales(models)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "fig, ax = plt.subplots(1, 2, figsize=(10, 4))\n",
 97 |     "\n",
 98 |     "ax[0].hist(np.concatenate(trjs), bins=50, range=(-3, 3), alpha=1, density=True);\n",
 99 |     "ax[0].plot(trjs[0][:200], np.linspace(0, 1, 200), color='k', alpha=.7)\n",
100 |     "ax[0].annotate('time', xy=(2.2, .8), xytext=(2.2, .1), rotation=90, \n",
101 |     "            arrowprops=dict(arrowstyle='->'), ha='center',)\n",
102 |     "\n",
103 |     "\n",
104 |     "cl = dt.clustering.RegularSpace(dmin=0.05).fit(np.concatenate(trjs)).fetch_model()\n",
105 |     "dtrajs = [cl.transform(x) for x in trjs]\n",
106 |     "its = implied_timescales_msm(dtrajs, range(1, 11))\n",
107 |     "\n",
108 |     "#its = implied_timescales_msm(dtrajs, lagtimes=range(1, 11))\n",
109 |     "dt.plots.plot_implied_timescales(its, marker='o', ax=ax[1], n_its=2)\n",
110 |     "\n",
111 |     "fig.tight_layout()"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "def plot_dw(trjs, hist=False, cl=None, ax=None, no_timeseries=False):\n",
121 |     "    \"\"\"\n",
122 |     "    Plot double well as histogram / time-series overlay\n",
123 |     "    \"\"\"\n",
124 |     "    if ax is None:\n",
125 |     "        fig, ax = plt.subplots(1, 1)\n",
126 |     "    if hist:\n",
127 |     "        ax.hist(trjs[0], bins=30, range=(-2, 2), alpha=.5, density=True);\n",
128 |     "        ax.hist(trjs[1], bins=30, range=(-2, 2), alpha=.5, density=True);\n",
129 |     "        \n",
130 |     "    if not no_timeseries:\n",
131 |     "        ax.annotate('time', xy=(1.8, .9), xytext=(1.8, .1), rotation=90, \n",
132 |     "                    arrowprops=dict(arrowstyle='->'), ha='center',)\n",
133 |     "        if cl is None:\n",
134 |     "            ax.plot(trjs[0], np.linspace(0, 1, trjs[0].shape[0]), color='C0')\n",
135 |     "            ax.plot(trjs[1], np.linspace(0, 1, trjs[1].shape[0]), color='C1')\n",
136 |     "        else:\n",
137 |     "            dtrajs = [cl.transform(x) for x in trjs]\n",
138 |     "            ax.plot(cl.cluster_centers[dtrajs[0], 0], np.linspace(0, 1, dtrajs[0].shape[0]), color='C0')\n",
139 |     "            ax.plot(cl.cluster_centers[dtrajs[1], 0], np.linspace(0, 1, dtrajs[1].shape[0]), color='C1')\n",
140 |     "        \n",
141 |     "    ax.set_xlim(-2, 2)\n",
142 |     "    return ax"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "## different kinds of almost well-sampled double well potentials\n",
150 |     "### 1. irreversibly connected"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "file = mdshare.fetch('doublewell_oneway.npy', working_directory='data')\n",
160 |     "trjs = [trj for trj in np.load(file)]"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "plot_dw(trjs, cl=cl, hist=True);"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "# cl = dt.clustering.RegularSpace(dmin=0.7).fit(np.concatenate(trjs)).fetch_model()\n",
179 |     "cl = dt.clustering.RegularSpace(dmin=0.1).fit(np.concatenate(trjs)).fetch_model()\n",
180 |     "print(cl.n_clusters)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "plot_dw(trjs, cl=cl, hist=True);"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "dtrajs = [cl.transform(x) for x in trjs]\n",
199 |     "lagtimes = [1, 10, 100, 200, 300, 500, 800, 1000]\n",
200 |     "\n",
201 |     "its = implied_timescales_msm(dtrajs, lagtimes)\n",
202 |     "\n",
203 |     "ax = dt.plots.plot_implied_timescales(its, marker='o', n_its=2)\n",
204 |     "ax.semilogy()\n",
205 |     "ax.set_ylim(1e1, 3e5);"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "c = dt.markov.TransitionCountEstimator(lagtimes[3], \"sliding\").fit(dtrajs).fetch_model()\n",
215 |     "m = dt.markov.msm.MaximumLikelihoodMSM().fit(c.submodel_largest()).fetch_model()\n",
216 |     "\n",
217 |     "ax = plot_dw(trjs, hist=True, cl=cl)\n",
218 |     "ax.set_yticks([])\n",
219 |     "tx = ax.twinx()\n",
220 |     "tx.plot(cl.cluster_centers[m.count_model.state_symbols, 0], m.eigenvectors_right()[:, 1], \n",
221 |     "        'ko:', label='first eigvec')\n",
222 |     "tx.set_ylabel('eigenvector')\n",
223 |     "ax.figure.legend()\n",
224 |     "ax.set_xlim(-2, 2)"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "m.count_model.state_symbols"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "### 2. disconnected with crossovers"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "file = mdshare.fetch('doublewell_disconnected.npy', working_directory='data')\n",
250 |     "trjs = [trj for trj in np.load(file)]"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "plot_dw(trjs, cl=cl, hist=True);"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": null,
265 |    "metadata": {
266 |     "scrolled": true
267 |    },
268 |    "outputs": [],
269 |    "source": [
270 |     "cl = dt.clustering.RegularSpace(dmin=0.7).fit(np.concatenate(trjs)).fetch_model()\n",
271 |     "# cl = dt.clustering.RegularSpace(dmin=0.1).fit(np.concatenate(trjs)).fetch_model()\n",
272 |     "print(cl.n_clusters)"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": null,
278 |    "metadata": {
279 |     "scrolled": true
280 |    },
281 |    "outputs": [],
282 |    "source": [
283 |     "plot_dw(trjs, cl=cl, hist=True);"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {
290 |     "scrolled": true
291 |    },
292 |    "outputs": [],
293 |    "source": [
294 |     "dtrajs = [cl.transform(x) for x in trjs]\n",
295 |     "lagtimes = [1, 10, 100, 200, 300, 500, 800, 1000]\n",
296 |     "its = implied_timescales_msm(dtrajs, lagtimes)\n",
297 |     "ax = dt.plots.plot_implied_timescales(its, marker='o', n_its=2)\n",
298 |     "ax.semilogy()\n",
299 |     "ax.set_ylim(1e1, 3e5);"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": null,
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "c = dt.markov.TransitionCountEstimator(200, \"sliding\").fit(dtrajs).fetch_model()\n",
309 |     "m = dt.markov.msm.MaximumLikelihoodMSM().fit(c.submodel_largest()).fetch_model()\n",
310 |     "pcca = m.pcca(2)\n",
311 |     "\n",
312 |     "fig, ax = plt.subplots(1, 2, figsize=(10, 4))\n",
313 |     "index_order = np.argsort(cl.cluster_centers[:, 0])\n",
314 |     "for n, metastable_distribution in enumerate(pcca.metastable_distributions):\n",
315 |     "    ax[0].step(cl.cluster_centers[index_order, 0], metastable_distribution[index_order], ':', linewidth=3,\n",
316 |     "               label='metastable distr state {}'.format(n), where='mid')\n",
317 |     "ax[0].set_title('metastable distributions')\n",
318 |     "for _ax in ax:\n",
319 |     "    tx = _ax.twinx()\n",
320 |     "    tx.set_yticklabels([]); tx.set_yticks([])\n",
321 |     "    plot_dw(trjs, cl=cl, hist=True, ax=tx, no_timeseries=True)\n",
322 |     "    \n",
323 |     "ax[1].step(cl.cluster_centers[index_order, 0], -np.log(m.stationary_distribution[index_order]), 'k--', linewidth=3,\n",
324 |     "           label='potential from MSM $\\pi$', where='mid')\n",
325 |     "fig.legend(loc='center right')\n",
326 |     "ax[1].set_title('stat dist');"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "markdown",
331 |    "metadata": {},
332 |    "source": [
333 |     "## some \"real world data\""
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "import mdshare"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": null,
348 |    "metadata": {},
349 |    "outputs": [],
350 |    "source": [
351 |     "pdb = mdshare.fetch('alanine-dipeptide-nowater.pdb', working_directory='data')\n",
352 |     "files = mdshare.fetch('alanine-dipeptide-*-250ns-nowater.xtc', working_directory='data')\n",
353 |     "feat = pyemma.coordinates.featurizer(pdb)\n",
354 |     "\n",
355 |     "feat.add_all()\n",
356 |     "data = pyemma.coordinates.load(files, features=feat)"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "markdown",
361 |    "metadata": {},
362 |    "source": [
363 |     "### ill-conducted TICA analysis"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": null,
369 |    "metadata": {},
370 |    "outputs": [],
371 |    "source": [
372 |     "tica = dt.decomposition.TICA(lagtime=data[0].shape[0]-100, dim=2).fit(data).fetch_model()\n",
373 |     "tica_output = [tica.transform(x) for x in data]\n",
374 |     "\n",
375 |     "pyemma.plots.plot_free_energy(*np.concatenate(tica_output).T, legacy=False);"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "markdown",
380 |    "metadata": {},
381 |    "source": [
382 |     "### discretization and MSM estimation"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": null,
388 |    "metadata": {},
389 |    "outputs": [],
390 |    "source": [
391 |     "cluster = dt.clustering.KMeans(200, max_iter=50).fit(np.concatenate(tica_output)[::100]).fetch_model()"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": null,
397 |    "metadata": {},
398 |    "outputs": [],
399 |    "source": [
400 |     "lagtimes = [1, 5, 10, 20, 30, 50]\n",
401 |     "dtrajs = [cluster.transform(x) for x in tica_output]\n",
402 |     "its = implied_timescales_msm(dtrajs, lagtimes)\n",
403 |     "ax = dt.plots.plot_implied_timescales(its, marker='o', n_its=3);\n",
404 |     "ax.semilogy()"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "markdown",
409 |    "metadata": {},
410 |    "source": [
411 |     "##### -> \"converged\"\n",
412 |     "### coarse graining into two states"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": null,
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "counts = dt.markov.TransitionCountEstimator(20, 'sliding').fit(dtrajs).fetch_model().submodel_largest()\n",
422 |     "msm_estimator = dt.markov.msm.MaximumLikelihoodMSM()\n",
423 |     "msm = msm_estimator.fit(counts).fetch_model()\n",
424 |     "\n",
425 |     "nstates = 2\n",
426 |     "pcca = msm.pcca(nstates);"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": null,
432 |    "metadata": {},
433 |    "outputs": [],
434 |    "source": [
435 |     "stride = 10\n",
436 |     "metastable_trajs_strided = [pcca.assignments[dtraj[::stride]] for dtraj in dtrajs]\n",
437 |     "tica_output_strided = [y[::stride] for y in tica_output]\n",
438 |     "_, _, misc = pyemma.plots.plot_state_map(*np.concatenate(tica_output_strided).T, \n",
439 |     "                                         np.concatenate(metastable_trajs_strided));\n",
440 |     "misc['cbar'].set_ticklabels(range(1, nstates + 1))  # set state numbers 1 ... nstates"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": null,
446 |    "metadata": {},
447 |    "outputs": [],
448 |    "source": [
449 |     "ck_lagtimes = np.arange(1, 11) * msm.lagtime\n",
450 |     "test_models = [dt.markov.msm.MaximumLikelihoodMSM(lagtime=lag).fit_fetch(dtrajs) for lag in ck_lagtimes]"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": null,
456 |    "metadata": {},
457 |    "outputs": [],
458 |    "source": [
459 |     "ck_test = msm.ck_test(test_models, nstates)\n",
460 |     "dt.plots.plot_ck_test(ck_test)"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "markdown",
465 |    "metadata": {},
466 |    "source": [
467 |     "# 👍 AWESOME !\n",
468 |     "\n",
469 |     "\n",
470 |     "What could be wrong?\n",
471 |     "\n",
472 |     "\n",
473 |     "### let's have a look at the trajectories as assigned to PCCA coarse states"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": null,
479 |    "metadata": {},
480 |    "outputs": [],
481 |    "source": [
482 |     "fig, ax = plt.subplots(1, 1, figsize=(15, 6), sharey=True, sharex=True)\n",
483 |     "ax_yticks_labels = []\n",
484 |     "for n, pcca_traj in enumerate(metastable_trajs_strided):\n",
485 |     "    ax.plot(range(len(pcca_traj)), pcca.n_metastable * n + pcca_traj, color='k', linewidth=0.3)\n",
486 |     "    ax.scatter(range(len(pcca_traj)), pcca.n_metastable * n + pcca_traj, c=pcca_traj, s=0.1)\n",
487 |     "    ax_yticks_labels.append(((pcca.n_metastable * (2 * n + 1) - 1) / 2, n + 1))\n",
488 |     "ax.set_yticks([l[0] for l in ax_yticks_labels])\n",
489 |     "ax.set_yticklabels([str(l[1]) for l in ax_yticks_labels])\n",
490 |     "ax.set_ylabel('Trajectory #')\n",
491 |     "ax.set_xlabel('time / {} ps'.format(stride))\n",
492 |     "fig.tight_layout()"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "markdown",
497 |    "metadata": {},
498 |    "source": [
499 |     "### histogram view doesn't show connectedness nor metastability\n",
500 |     "### Take-away: Stay as close as possible to the trajectories"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": null,
506 |    "metadata": {},
507 |    "outputs": [],
508 |    "source": [
509 |     "fig, axes = plt.subplots(2, 3, figsize=(12, 6), sharex=True, sharey='row')\n",
510 |     "\n",
511 |     "for n, trj in enumerate(tica_output):\n",
512 |     "    for dim, traj1d in enumerate(trj.T):\n",
513 |     "        axes[dim, n].plot(traj1d[::stride], linewidth=.5)\n",
514 |     "for ax in axes[1]:\n",
515 |     "    ax.set_xlabel('time / {} ps'.format(stride))\n",
516 |     "for dim, ax in enumerate(axes[:, 0]):\n",
517 |     "    ax.set_ylabel('IC {}'.format(dim + 1))\n",
518 |     "for n, ax in enumerate(axes[0]):\n",
519 |     "    ax.set_title('Trajectory # {}'.format(n + 1))\n",
520 |     "fig.tight_layout()"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "markdown",
525 |    "metadata": {},
526 |    "source": [
527 |     "## Thanks for your attention.\n",
528 |     "\n",
529 |     "An extended version of this notebook with more details & explanations can be found in the our PyEMMA tutorials\n",
530 |     "\n",
531 |     "> Wehmeyer, C.; Scherer, M. K.; Hempel, T.; Husic, B. E.; Olsson, S.; Noé, F. Introduction to Markov State Modeling with the PyEMMA Software [Article v1.0]. LiveCoMS 2018, 1 (1), 5965. https://doi.org/10.33011/livecoms.1.1.5965.\n",
532 |     "\n",
533 |     "or, more specifically, in \n",
534 |     "[this notebook](\n",
535 |     "https://github.com/markovmodel/pyemma_tutorials/blob/master/notebooks/08-common-problems.ipynb)."
536 |    ]
537 |   }
538 |  ],
539 |  "metadata": {
540 |   "kernelspec": {
541 |    "display_name": "Python 3 (ipykernel)",
542 |    "language": "python",
543 |    "name": "python3"
544 |   },
545 |   "language_info": {
546 |    "codemirror_mode": {
547 |     "name": "ipython",
548 |     "version": 3
549 |    },
550 |    "file_extension": ".py",
551 |    "mimetype": "text/x-python",
552 |    "name": "python",
553 |    "nbconvert_exporter": "python",
554 |    "pygments_lexer": "ipython3",
555 |    "version": "3.9.10"
556 |   },
557 |   "toc": {
558 |    "base_numbering": 1,
559 |    "nav_menu": {},
560 |    "number_sections": true,
561 |    "sideBar": true,
562 |    "skip_h1_title": false,
563 |    "title_cell": "Table of Contents",
564 |    "title_sidebar": "Contents",
565 |    "toc_cell": false,
566 |    "toc_position": {},
567 |    "toc_section_display": true,
568 |    "toc_window_display": false
569 |   }
570 |  },
571 |  "nbformat": 4,
572 |  "nbformat_minor": 2
573 | }
574 | 


--------------------------------------------------------------------------------
/notebooks/02-io-features-hands-on.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data input, featurization and coordinate transforms in PyEMMA\n",
  8 |     "**Remember**:\n",
  9 |     "- to run the currently highlighted cell, hold <kbd>&#x21E7; Shift</kbd> and press <kbd>&#x23ce; Enter</kbd>;\n",
 10 |     "- to get help for a specific function, place the cursor within the function's brackets, hold <kbd>&#x21E7; Shift</kbd>, and press <kbd>&#x21E5; Tab</kbd>;\n",
 11 |     "- you can find the full documentation at [PyEMMA.org](http://www.pyemma.org).\n",
 12 |     "\n",
 13 |     "## Loading MD example data from our FTP server\n",
 14 |     "Ingredients:\n",
 15 |     "- Topology file: PDB\n",
 16 |     "- Trajectory data: List of .XTC files"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from mdshare import fetch"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "topfile = fetch('alanine-dipeptide-nowater.pdb', working_directory='data')\n",
 35 |     "traj_list = [fetch('alanine-dipeptide-%d-250ns-nowater.xtc' % i, working_directory='data') for i in range(3)]"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "The `fetch` function fetches the data from our servers. **Do not use `mdshare` for your own data!**\n",
 43 |     "\n",
 44 |     "## Import PyEMMA & friends"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "import pyemma\n",
 54 |     "import numpy as np\n",
 55 |     "import matplotlib.pyplot as plt\n",
 56 |     "plt.matplotlib.rcParams.update({'font.size': 16})"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "## Several ways of processing the same data\n",
 64 |     "### Backbone torsions\n",
 65 |     "- The best possible discription for Ala2\n",
 66 |     "- Two dimensions that discribe the full dynamics\n",
 67 |     "- A priori known\n",
 68 |     "\n",
 69 |     "#### Exercise: Define the featurizer and add backbone torsions."
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {
 76 |     "solution2": "hidden",
 77 |     "solution2_first": true
 78 |    },
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "bbtorsion_feat = # FIXME\n",
 82 |     "# FIXME"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {
 89 |     "solution2": "hidden"
 90 |    },
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "bbtorsion_feat = pyemma.coordinates.featurizer(topfile)\n",
 94 |     "bbtorsion_feat.add_backbone_torsions()"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "#### Exercise: Load the data into memory"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {
108 |     "solution2": "hidden",
109 |     "solution2_first": true
110 |    },
111 |    "outputs": [],
112 |    "source": [
113 |     "bbtorsions = # FIXME"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {
120 |     "solution2": "hidden"
121 |    },
122 |    "outputs": [],
123 |    "source": [
124 |     "bbtorsions = pyemma.coordinates.load(traj_list, bbtorsion_feat)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "pyemma.plots.plot_free_energy(np.concatenate(bbtorsions)[:, 0], np.concatenate(bbtorsions)[:, 1])\n",
134 |     "plt.xlabel('$\\Phi$ / rad') \n",
135 |     "plt.ylabel('$\\Psi$ / rad');"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "### heavy atom distances\n",
143 |     "- without prior knowledge usually a good choice\n",
144 |     "- very high dimensional even for this system\n",
145 |     "\n",
146 |     "#### Exercise: define a second featurizer object and add heavy atom distances:"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {
153 |     "solution2": "hidden",
154 |     "solution2_first": true
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "heavy_atom_dist_feat =  # FIXME\n",
159 |     "heavy_atom_indices =  # FIXME\n",
160 |     "# FIXME"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {
167 |     "solution2": "hidden"
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "heavy_atom_dist_feat = pyemma.coordinates.featurizer(topfile)\n",
172 |     "heavy_atom_indices = heavy_atom_dist_feat.select_Heavy()\n",
173 |     "\n",
174 |     "heavy_atom_dist_feat.add_distances(heavy_atom_indices, periodic=False)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "print(heavy_atom_indices)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "heavy_atom_dist_feat.dimension()"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "heavy_atom_distances = pyemma.coordinates.load(traj_list, heavy_atom_dist_feat)"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {
207 |     "solution2": "hidden",
208 |     "solution2_first": true
209 |    },
210 |    "source": [
211 |     "#### Exercise: Visualize the heavy atom distances."
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": null,
217 |    "metadata": {
218 |     "solution2": "hidden"
219 |    },
220 |    "outputs": [],
221 |    "source": [
222 |     "fig, ax = plt.subplots(figsize=(10, 14))\n",
223 |     "pyemma.plots.plot_feature_histograms(np.concatenate(heavy_atom_distances), feature_labels=heavy_atom_dist_feat, ax=ax)\n",
224 |     "ax.set_xlabel('heavy atom distance')\n",
225 |     "ax.set_title('distance histograms per dimension (normalized)');"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {},
231 |    "source": [
232 |     "## VAMP-scoring: Which features are best?\n",
233 |     "We already learned that two dimensions are a good choice for our data. Now, we want to compare different input features with the VAMP-2 score.\n",
234 |     "Please complete the next task at the following lag times:"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "dim = 2\n",
244 |     "lags = [10, 100, 1000]  # ps"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "markdown",
249 |    "metadata": {
250 |     "solution2": "hidden",
251 |     "solution2_first": true
252 |    },
253 |    "source": [
254 |     "#### Exercise: Perform cross-validated VAMP-scoring for backbone torsions and heavy-atom distances."
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {
261 |     "solution2": "hidden"
262 |    },
263 |    "outputs": [],
264 |    "source": [
265 |     "from deeptime.decomposition import TICA, vamp_score_cv\n",
266 |     "\n",
267 |     "fig, axes = plt.subplots(1, 3, figsize=(12, 3), sharey=True)\n",
268 |     "\n",
269 |     "labels = ['backbone\\ntorsions', 'heavy Atom\\ndistances']\n",
270 |     "\n",
271 |     "tica_estimator = TICA(lagtime=lags[0], dim=dim)\n",
272 |     "\n",
273 |     "for ax, lag in zip(axes.flat, lags):\n",
274 |     "    tica_estimator.lagtime = lag\n",
275 |     "    torsions_scores = vamp_score_cv(tica_estimator, trajs=bbtorsions, blocksplit=False, n=3)\n",
276 |     "    scores = [torsions_scores.mean()]\n",
277 |     "    errors = [torsions_scores.std()]\n",
278 |     "    distances_scores = vamp_score_cv(tica_estimator, trajs=heavy_atom_distances, blocksplit=False, n=3)\n",
279 |     "    scores += [distances_scores.mean()]\n",
280 |     "    errors += [distances_scores.std()]\n",
281 |     "    ax.bar(labels, scores, yerr=errors, color=['C0', 'C1', 'C2'])\n",
282 |     "    ax.set_title(r'lag time $\\tau$={}ps'.format(lag))\n",
283 |     "\n",
284 |     "axes[0].set_ylabel('VAMP2 score')\n",
285 |     "fig.tight_layout()"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "#### Discussion:\n",
293 |     "Which feature looks best and why?\n",
294 |     "\n",
295 |     "## TICA projection of heavy atom distances\n",
296 |     "#### Exercise: Do a TICA projection of the heavy atom distances"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {
303 |     "solution2": "hidden",
304 |     "solution2_first": true
305 |    },
306 |    "outputs": [],
307 |    "source": [
308 |     "tica = # FIXME"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": null,
314 |    "metadata": {
315 |     "solution2": "hidden"
316 |    },
317 |    "outputs": [],
318 |    "source": [
319 |     "tica_estimator = TICA(lagtime=10, var_cutoff=0.95)\n",
320 |     "tica = tica_estimator.fit_fetch(heavy_atom_distances)"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "tica.output_dimension"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "tics = tica.transform(heavy_atom_distances)"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": null,
344 |    "metadata": {
345 |     "scrolled": true
346 |    },
347 |    "outputs": [],
348 |    "source": [
349 |     "pyemma.plots.plot_free_energy(np.concatenate(tics)[:, 0], np.concatenate(tics)[:, 1])\n",
350 |     "plt.xlabel('TIC 1') \n",
351 |     "plt.ylabel('TIC 2');"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "metadata": {},
357 |    "source": [
358 |     "#### Exercise: Perform a PCA projection of heavy atom distances"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "metadata": {
365 |     "solution2": "hidden",
366 |     "solution2_first": true
367 |    },
368 |    "outputs": [],
369 |    "source": [
370 |     "pca = pyemma.coordinates.pca()  # FIXME"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": null,
376 |    "metadata": {
377 |     "solution2": "hidden"
378 |    },
379 |    "outputs": [],
380 |    "source": [
381 |     "pca = pyemma.coordinates.pca(heavy_atom_distances, dim=2)"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": [
390 |     "pcs = [pca.transform(traj) for traj in heavy_atom_distances]"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": null,
396 |    "metadata": {},
397 |    "outputs": [],
398 |    "source": [
399 |     "pyemma.plots.plot_free_energy(np.concatenate(pcs)[:, 0], np.concatenate(pcs)[:, 1])\n",
400 |     "plt.xlabel('IC 1') \n",
401 |     "plt.ylabel('IC 2');"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": [
410 |     "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n",
411 |     "t = ['backbone torsions', 'TICs', 'PCs']\n",
412 |     "for n, _y in enumerate([bbtorsions, tics, pcs]):\n",
413 |     "    pyemma.plots.plot_free_energy(np.concatenate(_y)[:, 0], np.concatenate(_y)[:, 1], ax=axes[n], cbar=False)\n",
414 |     "    axes[n].set_title(t[n])"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "metadata": {},
420 |    "source": [
421 |     "#### Discussion:\n",
422 |     "What do you think are the differences between the plots in terms of the dynamics they describe?\n",
423 |     "\n",
424 |     "## Different ways of discretizing the output"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": null,
430 |    "metadata": {},
431 |    "outputs": [],
432 |    "source": [
433 |     "y = bbtorsions  # if you want, you can change this later and try e.g. the TICA transformed data"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "markdown",
438 |    "metadata": {},
439 |    "source": [
440 |     "#### Exercise: Perform k-means clustering and plot the cluster centers into the free energy landscape"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": null,
446 |    "metadata": {
447 |     "solution2": "hidden",
448 |     "solution2_first": true
449 |    },
450 |    "outputs": [],
451 |    "source": [
452 |     "clustering_kmeans =  # FIXME"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": null,
458 |    "metadata": {
459 |     "solution2": "hidden"
460 |    },
461 |    "outputs": [],
462 |    "source": [
463 |     "from deeptime.clustering import KMeans\n",
464 |     "from tqdm.notebook import tqdm\n",
465 |     "\n",
466 |     "kmeans_estimator = KMeans(75, max_iter=30, progress=tqdm)\n",
467 |     "stride = 10\n",
468 |     "clustering_kmeans = kmeans_estimator.fit_fetch(np.concatenate(y)[::stride])\n",
469 |     "# different k, stride, max_iter can be used!"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": null,
475 |    "metadata": {
476 |     "solution2": "hidden",
477 |     "solution2_first": true
478 |    },
479 |    "outputs": [],
480 |    "source": [
481 |     "fig, ax = plt.subplots()\n",
482 |     "# FIXME\n",
483 |     "pyemma.plots.plot_free_energy(*np.concatenate(y).T, ax=ax)\n",
484 |     "ax.set_xlabel('$\\Phi$ / rad') \n",
485 |     "ax.set_ylabel('$\\Psi$ / rad');"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "code",
490 |    "execution_count": null,
491 |    "metadata": {
492 |     "solution2": "hidden"
493 |    },
494 |    "outputs": [],
495 |    "source": [
496 |     "fig, ax = plt.subplots()\n",
497 |     "ax.plot(*clustering_kmeans.cluster_centers.T, 'ko')\n",
498 |     "pyemma.plots.plot_free_energy(*np.concatenate(y).T, ax=ax)\n",
499 |     "ax.set_xlabel('$\\Phi$ / rad') \n",
500 |     "ax.set_ylabel('$\\Psi$ / rad');"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "markdown",
505 |    "metadata": {},
506 |    "source": [
507 |     "#### Exercise: Do the same with regular space clustering"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "code",
512 |    "execution_count": null,
513 |    "metadata": {
514 |     "solution2": "hidden",
515 |     "solution2_first": true
516 |    },
517 |    "outputs": [],
518 |    "source": [
519 |     "clustering_regspace = # FIXME\n",
520 |     "clustering_regspace.n_clusters"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "code",
525 |    "execution_count": null,
526 |    "metadata": {
527 |     "solution2": "hidden"
528 |    },
529 |    "outputs": [],
530 |    "source": [
531 |     "from deeptime.clustering import RegularSpace\n",
532 |     "\n",
533 |     "regspace_estimator = RegularSpace(dmin=0.4)\n",
534 |     "clustering_regspace = regspace_estimator.fit_fetch(np.concatenate(y))\n",
535 |     "clustering_regspace.n_clusters"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "code",
540 |    "execution_count": null,
541 |    "metadata": {},
542 |    "outputs": [],
543 |    "source": [
544 |     "fig, ax = plt.subplots()\n",
545 |     "ax.plot(*clustering_regspace.cluster_centers.T, 'ko')\n",
546 |     "pyemma.plots.plot_free_energy(*np.concatenate(y).T, ax=ax)\n",
547 |     "ax.set_xlabel('$\\Phi$ / rad') \n",
548 |     "ax.set_ylabel('$\\Psi$ / rad');"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "markdown",
553 |    "metadata": {},
554 |    "source": [
555 |     "#### Discussion:\n",
556 |     "In your group, discuss the differences between the two clustering algorithms. Which one do you think is better? Which one is faster?"
557 |    ]
558 |   },
559 |   {
560 |    "cell_type": "markdown",
561 |    "metadata": {},
562 |    "source": [
563 |     "## Add-on: A quick MSM estimate to check our work\n",
564 |     "If you are already familiar with Markov state modeling, have a look at the following plots. It tells us which combination of features/projection/clustering conserves the slowest process in the system. Further, we might find that in some cases, MSM implied timescales converge faster than in others."
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "code",
569 |    "execution_count": null,
570 |    "metadata": {},
571 |    "outputs": [],
572 |    "source": [
573 |     "from deeptime.markov.msm import MaximumLikelihoodMSM\n",
574 |     "from deeptime.util.validation import implied_timescales\n",
575 |     "from deeptime.plots import plot_implied_timescales"
576 |    ]
577 |   },
578 |   {
579 |    "cell_type": "code",
580 |    "execution_count": null,
581 |    "metadata": {},
582 |    "outputs": [],
583 |    "source": [
584 |     "fig, axes = plt.subplots(3, 3, figsize=(15, 13))\n",
585 |     "t = ['backbone torsions', 'TICs', 'PCs']\n",
586 |     "for n, _y in enumerate([bbtorsions, tics, pcs]):\n",
587 |     "    pyemma.plots.plot_free_energy(*np.concatenate(_y).T, ax=axes[0][n], cbar=False)\n",
588 |     "    axes[0][n].set_title(t[n], fontweight='bold')\n",
589 |     "\n",
590 |     "    data = np.concatenate(_y)[::100]\n",
591 |     "    clusterings = [\n",
592 |     "        KMeans(75, max_iter=30).fit(data).fetch_model(),\n",
593 |     "        RegularSpace(dmin=0.4 if n==0 else .4 / (2.2 * n)).fit(data).fetch_model()\n",
594 |     "    ]\n",
595 |     "    for cl_n, cl_obj in enumerate(clusterings):\n",
596 |     "        axes[0][n].plot(*cl_obj.cluster_centers.T, 'ko' if cl_n == 0 else 'rs', alpha=.8)\n",
597 |     "        dtrajs = [cl_obj.transform(traj) for traj in _y]\n",
598 |     "        models = []\n",
599 |     "        for lag in [1, 2, 4, 6, 8]:\n",
600 |     "            models.append(MaximumLikelihoodMSM(lagtime=lag).fit_fetch(dtrajs))\n",
601 |     "        its = implied_timescales(models)\n",
602 |     "        plot_implied_timescales(its, n_its=4, ax=axes[cl_n+1][n])\n",
603 |     "        axes[cl_n+1][n].set_yscale('log')\n",
604 |     "        # its = implied_timescales_msm(dtrajs, lagtimes=[1, 2, 4, 6, 8], nits=4, bayesian=False)\n",
605 |     "        # pyemma.plots.plot_implied_timescales(its, ax=axes[cl_n+1][n])\n",
606 |     "        axes[cl_n+1][n].set_ylim(1e-1, 3e3)\n",
607 |     "        axes[cl_n+1][n].set_ylabel('')\n",
608 |     "axes[1][0].set_ylabel('k-means clustering', fontweight='bold')\n",
609 |     "axes[2][0].set_ylabel('regspace clustering', fontweight='bold')\n",
610 |     "\n",
611 |     "fig.tight_layout()"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": null,
617 |    "metadata": {},
618 |    "outputs": [],
619 |    "source": []
620 |   }
621 |  ],
622 |  "metadata": {
623 |   "kernelspec": {
624 |    "display_name": "deeptime",
625 |    "language": "python",
626 |    "name": "deeptime"
627 |   },
628 |   "language_info": {
629 |    "codemirror_mode": {
630 |     "name": "ipython",
631 |     "version": 3
632 |    },
633 |    "file_extension": ".py",
634 |    "mimetype": "text/x-python",
635 |    "name": "python",
636 |    "nbconvert_exporter": "python",
637 |    "pygments_lexer": "ipython3",
638 |    "version": "3.9.10"
639 |   },
640 |   "toc": {
641 |    "base_numbering": 1,
642 |    "nav_menu": {},
643 |    "number_sections": true,
644 |    "sideBar": true,
645 |    "skip_h1_title": false,
646 |    "title_cell": "Table of Contents",
647 |    "title_sidebar": "Contents",
648 |    "toc_cell": false,
649 |    "toc_position": {},
650 |    "toc_section_display": true,
651 |    "toc_window_display": false
652 |   }
653 |  },
654 |  "nbformat": 4,
655 |  "nbformat_minor": 4
656 | }
657 | 


--------------------------------------------------------------------------------
/notebooks/08-vampnets-session.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# VAMPnets\n",
  8 |     "\n",
  9 |     "<a rel=\"license\" href=\"http://creativecommons.org/licenses/by/4.0/\"><img alt=\"Creative Commons Licence\" style=\"border-width:0\" src=\"https://i.creativecommons.org/l/by/4.0/88x31.png\" title='This work is licensed under a Creative Commons Attribution 4.0 International License.' align=\"right\"/></a><br><br>\n",
 10 |     "\n",
 11 |     "In this session we will see an example of how to use VAMPnets to extract a coarse-grained model from raw data using a n unsupervised deep learning approach. We will load data from a 2D toy model with xxx states, and build and train a neural network that assigns each datapoint to a separate state, and finally visualize the information we extracted from the dataset. \n",
 12 |     "After this, we will follow the same process to analyse a trajectory of the molecule Alanine Dipeptide, since it is a 30D system whose dynamics can be easily visualized in a 2D space.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "<a id=\"ref-1\" href=\"https://www.nature.com/articles/s41467-017-02388-1\">Here</a> you can find literature on the used method.\n",
 16 |     "\n",
 17 |     "**Remember**:\n",
 18 |     "- to run the currently highlighted cell, hold <kbd>&#x21E7; Shift</kbd> and press <kbd>&#x23ce; Enter</kbd>;\n",
 19 |     "- to get help for a specific function, place the cursor within the function's brackets, hold <kbd>&#x21E7; Shift</kbd>, and press <kbd>&#x21E5; Tab</kbd>;"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "### Import the required packages\n",
 27 |     "\n",
 28 |     "In case you haven't installed pytorch: [Installation instructions](https://pytorch.org/get-started/locally/)."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "ExecuteTime": {
 36 |      "end_time": "2021-03-03T12:46:34.416876Z",
 37 |      "start_time": "2021-03-03T12:46:32.954616Z"
 38 |     }
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "%matplotlib inline\n",
 43 |     "import matplotlib.pyplot as plt\n",
 44 |     "import numpy as np\n",
 45 |     "import mdshare\n",
 46 |     "import pyemma\n",
 47 |     "import deeptime as dt\n",
 48 |     "import torch\n",
 49 |     "import torch.nn as nn\n",
 50 |     "\n",
 51 |     "from tqdm.notebook import tqdm\n",
 52 |     "from deeptime.plots import plot_implied_timescales\n",
 53 |     "from deeptime.util.validation import implied_timescales"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {
 60 |     "ExecuteTime": {
 61 |      "end_time": "2021-03-03T12:56:00.476348Z",
 62 |      "start_time": "2021-03-03T12:56:00.473075Z"
 63 |     }
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# this is optional if you have CUDA/GPU support\n",
 68 |     "device = torch.device(\"cuda\")\n",
 69 |     "torch.backends.cudnn.benchmark = True\n",
 70 |     "\n",
 71 |     "torch.set_num_threads(12)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "### Guided example: 2D toy model\n",
 79 |     "We start by loading the data for the 2D model by using the package `mdshare`. The `fetch` function fetches the data from our servers. **Do not use `mdshare` for your own data!**"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {
 86 |     "ExecuteTime": {
 87 |      "end_time": "2021-03-03T12:56:53.208408Z",
 88 |      "start_time": "2021-03-03T12:56:53.165072Z"
 89 |     }
 90 |    },
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "file = mdshare.fetch(\"hmm-doublewell-2d-100k.npz\", working_directory=\"data\")\n",
 94 |     "with np.load(file) as fh:\n",
 95 |     "    data = fh[\"trajectory\"]"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "Next we want to visualize how the datas are distributed in the 2D space.\n",
103 |     "\n",
104 |     "#### Exercise\n",
105 |     "Plot the density of the data using a function from the `pyemma` package"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {
112 |     "ExecuteTime": {
113 |      "end_time": "2021-03-02T09:49:50.626838Z",
114 |      "start_time": "2021-03-02T09:49:50.299384Z"
115 |     }
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "pyemma. ##FIXME\n",
120 |     "plt.show()"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "### Hyperparameter selection\n",
128 |     "The next step is a bit tricky, as hyperparameter selection requires some experience to be done correctly. We provided some default values that will allow for a smooth training of our model. The meaning of every hyperparameter is explained in the next cell."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "ExecuteTime": {
136 |      "end_time": "2021-03-03T13:00:57.908023Z",
137 |      "start_time": "2021-03-03T13:00:57.901950Z"
138 |     }
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "# Tau, how much is the timeshift of the two datasets\n",
143 |     "tau = 1\n",
144 |     "\n",
145 |     "# Batch size for Stochastic Gradient descent\n",
146 |     "batch_size = 3000\n",
147 |     "\n",
148 |     "# Which trajectory points percentage is used as validation\n",
149 |     "val_ratio = 0.1\n",
150 |     "\n",
151 |     "# How many hidden layers the network has\n",
152 |     "network_depth = 4\n",
153 |     "\n",
154 |     "# \"Width\" of every layer\n",
155 |     "layer_width = 20\n",
156 |     "\n",
157 |     "# Learning rate used for the ADAM optimizer\n",
158 |     "learning_rate = 5e-3\n",
159 |     "\n",
160 |     "# How many output states the network has\n",
161 |     "output_size = 2\n",
162 |     "\n",
163 |     "# List of nodes of each layer\n",
164 |     "nodes = [data.shape[1]] + [layer_width for _ in range(network_depth)] + [output_size]\n",
165 |     "\n",
166 |     "# Iteration over the training set in the fitting process;\n",
167 |     "# basically how many iterations our training algorithm will do\n",
168 |     "nb_epoch = 20"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "### Data preprocessing\n",
176 |     "\n",
177 |     "Now we can to prepare our data so that it can be used for training our VAMPnets model. We want two arrays made of coupled datapoints, which are selected from the main trajectory at indexes $i, i+\\tau$. We want the two trajectories to be shuffled, but to maintain the correspondence between the non-time-lagged and the time-lagged datapoints. Finally, we want to split our data into training set and validation set, the former being used for training the algorithm, and the latter being necessary to test whether the network is overfitting ( = the resulting transformation works only on the training set but not on data from the same distribution).\n",
178 |     "<br>\n",
179 |     "<br>"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "metadata": {
186 |     "ExecuteTime": {
187 |      "end_time": "2021-03-03T13:01:39.533105Z",
188 |      "start_time": "2021-03-03T13:01:39.529145Z"
189 |     }
190 |    },
191 |    "outputs": [],
192 |    "source": [
193 |     "dataset = dt.util.data.TrajectoryDataset(lagtime=tau, trajectory=data.astype(np.float32))"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {
200 |     "ExecuteTime": {
201 |      "end_time": "2021-03-03T13:02:22.870270Z",
202 |      "start_time": "2021-03-03T13:02:22.813803Z"
203 |     }
204 |    },
205 |    "outputs": [],
206 |    "source": [
207 |     "n_val = int(len(dataset)*val_ratio)\n",
208 |     "train_data, val_data = torch.utils.data.random_split(dataset, [len(dataset) - n_val, n_val])"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {
215 |     "ExecuteTime": {
216 |      "end_time": "2021-03-03T13:03:21.445813Z",
217 |      "start_time": "2021-03-03T13:03:21.402966Z"
218 |     }
219 |    },
220 |    "outputs": [],
221 |    "source": [
222 |     "from deeptime.util.torch import MLP\n",
223 |     "lobe = MLP(units=nodes, nonlinearity=nn.ELU, output_nonlinearity=nn.Softmax)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {
230 |     "ExecuteTime": {
231 |      "end_time": "2021-03-03T13:03:50.486407Z",
232 |      "start_time": "2021-03-03T13:03:50.467174Z"
233 |     }
234 |    },
235 |    "outputs": [],
236 |    "source": [
237 |     "vampnet = dt.decomposition.deep.VAMPNet(lobe=lobe, learning_rate=learning_rate)"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {
244 |     "ExecuteTime": {
245 |      "end_time": "2021-03-03T13:05:00.536266Z",
246 |      "start_time": "2021-03-03T13:05:00.532250Z"
247 |     }
248 |    },
249 |    "outputs": [],
250 |    "source": [
251 |     "from torch.utils.data import DataLoader\n",
252 |     "\n",
253 |     "loader_train = DataLoader(train_data, batch_size=batch_size, shuffle=True)\n",
254 |     "loader_val = DataLoader(val_data, batch_size=len(val_data), shuffle=False)"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {
261 |     "ExecuteTime": {
262 |      "end_time": "2021-03-03T13:08:09.890298Z",
263 |      "start_time": "2021-03-03T13:05:24.421742Z"
264 |     }
265 |    },
266 |    "outputs": [],
267 |    "source": [
268 |     "model = vampnet.fit(loader_train, n_epochs=nb_epoch, validation_loader=loader_val, progress=tqdm).fetch_model()"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "markdown",
273 |    "metadata": {},
274 |    "source": [
275 |     "### Model validation\n",
276 |     "\n",
277 |     "When the previous cell will finish running, we have successfully (🤞) trained our VAMPnets. We can plot the training information to visualize how well our training proceeded, and by plotting both training and validation information we can make sure that our model didn't overfit. Before running the next cell, consider that the our network's training and validation scores should converge to a value slightly lower than $2$, since the score is calculated as the norm of the singular values of the estimated Koopman operator. We only have 2 output nodes and the largest singular value is always $=1$."
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {
284 |     "ExecuteTime": {
285 |      "end_time": "2021-03-03T13:29:46.151845Z",
286 |      "start_time": "2021-03-03T13:29:46.147276Z"
287 |     }
288 |    },
289 |    "outputs": [],
290 |    "source": [
291 |     "print(model)"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": null,
297 |    "metadata": {
298 |     "ExecuteTime": {
299 |      "end_time": "2021-03-03T13:08:51.254553Z",
300 |      "start_time": "2021-03-03T13:08:50.321345Z"
301 |     }
302 |    },
303 |    "outputs": [],
304 |    "source": [
305 |     "plt.loglog(*vampnet.train_scores.T, label='training')\n",
306 |     "plt.loglog(*vampnet.validation_scores.T, label='validation')\n",
307 |     "plt.xlabel('step')\n",
308 |     "plt.ylabel('score')\n",
309 |     "plt.legend();"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "metadata": {},
315 |    "source": [
316 |     "Now we can finally reap the results of our work: if the network was properly trained it should automatically separate the two wells in our system. We can verify this hypothesis by first transforming our dataset with the network using the `model.predict` method."
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {
323 |     "ExecuteTime": {
324 |      "end_time": "2021-03-03T13:10:50.666275Z",
325 |      "start_time": "2021-03-03T13:10:50.611652Z"
326 |     }
327 |    },
328 |    "outputs": [],
329 |    "source": [
330 |     "transformed_data = model.transform(data)"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "markdown",
335 |    "metadata": {},
336 |    "source": [
337 |     "Now we can visualize to which states the network assigns every point; we do so in the following cell by calculating to which state every datapoint is most likely to be assigned by the network:"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": null,
343 |    "metadata": {
344 |     "ExecuteTime": {
345 |      "end_time": "2021-03-03T13:12:11.413042Z",
346 |      "start_time": "2021-03-03T13:12:09.244695Z"
347 |     }
348 |    },
349 |    "outputs": [],
350 |    "source": [
351 |     "plt.scatter(*data.T, c=transformed_data[:,1])"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "metadata": {},
357 |    "source": [
358 |     "If you are looking at an yellow and a blue ball, your network reached its optimal state during the training. \n",
359 |     "\n",
360 |     "We can further analyze the output of the network by visualizing the decision landscape:"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": null,
366 |    "metadata": {
367 |     "ExecuteTime": {
368 |      "end_time": "2021-03-03T13:12:34.622238Z",
369 |      "start_time": "2021-03-03T13:12:34.320797Z"
370 |     }
371 |    },
372 |    "outputs": [],
373 |    "source": [
374 |     "xmax = np.max(np.abs(data[:, 0]))\n",
375 |     "ymin = np.min(data[:, 1])\n",
376 |     "ymax = np.max(data[:, 1])\n",
377 |     "grid = np.meshgrid(np.linspace(-xmax-1, xmax+1, 150), np.linspace(ymin-1, ymax+1, 50))\n",
378 |     "xy = np.dstack(grid).reshape(-1, 2)\n",
379 |     "z = model.transform(xy)[:,0]\n",
380 |     "\n",
381 |     "cb = plt.contourf(grid[0], grid[1], z.reshape(grid[0].shape), levels=15, cmap='coolwarm')\n",
382 |     "plt.colorbar(cb);"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "markdown",
387 |    "metadata": {},
388 |    "source": [
389 |     "Since this is a very simple system, the network should enforce a very sharp classification, with most of the points belonging to either `state 1` or `state 2`, with only a few points in between having a mixed value.\n",
390 |     "\n",
391 |     "As a last step, we can verify that the network preserves the slow information in the system by plotting the implied timescales present in our transformed data:"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": null,
397 |    "metadata": {
398 |     "ExecuteTime": {
399 |      "end_time": "2021-03-03T13:13:19.250031Z",
400 |      "start_time": "2021-03-03T13:13:18.351691Z"
401 |     }
402 |    },
403 |    "outputs": [],
404 |    "source": [
405 |     "lagtimes = np.arange(1, 11)\n",
406 |     "its = implied_timescales([dt.decomposition.VAMP(lagtime=lag, observable_transform=model).fit(data).fetch_model() for lag in lagtimes])\n",
407 |     "fig, axes = plt.subplots(1, 1, figsize=(6, 4))\n",
408 |     "\n",
409 |     "plot_implied_timescales(its, ax=axes)\n",
410 |     "axes.set_yscale('log')\n",
411 |     "axes.set_xlabel('lagtime (steps)')\n",
412 |     "axes.set_ylabel('timescale (steps)')\n",
413 |     "fig.tight_layout()"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "markdown",
418 |    "metadata": {},
419 |    "source": [
420 |     "## Hands-on session: Alanine Dipeptide\n",
421 |     "In the following three cells, you are given the loading function for the alanine-dipeptide trajectories (along with its 2 dihedral values), a plot that shows how to visualize information about the molecule using the dihedral data, and a set of hyperparameters. Build and train a network that classifies alanine samples, and set the number of epochs so that your network converges to a stable score. Plot your results and confront them to the provided examples."
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "markdown",
426 |    "metadata": {},
427 |    "source": [
428 |     "#### Cell 1: Loading\n",
429 |     "**NOTE: do NOT use the dihedral information for the training! It would be easier to do so, but the interesting aspect of this exercise lies in seeing how easily the network extracts a low level representation from a highly dimensional space**"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": null,
435 |    "metadata": {
436 |     "ExecuteTime": {
437 |      "end_time": "2021-03-03T13:14:43.258385Z",
438 |      "start_time": "2021-03-03T13:14:43.100731Z"
439 |     }
440 |    },
441 |    "outputs": [],
442 |    "source": [
443 |     "ala_coords_file = mdshare.fetch(\n",
444 |     "    \"alanine-dipeptide-3x250ns-heavy-atom-positions.npz\", working_directory=\"data\"\n",
445 |     ")\n",
446 |     "with np.load(ala_coords_file) as fh:\n",
447 |     "    data = fh[\"arr_0\"]\n",
448 |     "\n",
449 |     "dihedral_file = mdshare.fetch(\n",
450 |     "    \"alanine-dipeptide-3x250ns-backbone-dihedrals.npz\", working_directory=\"data\"\n",
451 |     ")\n",
452 |     "with np.load(dihedral_file) as fh:\n",
453 |     "    dihedral = fh[\"arr_0\"]"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "markdown",
458 |    "metadata": {},
459 |    "source": [
460 |     "#### Cell 2: Visualization\n",
461 |     "Since the dynamics of the molecule are completely described by its position in the dihedral plane, we can use these two variables every time we need to pass an x-axis and y-axis to a plotting function"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": null,
467 |    "metadata": {
468 |     "ExecuteTime": {
469 |      "end_time": "2021-03-03T13:15:14.047608Z",
470 |      "start_time": "2021-03-03T13:15:13.626735Z"
471 |     }
472 |    },
473 |    "outputs": [],
474 |    "source": [
475 |     "pyemma.plots.plot_density(*dihedral.T, cmap=\"viridis\")\n",
476 |     "plt.show()"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "markdown",
481 |    "metadata": {},
482 |    "source": [
483 |     "#### Cell 3: Hyperparameters\n",
484 |     "The `nb_epochs` variable is missing a value. Experiment with the training and find a number of epochs that ensures that your network will converge every time you train it"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": null,
490 |    "metadata": {
491 |     "ExecuteTime": {
492 |      "end_time": "2021-03-02T09:54:06.123542Z",
493 |      "start_time": "2021-03-02T09:54:06.120163Z"
494 |     }
495 |    },
496 |    "outputs": [],
497 |    "source": [
498 |     "tau = 1\n",
499 |     "\n",
500 |     "batch_size = 10000\n",
501 |     "\n",
502 |     "train_ratio = 0.9\n",
503 |     "\n",
504 |     "network_depth = 6\n",
505 |     "\n",
506 |     "layer_width = 30\n",
507 |     "\n",
508 |     "learning_rate = 5e-3\n",
509 |     "\n",
510 |     "output_size = 6\n",
511 |     "\n",
512 |     "nodes = [data.shape[1]] + [layer_width for _ in range(network_depth)] + [output_size]\n",
513 |     "\n",
514 |     "nb_epoch = ## FIXME"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": null,
520 |    "metadata": {},
521 |    "outputs": [],
522 |    "source": [
523 |     "## Your network code goes here"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "markdown",
528 |    "metadata": {},
529 |    "source": [
530 |     "When you are done, the results should look like this:\n",
531 |     "\n",
532 |     "#### Dihedral space separation\n",
533 |     "<img style=\"float: left;\" src=\"./img/space_division.png\"/>"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "markdown",
538 |    "metadata": {},
539 |    "source": [
540 |     "#### Output values for each node\n",
541 |     "<img  style=\"float: left;\" src=\"./img/prob_state1.png\"/>\n",
542 |     "<img  style=\"float: left;\" src=\"./img/prob_state2.png\"/>\n",
543 |     "<img  style=\"float: left;\" src=\"./img/prob_state3.png\"/>\n",
544 |     "<img  style=\"float: left;\" src=\"./img/prob_state4.png\"/>\n",
545 |     "<img  style=\"float: left;\" src=\"./img/prob_state5.png\"/>\n",
546 |     "<img  style=\"float: left;\" src=\"./img/prob_state6.png\"/>"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "markdown",
551 |    "metadata": {},
552 |    "source": [
553 |     "#### Timescales\n",
554 |     "<img style=\"float: left;\" src=\"./img/timescales.png\"/>"
555 |    ]
556 |   }
557 |  ],
558 |  "metadata": {
559 |   "kernelspec": {
560 |    "display_name": "Python 3 (ipykernel)",
561 |    "language": "python",
562 |    "name": "python3"
563 |   },
564 |   "language_info": {
565 |    "codemirror_mode": {
566 |     "name": "ipython",
567 |     "version": 3
568 |    },
569 |    "file_extension": ".py",
570 |    "mimetype": "text/x-python",
571 |    "name": "python",
572 |    "nbconvert_exporter": "python",
573 |    "pygments_lexer": "ipython3",
574 |    "version": "3.8.12"
575 |   },
576 |   "toc": {
577 |    "base_numbering": 1,
578 |    "nav_menu": {},
579 |    "number_sections": false,
580 |    "sideBar": true,
581 |    "skip_h1_title": true,
582 |    "title_cell": "Table of Contents",
583 |    "title_sidebar": "Contents",
584 |    "toc_cell": false,
585 |    "toc_position": {},
586 |    "toc_section_display": true,
587 |    "toc_window_display": true
588 |   }
589 |  },
590 |  "nbformat": 4,
591 |  "nbformat_minor": 2
592 | }
593 | 


--------------------------------------------------------------------------------
/notebooks/11-independent-markov-decomposition.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Independent Markov decomposition\n",
  8 |     "\n",
  9 |     "In this notebook, we will explain how to split a global system into weakly coupled subsystems with independent Markov decomposition (IMD) [<a id=\"ref-1\" href=\"#cite-imd\">1</a>,<a id=\"ref-2\" href=\"#cite-syt\">2</a>]. Using a test system, we will show how to find an optimal partition into Markov-independent subsystems and how to model them independently.\n",
 10 |     "\n",
 11 |     "\n",
 12 |     "\n",
 13 |     "**Remember**:\n",
 14 |     "- to run the currently highlighted cell, hold <kbd>&#x21E7; Shift</kbd> and press <kbd>&#x23ce; Enter</kbd>;\n",
 15 |     "- to get help for a specific function, place the cursor within the function's brackets, hold <kbd>&#x21E7; Shift</kbd>, and press <kbd>&#x21E5; Tab</kbd>;\n",
 16 |     "- you can find the full documentation at [PyEMMA.org](http://www.pyemma.org)."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import numpy as np\n",
 26 |     "from tqdm.notebook import tqdm\n",
 27 |     "from matplotlib import pyplot as plt\n",
 28 |     "\n",
 29 |     "import itertools\n",
 30 |     "import networkx as nx\n",
 31 |     "\n",
 32 |     "import mdshare\n",
 33 |     "from deeptime.markov.msm import MaximumLikelihoodMSM, MarkovStateModel"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## state mapping\n",
 41 |     "We first have to look into different representations of a global system state. Imagine a system that consists of 2 subsystems, that each can exist in 3 states. On the one hand, we can write the system's state as a tuple, e.g. `(0, 2)` for the first sub-system being in state `0` and the second one in state `2`. On the other hand, we can also write the tuple as an integer, much like compressing the information into a single number. For the example system, the table of all possible states would be like this:\n",
 42 |     "\n",
 43 |     "|  |  | | | | | | | | | \n",
 44 |     "| ---- | ---- | ---- | ---- |  ---- | ---- |  ---- | ---- |  ---- | ---- |\n",
 45 |     "| **state integer** | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8  |\n",
 46 |     "| **state tuple** | (0, 0) | (0, 1) | (0, 2) | (1, 0) | (1, 1) | (1, 2) | (2, 0) | (2, 1) | (2, 2)  |\n",
 47 |     "\n",
 48 |     "Of course, mapping between these two representations can be generalized to arbitrary numbers of sub-systems with arbitrary states numbers.\n",
 49 |     "The notion here is that the **integer** describes the global system's state, whereas the **tuple** encodes each local system's state individually.\n",
 50 |     "\n",
 51 |     "In practise, system states can be converted between the tuple (local states) and integer (global state) using numpy. We only have to provide a corresponding *shape* for the system, `(3, 3)` in our case. Here's our example:"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "n_systems = 2  # number of local systems (tuple length)\n",
 61 |     "n_states = 3  # number of states per local system\n",
 62 |     "integer_trajectory = np.arange(9)  # global states (cf. first line of above table)\n",
 63 |     "# this could be a time series!\n",
 64 |     "\n",
 65 |     "shape = tuple((n_states for _ in range(n_systems)))\n",
 66 |     "print('shape for unravaling: ', shape)\n",
 67 |     "\n",
 68 |     "tuple_trajectory = np.vstack(\n",
 69 |     "                     np.unravel_index(integer_trajectory, shape)\n",
 70 |     ")\n",
 71 |     "print('unraveled states:')\n",
 72 |     "print(tuple_trajectory)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "We see that numpy has converted our `integer_trajectory` into two separate trajectories, each representing the state of a local agent."
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "print('int \\t tuple')\n",
 89 |     "for t in range(9):\n",
 90 |     "    int_state = integer_trajectory[t]\n",
 91 |     "    subsys0_state = tuple_trajectory[0][t]\n",
 92 |     "    subsys1_state = tuple_trajectory[1][t]\n",
 93 |     "    print(f'{int_state} \\t ({subsys0_state}, {subsys1_state})')"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "Looks familiar?\n",
101 |     "\n",
102 |     "**Task:**\n",
103 |     "Please do the inverse operation: Map back from the tuple trajectories into the space of full system integers. There is a numpy function for this task."
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {
110 |     "solution2": "hidden",
111 |     "solution2_first": true
112 |    },
113 |    "outputs": [],
114 |    "source": [
115 |     "remapped_int_traj = #FIXME\n",
116 |     "\n",
117 |     "np.all(integer_trajectory == remapped_int_traj)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {
124 |     "solution2": "hidden"
125 |    },
126 |    "outputs": [],
127 |    "source": [
128 |     "# solution\n",
129 |     "remapped_int_traj = np.ravel_multi_index(tuple_trajectory, \n",
130 |     "                                         tuple((n_states for _ in range(n_systems))))\n",
131 |     "\n",
132 |     "np.all(integer_trajectory == remapped_int_traj)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "## A system of unkown structure\n",
140 |     "\n",
141 |     "You are now given discrete data for a system of unknown structure. The task is a) to identify weakly coupled sub-systems and b) to approximate such a subsystem using an independent MSM.\n",
142 |     "\n",
143 |     "**Hint:** The system consists of ten 2-state subsystems, i.e., has a total of $2^{10}=1024$ states. Some of the subsystems are strongly coupled, others have weak couplings only.\n",
144 |     "\n",
145 |     "**Task:** Please define the number of subsystems and the number of subsystem states"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {
152 |     "solution2": "hidden",
153 |     "solution2_first": true
154 |    },
155 |    "outputs": [],
156 |    "source": [
157 |     "n_systems = #FIXME\n",
158 |     "n_states = #FIXME"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {
165 |     "solution2": "hidden"
166 |    },
167 |    "outputs": [],
168 |    "source": [
169 |     "# solution\n",
170 |     "n_systems = 10  # number of local systems\n",
171 |     "n_states = 2  # number of states per local system"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "### Data\n",
179 |     "First, we load the data. The trajectories were obtained by first defining a *global* transition matrix. Subsequently, a Markov chain sampler was used to create a time series from that matrix (saved every 20 steps). The *global* transition matrix uses (global) state integers to enumerate its states, therefore the trajectory that is loaded uses them as well. "
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "file = mdshare.fetch('imd_full_system_trajectory.npy', working_directory='data')\n",
189 |     "full_sys_traj = np.load(file)"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "In order to check *dependencies* between subsystems, we first need to retrieve the subsystem time series.\n",
197 |     "\n",
198 |     "**Task:** Compute the individual subsystem state trajectories as done above."
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {
205 |     "solution2": "hidden",
206 |     "solution2_first": true
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "subsys_trajs = # FIXME"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {
217 |     "solution2": "hidden"
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "# solution\n",
222 |     "subsys_trajs = np.vstack(\n",
223 |     "    np.unravel_index(full_sys_traj, tuple((n_states for _ in range(n_systems))))\n",
224 |     ")"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "metadata": {},
230 |    "source": [
231 |     "### Define dependency score\n",
232 |     "We now define the *dependency* score:"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "def compute_dependency(tmat12, tmat1, tmat2, score='frobenius'):\n",
242 |     "    \"\"\"\n",
243 |     "    compute dependency score between two systems\n",
244 |     "    :param tmat12: np.ndarray, transition matrix in joint space\n",
245 |     "    :param tmat1: np.ndarray, transition matrix in subsystem 1\n",
246 |     "    :param tmat2: np.ndarray, transition matrix in subsystem 2\n",
247 |     "    :param score: str, optional, matrix norm. one of frobenius, tracenorm.\n",
248 |     "    :return: float, dependency score\n",
249 |     "    \"\"\"\n",
250 |     "    if score == 'frobenius':\n",
251 |     "        d = np.linalg.norm(tmat12, ord='fro')**2 - \\\n",
252 |     "                    np.linalg.norm(tmat1, ord='fro')**2 * np.linalg.norm(tmat2, ord='fro')**2\n",
253 |     "    elif score == 'tracenorm':\n",
254 |     "        d = np.linalg.norm(tmat12, ord='nuc') - \\\n",
255 |     "                    np.linalg.norm(tmat1, ord='nuc') * np.linalg.norm(tmat2, ord='nuc')\n",
256 |     "    else:\n",
257 |     "        raise NotImplementedError('score must be one of frobenius, tracenorm.')\n",
258 |     "        \n",
259 |     "    return abs(d)\n"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "To compute the score on a pair of subsystems, we need to evaluate \n",
267 |     "- the transition matrix of subsystem 1\n",
268 |     "\n",
269 |     "- the transition matrix of subsystem 2\n",
270 |     "\n",
271 |     "- the transition matrix in the joint space\n",
272 |     "\n",
273 |     "Let's start with the single sub-systems."
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": [
282 |     "# we will store the results in numpy arrays.\n",
283 |     "single_tmats = np.empty((n_systems, n_states, n_states))"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "markdown",
288 |    "metadata": {},
289 |    "source": [
290 |     "**Task:** Compute each system's transition matrix and store it in the above array"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": null,
296 |    "metadata": {
297 |     "solution2": "hidden",
298 |     "solution2_first": true
299 |    },
300 |    "outputs": [],
301 |    "source": [
302 |     "for n in range(n_systems):\n",
303 |     "    single_tmats[n] = #FIXME"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {
310 |     "solution2": "hidden"
311 |    },
312 |    "outputs": [],
313 |    "source": [
314 |     "# solution:\n",
315 |     "for n in range(n_systems):\n",
316 |     "    msm = MaximumLikelihoodMSM(lagtime=1).fit_fetch(subsys_trajs[n])\n",
317 |     "    single_tmats[n] = msm.transition_matrix"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "metadata": {},
323 |    "source": [
324 |     "We will now compute all pairs of systems or joint transition matrices. "
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": null,
330 |    "metadata": {},
331 |    "outputs": [],
332 |    "source": [
333 |     "joint_tmats = np.empty((n_systems, n_systems, 2**n_states, 2**n_states))"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {
340 |     "scrolled": true
341 |    },
342 |    "outputs": [],
343 |    "source": [
344 |     "# compute pairwise transition matrices\n",
345 |     "for n1, n2 in itertools.combinations(range(n_systems), 2):\n",
346 |     "    dtraj_system1 = subsys_trajs[n1]\n",
347 |     "    dtraj_system2 = subsys_trajs[n2]\n",
348 |     "    \n",
349 |     "    # combine both system states into a global number\n",
350 |     "    # note that the number of systems in the *pair* is 2.\n",
351 |     "    combined_dtraj = np.ravel_multi_index((dtraj_system1, dtraj_system2), \n",
352 |     "                                         tuple((n_states for _ in range(2))))\n",
353 |     "    \n",
354 |     "    msm = MaximumLikelihoodMSM(lagtime=1).fit_fetch(combined_dtraj)\n",
355 |     "    joint_tmats[n1, n2] = msm.transition_matrix"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "markdown",
360 |    "metadata": {},
361 |    "source": [
362 |     "### graph analysis\n",
363 |     "We now compute dependencies for all pairs of systems and store them in a `networkx` graph. \n",
364 |     "\n",
365 |     "**Task**: Compute the dependency for all edges using the above defined function."
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "metadata": {
372 |     "solution2": "hidden",
373 |     "solution2_first": true
374 |    },
375 |    "outputs": [],
376 |    "source": [
377 |     "# compute different scores and store in a networkx graph object\n",
378 |     "graph_fronorm = nx.Graph()\n",
379 |     "graph_trace = nx.Graph()\n",
380 |     "\n",
381 |     "# for all pairs of subsystems, compute dependency scores with Frobenius and trace norm\n",
382 |     "for n1, n2 in itertools.combinations(range(n_systems), 2):\n",
383 |     "    # compute with trace norm\n",
384 |     "    d = # FIXME\n",
385 |     "    graph_trace.add_edge(n1, n2, weight=d)\n",
386 |     "    \n",
387 |     "    # compute with frobenius norm\n",
388 |     "    d = #FIXME\n",
389 |     "    graph_fronorm.add_edge(n1, n2, weight=d)"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": null,
395 |    "metadata": {
396 |     "solution2": "hidden"
397 |    },
398 |    "outputs": [],
399 |    "source": [
400 |     "# solution\n",
401 |     "\n",
402 |     "# compute different scores and store in a networkx graph object\n",
403 |     "graph_fronorm = nx.Graph()\n",
404 |     "graph_trace = nx.Graph()\n",
405 |     "\n",
406 |     "# for all pairs of subsystems, compute dependency scores with Frobenius and trace norm\n",
407 |     "for n1, n2 in itertools.combinations(range(n_systems), 2):\n",
408 |     "    d = compute_dependency(joint_tmats[n1, n2], \n",
409 |     "                           single_tmats[n1], \n",
410 |     "                           single_tmats[n2], \n",
411 |     "                           score='tracenorm')\n",
412 |     "    graph_trace.add_edge(n1, n2, weight=d)\n",
413 |     "    \n",
414 |     "\n",
415 |     "    d = compute_dependency(joint_tmats[n1, n2], \n",
416 |     "                           single_tmats[n1], \n",
417 |     "                           single_tmats[n2], \n",
418 |     "                           score='frobenius')\n",
419 |     "    graph_fronorm.add_edge(n1, n2, weight=d)"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "markdown",
424 |    "metadata": {},
425 |    "source": [
426 |     "### Draw the graph\n",
427 |     "We now have an edge-weight graph, i.e., a network of subsystems (nodes) that are connected by their *dependency* (edges). We can use that graph to identify clusters of strongly coupled subsystems."
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": null,
433 |    "metadata": {},
434 |    "outputs": [],
435 |    "source": [
436 |     "# some plot properties\n",
437 |     "_c = (0., 0., 0.)\n",
438 |     "nodesize = 35\n",
439 |     "edge_cmap = plt.matplotlib.colors.LinearSegmentedColormap.from_list(\"uwe\", [(*_c, 0.025), (*_c, 1)])\n",
440 |     "font = plt.matplotlib.font_manager.FontProperties(size=12)"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": null,
446 |    "metadata": {},
447 |    "outputs": [],
448 |    "source": [
449 |     "fig, axes = plt.subplots(1, 2, figsize=(7.5, 3), gridspec_kw={'hspace':.25})\n",
450 |     "names = ['trace norm', 'frobenius norm']\n",
451 |     "for n_graph, graph in enumerate([graph_trace, graph_fronorm]):\n",
452 |     "    \n",
453 |     "    ax = axes[n_graph]\n",
454 |     "    # positions by Fruchterman-Reingold\n",
455 |     "    pos_dict = nx.spring_layout(graph, k=0.75 if n_graph == 0 else 0.4)\n",
456 |     "    ax.set_title(names[n_graph])\n",
457 |     "\n",
458 |     "    weights = np.array(list(nx.get_edge_attributes(graph, 'weight').values()))\n",
459 |     "    \n",
460 |     "    # draw nodes\n",
461 |     "    nx.draw_networkx_nodes(graph, node_shape='s',\n",
462 |     "                           node_size=nodesize, \n",
463 |     "                           pos=pos_dict,\n",
464 |     "                          ax=ax)\n",
465 |     "    nx.draw_networkx_labels(graph, pos=pos_dict, ax=ax, \n",
466 |     "                            font_color='red', font_weight='bold', font_size=15)\n",
467 |     "    # draw all edges\n",
468 |     "    pc = nx.draw_networkx_edges(graph, edge_cmap=edge_cmap,\n",
469 |     "                     edge_color=weights, width=2.4,\n",
470 |     "                     pos=pos_dict, node_size=nodesize,\n",
471 |     "                     ax=ax,\n",
472 |     "                    )\n",
473 |     "    \n",
474 |     "    # define colormap\n",
475 |     "    pc.set_array(weights)\n",
476 |     "    pc.set_cmap(edge_cmap)\n",
477 |     "\n",
478 |     "    cb = fig.colorbar(pc, ax=ax,\n",
479 |     "                      aspect=25, pad=.15)\n",
480 |     "    cb.set_label(r'$d$')\n",
481 |     "    cbarticks = cb.ax.yaxis.get_ticklabels()\n",
482 |     "    \n",
483 |     "    # set font properties\n",
484 |     "    for _t in list(cbarticks):\n",
485 |     "        _t.set_font_properties(font)\n",
486 |     "    ax.axis('off');"
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "markdown",
491 |    "metadata": {},
492 |    "source": [
493 |     "### Interpretation:\n",
494 |     "You should see a grouping of your nodes into 2 strongly coupled clusters. Within these clusters, the *dependency* is large - it is low between different clusters. The node node numbers tell you which of them belong to a certain cluster. They can be used to extract the given systems for individual modeling. "
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "markdown",
499 |    "metadata": {},
500 |    "source": [
501 |     "## Modeling a single cluster independently\n",
502 |     "\n",
503 |     "Now that we have found an optimal partition, we can retrieve the model of one of the clusters, ignoring weak coupling between them. (Note that one would probably like to model both parts independently, for the sake of time we only look at one here - in this particular example, they are the same anyways.)\n",
504 |     "\n",
505 |     "**Task:** Please choose a set of subsystems to be modeled independently of the rest."
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "code",
510 |    "execution_count": null,
511 |    "metadata": {
512 |     "solution2": "hidden",
513 |     "solution2_first": true
514 |    },
515 |    "outputs": [],
516 |    "source": [
517 |     "system_nodes = #FIXME"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": null,
523 |    "metadata": {
524 |     "solution2": "hidden"
525 |    },
526 |    "outputs": [],
527 |    "source": [
528 |     "# solution\n",
529 |     "system_nodes = [0, 2, 3, 4, 5] # or [1, 6, 7, 8, 9]"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "markdown",
534 |    "metadata": {},
535 |    "source": [
536 |     "Now, the trajectories of these subsystems are extracted from the data. It will is re-written to an integer that describes the full state of that set of subsystems."
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": null,
542 |    "metadata": {},
543 |    "outputs": [],
544 |    "source": [
545 |     "# subsystem indexing ordered to match resulting matrices\n",
546 |     "subsystem_trajectory = np.ravel_multi_index(\n",
547 |     "        np.array(subsys_trajs)[system_nodes], \n",
548 |     "        tuple((n_states for _ in range(len(system_nodes))))\n",
549 |     ")"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "markdown",
554 |    "metadata": {},
555 |    "source": [
556 |     "**Task:** Fit a maximum likelihood MSM to the subsystem-cluster trajectory; use a lag time of 1 steps."
557 |    ]
558 |   },
559 |   {
560 |    "cell_type": "code",
561 |    "execution_count": null,
562 |    "metadata": {
563 |     "solution2": "hidden",
564 |     "solution2_first": true
565 |    },
566 |    "outputs": [],
567 |    "source": [
568 |     "msm = # FIXME"
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "code",
573 |    "execution_count": null,
574 |    "metadata": {
575 |     "solution2": "hidden"
576 |    },
577 |    "outputs": [],
578 |    "source": [
579 |     "msm = MaximumLikelihoodMSM(lagtime=1).fit_fetch(subsystem_trajectory)"
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "markdown",
584 |    "metadata": {},
585 |    "source": [
586 |     "Note that this transition matrix effectively models a lagtime of 20 because the trajectory was generated with that lag time.\n",
587 |     "\n",
588 |     "### compare transition matrices & implied timescales"
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "code",
593 |    "execution_count": null,
594 |    "metadata": {},
595 |    "outputs": [],
596 |    "source": [
597 |     "# reference transition matrix (does not include weak couplings between the two clusters!)\n",
598 |     "channel_tmat = np.load(mdshare.fetch('imd_channel_transitionmatrix.npy', working_directory='data'))\n",
599 |     "dt = 20  # time step used for generating the data\n",
600 |     "\n",
601 |     "# adjust lag time of generating matrix\n",
602 |     "ref_msm = MarkovStateModel(np.linalg.matrix_power(channel_tmat, dt))"
603 |    ]
604 |   },
605 |   {
606 |    "cell_type": "code",
607 |    "execution_count": null,
608 |    "metadata": {},
609 |    "outputs": [],
610 |    "source": [
611 |     "fig, ax = plt.subplots(1, 2)\n",
612 |     "ax[0].imshow(msm.transition_matrix, norm=plt.matplotlib.colors.LogNorm())\n",
613 |     "ax[0].set_title('estimated')\n",
614 |     "ax[1].imshow(ref_msm.transition_matrix, norm=plt.matplotlib.colors.LogNorm())\n",
615 |     "ax[1].set_title('reference');"
616 |    ]
617 |   },
618 |   {
619 |    "cell_type": "markdown",
620 |    "metadata": {},
621 |    "source": [
622 |     "The transition matrices look very similar, however few pixels are empty (white) due to the fact that even with 1,000,000 steps, not all states of the chosen set of subsystems were sampled."
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "code",
627 |    "execution_count": null,
628 |    "metadata": {},
629 |    "outputs": [],
630 |    "source": [
631 |     "fig, ax = plt.subplots(1, 2, figsize=(8, 4))\n",
632 |     "\n",
633 |     "ax[0].plot(msm.transition_matrix.flat, ref_msm.transition_matrix.flat, '.')\n",
634 |     "ax[0].loglog()\n",
635 |     "\n",
636 |     "its_ref = ref_msm.timescales()\n",
637 |     "its_est = msm.timescales()\n",
638 |     "\n",
639 |     "ax[1].plot(its_ref, 'r.', label='reference')\n",
640 |     "ax[1].plot(its_est, 'b.', label='estimate')\n",
641 |     "ax[1].semilogy()\n",
642 |     "fig.legend()"
643 |    ]
644 |   },
645 |   {
646 |    "cell_type": "markdown",
647 |    "metadata": {},
648 |    "source": [
649 |     "The spectrum is well-approximated. As this model does not incoporate the weak coupling between the two large node clusters, it is only an approximation."
650 |    ]
651 |   },
652 |   {
653 |    "cell_type": "markdown",
654 |    "metadata": {},
655 |    "source": [
656 |     "## References:\n",
657 |     "\n",
658 |     "<a id=\"cite-imd\"/><sup><a href=#ref-1>[1]</a></sup> Hempel, T.; del Razo, M. J.; Lee, C. T.; Taylor, B. C.; Amaro, R. E.; Noé, F. _Independent Markov Decomposition: Toward Modeling Kinetics of Biomolecular Complexes._ Proc Natl Acad Sci USA 2021, 118 (31), e2105230118. https://doi.org/10.1073/pnas.2105230118.\n",
659 |     ".\n",
660 |     "\n",
661 |     "<a id=\"cite-syt\"/><sup><a href=#ref-2>[2]</a></sup> Hempel, T.; Plattner, N.; Noé, F. _Coupling of Conformational Switches in Calcium Sensor Unraveled with Local Markov Models and Transfer Entropy._ J. Chem. Theory Comput. 2020, 16 (4), 2584–2593. https://doi.org/10.1021/acs.jctc.0c00043.\n"
662 |    ]
663 |   }
664 |  ],
665 |  "metadata": {
666 |   "kernelspec": {
667 |    "display_name": "Python 3 (ipykernel)",
668 |    "language": "python",
669 |    "name": "python3"
670 |   },
671 |   "language_info": {
672 |    "codemirror_mode": {
673 |     "name": "ipython",
674 |     "version": 3
675 |    },
676 |    "file_extension": ".py",
677 |    "mimetype": "text/x-python",
678 |    "name": "python",
679 |    "nbconvert_exporter": "python",
680 |    "pygments_lexer": "ipython3",
681 |    "version": "3.9.10"
682 |   },
683 |   "toc": {
684 |    "base_numbering": 1,
685 |    "nav_menu": {},
686 |    "number_sections": true,
687 |    "sideBar": true,
688 |    "skip_h1_title": false,
689 |    "title_cell": "Table of Contents",
690 |    "title_sidebar": "Contents",
691 |    "toc_cell": false,
692 |    "toc_position": {},
693 |    "toc_section_display": true,
694 |    "toc_window_display": false
695 |   }
696 |  },
697 |  "nbformat": 4,
698 |  "nbformat_minor": 2
699 | }
700 | 


--------------------------------------------------------------------------------
/notebooks/03-msm-estimation-validation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MSM estimation and validation\n",
  8 |     "\n",
  9 |     "<a rel=\"license\" href=\"http://creativecommons.org/licenses/by/4.0/\"><img alt=\"Creative Commons Licence\" style=\"border-width:0\" src=\"https://i.creativecommons.org/l/by/4.0/88x31.png\" title='This work is licensed under a Creative Commons Attribution 4.0 International License.' align=\"right\"/></a>\n",
 10 |     "\n",
 11 |     "In this notebook, we will cover how to estimate a Markov state model (MSM) and do model validation;\n",
 12 |     "we also show how to save and restore model and estimator objects.\n",
 13 |     "For this notebook, you need to know how to do data loading/visualization as well as dimension reduction.\n",
 14 |     "\n",
 15 |     "\n",
 16 |     "**Remember**:\n",
 17 |     "- to run the currently highlighted cell, hold <kbd>&#x21E7; Shift</kbd> and press <kbd>&#x23ce; Enter</kbd>;\n",
 18 |     "- to get help for a specific function, place the cursor within the function's brackets, hold <kbd>&#x21E7; Shift</kbd>, and press <kbd>&#x21E5; Tab</kbd>;\n",
 19 |     "- you can find the full documentation for PyEMMA at [PyEMMA.org](http://www.pyemma.org) and for deeptime at [deeptime-ml.github.io](https://deeptime-ml.github.io/)."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "%matplotlib inline\n",
 29 |     "import matplotlib.pyplot as plt\n",
 30 |     "from tqdm.notebook import tqdm\n",
 31 |     "import numpy as np\n",
 32 |     "import mdshare\n",
 33 |     "import pyemma"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Loading MD data and repeating the clustering step\n",
 41 |     "\n",
 42 |     "Let's load alanine dipeptide backbone torsions and discretise with 200 $k$-means centers..."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "pdb = mdshare.fetch('alanine-dipeptide-nowater.pdb', working_directory='data')\n",
 52 |     "files = mdshare.fetch('alanine-dipeptide-*-250ns-nowater.xtc', working_directory='data')\n",
 53 |     "\n",
 54 |     "feat = pyemma.coordinates.featurizer(pdb)\n",
 55 |     "feat.add_backbone_torsions(periodic=False)\n",
 56 |     "\n",
 57 |     "data = pyemma.coordinates.load(files, features=feat)\n",
 58 |     "\n",
 59 |     "from deeptime.clustering import KMeans\n",
 60 |     "cluster = KMeans(200, max_iter=50, progress=tqdm)\\\n",
 61 |     "    .fit_fetch(np.concatenate(data)[::10])"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "... and plot the free energy along with the cluster centers:"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "fig, ax = plt.subplots()\n",
 78 |     "pyemma.plots.plot_free_energy(*np.concatenate(data).T, ax=ax, legacy=False)\n",
 79 |     "ax.scatter(*cluster.cluster_centers.T, s=15, c='k')\n",
 80 |     "ax.set_xlabel('$\\Phi$ / rad') \n",
 81 |     "ax.set_ylabel('$\\Psi$ / rad')\n",
 82 |     "fig.tight_layout()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "## Implied time scales and lag time selection\n",
 90 |     "\n",
 91 |     "The first step after obtaining the discretized dynamics is finding a suitable lag time.\n",
 92 |     "The systematic approach is to estimate MSMs at various lag times and observe how the implied timescales (ITSs) of these models behave.\n",
 93 |     "In particular, we are looking for lag time ranges in which the implied timescales are constant.\n",
 94 |     "\n",
 95 |     "To that end we iterate over a range of lagtimes and estimate a Markov state model for each of them, subsequently computing the four slowest (``k=4``) timescales from it."
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "from deeptime.markov import TransitionCountEstimator\n",
105 |     "from deeptime.markov.msm import MaximumLikelihoodMSM\n",
106 |     "\n",
107 |     "dtrajs = [cluster.transform(traj) for traj in data]\n",
108 |     "\n",
109 |     "lags = [1, 2, 5, 10, 20, 50]\n",
110 |     "models = []\n",
111 |     "for lag in tqdm(lags, leave=False):\n",
112 |     "    counts_estimator = TransitionCountEstimator(lag, \"sliding\")\n",
113 |     "    counts = counts_estimator.fit_fetch(dtrajs)\n",
114 |     "    counts = counts.submodel_largest()\n",
115 |     "    \n",
116 |     "    msm_estimator = MaximumLikelihoodMSM()\n",
117 |     "    models.append(msm_estimator.fit_fetch(counts))"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "$$\\begin{eqnarray*}\n",
125 |     "T(n \\tau) & = & (T(\\tau))^n\\\\[0.75em]\n",
126 |     "\\lambda(n \\tau) & = & (\\lambda(\\tau))^n\\\\[0.75em]\n",
127 |     "\\mathrm{ITS}(n \\tau) & = & - \\frac{n \\tau}{\\ln \\lambda(n \\tau)} = - \\frac{n \\tau}{\\ln (\\lambda(\\tau))^n} = - \\frac{\\tau}{\\ln \\lambda(\\tau)} = \\mathrm{ITS}(\\tau)\n",
128 |     "\\end{eqnarray*}$$\n",
129 |     "\n",
130 |     "We can pass the returned estimated timescales as \"lagtime-timescale\"-tuple to the `pyemma.plots.plot_implied_timescales()` function:"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "from deeptime.util.validation import implied_timescales\n",
140 |     "from deeptime.plots import plot_implied_timescales\n",
141 |     "\n",
142 |     "its = implied_timescales(models)\n",
143 |     "ax = plot_implied_timescales(its, n_its=4)\n",
144 |     "ax.set_yscale('log')\n",
145 |     "ax.set_xlabel('lagtime (ps)')\n",
146 |     "ax.set_ylabel('timescales (ps)');"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "The above plot tells us that there are three resolved processes (blue, orange, green) which are largely invariant to the MSM lag time.\n",
154 |     "The fourth ITS (red) is smaller than the lag time (black line, grey-shaded area);\n",
155 |     "it corresponds to a process which is faster than the lag time and, thus, is not resolved.\n",
156 |     "Since the implied timescales are, like the corresponding eigenvalues, sorted in decreasing order,\n",
157 |     "we know that all other remaining processes must be even faster.\n",
158 |     "\n",
159 |     "## Error bars for the timescales\n",
160 |     "\n",
161 |     "Error bars can be obtained with Bayesian sampling:"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "from deeptime.markov.msm import BayesianMSM\n",
171 |     "\n",
172 |     "def its_bayesian_msm(data, lagtimes):\n",
173 |     "    models = [BayesianMSM(n_samples=50, lagtime=lag).fit_fetch(data) for lag in tqdm(lagtimes)]\n",
174 |     "    return implied_timescales(models)\n",
175 |     "\n",
176 |     "its = its_bayesian_msm(dtrajs, [1, 2, 5, 10, 20, 50])\n",
177 |     "\n",
178 |     "ax = plot_implied_timescales(its, n_its=4)\n",
179 |     "ax.set_yscale('log')\n",
180 |     "ax.set_xlabel('lagtime (ps)')\n",
181 |     "ax.set_ylabel('timescales (ps)');"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "## Effect of the discretization on the implied timescales\n",
189 |     "\n",
190 |     "Let's look at the discretisation's influence on the ITSs:"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "def its_msm(data, lagtimes):\n",
200 |     "    models = [MaximumLikelihoodMSM(lagtime=lag).fit_fetch(data) for lag in lagtimes]\n",
201 |     "    return implied_timescales(models)\n",
202 |     "\n",
203 |     "lags = [1, 2, 5, 10, 20, 50]\n",
204 |     "\n",
205 |     "cluster_20 = KMeans(20, max_iter=50).fit_fetch(np.concatenate(data)[::10])\n",
206 |     "its_20 = its_msm([cluster_20.transform(x) for x in data], lags)\n",
207 |     "\n",
208 |     "cluster_50 = KMeans(50, max_iter=50).fit_fetch(np.concatenate(data)[::10])\n",
209 |     "its_50 = its_msm([cluster_50.transform(x) for x in data], lags)\n",
210 |     "\n",
211 |     "cluster_100 = KMeans(100, max_iter=50).fit_fetch(np.concatenate(data)[::10])\n",
212 |     "its_100 = its_msm([cluster_100.transform(x) for x in data], lags);"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "fig, axes = plt.subplots(2, 3, figsize=(12, 6))\n",
222 |     "\n",
223 |     "pyemma.plots.plot_free_energy(*np.concatenate(data).T, ax=axes[0, 0], cbar=False)\n",
224 |     "axes[0, 0].scatter(*cluster_20.cluster_centers.T, s=15, c='k')\n",
225 |     "ax_its = plot_implied_timescales(its_20, ax=axes[1, 0], n_its=4)\n",
226 |     "ax_its.set_yscale('log')\n",
227 |     "ax_its.set_xlabel('lagtime (ps)')\n",
228 |     "\n",
229 |     "pyemma.plots.plot_free_energy(*np.concatenate(data).T, ax=axes[0, 1], cbar=False)\n",
230 |     "axes[0, 1].scatter(*cluster_50.cluster_centers.T, s=15, c='k')\n",
231 |     "ax_its = plot_implied_timescales(its_50, ax=axes[1, 1], n_its=4)\n",
232 |     "ax_its.set_yscale('log')\n",
233 |     "ax_its.set_xlabel('lagtime (ps)')\n",
234 |     "\n",
235 |     "pyemma.plots.plot_free_energy(*np.concatenate(data).T, ax=axes[0, 2], cbar=False)\n",
236 |     "axes[0, 2].scatter(*cluster_100.cluster_centers.T, s=15, c='k')\n",
237 |     "ax_its = plot_implied_timescales(its_100, ax=axes[1, 2], n_its=4)\n",
238 |     "ax_its.set_yscale('log')\n",
239 |     "ax_its.set_xlabel('lagtime (ps)')\n",
240 |     "\n",
241 |     "fig.tight_layout()"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "## Estimating the maximum likelihood Markov model"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "counts_estimator = TransitionCountEstimator(lagtime=10, count_mode='sliding')\n",
258 |     "counts = counts_estimator.fit_fetch(dtrajs).submodel_largest()\n",
259 |     "\n",
260 |     "msm_estimator = MaximumLikelihoodMSM()\n",
261 |     "msm = msm_estimator.fit_fetch(counts)\n",
262 |     "\n",
263 |     "print(f'fraction of states used = {msm.state_fraction}')\n",
264 |     "print(f'fraction of counts used = {msm.count_fraction}')"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "msm.timescales(k=4)"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "markdown",
278 |    "metadata": {},
279 |    "source": [
280 |     "The state space can be restricted to largest connected set (`submodel_largest()`) or any other selection of states:"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": null,
286 |    "metadata": {},
287 |    "outputs": [],
288 |    "source": [
289 |     "counts = counts.submodel([0, 1, 3, 7])\n",
290 |     "print(f\"States: {counts.states}, state symbols: {counts.state_symbols}\")\n",
291 |     "msm = MaximumLikelihoodMSM().fit_fetch(counts)\n",
292 |     "\n",
293 |     "print(f'fraction of states used = {msm.state_fraction}')\n",
294 |     "print(f'fraction of counts used = {msm.count_fraction}')"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "markdown",
299 |    "metadata": {},
300 |    "source": [
301 |     "And restricted even further, always based on the _states_ of the current count model."
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "counts = counts.submodel([0, 3])\n",
311 |     "print(f\"States: {counts.states}, state symbols: {counts.state_symbols}\")\n",
312 |     "msm = MaximumLikelihoodMSM().fit(counts).fetch_model()\n",
313 |     "\n",
314 |     "print(f'fraction of states used = {msm.state_fraction}')\n",
315 |     "print(f'fraction of counts used = {msm.count_fraction}')"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "markdown",
320 |    "metadata": {},
321 |    "source": [
322 |     "## Estimating the Bayesian Markov model"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {},
329 |    "outputs": [],
330 |    "source": [
331 |     "count_estimator = TransitionCountEstimator(lagtime=10, count_mode='effective')\n",
332 |     "counts = count_estimator.fit_fetch(dtrajs).submodel_largest()\n",
333 |     "bayesian_msm_estimator = BayesianMSM()\n",
334 |     "bayesian_msm = bayesian_msm_estimator.fit_fetch(counts)"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "metadata": {},
340 |    "source": [
341 |     "Shortcut:"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "bayesian_msm = BayesianMSM(lagtime=10).fit_fetch(dtrajs)"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": null,
356 |    "metadata": {},
357 |    "outputs": [],
358 |    "source": [
359 |     "stats = bayesian_msm.gather_stats('timescales', k=3)\n",
360 |     "stats.L, stats.R"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "metadata": {},
366 |    "source": [
367 |     "## The Chapman-Kolmogorov test\n",
368 |     "\n",
369 |     "To see whether our model satisfies Markovianity, we perform (and visualize) a Chapman-Kolmogorow (CK) test.\n",
370 |     "Since we aim at modeling the dynamics between metastable states rather than between microstates, this will be conducted in the space of metastable states.\n",
371 |     "The latter are identified automatically using PCCA++ (which is explained later).\n",
372 |     "We usually choose the number of metastable states according to the implied timescales plot by identifying a gap between the ITS."
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "metadata": {},
379 |    "outputs": [],
380 |    "source": [
381 |     "test_model = MaximumLikelihoodMSM(lagtime=10).fit_fetch(dtrajs);"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": [
390 |     "models = []\n",
391 |     "for lagtime in [10, 20, 30, 40, 50, 80, 100]:\n",
392 |     "    models.append(MaximumLikelihoodMSM(lagtime=lagtime).fit_fetch(dtrajs))\n",
393 |     "ck_test = test_model.ck_test(models, n_metastable_sets=4)"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": null,
399 |    "metadata": {},
400 |    "outputs": [],
401 |    "source": [
402 |     "from deeptime.plots import plot_ck_test\n",
403 |     "\n",
404 |     "plot_ck_test(ck_test, xlabel='lagtime (ps)', sharey=True);"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": null,
410 |    "metadata": {},
411 |    "outputs": [],
412 |    "source": [
413 |     "counts = TransitionCountEstimator(lagtime=10, count_mode='effective')\\\n",
414 |     "    .fit_fetch(dtrajs).submodel_largest()\n",
415 |     "test_bmsm = BayesianMSM().fit_fetch(counts)\n",
416 |     "\n",
417 |     "models = [BayesianMSM(lagtime=lagtime, n_samples=20).fit_fetch(dtrajs) for lagtime in tqdm([10, 20, 30, 40, 50, 80, 100])]"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": null,
423 |    "metadata": {},
424 |    "outputs": [],
425 |    "source": [
426 |     "ck_test = test_bmsm.ck_test(models, n_metastable_sets=4)\n",
427 |     "plot_ck_test(ck_test, xlabel='lagtime (ps)');"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "markdown",
432 |    "metadata": {},
433 |    "source": [
434 |     "## Persisting and restoring estimators"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": null,
440 |    "metadata": {},
441 |    "outputs": [],
442 |    "source": [
443 |     "import pickle\n",
444 |     "\n",
445 |     "with open('cluster_50.pkl', 'wb') as f:\n",
446 |     "    pickle.dump(cluster_50, f)\n",
447 |     "    \n",
448 |     "with open('cluster_50.pkl', 'rb') as f:\n",
449 |     "    cluster_50_restored = pickle.load(f)\n",
450 |     "    \n",
451 |     "print(cluster_50_restored.n_clusters)"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "code",
456 |    "execution_count": null,
457 |    "metadata": {},
458 |    "outputs": [],
459 |    "source": [
460 |     "with open('msm.pkl', 'wb') as f:\n",
461 |     "    pickle.dump(msm, f)\n",
462 |     "with open('msm.pkl', 'rb') as f:\n",
463 |     "    msm_restored = pickle.load(f)\n",
464 |     "    \n",
465 |     "print(f\"Timescales {msm.timescales()}, restored {msm_restored.timescales()}\")"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "markdown",
470 |    "metadata": {},
471 |    "source": [
472 |     "## Hands-on\n",
473 |     "\n",
474 |     "#### Exercise 1\n",
475 |     "\n",
476 |     "Load the heavy atom distances into memory, perform PCA and TICA (`lag=3`) with `dim=2`,\n",
477 |     "then discretize with $100$ $k$-means centers and a stride of $10$. Compare the two discretizations be generating implied timescale plots for both of them."
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": null,
483 |    "metadata": {
484 |     "solution2": "hidden",
485 |     "solution2_first": true
486 |    },
487 |    "outputs": [],
488 |    "source": [
489 |     "feat =  #FIXME\n",
490 |     "feat. #FIXME\n",
491 |     "data =  #FIXME\n",
492 |     "\n",
493 |     "from sklearn.decomposition import PCA\n",
494 |     "pca = PCA(n_components=2).fit(np.concatenate(data))\n",
495 |     "tica = #FIXME\n",
496 |     "\n",
497 |     "pca_output = #FIXME\n",
498 |     "tica_output = [tica.transform(traj) for traj in data]\n",
499 |     "\n",
500 |     "cls_pca_estimator = KMeans(100, max_iter=50)\n",
501 |     "cls_pca = #FIXME\n",
502 |     "cls_tica = #FIXME\n",
503 |     "\n",
504 |     "dtrajs_pca = [cls_pca.transform(pca.transform(traj)) for traj in data]\n",
505 |     "dtrajs_tica = # FIXME\n",
506 |     "\n",
507 |     "its_pca = implied_timescales_msm(dtrajs_pca, lags=[1, 2, 5, 10, 20, 50])\n",
508 |     "its_tica = #FIXME"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "markdown",
513 |    "metadata": {
514 |     "solution2": "hidden"
515 |    },
516 |    "source": [
517 |     "###### Solution"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": null,
523 |    "metadata": {
524 |     "solution2": "hidden"
525 |    },
526 |    "outputs": [],
527 |    "source": [
528 |     "feat = pyemma.coordinates.featurizer(pdb)\n",
529 |     "pairs = feat.pairs(feat.select_Heavy())\n",
530 |     "feat.add_distances(pairs, periodic=False)\n",
531 |     "data = pyemma.coordinates.load(files, features=feat)\n",
532 |     "\n",
533 |     "from sklearn.decomposition import PCA\n",
534 |     "pca = PCA(n_components=2).fit(np.concatenate(data))\n",
535 |     "\n",
536 |     "from deeptime.decomposition import TICA\n",
537 |     "tica_estimator = TICA(lagtime=3, dim=2)\n",
538 |     "tica = tica_estimator.fit_fetch(data)\n",
539 |     "\n",
540 |     "pca_output = [pca.transform(traj) for traj in data]\n",
541 |     "tica_output = [tica.transform(traj) for traj in data]\n",
542 |     "\n",
543 |     "cls_pca = KMeans(100, max_iter=50).fit(np.concatenate(pca_output)[::10]).fetch_model()\n",
544 |     "cls_tica = KMeans(100, max_iter=50).fit(np.concatenate(tica_output)[::10]).fetch_model()\n",
545 |     "\n",
546 |     "dtrajs_pca = [cls_pca.transform(pca.transform(traj)) for traj in data]\n",
547 |     "dtrajs_tica = [cls_tica.transform(tica.transform(traj)) for traj in data]\n",
548 |     "\n",
549 |     "lags = [1, 2, 5, 10, 20, 50]\n",
550 |     "its_pca = implied_timescales([MaximumLikelihoodMSM(lagtime=lag).fit_fetch(dtrajs_pca) for lag in lags])\n",
551 |     "its_tica = implied_timescales([MaximumLikelihoodMSM(lagtime=lag).fit_fetch(dtrajs_tica) for lag in lags])"
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "markdown",
556 |    "metadata": {},
557 |    "source": [
558 |     "Let's visualize the ITS convergence for both projections:"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "code",
563 |    "execution_count": null,
564 |    "metadata": {},
565 |    "outputs": [],
566 |    "source": [
567 |     "fig, axes = plt.subplots(2, 3, figsize=(12, 6))\n",
568 |     "pyemma.plots.plot_feature_histograms(np.concatenate(pca_output), ax=axes[0, 0])\n",
569 |     "pyemma.plots.plot_feature_histograms(np.concatenate(tica_output), ax=axes[1, 0])\n",
570 |     "axes[0, 0].set_title('PCA')\n",
571 |     "axes[1, 0].set_title('TICA')\n",
572 |     "pyemma.plots.plot_density(*np.concatenate(pca_output).T, ax=axes[0, 1], cbar=False, alpha=0.1)\n",
573 |     "axes[0, 1].scatter(*cls_pca.cluster_centers.T, s=15, c='C1')\n",
574 |     "axes[0, 1].set_xlabel('PC 1')\n",
575 |     "axes[0, 1].set_ylabel('PC 2')\n",
576 |     "pyemma.plots.plot_density(*np.concatenate(tica_output).T, ax=axes[1, 1], cbar=False, alpha=0.1)\n",
577 |     "axes[1, 1].scatter(*cls_tica.cluster_centers.T, s=15, c='C1')\n",
578 |     "axes[1, 1].set_xlabel('IC 1')\n",
579 |     "axes[1, 1].set_ylabel('IC 2')\n",
580 |     "ax_its = plot_implied_timescales(its_pca, ax=axes[0, 2], n_its=4)\n",
581 |     "ax_its.set_yscale('log')\n",
582 |     "ax_its.set_xlabel('lagtime (ps)')\n",
583 |     "ax_its = plot_implied_timescales(its_tica, ax=axes[1, 2], n_its=4)\n",
584 |     "ax_its.set_yscale('log')\n",
585 |     "ax_its.set_xlabel('lagtime (ps)')\n",
586 |     "axes[0, 2].set_ylim(1, 2000)\n",
587 |     "axes[1, 2].set_ylim(1, 2000)\n",
588 |     "fig.tight_layout()"
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "markdown",
593 |    "metadata": {},
594 |    "source": [
595 |     "Despite the fact that PCA yields a projection with some defined basins,\n",
596 |     "the ITS plot shows that only one \"slow\" process is resolved which is more than one order of magnitude too fast.\n",
597 |     "\n",
598 |     "TICA does find three slow processes which agree (in terms of the implied timescales) with the backbone torsions example above.\n",
599 |     "\n",
600 |     "We conclude that this PCA projection is not suitable to resolve the slow dynamics of alanine dipeptide and we will continue to estimate/validate the TICA-based projection.\n",
601 |     "\n",
602 |     "#### Exercise 2\n",
603 |     "\n",
604 |     "Estimate a Bayesian MSM at lag time $10$ ps and perform/show a CK test for four metastable states."
605 |    ]
606 |   },
607 |   {
608 |    "cell_type": "code",
609 |    "execution_count": null,
610 |    "metadata": {
611 |     "solution2": "hidden",
612 |     "solution2_first": true
613 |    },
614 |    "outputs": [],
615 |    "source": [
616 |     "counts_estimator = TransitionCountEstimator(lagtime=10, count_mode=\"effective\")\n",
617 |     "counts = counts_estimator.fit_fetch(dtrajs_tica).submodel_largest()\n",
618 |     "bayesian_msm = # FIXME\n",
619 |     "pyemma.plots. #FIXME"
620 |    ]
621 |   },
622 |   {
623 |    "cell_type": "markdown",
624 |    "metadata": {
625 |     "solution2": "hidden"
626 |    },
627 |    "source": [
628 |     "###### Solution"
629 |    ]
630 |   },
631 |   {
632 |    "cell_type": "code",
633 |    "execution_count": null,
634 |    "metadata": {
635 |     "solution2": "hidden"
636 |    },
637 |    "outputs": [],
638 |    "source": [
639 |     "test_model = BayesianMSM(n_samples=50, lagtime=10).fit_fetch(dtrajs_tica)\n",
640 |     "\n",
641 |     "models = []\n",
642 |     "for i in tqdm(range(1, 10)):\n",
643 |     "    models.append(BayesianMSM(n_samples=50, lagtime=i*10).fit_fetch(dtrajs_tica))\n",
644 |     "\n",
645 |     "ck_test = test_model.ck_test(models, n_metastable_sets=4)\n",
646 |     "plot_ck_test(ck_test, xlabel='lagtime (ps)', sharey=True);"
647 |    ]
648 |   },
649 |   {
650 |    "cell_type": "markdown",
651 |    "metadata": {},
652 |    "source": [
653 |     "We again see a good agreement between model prediction and re-estimation.\n",
654 |     "\n",
655 |     "## Wrapping up\n",
656 |     "In this notebook, we have learned how to estimate a regular or Bayesian MSM from discretized molecular simulation data with `deeptime` and `pyemma`, also how to perform basic model validation.\n",
657 |     "\n",
658 |     "In detail, we have selected a suitable lag time by\n",
659 |     "- computing timescales from MSMs and Bayesian MSMs\n",
660 |     "- `pyemma.plots.plot_implied_timescales()` to visualize the convergence of the implied timescales.\n",
661 |     "\n",
662 |     "We then have used\n",
663 |     "- `dt.markov.TransitionCountEstimator()` to estimate transition counts\n",
664 |     "- `dt.markov.msm.MaximumLikelihoodMSM()` to estimate a regular MSM,\n",
665 |     "- `dt.markov.msm.BayesianMSM()` to estimate a Bayesian MSM,\n",
666 |     "- the `timescales()` method of an estimated MSM object to access its implied timescales,\n",
667 |     "- the `chapman_kolmogorov_validator()` method of an estimated MSM estiamator to perform a Chapman-Kolmogorow test, and\n",
668 |     "- `pyemma.plots.plot_cktest()` to visualize the latter."
669 |    ]
670 |   }
671 |  ],
672 |  "metadata": {
673 |   "kernelspec": {
674 |    "display_name": "deeptime",
675 |    "language": "python",
676 |    "name": "deeptime"
677 |   },
678 |   "language_info": {
679 |    "codemirror_mode": {
680 |     "name": "ipython",
681 |     "version": 3
682 |    },
683 |    "file_extension": ".py",
684 |    "mimetype": "text/x-python",
685 |    "name": "python",
686 |    "nbconvert_exporter": "python",
687 |    "pygments_lexer": "ipython3",
688 |    "version": "3.9.10"
689 |   },
690 |   "toc": {
691 |    "base_numbering": 1,
692 |    "nav_menu": {},
693 |    "number_sections": true,
694 |    "sideBar": true,
695 |    "skip_h1_title": false,
696 |    "title_cell": "Table of Contents",
697 |    "title_sidebar": "Contents",
698 |    "toc_cell": false,
699 |    "toc_position": {},
700 |    "toc_section_display": true,
701 |    "toc_window_display": true
702 |   }
703 |  },
704 |  "nbformat": 4,
705 |  "nbformat_minor": 4
706 | }
707 | 


--------------------------------------------------------------------------------
/notebooks/08-vampnets-session-solved.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# VAMPnets\n",
  8 |     "\n",
  9 |     "<a rel=\"license\" href=\"http://creativecommons.org/licenses/by/4.0/\"><img alt=\"Creative Commons Licence\" style=\"border-width:0\" src=\"https://i.creativecommons.org/l/by/4.0/88x31.png\" title='This work is licensed under a Creative Commons Attribution 4.0 International License.' align=\"right\"/></a><br><br>\n",
 10 |     "\n",
 11 |     "In this session we will see an example of how to use VAMPnets to extract a coarse-grained model from raw data using a n unsupervised deep learning approach. We will load data from a 2D toy model with xxx states, and build and train a neural network that assigns each datapoint to a separate state, and finally visualize the information we extracted from the dataset. \n",
 12 |     "After this, we will follow the same process to analyse a trajectory of the molecule Alanine Dipeptide, since it is a 30D system whose dynamics can be easily visualized in a 2D space.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "<a id=\"ref-1\" href=\"https://www.nature.com/articles/s41467-017-02388-1\">Here</a> you can find literature on the used method.\n",
 16 |     "\n",
 17 |     "**Remember**:\n",
 18 |     "- to run the currently highlighted cell, hold <kbd>&#x21E7; Shift</kbd> and press <kbd>&#x23ce; Enter</kbd>;\n",
 19 |     "- to get help for a specific function, place the cursor within the function's brackets, hold <kbd>&#x21E7; Shift</kbd>, and press <kbd>&#x21E5; Tab</kbd>;"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "### Import the required packages\n",
 27 |     "\n",
 28 |     "In case you haven't installed pytorch: [Installation instructions](https://pytorch.org/get-started/locally/)."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "ExecuteTime": {
 36 |      "end_time": "2022-02-17T10:00:18.448464Z",
 37 |      "start_time": "2022-02-17T10:00:16.828959Z"
 38 |     }
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "%matplotlib inline\n",
 43 |     "import matplotlib.pyplot as plt\n",
 44 |     "import numpy as np\n",
 45 |     "import mdshare\n",
 46 |     "import pyemma\n",
 47 |     "import deeptime as dt\n",
 48 |     "import torch\n",
 49 |     "import torch.nn as nn\n",
 50 |     "\n",
 51 |     "from tqdm.notebook import tqdm\n",
 52 |     "from deeptime.plots import plot_implied_timescales\n",
 53 |     "from deeptime.util.validation import implied_timescales"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {
 60 |     "ExecuteTime": {
 61 |      "end_time": "2022-02-17T10:00:18.454416Z",
 62 |      "start_time": "2022-02-17T10:00:18.450757Z"
 63 |     }
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# this is optional if you have CUDA/GPU support\n",
 68 |     "device = torch.device(\"cuda\")\n",
 69 |     "torch.backends.cudnn.benchmark = True\n",
 70 |     "\n",
 71 |     "torch.set_num_threads(12)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "### Guided example: 2D toy model\n",
 79 |     "We start by loading the data for the 2D model by using the package `mdshare`. The `fetch` function fetches the data from our servers. **Do not use `mdshare` for your own data!**"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {
 86 |     "ExecuteTime": {
 87 |      "end_time": "2022-02-17T10:00:18.888328Z",
 88 |      "start_time": "2022-02-17T10:00:18.456161Z"
 89 |     }
 90 |    },
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "file = mdshare.fetch(\"hmm-doublewell-2d-100k.npz\", working_directory=\"data\")\n",
 94 |     "with np.load(file) as fh:\n",
 95 |     "    data = fh[\"trajectory\"]"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "Next we want to visualize how the datas are distributed in the 2D space.\n",
103 |     "\n",
104 |     "#### Exercise\n",
105 |     "Plot the density of the data using a function from the `pyemma` package"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {
112 |     "ExecuteTime": {
113 |      "end_time": "2022-02-17T10:00:19.481092Z",
114 |      "start_time": "2022-02-17T10:00:18.890480Z"
115 |     }
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "pyemma.plots.plot_density(data[:,0], data[:,1]) ##FIXME\n",
120 |     "plt.show()"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "### Hyperparameter selection\n",
128 |     "The next step is a bit tricky, as hyperparameter selection requires some experience to be done correctly. We provided some default values that will allow for a smooth training of our model. The meaning of every hyperparameter is explained in the next cell."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "ExecuteTime": {
136 |      "end_time": "2022-02-17T10:07:20.118868Z",
137 |      "start_time": "2022-02-17T10:07:20.112073Z"
138 |     }
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "# Tau, how much is the timeshift of the two datasets\n",
143 |     "tau = 1\n",
144 |     "\n",
145 |     "# Batch size for Stochastic Gradient descent\n",
146 |     "batch_size = 3000\n",
147 |     "\n",
148 |     "# Which trajectory points percentage is used as validation\n",
149 |     "val_ratio = 0.1\n",
150 |     "\n",
151 |     "# How many hidden layers the network has\n",
152 |     "network_depth = 4\n",
153 |     "\n",
154 |     "# \"Width\" of every layer\n",
155 |     "layer_width = 20\n",
156 |     "\n",
157 |     "# Learning rate used for the ADAM optimizer\n",
158 |     "learning_rate = 5e-3\n",
159 |     "\n",
160 |     "# How many output states the network has\n",
161 |     "output_size = 2\n",
162 |     "\n",
163 |     "# List of nodes of each layer\n",
164 |     "nodes = [data.shape[1]] + [layer_width for _ in range(network_depth)] + [output_size]\n",
165 |     "\n",
166 |     "# Iteration over the training set in the fitting process;\n",
167 |     "# basically how many iterations our training algorithm will do\n",
168 |     "nb_epoch = 20"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "### Data preprocessing\n",
176 |     "\n",
177 |     "Now we can to prepare our data so that it can be used for training our VAMPnets model. We want two arrays made of coupled datapoints, which are selected from the main trajectory at indexes $i, i+\\tau$. We want the two trajectories to be shuffled, but to maintain the correspondence between the non-time-lagged and the time-lagged datapoints. Finally, we want to split our data into training set and validation set, the former being used for training the algorithm, and the latter being necessary to test whether the network is overfitting ( = the resulting transformation works only on the training set but not on data from the same distribution).\n",
178 |     "<br>\n",
179 |     "<br>"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "metadata": {
186 |     "ExecuteTime": {
187 |      "end_time": "2022-02-17T10:07:20.833305Z",
188 |      "start_time": "2022-02-17T10:07:20.829666Z"
189 |     }
190 |    },
191 |    "outputs": [],
192 |    "source": [
193 |     "dataset = dt.util.data.TrajectoryDataset(lagtime=tau, trajectory=data.astype(np.float32))"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {
200 |     "ExecuteTime": {
201 |      "end_time": "2022-02-17T10:07:21.087694Z",
202 |      "start_time": "2022-02-17T10:07:21.061429Z"
203 |     }
204 |    },
205 |    "outputs": [],
206 |    "source": [
207 |     "n_val = int(len(dataset)*val_ratio)\n",
208 |     "train_data, val_data = torch.utils.data.random_split(dataset, [len(dataset) - n_val, n_val])"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {
215 |     "ExecuteTime": {
216 |      "end_time": "2022-02-17T10:07:21.261012Z",
217 |      "start_time": "2022-02-17T10:07:21.247655Z"
218 |     }
219 |    },
220 |    "outputs": [],
221 |    "source": [
222 |     "from deeptime.util.torch import MLP\n",
223 |     "lobe = MLP(units=nodes, nonlinearity=nn.ELU, output_nonlinearity=nn.Softmax)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {
230 |     "ExecuteTime": {
231 |      "end_time": "2022-02-17T10:07:22.043651Z",
232 |      "start_time": "2022-02-17T10:07:22.040554Z"
233 |     }
234 |    },
235 |    "outputs": [],
236 |    "source": [
237 |     "vampnet = dt.decomposition.deep.VAMPNet(lobe=lobe, learning_rate=learning_rate)"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {
244 |     "ExecuteTime": {
245 |      "end_time": "2022-02-17T10:07:22.705168Z",
246 |      "start_time": "2022-02-17T10:07:22.702307Z"
247 |     }
248 |    },
249 |    "outputs": [],
250 |    "source": [
251 |     "from torch.utils.data import DataLoader\n",
252 |     "\n",
253 |     "loader_train = DataLoader(train_data, batch_size=batch_size, shuffle=True)\n",
254 |     "loader_val = DataLoader(val_data, batch_size=len(val_data), shuffle=False)"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {
261 |     "ExecuteTime": {
262 |      "end_time": "2022-02-17T10:12:19.176451Z",
263 |      "start_time": "2022-02-17T10:07:23.393483Z"
264 |     }
265 |    },
266 |    "outputs": [],
267 |    "source": [
268 |     "model = vampnet.fit(loader_train, n_epochs=nb_epoch, validation_loader=loader_val, progress=tqdm).fetch_model()"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "markdown",
273 |    "metadata": {},
274 |    "source": [
275 |     "### Model validation\n",
276 |     "\n",
277 |     "When the previous cell will finish running, we have successfully (🤞) trained our VAMPnets. We can plot the training information to visualize how well our training proceeded, and by plotting both training and validation information we can make sure that our model didn't overfit. Before running the next cell, consider that the our network's training and validation scores should converge to a value slightly lower than $2$, since the score is calculated as the norm of the singular values of the estimated Koopman operator. We only have 2 output nodes and the largest singular value is always $=1$."
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {
284 |     "ExecuteTime": {
285 |      "end_time": "2022-02-17T10:12:19.887907Z",
286 |      "start_time": "2022-02-17T10:12:19.179367Z"
287 |     }
288 |    },
289 |    "outputs": [],
290 |    "source": [
291 |     "plt.loglog(*vampnet.train_scores.T, label='training')\n",
292 |     "plt.loglog(*vampnet.validation_scores.T, label='validation')\n",
293 |     "plt.xlabel('step')\n",
294 |     "plt.ylabel('score')\n",
295 |     "plt.legend();"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "markdown",
300 |    "metadata": {},
301 |    "source": [
302 |     "Now we can finally reap the results of our work: if the network was properly trained it should automatically separate the two wells in our system. We can verify this hypothesis by first transforming our dataset with the network using the `model.predict` method."
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": null,
308 |    "metadata": {
309 |     "ExecuteTime": {
310 |      "end_time": "2022-02-17T10:17:25.523840Z",
311 |      "start_time": "2022-02-17T10:17:25.250759Z"
312 |     }
313 |    },
314 |    "outputs": [],
315 |    "source": [
316 |     "transformed_data = model.transform(data)"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "metadata": {},
322 |    "source": [
323 |     "Now we can visualize to which states the network assigns every point; we do so in the following cell by calculating to which state every datapoint is most likely to be assigned by the network:"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {
330 |     "ExecuteTime": {
331 |      "end_time": "2022-02-17T10:17:29.444753Z",
332 |      "start_time": "2022-02-17T10:17:26.202519Z"
333 |     }
334 |    },
335 |    "outputs": [],
336 |    "source": [
337 |     "plt.scatter(*data.T, c=transformed_data[:,0])"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "markdown",
342 |    "metadata": {},
343 |    "source": [
344 |     "If you are looking at an orange and a blue ball, your network reached its optimal state during the training. \n",
345 |     "\n",
346 |     "We can further analyze the output of the network by visualizing the decision landscape:"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "metadata": {
353 |     "ExecuteTime": {
354 |      "end_time": "2022-02-17T10:17:33.209614Z",
355 |      "start_time": "2022-02-17T10:17:32.789421Z"
356 |     }
357 |    },
358 |    "outputs": [],
359 |    "source": [
360 |     "xmax = np.max(np.abs(data[:, 0]))\n",
361 |     "ymin = np.min(data[:, 1])\n",
362 |     "ymax = np.max(data[:, 1])\n",
363 |     "grid = np.meshgrid(np.linspace(-xmax-1, xmax+1, 150), np.linspace(ymin-1, ymax+1, 50))\n",
364 |     "xy = np.dstack(grid).reshape(-1, 2)\n",
365 |     "z = model.transform(xy)[:,0]\n",
366 |     "\n",
367 |     "cb = plt.contourf(grid[0], grid[1], z.reshape(grid[0].shape), levels=15, cmap='coolwarm')\n",
368 |     "plt.colorbar(cb);"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "markdown",
373 |    "metadata": {},
374 |    "source": [
375 |     "Since this is a very simple system, the network should enforce a very sharp classification, with most of the points belonging to either `state 1` or `state 2`, with only a few points in between having a mixed value.\n",
376 |     "\n",
377 |     "As a last step, we can verify that the network preserves the slow information in the system by plotting the implied timescales present in our transformed data:"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": null,
383 |    "metadata": {
384 |     "ExecuteTime": {
385 |      "end_time": "2022-02-17T10:17:43.337174Z",
386 |      "start_time": "2022-02-17T10:17:36.340910Z"
387 |     }
388 |    },
389 |    "outputs": [],
390 |    "source": [
391 |     "lagtimes = np.arange(1, 11)\n",
392 |     "its = implied_timescales([dt.decomposition.VAMP(lagtime=lag, observable_transform=model).fit(data).fetch_model() for lag in lagtimes])\n",
393 |     "fig, axes = plt.subplots(1, 1, figsize=(6, 4))\n",
394 |     "\n",
395 |     "plot_implied_timescales(its, ax=axes)\n",
396 |     "axes.set_yscale('log')\n",
397 |     "axes.set_xlabel('lagtime (steps)')\n",
398 |     "axes.set_ylabel('timescale (steps)')\n",
399 |     "fig.tight_layout()"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "markdown",
404 |    "metadata": {},
405 |    "source": [
406 |     "## Hands-on session: Alanine Dipeptide\n",
407 |     "In the following three cells, you are given the loading function for the alanine-dipeptide trajectories (along with its 2 dihedral values), a plot that shows how to visualize information about the molecule using the dihedral data, and a set of hyperparameters. Build and train a network that classifies alanine samples, and set the number of epochs so that your network converges to a stable score. Plot your results and confront them to the provided examples."
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "markdown",
412 |    "metadata": {},
413 |    "source": [
414 |     "#### Cell 1: Loading\n",
415 |     "**NOTE: do NOT use the dihedral information for the training! It would be easier to do so, but the interesting aspect of this exercise lies in seeing how easily the network extracts a low level representation from a highly dimensional space**"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": null,
421 |    "metadata": {
422 |     "ExecuteTime": {
423 |      "end_time": "2022-02-17T09:42:23.006552Z",
424 |      "start_time": "2022-02-17T09:42:22.940646Z"
425 |     }
426 |    },
427 |    "outputs": [],
428 |    "source": [
429 |     "ala_coords_file = mdshare.fetch(\n",
430 |     "    \"alanine-dipeptide-3x250ns-heavy-atom-positions.npz\", working_directory=\"data\"\n",
431 |     ")\n",
432 |     "with np.load(ala_coords_file) as fh:\n",
433 |     "    data = fh[\"arr_0\"]\n",
434 |     "\n",
435 |     "dihedral_file = mdshare.fetch(\n",
436 |     "    \"alanine-dipeptide-3x250ns-backbone-dihedrals.npz\", working_directory=\"data\"\n",
437 |     ")\n",
438 |     "with np.load(dihedral_file) as fh:\n",
439 |     "    dihedral = fh[\"arr_0\"]"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "markdown",
444 |    "metadata": {},
445 |    "source": [
446 |     "#### Cell 2: Visualization\n",
447 |     "Since the dynamics of the molecule are completely described by its position in the dihedral plane, we can use these two variables every time we need to pass an x-axis and y-axis to a plotting function"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "metadata": {
454 |     "ExecuteTime": {
455 |      "end_time": "2022-02-17T09:42:27.352272Z",
456 |      "start_time": "2022-02-17T09:42:26.990397Z"
457 |     }
458 |    },
459 |    "outputs": [],
460 |    "source": [
461 |     "pyemma.plots.plot_density(*dihedral.T, cmap=\"viridis\")\n",
462 |     "plt.show()"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "markdown",
467 |    "metadata": {},
468 |    "source": [
469 |     "#### Cell 3: Hyperparameters\n",
470 |     "The `nb_epochs` variable is missing a value. Experiment with the training and find a number of epochs that ensures that your network will converge every time you train it"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": null,
476 |    "metadata": {
477 |     "ExecuteTime": {
478 |      "end_time": "2022-02-17T09:51:17.283372Z",
479 |      "start_time": "2022-02-17T09:51:17.277646Z"
480 |     }
481 |    },
482 |    "outputs": [],
483 |    "source": [
484 |     "tau = 1\n",
485 |     "\n",
486 |     "batch_size = 10000\n",
487 |     "\n",
488 |     "train_ratio = 0.9\n",
489 |     "\n",
490 |     "network_depth = 6\n",
491 |     "\n",
492 |     "layer_width = 30\n",
493 |     "\n",
494 |     "learning_rate = 5e-3\n",
495 |     "\n",
496 |     "output_size = 6\n",
497 |     "\n",
498 |     "nodes = [data.shape[1]] + [layer_width for _ in range(network_depth)] + [output_size]\n",
499 |     "\n",
500 |     "nb_epoch = 30## FIXME"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": null,
506 |    "metadata": {
507 |     "ExecuteTime": {
508 |      "end_time": "2022-02-17T09:42:30.220102Z",
509 |      "start_time": "2022-02-17T09:42:30.206208Z"
510 |     }
511 |    },
512 |    "outputs": [],
513 |    "source": [
514 |     "dataset = dt.util.data.TrajectoryDataset(lagtime=tau, trajectory=data.astype(np.float32))"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": null,
520 |    "metadata": {
521 |     "ExecuteTime": {
522 |      "end_time": "2022-02-17T09:42:31.123941Z",
523 |      "start_time": "2022-02-17T09:42:31.078446Z"
524 |     }
525 |    },
526 |    "outputs": [],
527 |    "source": [
528 |     "n_val = int(len(dataset)*val_ratio)\n",
529 |     "train_data, val_data = torch.utils.data.random_split(dataset, [len(dataset) - n_val, n_val])"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": null,
535 |    "metadata": {
536 |     "ExecuteTime": {
537 |      "end_time": "2022-02-17T09:51:14.732232Z",
538 |      "start_time": "2022-02-17T09:51:14.724872Z"
539 |     }
540 |    },
541 |    "outputs": [],
542 |    "source": [
543 |     "from deeptime.util.torch import MLP\n",
544 |     "lobe = MLP(units=nodes, nonlinearity=nn.ELU, output_nonlinearity=nn.Softmax)"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": null,
550 |    "metadata": {
551 |     "ExecuteTime": {
552 |      "end_time": "2022-02-17T09:51:15.169200Z",
553 |      "start_time": "2022-02-17T09:51:15.163638Z"
554 |     }
555 |    },
556 |    "outputs": [],
557 |    "source": [
558 |     "vampnet = dt.decomposition.deep.VAMPNet(lobe=lobe, learning_rate=learning_rate)"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "code",
563 |    "execution_count": null,
564 |    "metadata": {
565 |     "ExecuteTime": {
566 |      "end_time": "2022-02-17T09:51:19.257364Z",
567 |      "start_time": "2022-02-17T09:51:19.252728Z"
568 |     }
569 |    },
570 |    "outputs": [],
571 |    "source": [
572 |     "from torch.utils.data import DataLoader\n",
573 |     "\n",
574 |     "loader_train = DataLoader(train_data, batch_size=batch_size, shuffle=True)\n",
575 |     "loader_val = DataLoader(val_data, batch_size=len(val_data), shuffle=False)"
576 |    ]
577 |   },
578 |   {
579 |    "cell_type": "code",
580 |    "execution_count": null,
581 |    "metadata": {
582 |     "ExecuteTime": {
583 |      "end_time": "2022-02-17T09:57:57.168327Z",
584 |      "start_time": "2022-02-17T09:51:20.226057Z"
585 |     }
586 |    },
587 |    "outputs": [],
588 |    "source": [
589 |     "model = vampnet.fit(loader_train, n_epochs=nb_epoch, validation_loader=loader_val, progress=tqdm).fetch_model()"
590 |    ]
591 |   },
592 |   {
593 |    "cell_type": "code",
594 |    "execution_count": null,
595 |    "metadata": {
596 |     "ExecuteTime": {
597 |      "end_time": "2022-02-17T09:58:00.304365Z",
598 |      "start_time": "2022-02-17T09:57:59.706153Z"
599 |     }
600 |    },
601 |    "outputs": [],
602 |    "source": [
603 |     "plt.loglog(*vampnet.train_scores.T, label='training')\n",
604 |     "plt.loglog(*vampnet.validation_scores.T, label='validation')\n",
605 |     "plt.xlabel('step')\n",
606 |     "plt.ylabel('score')\n",
607 |     "plt.legend();"
608 |    ]
609 |   },
610 |   {
611 |    "cell_type": "code",
612 |    "execution_count": null,
613 |    "metadata": {
614 |     "ExecuteTime": {
615 |      "end_time": "2022-02-17T09:58:07.768794Z",
616 |      "start_time": "2022-02-17T09:58:06.620620Z"
617 |     }
618 |    },
619 |    "outputs": [],
620 |    "source": [
621 |     "transformed_data = model.transform(data)"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "code",
626 |    "execution_count": null,
627 |    "metadata": {
628 |     "ExecuteTime": {
629 |      "end_time": "2022-02-17T09:58:37.567956Z",
630 |      "start_time": "2022-02-17T09:58:08.445781Z"
631 |     }
632 |    },
633 |    "outputs": [],
634 |    "source": [
635 |     "lagtimes = np.arange(1, 11)\n",
636 |     "its = implied_timescales([dt.decomposition.VAMP(lagtime=lag, observable_transform=model).fit(data).fetch_model() for lag in lagtimes])\n",
637 |     "fig, axes = plt.subplots(1, 1, figsize=(6, 4))\n",
638 |     "\n",
639 |     "plot_implied_timescales(its, ax=axes)\n",
640 |     "axes.set_yscale('log')\n",
641 |     "axes.set_xlabel('lagtime (steps)')\n",
642 |     "axes.set_ylabel('timescale (steps)')\n",
643 |     "fig.tight_layout()"
644 |    ]
645 |   },
646 |   {
647 |    "cell_type": "code",
648 |    "execution_count": null,
649 |    "metadata": {
650 |     "ExecuteTime": {
651 |      "end_time": "2022-02-17T09:58:55.913229Z",
652 |      "start_time": "2022-02-17T09:58:37.570595Z"
653 |     }
654 |    },
655 |    "outputs": [],
656 |    "source": [
657 |     "for i in range(output_size):\n",
658 |     "    plt.scatter(*dihedral.T, c=transformed_data[:,i], s=0.5)\n",
659 |     "    plt.show()"
660 |    ]
661 |   },
662 |   {
663 |    "cell_type": "code",
664 |    "execution_count": null,
665 |    "metadata": {
666 |     "ExecuteTime": {
667 |      "end_time": "2022-02-17T09:58:55.921705Z",
668 |      "start_time": "2022-02-17T09:58:55.915544Z"
669 |     }
670 |    },
671 |    "outputs": [],
672 |    "source": [
673 |     "colorcode = np.argmax(transformed_data, axis=1)"
674 |    ]
675 |   },
676 |   {
677 |    "cell_type": "code",
678 |    "execution_count": null,
679 |    "metadata": {
680 |     "ExecuteTime": {
681 |      "end_time": "2022-02-17T09:58:59.401302Z",
682 |      "start_time": "2022-02-17T09:58:55.925013Z"
683 |     }
684 |    },
685 |    "outputs": [],
686 |    "source": [
687 |     "plt.scatter(*dihedral.T, c=colorcode, s=0.5)\n",
688 |     "plt.show()"
689 |    ]
690 |   },
691 |   {
692 |    "cell_type": "code",
693 |    "execution_count": null,
694 |    "metadata": {},
695 |    "outputs": [],
696 |    "source": [
697 |     "## Your network code goes here"
698 |    ]
699 |   },
700 |   {
701 |    "cell_type": "markdown",
702 |    "metadata": {},
703 |    "source": [
704 |     "When you are done, the results should look like this:\n",
705 |     "\n",
706 |     "#### Dihedral space separation\n",
707 |     "<img style=\"float: left;\" src=\"./img/space_division.png\"/>"
708 |    ]
709 |   },
710 |   {
711 |    "cell_type": "markdown",
712 |    "metadata": {},
713 |    "source": [
714 |     "#### Output values for each node\n",
715 |     "<img  style=\"float: left;\" src=\"./img/prob_state1.png\"/>\n",
716 |     "<img  style=\"float: left;\" src=\"./img/prob_state2.png\"/>\n",
717 |     "<img  style=\"float: left;\" src=\"./img/prob_state3.png\"/>\n",
718 |     "<img  style=\"float: left;\" src=\"./img/prob_state4.png\"/>\n",
719 |     "<img  style=\"float: left;\" src=\"./img/prob_state5.png\"/>\n",
720 |     "<img  style=\"float: left;\" src=\"./img/prob_state6.png\"/>"
721 |    ]
722 |   },
723 |   {
724 |    "cell_type": "markdown",
725 |    "metadata": {},
726 |    "source": [
727 |     "#### Timescales\n",
728 |     "<img style=\"float: left;\" src=\"./img/timescales.png\"/>"
729 |    ]
730 |   }
731 |  ],
732 |  "metadata": {
733 |   "kernelspec": {
734 |    "display_name": "Python 3 (ipykernel)",
735 |    "language": "python",
736 |    "name": "python3"
737 |   },
738 |   "language_info": {
739 |    "codemirror_mode": {
740 |     "name": "ipython",
741 |     "version": 3
742 |    },
743 |    "file_extension": ".py",
744 |    "mimetype": "text/x-python",
745 |    "name": "python",
746 |    "nbconvert_exporter": "python",
747 |    "pygments_lexer": "ipython3",
748 |    "version": "3.9.10"
749 |   },
750 |   "toc": {
751 |    "base_numbering": 1,
752 |    "nav_menu": {},
753 |    "number_sections": false,
754 |    "sideBar": true,
755 |    "skip_h1_title": true,
756 |    "title_cell": "Table of Contents",
757 |    "title_sidebar": "Contents",
758 |    "toc_cell": false,
759 |    "toc_position": {},
760 |    "toc_section_display": true,
761 |    "toc_window_display": true
762 |   }
763 |  },
764 |  "nbformat": 4,
765 |  "nbformat_minor": 2
766 | }
767 | 


--------------------------------------------------------------------------------