├── .github
    └── workflows
    │   └── pythonpackage.yml
├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── 404error.png
    ├── colab.png
    ├── conda_env_name.png
    ├── kaggle_upload.png
    ├── kernel_example.png
    ├── kernel_mapping.png
    ├── mof_building_principle.png
    ├── racs.png
    ├── result.gif
    ├── save_copy_colab.png
    └── spheres.png
├── data
    ├── .gitkeep
    ├── data.csv
    ├── features.csv
    └── submission.csv
├── environment.yml
├── molsim_ml.ipynb
└── test
    └── test.py


/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "*"
 7 |   pull_request:
 8 |     branches:
 9 |       - "*"
10 | 
11 | jobs:
12 |   test_conda:
13 |     name: Ex1 (${{ matrix.python-version }}, ${{ matrix.os }})
14 |     runs-on: ${{ matrix.os }}
15 |     strategy:
16 |       fail-fast: false
17 |       matrix:
18 |         os: ["ubuntu-latest", "macos-latest", "windows-latest"]
19 |         python-version: ["3.7", "3.8", "3.9"]
20 |     steps:
21 |       - uses: actions/checkout@v1
22 |       - uses: conda-incubator/setup-miniconda@v2
23 |         with:
24 |           auto-update-conda: true
25 |           python-version: ${{ matrix.python-version }}
26 |       - name: Conda bash
27 |         shell: bash -l {0}
28 |         run: export CONDA_ALWAYS_YES="true" && conda env create --file environment.yml --name molsim_ml && conda activate molsim_ml && python test/test.py
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.gitignore.io/api/vim,macos,python,jupyternotebooks,visualstudiocode
  3 | # Edit at https://www.gitignore.io/?templates=vim,macos,python,jupyternotebooks,visualstudiocode
  4 | 
  5 | ### JupyterNotebooks ###
  6 | # gitignore template for Jupyter Notebooks
  7 | # website: http://jupyter.org/
  8 | 
  9 | .ipynb_checkpoints
 10 | */.ipynb_checkpoints/*
 11 | 
 12 | # IPython
 13 | profile_default/
 14 | ipython_config.py
 15 | 
 16 | # Remove previous ipynb_checkpoints
 17 | #   git rm -r .ipynb_checkpoints/
 18 | 
 19 | ### macOS ###
 20 | # General
 21 | .DS_Store
 22 | .AppleDouble
 23 | .LSOverride
 24 | 
 25 | # Icon must end with two \r
 26 | Icon
 27 | 
 28 | # Thumbnails
 29 | ._*
 30 | 
 31 | # Files that might appear in the root of a volume
 32 | .DocumentRevisions-V100
 33 | .fseventsd
 34 | .Spotlight-V100
 35 | .TemporaryItems
 36 | .Trashes
 37 | .VolumeIcon.icns
 38 | .com.apple.timemachine.donotpresent
 39 | 
 40 | # Directories potentially created on remote AFP share
 41 | .AppleDB
 42 | .AppleDesktop
 43 | Network Trash Folder
 44 | Temporary Items
 45 | .apdisk
 46 | 
 47 | ### Python ###
 48 | # Byte-compiled / optimized / DLL files
 49 | __pycache__/
 50 | *.py[cod]
 51 | *$py.class
 52 | 
 53 | # C extensions
 54 | *.so
 55 | 
 56 | # Distribution / packaging
 57 | .Python
 58 | build/
 59 | develop-eggs/
 60 | dist/
 61 | downloads/
 62 | eggs/
 63 | .eggs/
 64 | lib/
 65 | lib64/
 66 | parts/
 67 | sdist/
 68 | var/
 69 | wheels/
 70 | pip-wheel-metadata/
 71 | share/python-wheels/
 72 | *.egg-info/
 73 | .installed.cfg
 74 | *.egg
 75 | MANIFEST
 76 | 
 77 | # PyInstaller
 78 | #  Usually these files are written by a python script from a template
 79 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 80 | *.manifest
 81 | *.spec
 82 | 
 83 | # Installer logs
 84 | pip-log.txt
 85 | pip-delete-this-directory.txt
 86 | 
 87 | # Unit test / coverage reports
 88 | htmlcov/
 89 | .tox/
 90 | .nox/
 91 | .coverage
 92 | .coverage.*
 93 | .cache
 94 | nosetests.xml
 95 | coverage.xml
 96 | *.cover
 97 | .hypothesis/
 98 | .pytest_cache/
 99 | 
100 | # Translations
101 | *.mo
102 | *.pot
103 | 
104 | # Scrapy stuff:
105 | .scrapy
106 | 
107 | # Sphinx documentation
108 | docs/_build/
109 | 
110 | # PyBuilder
111 | target/
112 | 
113 | # pyenv
114 | .python-version
115 | 
116 | # pipenv
117 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
118 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
119 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
120 | #   install all needed dependencies.
121 | #Pipfile.lock
122 | 
123 | # celery beat schedule file
124 | celerybeat-schedule
125 | 
126 | # SageMath parsed files
127 | *.sage.py
128 | 
129 | # Spyder project settings
130 | .spyderproject
131 | .spyproject
132 | 
133 | # Rope project settings
134 | .ropeproject
135 | 
136 | # Mr Developer
137 | .mr.developer.cfg
138 | .project
139 | .pydevproject
140 | 
141 | # mkdocs documentation
142 | /site
143 | 
144 | # mypy
145 | .mypy_cache/
146 | .dmypy.json
147 | dmypy.json
148 | 
149 | # Pyre type checker
150 | .pyre/
151 | 
152 | ### Vim ###
153 | # Swap
154 | [._]*.s[a-v][a-z]
155 | [._]*.sw[a-p]
156 | [._]s[a-rt-v][a-z]
157 | [._]ss[a-gi-z]
158 | [._]sw[a-p]
159 | 
160 | # Session
161 | Session.vim
162 | Sessionx.vim
163 | 
164 | # Temporary
165 | .netrwhist
166 | *~
167 | 
168 | # Auto-generated tag files
169 | tags
170 | 
171 | # Persistent undo
172 | [._]*.un~
173 | 
174 | # Coc configuration directory
175 | .vim
176 | 
177 | ### VisualStudioCode ###
178 | .vscode/*
179 | !.vscode/settings.json
180 | !.vscode/tasks.json
181 | !.vscode/launch.json
182 | !.vscode/extensions.json
183 | 
184 | ### VisualStudioCode Patch ###
185 | # Ignore all local history of files
186 | .history
187 | 
188 | # End of https://www.gitignore.io/api/vim,macos,python,jupyternotebooks,visualstudiocode
189 | molsim_solution.ipynb


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Kevin Jablonka
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ML workshop for MolSim 2024
 2 | 
 3 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kjappelbaum/ml_molsim/blob/2024/molsim_ml.ipynb)
 4 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/kjappelbaum/ml_molsim/2024?filepath=molsim_ml.ipynb)
 5 | [![](https://img.shields.io/badge/python-3.7+-blue.svg)](https://www.python.org/download/releases/3.7.0/)
 6 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 7 | [![Actions Status](https://github.com/kjappelbaum/ml_molsim/workflows/Python%20package/badge.svg)](https://github.com/kjappelbaum/ml_molsim/actions)
 8 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3605363.svg)](https://doi.org/10.5281/zenodo.3605363)
 9 | 
10 | In this exercise we will build a simple model that can predict the carbon dioxide uptake in MOFs. The goal is to get familiar with the tools that are used for machine learning and to develop an understanding of the workflow, tricks, and pitfalls (e.g., why baselines are important). Some more of the theory can be found [in our review](https://pubs.acs.org/doi/abs/10.1021/acs.chemrev.0c00004).
11 | 
12 | ![Parity plot result](assets/result.gif)
13 | 
14 | If you find some errors, typos or issues feel free to [open an issue](https://help.github.com/en/github/managing-your-work-on-github/about-issues) or directly make a [pull request](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests).
15 | 
16 | # How to run it
17 | 
18 | If you have a modern laptop, we recommend you run them
19 | on the laptop. If you do not want to use your machine or the cluster, you
20 | can also run the exercises on Google Colab.
21 | 
22 | ## Some tips
23 | 
24 | - If you are not with the Python data science stack, we can recommend you [some cheatsheets](https://www.utc.fr/~jlaforet/Suppl/python-cheatsheets.pdf).
25 | - If you are not familiar with a function you can get help in a Jupyter notebook by going into the parentheses of a function and hitting SHIFT + ENTER, alternatively, you can just prepend a variable/function/library with `?`, e.g., `?str.replace()`
26 | - The errors you'll run into are most likely some that someone else already encountered. If you copy/paste the error message into a search engine like Google you will often find the solution to your problem on a site like StackOverflow
27 | - [Here](https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/) are some nice tips/tricks for using Jupyter notebooks
28 | - For plotting, we use the [holoviews library](https://holoviews.org/) as it is one of the simplest ways to create interactive figures in Python (it is a high-level interface to the [bokeh library](https://bokeh.org/)). You mind find the [Getting Started section of the documentation](https://holoviews.org/getting_started/) useful if you want to understand it better. I also found [this guide from Caltech](http://bebi103.caltech.edu.s3-website-us-east-1.amazonaws.com/2019a/content/lessons/lesson_03/intro_to_holoviews.html) useful.
29 | 
30 | ## Run it locally (recommended)
31 | 
32 | The following steps assume that you use MacOS or some Linux flavor. If you use Windows, we recommend that you first install the [Windows Subsystem for Linux (WSL)](https://docs.microsoft.com/en-us/windows/wsl/install-win10).
33 | 
34 | Create a new folder and clone this repository (you need `git` for this, if you get a `missing command` error for `git` you can install it with `sudo apt-get install git`)
35 | 
36 | ```bash
37 | git clone --depth 1 https://github.com/kjappelbaum/ml_molsim.git\
38 | git checkout 2024
39 | cd ml_molsim
40 | ```
41 | 
42 | We recommend that you create a virtual conda environment on your computer in which you install the dependencies for this exercise. To do so head over to [Miniconda](https://docs.conda.io/en/latest/miniconda.html) and follow the installation instructions there.
43 | 
44 | Then, use
45 | 
46 | ```bash
47 | conda env create -f environment.yml -n ml_molsim
48 | ```
49 | 
50 | You can activate this environment using
51 | 
52 | ```bash
53 | conda activate ml_molsim
54 | ```
55 | 
56 | After this you can start Jupyter Lab and select the `molsim_ml.ipynb` file from the file browser.
57 | 
58 | ```(bash)
59 | jupyter lab
60 | ```
61 | 
62 | Make sure that the notebook kernel runs in the correct environment:
63 | 
64 | ![Environment name](assets/conda_env_name.png)
65 | 
66 | If the environment name that is shown is different from "ml_molsim" you can click on it and select the correct one.
67 | 
68 | ## Use it on Google Colab
69 | 
70 | ![Screenshot of the Colab environment](assets/colab.png)
71 | 
72 | Here, you can use relatively powerful computing resources (like [GPUs](https://en.wikipedia.org/wiki/Graphics_processing_unit) and [TPUs](https://en.wikipedia.org/wiki/Tensor_Processing_Unit)) from Google for free.
73 | Click the "Open in Colab" button on the top, then make a copy of the notebook into your Google Drive, and run the first three cells to
74 | install the dependencies.
75 | Then you should be able to use the notebook in Colab.
76 | 
77 | ![Making a copy in Colab](assets/save_copy_colab.png)
78 | 
79 | **Make sure to make a copy into your Google Drive and work on this copy. And
80 | not on the shared notebook!**
81 | 
82 | _Note:_ If you have a Google Account from your organization, e.g. university, you might
83 | need to log out and use your personal account as many organizations block
84 | third-party applications.
85 | 
86 | _Note:_ Google Colab also requires that you reload the JavaScript of holoviews in each plotting cell.
87 | So, you have to start every cell with a holoviews plot with `hv.extension('bokeh')`
88 | 
89 | ## Acknowledgements
90 | 
91 | We want to thank [Leopold Talirz](https://github.com/ltalirz) for incredibly valuable feedback and input during the initial phases of development.
92 | We also want to thank Peter Alexander Knudsen for spotting typos, as well as [Prof. Tristan Bereau](https://github.com/tbereau) and all MolSim participant and TAs for feedback.
93 | 


--------------------------------------------------------------------------------
/assets/404error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjappelbaum/ml_molsim/0ebed176b972c9ba9ddf7b6e3a4d04eaf3f32409/assets/404error.png


--------------------------------------------------------------------------------
/assets/colab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjappelbaum/ml_molsim/0ebed176b972c9ba9ddf7b6e3a4d04eaf3f32409/assets/colab.png


--------------------------------------------------------------------------------
/assets/conda_env_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjappelbaum/ml_molsim/0ebed176b972c9ba9ddf7b6e3a4d04eaf3f32409/assets/conda_env_name.png


--------------------------------------------------------------------------------
/assets/kaggle_upload.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjappelbaum/ml_molsim/0ebed176b972c9ba9ddf7b6e3a4d04eaf3f32409/assets/kaggle_upload.png


--------------------------------------------------------------------------------
/assets/kernel_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjappelbaum/ml_molsim/0ebed176b972c9ba9ddf7b6e3a4d04eaf3f32409/assets/kernel_example.png


--------------------------------------------------------------------------------
/assets/kernel_mapping.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjappelbaum/ml_molsim/0ebed176b972c9ba9ddf7b6e3a4d04eaf3f32409/assets/kernel_mapping.png


--------------------------------------------------------------------------------
/assets/mof_building_principle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjappelbaum/ml_molsim/0ebed176b972c9ba9ddf7b6e3a4d04eaf3f32409/assets/mof_building_principle.png


--------------------------------------------------------------------------------
/assets/racs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjappelbaum/ml_molsim/0ebed176b972c9ba9ddf7b6e3a4d04eaf3f32409/assets/racs.png


--------------------------------------------------------------------------------
/assets/result.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjappelbaum/ml_molsim/0ebed176b972c9ba9ddf7b6e3a4d04eaf3f32409/assets/result.gif


--------------------------------------------------------------------------------
/assets/save_copy_colab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjappelbaum/ml_molsim/0ebed176b972c9ba9ddf7b6e3a4d04eaf3f32409/assets/save_copy_colab.png


--------------------------------------------------------------------------------
/assets/spheres.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjappelbaum/ml_molsim/0ebed176b972c9ba9ddf7b6e3a4d04eaf3f32409/assets/spheres.png


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjappelbaum/ml_molsim/0ebed176b972c9ba9ddf7b6e3a4d04eaf3f32409/data/.gitkeep


--------------------------------------------------------------------------------
/data/submission.csv:
--------------------------------------------------------------------------------
   1 | id,prediction
   2 | 0,4568
   3 | 1,
   4 | 2,
   5 | 3,
   6 | 4,
   7 | 5,
   8 | 6,
   9 | 7,
  10 | 8,
  11 | 9,
  12 | 10,
  13 | 11,
  14 | 12,
  15 | 13,
  16 | 14,
  17 | 15,
  18 | 16,
  19 | 17,
  20 | 18,
  21 | 19,
  22 | 20,
  23 | 21,
  24 | 22,
  25 | 23,
  26 | 24,
  27 | 25,
  28 | 26,
  29 | 27,
  30 | 28,
  31 | 29,
  32 | 30,
  33 | 31,
  34 | 32,
  35 | 33,
  36 | 34,
  37 | 35,
  38 | 36,
  39 | 37,
  40 | 38,
  41 | 39,
  42 | 40,
  43 | 41,
  44 | 42,
  45 | 43,
  46 | 44,
  47 | 45,
  48 | 46,
  49 | 47,
  50 | 48,
  51 | 49,
  52 | 50,
  53 | 51,
  54 | 52,
  55 | 53,
  56 | 54,
  57 | 55,
  58 | 56,
  59 | 57,
  60 | 58,
  61 | 59,
  62 | 60,
  63 | 61,
  64 | 62,
  65 | 63,
  66 | 64,
  67 | 65,
  68 | 66,
  69 | 67,
  70 | 68,
  71 | 69,
  72 | 70,
  73 | 71,
  74 | 72,
  75 | 73,
  76 | 74,
  77 | 75,
  78 | 76,
  79 | 77,
  80 | 78,
  81 | 79,
  82 | 80,
  83 | 81,
  84 | 82,
  85 | 83,
  86 | 84,
  87 | 85,
  88 | 86,
  89 | 87,
  90 | 88,
  91 | 89,
  92 | 90,
  93 | 91,
  94 | 92,
  95 | 93,
  96 | 94,
  97 | 95,
  98 | 96,
  99 | 97,
 100 | 98,
 101 | 99,
 102 | 100,
 103 | 101,
 104 | 102,
 105 | 103,
 106 | 104,
 107 | 105,
 108 | 106,
 109 | 107,
 110 | 108,
 111 | 109,
 112 | 110,
 113 | 111,
 114 | 112,
 115 | 113,
 116 | 114,
 117 | 115,
 118 | 116,
 119 | 117,
 120 | 118,
 121 | 119,
 122 | 120,
 123 | 121,
 124 | 122,
 125 | 123,
 126 | 124,
 127 | 125,
 128 | 126,
 129 | 127,
 130 | 128,
 131 | 129,
 132 | 130,
 133 | 131,
 134 | 132,
 135 | 133,
 136 | 134,
 137 | 135,
 138 | 136,
 139 | 137,
 140 | 138,
 141 | 139,
 142 | 140,
 143 | 141,
 144 | 142,
 145 | 143,
 146 | 144,
 147 | 145,
 148 | 146,
 149 | 147,
 150 | 148,
 151 | 149,
 152 | 150,
 153 | 151,
 154 | 152,
 155 | 153,
 156 | 154,
 157 | 155,
 158 | 156,
 159 | 157,
 160 | 158,
 161 | 159,
 162 | 160,
 163 | 161,
 164 | 162,
 165 | 163,
 166 | 164,
 167 | 165,
 168 | 166,
 169 | 167,
 170 | 168,
 171 | 169,
 172 | 170,
 173 | 171,
 174 | 172,
 175 | 173,
 176 | 174,
 177 | 175,
 178 | 176,
 179 | 177,
 180 | 178,
 181 | 179,
 182 | 180,
 183 | 181,
 184 | 182,
 185 | 183,
 186 | 184,
 187 | 185,
 188 | 186,
 189 | 187,
 190 | 188,
 191 | 189,
 192 | 190,
 193 | 191,
 194 | 192,
 195 | 193,
 196 | 194,
 197 | 195,
 198 | 196,
 199 | 197,
 200 | 198,
 201 | 199,
 202 | 200,
 203 | 201,
 204 | 202,
 205 | 203,
 206 | 204,
 207 | 205,
 208 | 206,
 209 | 207,
 210 | 208,
 211 | 209,
 212 | 210,
 213 | 211,
 214 | 212,
 215 | 213,
 216 | 214,
 217 | 215,
 218 | 216,
 219 | 217,
 220 | 218,
 221 | 219,
 222 | 220,
 223 | 221,
 224 | 222,
 225 | 223,
 226 | 224,
 227 | 225,
 228 | 226,
 229 | 227,
 230 | 228,
 231 | 229,
 232 | 230,
 233 | 231,
 234 | 232,
 235 | 233,
 236 | 234,
 237 | 235,
 238 | 236,
 239 | 237,
 240 | 238,
 241 | 239,
 242 | 240,
 243 | 241,
 244 | 242,
 245 | 243,
 246 | 244,
 247 | 245,
 248 | 246,
 249 | 247,
 250 | 248,
 251 | 249,
 252 | 250,
 253 | 251,
 254 | 252,
 255 | 253,
 256 | 254,
 257 | 255,
 258 | 256,
 259 | 257,
 260 | 258,
 261 | 259,
 262 | 260,
 263 | 261,
 264 | 262,
 265 | 263,
 266 | 264,
 267 | 265,
 268 | 266,
 269 | 267,
 270 | 268,
 271 | 269,
 272 | 270,
 273 | 271,
 274 | 272,
 275 | 273,
 276 | 274,
 277 | 275,
 278 | 276,
 279 | 277,
 280 | 278,
 281 | 279,
 282 | 280,
 283 | 281,
 284 | 282,
 285 | 283,
 286 | 284,
 287 | 285,
 288 | 286,
 289 | 287,
 290 | 288,
 291 | 289,
 292 | 290,
 293 | 291,
 294 | 292,
 295 | 293,
 296 | 294,
 297 | 295,
 298 | 296,
 299 | 297,
 300 | 298,
 301 | 299,
 302 | 300,
 303 | 301,
 304 | 302,
 305 | 303,
 306 | 304,
 307 | 305,
 308 | 306,
 309 | 307,
 310 | 308,
 311 | 309,
 312 | 310,
 313 | 311,
 314 | 312,
 315 | 313,
 316 | 314,
 317 | 315,
 318 | 316,
 319 | 317,
 320 | 318,
 321 | 319,
 322 | 320,
 323 | 321,
 324 | 322,
 325 | 323,
 326 | 324,
 327 | 325,
 328 | 326,
 329 | 327,
 330 | 328,
 331 | 329,
 332 | 330,
 333 | 331,
 334 | 332,
 335 | 333,
 336 | 334,
 337 | 335,
 338 | 336,
 339 | 337,
 340 | 338,
 341 | 339,
 342 | 340,
 343 | 341,
 344 | 342,
 345 | 343,
 346 | 344,
 347 | 345,
 348 | 346,
 349 | 347,
 350 | 348,
 351 | 349,
 352 | 350,
 353 | 351,
 354 | 352,
 355 | 353,
 356 | 354,
 357 | 355,
 358 | 356,
 359 | 357,
 360 | 358,
 361 | 359,
 362 | 360,
 363 | 361,
 364 | 362,
 365 | 363,
 366 | 364,
 367 | 365,
 368 | 366,
 369 | 367,
 370 | 368,
 371 | 369,
 372 | 370,
 373 | 371,
 374 | 372,
 375 | 373,
 376 | 374,
 377 | 375,
 378 | 376,
 379 | 377,
 380 | 378,
 381 | 379,
 382 | 380,
 383 | 381,
 384 | 382,
 385 | 383,
 386 | 384,
 387 | 385,
 388 | 386,
 389 | 387,
 390 | 388,
 391 | 389,
 392 | 390,
 393 | 391,
 394 | 392,
 395 | 393,
 396 | 394,
 397 | 395,
 398 | 396,
 399 | 397,
 400 | 398,
 401 | 399,
 402 | 400,
 403 | 401,
 404 | 402,
 405 | 403,
 406 | 404,
 407 | 405,
 408 | 406,
 409 | 407,
 410 | 408,
 411 | 409,
 412 | 410,
 413 | 411,
 414 | 412,
 415 | 413,
 416 | 414,
 417 | 415,
 418 | 416,
 419 | 417,
 420 | 418,
 421 | 419,
 422 | 420,
 423 | 421,
 424 | 422,
 425 | 423,
 426 | 424,
 427 | 425,
 428 | 426,
 429 | 427,
 430 | 428,
 431 | 429,
 432 | 430,
 433 | 431,
 434 | 432,
 435 | 433,
 436 | 434,
 437 | 435,
 438 | 436,
 439 | 437,
 440 | 438,
 441 | 439,
 442 | 440,
 443 | 441,
 444 | 442,
 445 | 443,
 446 | 444,
 447 | 445,
 448 | 446,
 449 | 447,
 450 | 448,
 451 | 449,
 452 | 450,
 453 | 451,
 454 | 452,
 455 | 453,
 456 | 454,
 457 | 455,
 458 | 456,
 459 | 457,
 460 | 458,
 461 | 459,
 462 | 460,
 463 | 461,
 464 | 462,
 465 | 463,
 466 | 464,
 467 | 465,
 468 | 466,
 469 | 467,
 470 | 468,
 471 | 469,
 472 | 470,
 473 | 471,
 474 | 472,
 475 | 473,
 476 | 474,
 477 | 475,
 478 | 476,
 479 | 477,
 480 | 478,
 481 | 479,
 482 | 480,
 483 | 481,
 484 | 482,
 485 | 483,
 486 | 484,
 487 | 485,
 488 | 486,
 489 | 487,
 490 | 488,
 491 | 489,
 492 | 490,
 493 | 491,
 494 | 492,
 495 | 493,
 496 | 494,
 497 | 495,
 498 | 496,
 499 | 497,
 500 | 498,
 501 | 499,
 502 | 500,
 503 | 501,
 504 | 502,
 505 | 503,
 506 | 504,
 507 | 505,
 508 | 506,
 509 | 507,
 510 | 508,
 511 | 509,
 512 | 510,
 513 | 511,
 514 | 512,
 515 | 513,
 516 | 514,
 517 | 515,
 518 | 516,
 519 | 517,
 520 | 518,
 521 | 519,
 522 | 520,
 523 | 521,
 524 | 522,
 525 | 523,
 526 | 524,
 527 | 525,
 528 | 526,
 529 | 527,
 530 | 528,
 531 | 529,
 532 | 530,
 533 | 531,
 534 | 532,
 535 | 533,
 536 | 534,
 537 | 535,
 538 | 536,
 539 | 537,
 540 | 538,
 541 | 539,
 542 | 540,
 543 | 541,
 544 | 542,
 545 | 543,
 546 | 544,
 547 | 545,
 548 | 546,
 549 | 547,
 550 | 548,
 551 | 549,
 552 | 550,
 553 | 551,
 554 | 552,
 555 | 553,
 556 | 554,
 557 | 555,
 558 | 556,
 559 | 557,
 560 | 558,
 561 | 559,
 562 | 560,
 563 | 561,
 564 | 562,
 565 | 563,
 566 | 564,
 567 | 565,
 568 | 566,
 569 | 567,
 570 | 568,
 571 | 569,
 572 | 570,
 573 | 571,
 574 | 572,
 575 | 573,
 576 | 574,
 577 | 575,
 578 | 576,
 579 | 577,
 580 | 578,
 581 | 579,
 582 | 580,
 583 | 581,
 584 | 582,
 585 | 583,
 586 | 584,
 587 | 585,
 588 | 586,
 589 | 587,
 590 | 588,
 591 | 589,
 592 | 590,
 593 | 591,
 594 | 592,
 595 | 593,
 596 | 594,
 597 | 595,
 598 | 596,
 599 | 597,
 600 | 598,
 601 | 599,
 602 | 600,
 603 | 601,
 604 | 602,
 605 | 603,
 606 | 604,
 607 | 605,
 608 | 606,
 609 | 607,
 610 | 608,
 611 | 609,
 612 | 610,
 613 | 611,
 614 | 612,
 615 | 613,
 616 | 614,
 617 | 615,
 618 | 616,
 619 | 617,
 620 | 618,
 621 | 619,
 622 | 620,
 623 | 621,
 624 | 622,
 625 | 623,
 626 | 624,
 627 | 625,
 628 | 626,
 629 | 627,
 630 | 628,
 631 | 629,
 632 | 630,
 633 | 631,
 634 | 632,
 635 | 633,
 636 | 634,
 637 | 635,
 638 | 636,
 639 | 637,
 640 | 638,
 641 | 639,
 642 | 640,
 643 | 641,
 644 | 642,
 645 | 643,
 646 | 644,
 647 | 645,
 648 | 646,
 649 | 647,
 650 | 648,
 651 | 649,
 652 | 650,
 653 | 651,
 654 | 652,
 655 | 653,
 656 | 654,
 657 | 655,
 658 | 656,
 659 | 657,
 660 | 658,
 661 | 659,
 662 | 660,
 663 | 661,
 664 | 662,
 665 | 663,
 666 | 664,
 667 | 665,
 668 | 666,
 669 | 667,
 670 | 668,
 671 | 669,
 672 | 670,
 673 | 671,
 674 | 672,
 675 | 673,
 676 | 674,
 677 | 675,
 678 | 676,
 679 | 677,
 680 | 678,
 681 | 679,
 682 | 680,
 683 | 681,
 684 | 682,
 685 | 683,
 686 | 684,
 687 | 685,
 688 | 686,
 689 | 687,
 690 | 688,
 691 | 689,
 692 | 690,
 693 | 691,
 694 | 692,
 695 | 693,
 696 | 694,
 697 | 695,
 698 | 696,
 699 | 697,
 700 | 698,
 701 | 699,
 702 | 700,
 703 | 701,
 704 | 702,
 705 | 703,
 706 | 704,
 707 | 705,
 708 | 706,
 709 | 707,
 710 | 708,
 711 | 709,
 712 | 710,
 713 | 711,
 714 | 712,
 715 | 713,
 716 | 714,
 717 | 715,
 718 | 716,
 719 | 717,
 720 | 718,
 721 | 719,
 722 | 720,
 723 | 721,
 724 | 722,
 725 | 723,
 726 | 724,
 727 | 725,
 728 | 726,
 729 | 727,
 730 | 728,
 731 | 729,
 732 | 730,
 733 | 731,
 734 | 732,
 735 | 733,
 736 | 734,
 737 | 735,
 738 | 736,
 739 | 737,
 740 | 738,
 741 | 739,
 742 | 740,
 743 | 741,
 744 | 742,
 745 | 743,
 746 | 744,
 747 | 745,
 748 | 746,
 749 | 747,
 750 | 748,
 751 | 749,
 752 | 750,
 753 | 751,
 754 | 752,
 755 | 753,
 756 | 754,
 757 | 755,
 758 | 756,
 759 | 757,
 760 | 758,
 761 | 759,
 762 | 760,
 763 | 761,
 764 | 762,
 765 | 763,
 766 | 764,
 767 | 765,
 768 | 766,
 769 | 767,
 770 | 768,
 771 | 769,
 772 | 770,
 773 | 771,
 774 | 772,
 775 | 773,
 776 | 774,
 777 | 775,
 778 | 776,
 779 | 777,
 780 | 778,
 781 | 779,
 782 | 780,
 783 | 781,
 784 | 782,
 785 | 783,
 786 | 784,
 787 | 785,
 788 | 786,
 789 | 787,
 790 | 788,
 791 | 789,
 792 | 790,
 793 | 791,
 794 | 792,
 795 | 793,
 796 | 794,
 797 | 795,
 798 | 796,
 799 | 797,
 800 | 798,
 801 | 799,
 802 | 800,
 803 | 801,
 804 | 802,
 805 | 803,
 806 | 804,
 807 | 805,
 808 | 806,
 809 | 807,
 810 | 808,
 811 | 809,
 812 | 810,
 813 | 811,
 814 | 812,
 815 | 813,
 816 | 814,
 817 | 815,
 818 | 816,
 819 | 817,
 820 | 818,
 821 | 819,
 822 | 820,
 823 | 821,
 824 | 822,
 825 | 823,
 826 | 824,
 827 | 825,
 828 | 826,
 829 | 827,
 830 | 828,
 831 | 829,
 832 | 830,
 833 | 831,
 834 | 832,
 835 | 833,
 836 | 834,
 837 | 835,
 838 | 836,
 839 | 837,
 840 | 838,
 841 | 839,
 842 | 840,
 843 | 841,
 844 | 842,
 845 | 843,
 846 | 844,
 847 | 845,
 848 | 846,
 849 | 847,
 850 | 848,
 851 | 849,
 852 | 850,
 853 | 851,
 854 | 852,
 855 | 853,
 856 | 854,
 857 | 855,
 858 | 856,
 859 | 857,
 860 | 858,
 861 | 859,
 862 | 860,
 863 | 861,
 864 | 862,
 865 | 863,
 866 | 864,
 867 | 865,
 868 | 866,
 869 | 867,
 870 | 868,
 871 | 869,
 872 | 870,
 873 | 871,
 874 | 872,
 875 | 873,
 876 | 874,
 877 | 875,
 878 | 876,
 879 | 877,
 880 | 878,
 881 | 879,
 882 | 880,
 883 | 881,
 884 | 882,
 885 | 883,
 886 | 884,
 887 | 885,
 888 | 886,
 889 | 887,
 890 | 888,
 891 | 889,
 892 | 890,
 893 | 891,
 894 | 892,
 895 | 893,
 896 | 894,
 897 | 895,
 898 | 896,
 899 | 897,
 900 | 898,
 901 | 899,
 902 | 900,
 903 | 901,
 904 | 902,
 905 | 903,
 906 | 904,
 907 | 905,
 908 | 906,
 909 | 907,
 910 | 908,
 911 | 909,
 912 | 910,
 913 | 911,
 914 | 912,
 915 | 913,
 916 | 914,
 917 | 915,
 918 | 916,
 919 | 917,
 920 | 918,
 921 | 919,
 922 | 920,
 923 | 921,
 924 | 922,
 925 | 923,
 926 | 924,
 927 | 925,
 928 | 926,
 929 | 927,
 930 | 928,
 931 | 929,
 932 | 930,
 933 | 931,
 934 | 932,
 935 | 933,
 936 | 934,
 937 | 935,
 938 | 936,
 939 | 937,
 940 | 938,
 941 | 939,
 942 | 940,
 943 | 941,
 944 | 942,
 945 | 943,
 946 | 944,
 947 | 945,
 948 | 946,
 949 | 947,
 950 | 948,
 951 | 949,
 952 | 950,
 953 | 951,
 954 | 952,
 955 | 953,
 956 | 954,
 957 | 955,
 958 | 956,
 959 | 957,
 960 | 958,
 961 | 959,
 962 | 960,
 963 | 961,
 964 | 962,
 965 | 963,
 966 | 964,
 967 | 965,
 968 | 966,
 969 | 967,
 970 | 968,
 971 | 969,
 972 | 970,
 973 | 971,
 974 | 972,
 975 | 973,
 976 | 974,
 977 | 975,
 978 | 976,
 979 | 977,
 980 | 978,
 981 | 979,
 982 | 980,
 983 | 981,
 984 | 982,
 985 | 983,
 986 | 984,
 987 | 985,
 988 | 986,
 989 | 987,
 990 | 988,
 991 | 989,
 992 | 990,
 993 | 991,
 994 | 992,
 995 | 993,
 996 | 994,
 997 | 995,
 998 | 996,
 999 | 997,
1000 | 998,
1001 | 999,
1002 | 1000,
1003 | 1001,
1004 | 1002,
1005 | 1003,
1006 | 1004,
1007 | 1005,
1008 | 1006,
1009 | 1007,
1010 | 1008,
1011 | 1009,
1012 | 1010,
1013 | 1011,
1014 | 1012,
1015 | 1013,
1016 | 1014,
1017 | 1015,
1018 | 1016,
1019 | 1017,
1020 | 1018,
1021 | 1019,
1022 | 1020,
1023 | 1021,
1024 | 1022,
1025 | 1023,
1026 | 1024,
1027 | 1025,
1028 | 1026,
1029 | 1027,
1030 | 1028,
1031 | 1029,
1032 | 1030,
1033 | 1031,
1034 | 1032,
1035 | 1033,
1036 | 1034,
1037 | 1035,
1038 | 1036,
1039 | 1037,
1040 | 1038,
1041 | 1039,
1042 | 1040,
1043 | 1041,
1044 | 1042,
1045 | 1043,
1046 | 1044,
1047 | 1045,
1048 | 1046,
1049 | 1047,
1050 | 1048,
1051 | 1049,
1052 | 1050,
1053 | 1051,
1054 | 1052,
1055 | 1053,
1056 | 1054,
1057 | 1055,
1058 | 1056,
1059 | 1057,
1060 | 1058,
1061 | 1059,
1062 | 1060,
1063 | 1061,
1064 | 1062,
1065 | 1063,
1066 | 1064,
1067 | 1065,
1068 | 1066,
1069 | 1067,
1070 | 1068,
1071 | 1069,
1072 | 1070,
1073 | 1071,
1074 | 1072,
1075 | 1073,
1076 | 1074,
1077 | 1075,
1078 | 1076,
1079 | 1077,
1080 | 1078,
1081 | 1079,
1082 | 1080,
1083 | 1081,
1084 | 1082,
1085 | 1083,
1086 | 1084,
1087 | 1085,
1088 | 1086,
1089 | 1087,
1090 | 1088,
1091 | 1089,
1092 | 1090,
1093 | 1091,
1094 | 1092,
1095 | 1093,
1096 | 1094,
1097 | 1095,
1098 | 1096,
1099 | 1097,
1100 | 1098,
1101 | 1099,
1102 | 1100,
1103 | 1101,
1104 | 1102,
1105 | 1103,
1106 | 1104,
1107 | 1105,
1108 | 1106,
1109 | 1107,
1110 | 1108,
1111 | 1109,
1112 | 1110,
1113 | 1111,
1114 | 1112,
1115 | 1113,
1116 | 1114,
1117 | 1115,
1118 | 1116,
1119 | 1117,
1120 | 1118,
1121 | 1119,
1122 | 1120,
1123 | 1121,
1124 | 1122,
1125 | 1123,
1126 | 1124,
1127 | 1125,
1128 | 1126,
1129 | 1127,
1130 | 1128,
1131 | 1129,
1132 | 1130,
1133 | 1131,
1134 | 1132,
1135 | 1133,
1136 | 1134,
1137 | 1135,
1138 | 1136,
1139 | 1137,
1140 | 1138,
1141 | 1139,
1142 | 1140,
1143 | 1141,
1144 | 1142,
1145 | 1143,
1146 | 1144,
1147 | 1145,
1148 | 1146,
1149 | 1147,
1150 | 1148,
1151 | 1149,
1152 | 1150,
1153 | 1151,
1154 | 1152,
1155 | 1153,
1156 | 1154,
1157 | 1155,
1158 | 1156,
1159 | 1157,
1160 | 1158,
1161 | 1159,
1162 | 1160,
1163 | 1161,
1164 | 1162,
1165 | 1163,
1166 | 1164,
1167 | 1165,
1168 | 1166,
1169 | 1167,
1170 | 1168,
1171 | 1169,
1172 | 1170,
1173 | 1171,
1174 | 1172,
1175 | 1173,
1176 | 1174,
1177 | 1175,
1178 | 1176,
1179 | 1177,
1180 | 1178,
1181 | 1179,
1182 | 1180,
1183 | 1181,
1184 | 1182,
1185 | 1183,
1186 | 1184,
1187 | 1185,
1188 | 1186,
1189 | 1187,
1190 | 1188,
1191 | 1189,
1192 | 1190,
1193 | 1191,
1194 | 1192,
1195 | 1193,
1196 | 1194,
1197 | 1195,
1198 | 1196,
1199 | 1197,
1200 | 1198,
1201 | 1199,
1202 | 1200,
1203 | 1201,
1204 | 1202,
1205 | 1203,
1206 | 1204,
1207 | 1205,
1208 | 1206,
1209 | 1207,
1210 | 1208,
1211 | 1209,
1212 | 1210,
1213 | 1211,
1214 | 1212,
1215 | 1213,
1216 | 1214,
1217 | 1215,
1218 | 1216,
1219 | 1217,
1220 | 1218,
1221 | 1219,
1222 | 1220,
1223 | 1221,
1224 | 1222,
1225 | 1223,
1226 | 1224,
1227 | 1225,
1228 | 1226,
1229 | 1227,
1230 | 1228,
1231 | 1229,
1232 | 1230,
1233 | 1231,
1234 | 1232,
1235 | 1233,
1236 | 1234,
1237 | 1235,
1238 | 1236,
1239 | 1237,
1240 | 1238,
1241 | 1239,
1242 | 1240,
1243 | 1241,
1244 | 1242,
1245 | 1243,
1246 | 1244,
1247 | 1245,
1248 | 1246,
1249 | 1247,
1250 | 1248,
1251 | 1249,
1252 | 1250,
1253 | 1251,
1254 | 1252,
1255 | 1253,
1256 | 1254,
1257 | 1255,
1258 | 1256,
1259 | 1257,
1260 | 1258,
1261 | 1259,
1262 | 1260,
1263 | 1261,
1264 | 1262,
1265 | 1263,
1266 | 1264,
1267 | 1265,
1268 | 1266,
1269 | 1267,
1270 | 1268,
1271 | 1269,
1272 | 1270,
1273 | 1271,
1274 | 1272,
1275 | 1273,
1276 | 1274,
1277 | 1275,
1278 | 1276,
1279 | 1277,
1280 | 1278,
1281 | 1279,
1282 | 1280,
1283 | 1281,
1284 | 1282,
1285 | 1283,
1286 | 1284,
1287 | 1285,
1288 | 1286,
1289 | 1287,
1290 | 1288,
1291 | 1289,
1292 | 1290,
1293 | 1291,
1294 | 1292,
1295 | 1293,
1296 | 1294,
1297 | 1295,
1298 | 1296,
1299 | 1297,
1300 | 1298,
1301 | 1299,
1302 | 1300,
1303 | 1301,
1304 | 1302,
1305 | 1303,
1306 | 1304,
1307 | 1305,
1308 | 1306,
1309 | 1307,
1310 | 1308,
1311 | 1309,
1312 | 1310,
1313 | 1311,
1314 | 1312,
1315 | 1313,
1316 | 1314,
1317 | 1315,
1318 | 1316,
1319 | 1317,
1320 | 1318,
1321 | 1319,
1322 | 1320,
1323 | 1321,
1324 | 1322,
1325 | 1323,
1326 | 1324,
1327 | 1325,
1328 | 1326,
1329 | 1327,
1330 | 1328,
1331 | 1329,
1332 | 1330,
1333 | 1331,
1334 | 1332,
1335 | 1333,
1336 | 1334,
1337 | 1335,
1338 | 1336,
1339 | 1337,
1340 | 1338,
1341 | 1339,
1342 | 1340,
1343 | 1341,
1344 | 1342,
1345 | 1343,
1346 | 1344,
1347 | 1345,
1348 | 1346,
1349 | 1347,
1350 | 1348,
1351 | 1349,
1352 | 1350,
1353 | 1351,
1354 | 1352,
1355 | 1353,
1356 | 1354,
1357 | 1355,
1358 | 1356,
1359 | 1357,
1360 | 1358,
1361 | 1359,
1362 | 1360,
1363 | 1361,
1364 | 1362,
1365 | 1363,
1366 | 1364,
1367 | 1365,
1368 | 1366,
1369 | 1367,
1370 | 1368,
1371 | 1369,
1372 | 1370,
1373 | 1371,
1374 | 1372,
1375 | 1373,
1376 | 1374,
1377 | 1375,
1378 | 1376,
1379 | 1377,
1380 | 1378,
1381 | 1379,
1382 | 1380,
1383 | 1381,
1384 | 1382,
1385 | 1383,
1386 | 1384,
1387 | 1385,
1388 | 1386,
1389 | 1387,
1390 | 1388,
1391 | 1389,
1392 | 1390,
1393 | 1391,
1394 | 1392,
1395 | 1393,
1396 | 1394,
1397 | 1395,
1398 | 1396,
1399 | 1397,
1400 | 1398,
1401 | 1399,
1402 | 1400,
1403 | 1401,
1404 | 1402,
1405 | 1403,
1406 | 1404,
1407 | 1405,
1408 | 1406,
1409 | 1407,
1410 | 1408,
1411 | 1409,
1412 | 1410,
1413 | 1411,
1414 | 1412,
1415 | 1413,
1416 | 1414,
1417 | 1415,
1418 | 1416,
1419 | 1417,
1420 | 1418,
1421 | 1419,
1422 | 1420,
1423 | 1421,
1424 | 1422,
1425 | 1423,
1426 | 1424,
1427 | 1425,
1428 | 1426,
1429 | 1427,
1430 | 1428,
1431 | 1429,
1432 | 1430,
1433 | 1431,
1434 | 1432,
1435 | 1433,
1436 | 1434,
1437 | 1435,
1438 | 1436,
1439 | 1437,
1440 | 1438,
1441 | 1439,
1442 | 1440,
1443 | 1441,
1444 | 1442,
1445 | 1443,
1446 | 1444,
1447 | 1445,
1448 | 1446,
1449 | 1447,
1450 | 1448,
1451 | 1449,
1452 | 1450,
1453 | 1451,
1454 | 1452,
1455 | 1453,
1456 | 1454,
1457 | 1455,
1458 | 1456,
1459 | 1457,
1460 | 1458,
1461 | 1459,
1462 | 1460,
1463 | 1461,
1464 | 1462,
1465 | 1463,
1466 | 1464,
1467 | 1465,
1468 | 1466,
1469 | 1467,
1470 | 1468,
1471 | 1469,
1472 | 1470,
1473 | 1471,
1474 | 1472,
1475 | 1473,
1476 | 1474,
1477 | 1475,
1478 | 1476,
1479 | 1477,
1480 | 1478,
1481 | 1479,
1482 | 1480,
1483 | 1481,
1484 | 1482,
1485 | 1483,
1486 | 1484,
1487 | 1485,
1488 | 1486,
1489 | 1487,
1490 | 1488,
1491 | 1489,
1492 | 1490,
1493 | 1491,
1494 | 1492,
1495 | 1493,
1496 | 1494,
1497 | 1495,
1498 | 1496,
1499 | 1497,
1500 | 1498,
1501 | 1499,
1502 | 1500,
1503 | 1501,
1504 | 1502,
1505 | 1503,
1506 | 1504,
1507 | 1505,
1508 | 1506,
1509 | 1507,
1510 | 1508,
1511 | 1509,
1512 | 1510,
1513 | 1511,
1514 | 1512,
1515 | 1513,
1516 | 1514,
1517 | 1515,
1518 | 1516,
1519 | 1517,
1520 | 1518,
1521 | 1519,
1522 | 1520,
1523 | 1521,
1524 | 1522,
1525 | 1523,
1526 | 1524,
1527 | 1525,
1528 | 1526,
1529 | 1527,
1530 | 1528,
1531 | 1529,
1532 | 1530,
1533 | 1531,
1534 | 1532,
1535 | 1533,
1536 | 1534,
1537 | 1535,
1538 | 1536,
1539 | 1537,
1540 | 1538,
1541 | 1539,
1542 | 1540,
1543 | 1541,
1544 | 1542,
1545 | 1543,
1546 | 1544,
1547 | 1545,
1548 | 1546,
1549 | 1547,
1550 | 1548,
1551 | 1549,
1552 | 1550,
1553 | 1551,
1554 | 1552,
1555 | 1553,
1556 | 1554,
1557 | 1555,
1558 | 1556,
1559 | 1557,
1560 | 1558,
1561 | 1559,
1562 | 1560,
1563 | 1561,
1564 | 1562,
1565 | 1563,
1566 | 1564,
1567 | 1565,
1568 | 1566,
1569 | 1567,
1570 | 1568,
1571 | 1569,
1572 | 1570,
1573 | 1571,
1574 | 1572,
1575 | 1573,
1576 | 1574,
1577 | 1575,
1578 | 1576,
1579 | 1577,
1580 | 1578,
1581 | 1579,
1582 | 1580,
1583 | 1581,
1584 | 1582,
1585 | 1583,
1586 | 1584,
1587 | 1585,
1588 | 1586,
1589 | 1587,
1590 | 1588,
1591 | 1589,
1592 | 1590,
1593 | 1591,
1594 | 1592,
1595 | 1593,
1596 | 1594,
1597 | 1595,
1598 | 1596,
1599 | 1597,
1600 | 1598,
1601 | 1599,
1602 | 1600,
1603 | 1601,
1604 | 1602,
1605 | 1603,
1606 | 1604,
1607 | 1605,
1608 | 1606,
1609 | 1607,
1610 | 1608,
1611 | 1609,
1612 | 1610,
1613 | 1611,
1614 | 1612,
1615 | 1613,
1616 | 1614,
1617 | 1615,
1618 | 1616,
1619 | 1617,
1620 | 1618,
1621 | 1619,
1622 | 1620,
1623 | 1621,
1624 | 1622,
1625 | 1623,
1626 | 1624,
1627 | 1625,
1628 | 1626,
1629 | 1627,
1630 | 1628,
1631 | 1629,
1632 | 1630,
1633 | 1631,
1634 | 1632,
1635 | 1633,
1636 | 1634,
1637 | 1635,
1638 | 1636,
1639 | 1637,
1640 | 1638,
1641 | 1639,
1642 | 1640,
1643 | 1641,
1644 | 1642,
1645 | 1643,
1646 | 1644,
1647 | 1645,
1648 | 1646,
1649 | 1647,
1650 | 1648,
1651 | 1649,
1652 | 1650,
1653 | 1651,
1654 | 1652,
1655 | 1653,
1656 | 1654,
1657 | 1655,
1658 | 1656,
1659 | 1657,
1660 | 1658,
1661 | 1659,
1662 | 1660,
1663 | 1661,
1664 | 1662,
1665 | 1663,
1666 | 1664,
1667 | 1665,
1668 | 1666,
1669 | 1667,
1670 | 1668,
1671 | 1669,
1672 | 1670,
1673 | 1671,
1674 | 1672,
1675 | 1673,
1676 | 1674,
1677 | 1675,
1678 | 1676,
1679 | 1677,
1680 | 1678,
1681 | 1679,
1682 | 1680,
1683 | 1681,
1684 | 1682,
1685 | 1683,
1686 | 1684,
1687 | 1685,
1688 | 1686,
1689 | 1687,
1690 | 1688,
1691 | 1689,
1692 | 1690,
1693 | 1691,
1694 | 1692,
1695 | 1693,
1696 | 1694,
1697 | 1695,
1698 | 1696,
1699 | 1697,
1700 | 1698,
1701 | 1699,
1702 | 1700,
1703 | 1701,
1704 | 1702,
1705 | 1703,
1706 | 1704,
1707 | 1705,
1708 | 1706,
1709 | 1707,
1710 | 1708,
1711 | 1709,
1712 | 1710,
1713 | 1711,
1714 | 1712,
1715 | 1713,
1716 | 1714,
1717 | 1715,
1718 | 1716,
1719 | 1717,
1720 | 1718,
1721 | 1719,
1722 | 1720,
1723 | 1721,
1724 | 1722,
1725 | 1723,
1726 | 1724,
1727 | 1725,
1728 | 1726,
1729 | 1727,
1730 | 1728,
1731 | 1729,
1732 | 1730,
1733 | 1731,
1734 | 1732,
1735 | 1733,
1736 | 1734,
1737 | 1735,
1738 | 1736,
1739 | 1737,
1740 | 1738,
1741 | 1739,
1742 | 1740,
1743 | 1741,
1744 | 1742,
1745 | 1743,
1746 | 1744,
1747 | 1745,
1748 | 1746,
1749 | 1747,
1750 | 1748,
1751 | 1749,
1752 | 1750,
1753 | 1751,
1754 | 1752,
1755 | 1753,
1756 | 1754,
1757 | 1755,
1758 | 1756,
1759 | 1757,
1760 | 1758,
1761 | 1759,
1762 | 1760,
1763 | 1761,
1764 | 1762,
1765 | 1763,
1766 | 1764,
1767 | 1765,
1768 | 1766,
1769 | 1767,
1770 | 1768,
1771 | 1769,
1772 | 1770,
1773 | 1771,
1774 | 1772,
1775 | 1773,
1776 | 1774,
1777 | 1775,
1778 | 1776,
1779 | 1777,
1780 | 1778,
1781 | 1779,
1782 | 1780,
1783 | 1781,
1784 | 1782,
1785 | 1783,
1786 | 1784,
1787 | 1785,
1788 | 1786,
1789 | 1787,
1790 | 1788,
1791 | 1789,
1792 | 1790,
1793 | 1791,
1794 | 1792,
1795 | 1793,
1796 | 1794,
1797 | 1795,
1798 | 1796,
1799 | 1797,
1800 | 1798,
1801 | 1799,
1802 | 1800,
1803 | 1801,
1804 | 1802,
1805 | 1803,
1806 | 1804,
1807 | 1805,
1808 | 1806,
1809 | 1807,
1810 | 1808,
1811 | 1809,
1812 | 1810,
1813 | 1811,
1814 | 1812,
1815 | 1813,
1816 | 1814,
1817 | 1815,
1818 | 1816,
1819 | 1817,
1820 | 1818,
1821 | 1819,
1822 | 1820,
1823 | 1821,
1824 | 1822,
1825 | 1823,
1826 | 1824,
1827 | 1825,
1828 | 1826,
1829 | 1827,
1830 | 1828,
1831 | 1829,
1832 | 1830,
1833 | 1831,
1834 | 1832,
1835 | 1833,
1836 | 1834,
1837 | 1835,
1838 | 1836,
1839 | 1837,
1840 | 1838,
1841 | 1839,
1842 | 1840,
1843 | 1841,
1844 | 1842,
1845 | 1843,
1846 | 1844,
1847 | 1845,
1848 | 1846,
1849 | 1847,
1850 | 1848,
1851 | 1849,
1852 | 1850,
1853 | 1851,
1854 | 1852,
1855 | 1853,
1856 | 1854,
1857 | 1855,
1858 | 1856,
1859 | 1857,
1860 | 1858,
1861 | 1859,
1862 | 1860,
1863 | 1861,
1864 | 1862,
1865 | 1863,
1866 | 1864,
1867 | 1865,
1868 | 1866,
1869 | 1867,
1870 | 1868,
1871 | 1869,
1872 | 1870,
1873 | 1871,
1874 | 1872,
1875 | 1873,
1876 | 1874,
1877 | 1875,
1878 | 1876,
1879 | 1877,
1880 | 1878,
1881 | 1879,
1882 | 1880,
1883 | 1881,
1884 | 1882,
1885 | 1883,
1886 | 1884,
1887 | 1885,
1888 | 1886,
1889 | 1887,
1890 | 1888,
1891 | 1889,
1892 | 1890,
1893 | 1891,
1894 | 1892,
1895 | 1893,
1896 | 1894,
1897 | 1895,
1898 | 1896,
1899 | 1897,
1900 | 1898,
1901 | 1899,
1902 | 1900,
1903 | 1901,
1904 | 1902,
1905 | 1903,
1906 | 1904,
1907 | 1905,
1908 | 1906,
1909 | 1907,
1910 | 1908,
1911 | 1909,
1912 | 1910,
1913 | 1911,
1914 | 1912,
1915 | 1913,
1916 | 1914,
1917 | 1915,
1918 | 1916,
1919 | 1917,
1920 | 1918,
1921 | 1919,
1922 | 1920,
1923 | 1921,
1924 | 1922,
1925 | 1923,
1926 | 1924,
1927 | 1925,
1928 | 1926,
1929 | 1927,
1930 | 1928,
1931 | 1929,
1932 | 1930,
1933 | 1931,
1934 | 1932,
1935 | 1933,
1936 | 1934,
1937 | 1935,
1938 | 1936,
1939 | 1937,
1940 | 1938,
1941 | 1939,
1942 | 1940,
1943 | 1941,
1944 | 1942,
1945 | 1943,
1946 | 1944,
1947 | 1945,
1948 | 1946,
1949 | 1947,
1950 | 1948,
1951 | 1949,
1952 | 1950,
1953 | 1951,
1954 | 1952,
1955 | 1953,
1956 | 1954,
1957 | 1955,
1958 | 1956,
1959 | 1957,
1960 | 1958,
1961 | 1959,
1962 | 1960,
1963 | 1961,
1964 | 1962,
1965 | 1963,
1966 | 1964,
1967 | 1965,
1968 | 1966,
1969 | 1967,
1970 | 1968,
1971 | 1969,
1972 | 1970,
1973 | 1971,
1974 | 1972,
1975 | 1973,
1976 | 1974,
1977 | 1975,
1978 | 1976,
1979 | 1977,
1980 | 1978,
1981 | 1979,
1982 | 1980,
1983 | 1981,
1984 | 1982,
1985 | 1983,
1986 | 1984,
1987 | 1985,
1988 | 1986,
1989 | 1987,
1990 | 1988,
1991 | 1989,
1992 | 1990,
1993 | 1991,
1994 | 1992,
1995 | 1993,
1996 | 1994,
1997 | 1995,
1998 | 1996,
1999 | 1997,
2000 | 1998,
2001 | 1999,
2002 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - defaults
 4 | dependencies:
 5 |   - pip
 6 |   - ipywidgets
 7 |   - hyperopt
 8 |   - kaggle
 9 |   - plotly
10 |   - scikit-learn
11 |   - scipy
12 |   - numpy
13 |   - python
14 |   - py-xgboost
15 |   - jupyterlab
16 |   - nb_conda
17 |   - bokeh
18 |   - pandas
19 |   - holoviews
20 |   - seaborn
21 |   - notebook<7.0.0
22 |   - pymatviz


--------------------------------------------------------------------------------
/molsim_ml.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# ML for Gas Adsorption"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "## -1. Only if you run this notebook on Colab"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "markdown",
  19 |    "metadata": {},
  20 |    "source": [
  21 |     "If you use this notebook on Colab, please uncomment the lines below (remove the `#`) and execute the cell."
  22 |    ]
  23 |   },
  24 |   {
  25 |    "cell_type": "code",
  26 |    "execution_count": 1,
  27 |    "metadata": {},
  28 |    "outputs": [],
  29 |    "source": [
  30 |     "#import sys\n",
  31 |     "#!{sys.executable} -m pip install -U pandas-profiling[notebook]\n",
  32 |     "#!jupyter nbextension enable --py widgetsnbextension\n",
  33 |     "#!pip install --upgrade pandas sklearn holoviews bokeh plotly matplotlib\n",
  34 |     "#!wget https://raw.githubusercontent.com/kjappelbaum/ml_molsim/2022/descriptornames.py\n",
  35 |     "#!mkdir data\n",
  36 |     "#!cd data && wget https://github.com/kjappelbaum/ml_molsim/raw/2022/data/data.csv\n",
  37 |     "#!cd data && wget https://github.com/kjappelbaum/ml_molsim/raw/2022/data/features.csv\n",
  38 |     "# import os, holoviews as hv\n",
  39 |     "# os.environ['HV_DOC_HTML'] = 'true'"
  40 |    ]
  41 |   },
  42 |   {
  43 |    "cell_type": "markdown",
  44 |    "metadata": {},
  45 |    "source": [
  46 |     "## Import packages we will need"
  47 |    ]
  48 |   },
  49 |   {
  50 |    "cell_type": "code",
  51 |    "execution_count": 7,
  52 |    "metadata": {},
  53 |    "outputs": [],
  54 |    "source": [
  55 |     "# basics\n",
  56 |     "import os\n",
  57 |     "import numpy as np\n",
  58 |     "import pprint as pp\n",
  59 |     "\n",
  60 |     "# pandas is used to read/process data\n",
  61 |     "import pandas as pd\n",
  62 |     "\n",
  63 |     "# machine learning dependencies\n",
  64 |     "# scaling of data\n",
  65 |     "from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler\n",
  66 |     "# train/test split\n",
  67 |     "from sklearn.model_selection import train_test_split\n",
  68 |     "# model selection\n",
  69 |     "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n",
  70 |     "# the KRR model\n",
  71 |     "from sklearn.kernel_ridge import KernelRidge\n",
  72 |     "# linear model\n",
  73 |     "from sklearn.linear_model import LinearRegression\n",
  74 |     "# pipeline to streamline modeling pipelines\n",
  75 |     "from sklearn.pipeline import Pipeline\n",
  76 |     "# principal component analysis\n",
  77 |     "from sklearn.decomposition import PCA\n",
  78 |     "# polynomial kernel\n",
  79 |     "from sklearn.metrics.pairwise import polynomial_kernel\n",
  80 |     "# Dummy model as baseline\n",
  81 |     "from sklearn.dummy import DummyClassifier, DummyRegressor\n",
  82 |     "# Variance Threshold for feature selection\n",
  83 |     "from sklearn.feature_selection import VarianceThreshold, SelectFromModel\n",
  84 |     "# metrics to measure model performance\n",
  85 |     "from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,\n",
  86 |     "                             mean_absolute_error, mean_squared_error, max_error)\n",
  87 |     "\n",
  88 |     "# save/load models\n",
  89 |     "import joblib\n",
  90 |     "\n",
  91 |     "# For the permutation importance implementation\n",
  92 |     "from joblib import Parallel\n",
  93 |     "from joblib import delayed\n",
  94 |     "from sklearn.metrics import check_scoring\n",
  95 |     "from sklearn.utils import Bunch\n",
  96 |     "from sklearn.utils import check_random_state\n",
  97 |     "from sklearn.utils import check_array\n",
  98 |     "\n",
  99 |     "# plotting\n",
 100 |     "import matplotlib.pyplot as plt\n",
 101 |     "%matplotlib inline\n",
 102 |     "from pymatviz.parity import hist_density\n",
 103 |     "\n",
 104 |     "RANDOM_SEED = 4242424242\n",
 105 |     "DATA_DIR = 'data'\n",
 106 |     "DATA_FILE = os.path.join(DATA_DIR, 'data.csv')\n",
 107 |     "\n",
 108 |     "np.random.seed(RANDOM_SEED)\n"
 109 |    ]
 110 |   },
 111 |   {
 112 |    "cell_type": "code",
 113 |    "execution_count": null,
 114 |    "metadata": {},
 115 |    "outputs": [],
 116 |    "source": [
 117 |     "\n",
 118 |     "other_descriptors = [\"CellV [A^3]\"]\n",
 119 |     "\n",
 120 |     "geometric_descriptors = [\n",
 121 |     "    \"Di\",\n",
 122 |     "    \"Df\",\n",
 123 |     "    \"Dif\",\n",
 124 |     "    \"density [g/cm^3]\",\n",
 125 |     "    \"total_SA_volumetric\",\n",
 126 |     "    \"total_SA_gravimetric\",\n",
 127 |     "    \"total_POV_volumetric\",\n",
 128 |     "    \"total_POV_gravimetric\",\n",
 129 |     "]\n",
 130 |     "\n",
 131 |     "linker_descriptors = [\n",
 132 |     "    \"f-lig-chi-0\",\n",
 133 |     "    \"f-lig-chi-1\",\n",
 134 |     "    \"f-lig-chi-2\",\n",
 135 |     "    \"f-lig-chi-3\",\n",
 136 |     "    \"f-lig-Z-0\",\n",
 137 |     "    \"f-lig-Z-1\",\n",
 138 |     "    \"f-lig-Z-2\",\n",
 139 |     "    \"f-lig-Z-3\",\n",
 140 |     "    \"f-lig-I-0\",\n",
 141 |     "    \"f-lig-I-1\",\n",
 142 |     "    \"f-lig-I-2\",\n",
 143 |     "    \"f-lig-I-3\",\n",
 144 |     "    \"f-lig-T-0\",\n",
 145 |     "    \"f-lig-T-1\",\n",
 146 |     "    \"f-lig-T-2\",\n",
 147 |     "    \"f-lig-T-3\",\n",
 148 |     "    \"f-lig-S-0\",\n",
 149 |     "    \"f-lig-S-1\",\n",
 150 |     "    \"f-lig-S-2\",\n",
 151 |     "    \"f-lig-S-3\",\n",
 152 |     "    \"lc-chi-0-all\",\n",
 153 |     "    \"lc-chi-1-all\",\n",
 154 |     "    \"lc-chi-2-all\",\n",
 155 |     "    \"lc-chi-3-all\",\n",
 156 |     "    \"lc-Z-0-all\",\n",
 157 |     "    \"lc-Z-1-all\",\n",
 158 |     "    \"lc-Z-2-all\",\n",
 159 |     "    \"lc-Z-3-all\",\n",
 160 |     "    \"lc-I-0-all\",\n",
 161 |     "    \"lc-I-1-all\",\n",
 162 |     "    \"lc-I-2-all\",\n",
 163 |     "    \"lc-I-3-all\",\n",
 164 |     "    \"lc-T-0-all\",\n",
 165 |     "    \"lc-T-1-all\",\n",
 166 |     "    \"lc-T-2-all\",\n",
 167 |     "    \"lc-T-3-all\",\n",
 168 |     "    \"lc-S-0-all\",\n",
 169 |     "    \"lc-S-1-all\",\n",
 170 |     "    \"lc-S-2-all\",\n",
 171 |     "    \"lc-S-3-all\",\n",
 172 |     "    \"lc-alpha-0-all\",\n",
 173 |     "    \"lc-alpha-1-all\",\n",
 174 |     "    \"lc-alpha-2-all\",\n",
 175 |     "    \"lc-alpha-3-all\",\n",
 176 |     "    \"D_lc-chi-0-all\",\n",
 177 |     "    \"D_lc-chi-1-all\",\n",
 178 |     "    \"D_lc-chi-2-all\",\n",
 179 |     "    \"D_lc-chi-3-all\",\n",
 180 |     "    \"D_lc-Z-0-all\",\n",
 181 |     "    \"D_lc-Z-1-all\",\n",
 182 |     "    \"D_lc-Z-2-all\",\n",
 183 |     "    \"D_lc-Z-3-all\",\n",
 184 |     "    \"D_lc-I-0-all\",\n",
 185 |     "    \"D_lc-I-1-all\",\n",
 186 |     "    \"D_lc-I-2-all\",\n",
 187 |     "    \"D_lc-I-3-all\",\n",
 188 |     "    \"D_lc-T-0-all\",\n",
 189 |     "    \"D_lc-T-1-all\",\n",
 190 |     "    \"D_lc-T-2-all\",\n",
 191 |     "    \"D_lc-T-3-all\",\n",
 192 |     "    \"D_lc-S-0-all\",\n",
 193 |     "    \"D_lc-S-1-all\",\n",
 194 |     "    \"D_lc-S-2-all\",\n",
 195 |     "    \"D_lc-S-3-all\",\n",
 196 |     "    \"D_lc-alpha-0-all\",\n",
 197 |     "    \"D_lc-alpha-1-all\",\n",
 198 |     "    \"D_lc-alpha-2-all\",\n",
 199 |     "    \"D_lc-alpha-3-all\",\n",
 200 |     "]\n",
 201 |     "\n",
 202 |     "metalcenter_descriptors = [\n",
 203 |     "    \"mc_CRY-chi-0-all\",\n",
 204 |     "    \"mc_CRY-chi-1-all\",\n",
 205 |     "    \"mc_CRY-chi-2-all\",\n",
 206 |     "    \"mc_CRY-chi-3-all\",\n",
 207 |     "    \"mc_CRY-Z-0-all\",\n",
 208 |     "    \"mc_CRY-Z-1-all\",\n",
 209 |     "    \"mc_CRY-Z-2-all\",\n",
 210 |     "    \"mc_CRY-Z-3-all\",\n",
 211 |     "    \"mc_CRY-I-0-all\",\n",
 212 |     "    \"mc_CRY-I-1-all\",\n",
 213 |     "    \"mc_CRY-I-2-all\",\n",
 214 |     "    \"mc_CRY-I-3-all\",\n",
 215 |     "    \"mc_CRY-T-0-all\",\n",
 216 |     "    \"mc_CRY-T-1-all\",\n",
 217 |     "    \"mc_CRY-T-2-all\",\n",
 218 |     "    \"mc_CRY-T-3-all\",\n",
 219 |     "    \"mc_CRY-S-0-all\",\n",
 220 |     "    \"mc_CRY-S-1-all\",\n",
 221 |     "    \"mc_CRY-S-2-all\",\n",
 222 |     "    \"mc_CRY-S-3-all\",\n",
 223 |     "    \"D_mc_CRY-chi-0-all\",\n",
 224 |     "    \"D_mc_CRY-chi-1-all\",\n",
 225 |     "    \"D_mc_CRY-chi-2-all\",\n",
 226 |     "    \"D_mc_CRY-chi-3-all\",\n",
 227 |     "    \"D_mc_CRY-Z-0-all\",\n",
 228 |     "    \"D_mc_CRY-Z-1-all\",\n",
 229 |     "    \"D_mc_CRY-Z-2-all\",\n",
 230 |     "    \"D_mc_CRY-Z-3-all\",\n",
 231 |     "    \"D_mc_CRY-I-0-all\",\n",
 232 |     "    \"D_mc_CRY-I-1-all\",\n",
 233 |     "    \"D_mc_CRY-I-2-all\",\n",
 234 |     "    \"D_mc_CRY-I-3-all\",\n",
 235 |     "    \"D_mc_CRY-T-0-all\",\n",
 236 |     "    \"D_mc_CRY-T-1-all\",\n",
 237 |     "    \"D_mc_CRY-T-2-all\",\n",
 238 |     "    \"D_mc_CRY-T-3-all\",\n",
 239 |     "    \"D_mc_CRY-S-0-all\",\n",
 240 |     "    \"D_mc_CRY-S-1-all\",\n",
 241 |     "    \"D_mc_CRY-S-2-all\",\n",
 242 |     "    \"D_mc_CRY-S-3-all\",\n",
 243 |     "]\n",
 244 |     "\n",
 245 |     "functionalgroup_descriptors = [\n",
 246 |     "    \"func-chi-0-all\",\n",
 247 |     "    \"func-chi-1-all\",\n",
 248 |     "    \"func-chi-2-all\",\n",
 249 |     "    \"func-chi-3-all\",\n",
 250 |     "    \"func-Z-0-all\",\n",
 251 |     "    \"func-Z-1-all\",\n",
 252 |     "    \"func-Z-2-all\",\n",
 253 |     "    \"func-Z-3-all\",\n",
 254 |     "    \"func-I-0-all\",\n",
 255 |     "    \"func-I-1-all\",\n",
 256 |     "    \"func-I-2-all\",\n",
 257 |     "    \"func-I-3-all\",\n",
 258 |     "    \"func-T-0-all\",\n",
 259 |     "    \"func-T-1-all\",\n",
 260 |     "    \"func-T-2-all\",\n",
 261 |     "    \"func-T-3-all\",\n",
 262 |     "    \"func-S-0-all\",\n",
 263 |     "    \"func-S-1-all\",\n",
 264 |     "    \"func-S-2-all\",\n",
 265 |     "    \"func-S-3-all\",\n",
 266 |     "    \"func-alpha-0-all\",\n",
 267 |     "    \"func-alpha-1-all\",\n",
 268 |     "    \"func-alpha-2-all\",\n",
 269 |     "    \"func-alpha-3-all\",\n",
 270 |     "    \"D_func-chi-0-all\",\n",
 271 |     "    \"D_func-chi-1-all\",\n",
 272 |     "    \"D_func-chi-2-all\",\n",
 273 |     "    \"D_func-chi-3-all\",\n",
 274 |     "    \"D_func-Z-0-all\",\n",
 275 |     "    \"D_func-Z-1-all\",\n",
 276 |     "    \"D_func-Z-2-all\",\n",
 277 |     "    \"D_func-Z-3-all\",\n",
 278 |     "    \"D_func-I-0-all\",\n",
 279 |     "    \"D_func-I-1-all\",\n",
 280 |     "    \"D_func-I-2-all\",\n",
 281 |     "    \"D_func-I-3-all\",\n",
 282 |     "    \"D_func-T-0-all\",\n",
 283 |     "    \"D_func-T-1-all\",\n",
 284 |     "    \"D_func-T-2-all\",\n",
 285 |     "    \"D_func-T-3-all\",\n",
 286 |     "    \"D_func-S-0-all\",\n",
 287 |     "    \"D_func-S-1-all\",\n",
 288 |     "    \"D_func-S-2-all\",\n",
 289 |     "    \"D_func-S-3-all\",\n",
 290 |     "    \"D_func-alpha-0-all\",\n",
 291 |     "    \"D_func-alpha-1-all\",\n",
 292 |     "    \"D_func-alpha-2-all\",\n",
 293 |     "    \"D_func-alpha-3-all\",\n",
 294 |     "]\n",
 295 |     "\n",
 296 |     "\n",
 297 |     "summed_linker_descriptors = [\n",
 298 |     "    \"sum-f-lig-chi-0\",\n",
 299 |     "    \"sum-f-lig-chi-1\",\n",
 300 |     "    \"sum-f-lig-chi-2\",\n",
 301 |     "    \"sum-f-lig-chi-3\",\n",
 302 |     "    \"sum-f-lig-Z-0\",\n",
 303 |     "    \"sum-f-lig-Z-1\",\n",
 304 |     "    \"sum-f-lig-Z-2\",\n",
 305 |     "    \"sum-f-lig-Z-3\",\n",
 306 |     "    \"sum-f-lig-I-0\",\n",
 307 |     "    \"sum-f-lig-I-1\",\n",
 308 |     "    \"sum-f-lig-I-2\",\n",
 309 |     "    \"sum-f-lig-I-3\",\n",
 310 |     "    \"sum-f-lig-T-0\",\n",
 311 |     "    \"sum-f-lig-T-1\",\n",
 312 |     "    \"sum-f-lig-T-2\",\n",
 313 |     "    \"sum-f-lig-T-3\",\n",
 314 |     "    \"sum-f-lig-S-0\",\n",
 315 |     "    \"sum-f-lig-S-1\",\n",
 316 |     "    \"sum-f-lig-S-2\",\n",
 317 |     "    \"sum-f-lig-S-3\",\n",
 318 |     "    \"sum-lc-chi-0-all\",\n",
 319 |     "    \"sum-lc-chi-1-all\",\n",
 320 |     "    \"sum-lc-chi-2-all\",\n",
 321 |     "    \"sum-lc-chi-3-all\",\n",
 322 |     "    \"sum-lc-Z-0-all\",\n",
 323 |     "    \"sum-lc-Z-1-all\",\n",
 324 |     "    \"sum-lc-Z-2-all\",\n",
 325 |     "    \"sum-lc-Z-3-all\",\n",
 326 |     "    \"sum-lc-I-0-all\",\n",
 327 |     "    \"sum-lc-I-1-all\",\n",
 328 |     "    \"sum-lc-I-2-all\",\n",
 329 |     "    \"sum-lc-I-3-all\",\n",
 330 |     "    \"sum-lc-T-0-all\",\n",
 331 |     "    \"sum-lc-T-1-all\",\n",
 332 |     "    \"sum-lc-T-2-all\",\n",
 333 |     "    \"sum-lc-T-3-all\",\n",
 334 |     "    \"sum-lc-S-0-all\",\n",
 335 |     "    \"sum-lc-S-1-all\",\n",
 336 |     "    \"sum-lc-S-2-all\",\n",
 337 |     "    \"sum-lc-S-3-all\",\n",
 338 |     "    \"sum-lc-alpha-0-all\",\n",
 339 |     "    \"sum-lc-alpha-1-all\",\n",
 340 |     "    \"sum-lc-alpha-2-all\",\n",
 341 |     "    \"sum-lc-alpha-3-all\",\n",
 342 |     "    \"sum-D_lc-chi-0-all\",\n",
 343 |     "    \"sum-D_lc-chi-1-all\",\n",
 344 |     "    \"sum-D_lc-chi-2-all\",\n",
 345 |     "    \"sum-D_lc-chi-3-all\",\n",
 346 |     "    \"sum-D_lc-Z-0-all\",\n",
 347 |     "    \"sum-D_lc-Z-1-all\",\n",
 348 |     "    \"sum-D_lc-Z-2-all\",\n",
 349 |     "    \"sum-D_lc-Z-3-all\",\n",
 350 |     "    \"sum-D_lc-I-0-all\",\n",
 351 |     "    \"sum-D_lc-I-1-all\",\n",
 352 |     "    \"sum-D_lc-I-2-all\",\n",
 353 |     "    \"sum-D_lc-I-3-all\",\n",
 354 |     "    \"sum-D_lc-T-0-all\",\n",
 355 |     "    \"sum-D_lc-T-1-all\",\n",
 356 |     "    \"sum-D_lc-T-2-all\",\n",
 357 |     "    \"sum-D_lc-T-3-all\",\n",
 358 |     "    \"sum-D_lc-S-0-all\",\n",
 359 |     "    \"sum-D_lc-S-1-all\",\n",
 360 |     "    \"sum-D_lc-S-2-all\",\n",
 361 |     "    \"sum-D_lc-S-3-all\",\n",
 362 |     "    \"sum-D_lc-alpha-0-all\",\n",
 363 |     "    \"sum-D_lc-alpha-1-all\",\n",
 364 |     "    \"sum-D_lc-alpha-2-all\",\n",
 365 |     "    \"sum-D_lc-alpha-3-all\",\n",
 366 |     "]\n",
 367 |     "\n",
 368 |     "summed_metalcenter_descriptors = [\n",
 369 |     "    \"sum-mc_CRY-chi-0-all\",\n",
 370 |     "    \"sum-mc_CRY-chi-1-all\",\n",
 371 |     "    \"sum-mc_CRY-chi-2-all\",\n",
 372 |     "    \"sum-mc_CRY-chi-3-all\",\n",
 373 |     "    \"sum-mc_CRY-Z-0-all\",\n",
 374 |     "    \"sum-mc_CRY-Z-1-all\",\n",
 375 |     "    \"sum-mc_CRY-Z-2-all\",\n",
 376 |     "    \"sum-mc_CRY-Z-3-all\",\n",
 377 |     "    \"sum-mc_CRY-I-0-all\",\n",
 378 |     "    \"sum-mc_CRY-I-1-all\",\n",
 379 |     "    \"sum-mc_CRY-I-2-all\",\n",
 380 |     "    \"sum-mc_CRY-I-3-all\",\n",
 381 |     "    \"sum-mc_CRY-T-0-all\",\n",
 382 |     "    \"sum-mc_CRY-T-1-all\",\n",
 383 |     "    \"sum-mc_CRY-T-2-all\",\n",
 384 |     "    \"sum-mc_CRY-T-3-all\",\n",
 385 |     "    \"sum-mc_CRY-S-0-all\",\n",
 386 |     "    \"sum-mc_CRY-S-1-all\",\n",
 387 |     "    \"sum-mc_CRY-S-2-all\",\n",
 388 |     "    \"sum-mc_CRY-S-3-all\",\n",
 389 |     "    \"sum-D_mc_CRY-chi-0-all\",\n",
 390 |     "    \"sum-D_mc_CRY-chi-1-all\",\n",
 391 |     "    \"sum-D_mc_CRY-chi-2-all\",\n",
 392 |     "    \"sum-D_mc_CRY-chi-3-all\",\n",
 393 |     "    \"sum-D_mc_CRY-Z-0-all\",\n",
 394 |     "    \"sum-D_mc_CRY-Z-1-all\",\n",
 395 |     "    \"sum-D_mc_CRY-Z-2-all\",\n",
 396 |     "    \"sum-D_mc_CRY-Z-3-all\",\n",
 397 |     "    \"sum-D_mc_CRY-I-0-all\",\n",
 398 |     "    \"sum-D_mc_CRY-I-1-all\",\n",
 399 |     "    \"sum-D_mc_CRY-I-2-all\",\n",
 400 |     "    \"sum-D_mc_CRY-I-3-all\",\n",
 401 |     "    \"sum-D_mc_CRY-T-0-all\",\n",
 402 |     "    \"sum-D_mc_CRY-T-1-all\",\n",
 403 |     "    \"sum-D_mc_CRY-T-2-all\",\n",
 404 |     "    \"sum-D_mc_CRY-T-3-all\",\n",
 405 |     "    \"sum-D_mc_CRY-S-0-all\",\n",
 406 |     "    \"sum-D_mc_CRY-S-1-all\",\n",
 407 |     "    \"sum-D_mc_CRY-S-2-all\",\n",
 408 |     "    \"sum-D_mc_CRY-S-3-all\",\n",
 409 |     "]\n",
 410 |     "\n",
 411 |     "summed_functionalgroup_descriptors = [\n",
 412 |     "    \"sum-func-chi-0-all\",\n",
 413 |     "    \"sum-func-chi-1-all\",\n",
 414 |     "    \"sum-func-chi-2-all\",\n",
 415 |     "    \"sum-func-chi-3-all\",\n",
 416 |     "    \"sum-func-Z-0-all\",\n",
 417 |     "    \"sum-func-Z-1-all\",\n",
 418 |     "    \"sum-func-Z-2-all\",\n",
 419 |     "    \"sum-func-Z-3-all\",\n",
 420 |     "    \"sum-func-I-0-all\",\n",
 421 |     "    \"sum-func-I-1-all\",\n",
 422 |     "    \"sum-func-I-2-all\",\n",
 423 |     "    \"sum-func-I-3-all\",\n",
 424 |     "    \"sum-func-T-0-all\",\n",
 425 |     "    \"sum-func-T-1-all\",\n",
 426 |     "    \"sum-func-T-2-all\",\n",
 427 |     "    \"sum-func-T-3-all\",\n",
 428 |     "    \"sum-func-S-0-all\",\n",
 429 |     "    \"sum-func-S-1-all\",\n",
 430 |     "    \"sum-func-S-2-all\",\n",
 431 |     "    \"sum-func-S-3-all\",\n",
 432 |     "    \"sum-func-alpha-0-all\",\n",
 433 |     "    \"sum-func-alpha-1-all\",\n",
 434 |     "    \"sum-func-alpha-2-all\",\n",
 435 |     "    \"sum-func-alpha-3-all\",\n",
 436 |     "    \"sum-D_func-chi-0-all\",\n",
 437 |     "    \"sum-D_func-chi-1-all\",\n",
 438 |     "    \"sum-D_func-chi-2-all\",\n",
 439 |     "    \"sum-D_func-chi-3-all\",\n",
 440 |     "    \"sum-D_func-Z-0-all\",\n",
 441 |     "    \"sum-D_func-Z-1-all\",\n",
 442 |     "    \"sum-D_func-Z-2-all\",\n",
 443 |     "    \"sum-D_func-Z-3-all\",\n",
 444 |     "    \"sum-D_func-I-0-all\",\n",
 445 |     "    \"sum-D_func-I-1-all\",\n",
 446 |     "    \"sum-D_func-I-2-all\",\n",
 447 |     "    \"sum-D_func-I-3-all\",\n",
 448 |     "    \"sum-D_func-T-0-all\",\n",
 449 |     "    \"sum-D_func-T-1-all\",\n",
 450 |     "    \"sum-D_func-T-2-all\",\n",
 451 |     "    \"sum-D_func-T-3-all\",\n",
 452 |     "    \"sum-D_func-S-0-all\",\n",
 453 |     "    \"sum-D_func-S-1-all\",\n",
 454 |     "    \"sum-D_func-S-2-all\",\n",
 455 |     "    \"sum-D_func-S-3-all\",\n",
 456 |     "    \"sum-D_func-alpha-0-all\",\n",
 457 |     "    \"sum-D_func-alpha-1-all\",\n",
 458 |     "    \"sum-D_func-alpha-2-all\",\n",
 459 |     "    \"sum-D_func-alpha-3-all\",\n",
 460 |     "]\n"
 461 |    ]
 462 |   },
 463 |   {
 464 |    "cell_type": "markdown",
 465 |    "metadata": {},
 466 |    "source": [
 467 |     " $\\color{DarkBlue}{\\textsf{Short question}}$\n",
 468 |     "- We declared a global variable to fix the random seed (`RANDOM_SEED`). Why did we do this?  "
 469 |    ]
 470 |   },
 471 |   {
 472 |    "cell_type": "markdown",
 473 |    "metadata": {},
 474 |    "source": [
 475 |     "## Hands-on Project: Carbon-dioxide uptake in MOFs"
 476 |    ]
 477 |   },
 478 |   {
 479 |    "cell_type": "markdown",
 480 |    "metadata": {},
 481 |    "source": [
 482 |     "In this exercise we will build a model that can predict the CO$_2$ uptake of metal-organic frameworks (MOFs), which are crystalline materials consisting of inorganic metal nodes linked by organic linkers.\n",
 483 |     "\n",
 484 |     "![MOF building principle](assets/mof_building_principle.png)\n",
 485 |     "\n",
 486 |     "There are two main **learning goals** for this exercise: \n",
 487 |     "\n",
 488 |     "1. Understand the typical workflow for machine learning in materials science. We will cover exploratory data analysis (EDA) and supervised learning (KRR).\n",
 489 |     "\n",
 490 |     "2. Get familiar with some Python packages that are useful for data analysis and visualization. \n",
 491 |     "\n",
 492 |     "At the end of the exercise, you will produce an interactive plot like the one below, comparing the predictions of your model against CO$_2$ computed with GCMC simulations.\n",
 493 |     "The histograms show the distributions of the errors on the training set (left) and on the test set (right).\n",
 494 |     "\n",
 495 |     "\n",
 496 |     "\n",
 497 |     "<img src=\"assets/result.gif\" alt=\"Parity interactive\" width=\"700\"/>\n",
 498 |     "\n",
 499 |     "This exercise requires a basic knowledge of Python, e.g. that you can write list comprehensions, and are able to read documentation of functions provided by Python packages.\n",
 500 |     "You will be asked to provide some function arguments (indicated by `#fillme` comments).\n",
 501 |     "\n",
 502 |     "You can execute all the following code cells by pressing SHIFT and ENTER and get informations about the functions by pressing TAB when you are between the parentheses (see the notes for more tips). \n",
 503 |     "\n",
 504 |     "Also the [sklearn documentation](https://scikit-learn.org/stable/user_guide.html) is a great source of reference with many explanations and examples.\n",
 505 |     "\n",
 506 |     "In pandas dataframe (df) you can select columns using their name by running `df[columnname]`. If at any point you think that the dataset is too large for your computer, you can select a subset using `df.sample()` or by making the test set larger in the train/test split (section 2). "
 507 |    ]
 508 |   },
 509 |   {
 510 |    "cell_type": "markdown",
 511 |    "metadata": {},
 512 |    "source": [
 513 |     "## 1. Import the data"
 514 |    ]
 515 |   },
 516 |   {
 517 |    "cell_type": "code",
 518 |    "execution_count": 3,
 519 |    "metadata": {},
 520 |    "outputs": [],
 521 |    "source": [
 522 |     "df = pd.read_csv(DATA_FILE)"
 523 |    ]
 524 |   },
 525 |   {
 526 |    "cell_type": "markdown",
 527 |    "metadata": {},
 528 |    "source": [
 529 |     "Let's take a look at the first few rows to see if everythings seems reasonable ..."
 530 |    ]
 531 |   },
 532 |   {
 533 |    "cell_type": "code",
 534 |    "execution_count": 4,
 535 |    "metadata": {},
 536 |    "outputs": [
 537 |     {
 538 |      "data": {
 539 |       "text/html": [
 540 |        "<div>\n",
 541 |        "<style scoped>\n",
 542 |        "    .dataframe tbody tr th:only-of-type {\n",
 543 |        "        vertical-align: middle;\n",
 544 |        "    }\n",
 545 |        "\n",
 546 |        "    .dataframe tbody tr th {\n",
 547 |        "        vertical-align: top;\n",
 548 |        "    }\n",
 549 |        "\n",
 550 |        "    .dataframe thead th {\n",
 551 |        "        text-align: right;\n",
 552 |        "    }\n",
 553 |        "</style>\n",
 554 |        "<table border=\"1\" class=\"dataframe\">\n",
 555 |        "  <thead>\n",
 556 |        "    <tr style=\"text-align: right;\">\n",
 557 |        "      <th></th>\n",
 558 |        "      <th>ASA [m^2/cm^3]</th>\n",
 559 |        "      <th>CellV [A^3]</th>\n",
 560 |        "      <th>Df</th>\n",
 561 |        "      <th>Di</th>\n",
 562 |        "      <th>Dif</th>\n",
 563 |        "      <th>NASA [m^2/cm^3]</th>\n",
 564 |        "      <th>POAV [cm^3/g]</th>\n",
 565 |        "      <th>POAVF</th>\n",
 566 |        "      <th>PONAV [cm^3/g]</th>\n",
 567 |        "      <th>PONAVF</th>\n",
 568 |        "      <th>...</th>\n",
 569 |        "      <th>pure_methane_widomHOA</th>\n",
 570 |        "      <th>pure_uptake_CO2_298.00_15000</th>\n",
 571 |        "      <th>pure_uptake_CO2_298.00_1600000</th>\n",
 572 |        "      <th>pure_uptake_methane_298.00_580000</th>\n",
 573 |        "      <th>pure_uptake_methane_298.00_6500000</th>\n",
 574 |        "      <th>logKH_CO2</th>\n",
 575 |        "      <th>logKH_CH4</th>\n",
 576 |        "      <th>CH4DC</th>\n",
 577 |        "      <th>CH4HPSTP</th>\n",
 578 |        "      <th>CH4LPSTP</th>\n",
 579 |        "    </tr>\n",
 580 |        "  </thead>\n",
 581 |        "  <tbody>\n",
 582 |        "    <tr>\n",
 583 |        "      <th>0</th>\n",
 584 |        "      <td>2329.01</td>\n",
 585 |        "      <td>1251.28</td>\n",
 586 |        "      <td>6.61256</td>\n",
 587 |        "      <td>8.87694</td>\n",
 588 |        "      <td>8.48668</td>\n",
 589 |        "      <td>0.0</td>\n",
 590 |        "      <td>0.818919</td>\n",
 591 |        "      <td>0.68874</td>\n",
 592 |        "      <td>0.0</td>\n",
 593 |        "      <td>0.0</td>\n",
 594 |        "      <td>...</td>\n",
 595 |        "      <td>-8.144317</td>\n",
 596 |        "      <td>0.111981</td>\n",
 597 |        "      <td>14.218595</td>\n",
 598 |        "      <td>1.680640</td>\n",
 599 |        "      <td>9.163066</td>\n",
 600 |        "      <td>-5.125451</td>\n",
 601 |        "      <td>-5.511444</td>\n",
 602 |        "      <td>175.569974</td>\n",
 603 |        "      <td>215.005044</td>\n",
 604 |        "      <td>39.435070</td>\n",
 605 |        "    </tr>\n",
 606 |        "    <tr>\n",
 607 |        "      <th>1</th>\n",
 608 |        "      <td>1983.81</td>\n",
 609 |        "      <td>1254.01</td>\n",
 610 |        "      <td>5.80566</td>\n",
 611 |        "      <td>7.13426</td>\n",
 612 |        "      <td>7.13154</td>\n",
 613 |        "      <td>0.0</td>\n",
 614 |        "      <td>0.495493</td>\n",
 615 |        "      <td>0.58032</td>\n",
 616 |        "      <td>0.0</td>\n",
 617 |        "      <td>0.0</td>\n",
 618 |        "      <td>...</td>\n",
 619 |        "      <td>-10.208005</td>\n",
 620 |        "      <td>0.481625</td>\n",
 621 |        "      <td>9.312424</td>\n",
 622 |        "      <td>1.513152</td>\n",
 623 |        "      <td>5.908356</td>\n",
 624 |        "      <td>-4.502967</td>\n",
 625 |        "      <td>-5.505947</td>\n",
 626 |        "      <td>143.616349</td>\n",
 627 |        "      <td>193.059644</td>\n",
 628 |        "      <td>49.443295</td>\n",
 629 |        "    </tr>\n",
 630 |        "    <tr>\n",
 631 |        "      <th>2</th>\n",
 632 |        "      <td>2259.13</td>\n",
 633 |        "      <td>1250.58</td>\n",
 634 |        "      <td>5.99131</td>\n",
 635 |        "      <td>8.01682</td>\n",
 636 |        "      <td>7.98933</td>\n",
 637 |        "      <td>0.0</td>\n",
 638 |        "      <td>0.728036</td>\n",
 639 |        "      <td>0.65710</td>\n",
 640 |        "      <td>0.0</td>\n",
 641 |        "      <td>0.0</td>\n",
 642 |        "      <td>...</td>\n",
 643 |        "      <td>-8.479801</td>\n",
 644 |        "      <td>0.401683</td>\n",
 645 |        "      <td>14.796071</td>\n",
 646 |        "      <td>1.569714</td>\n",
 647 |        "      <td>7.933198</td>\n",
 648 |        "      <td>-4.433968</td>\n",
 649 |        "      <td>-5.525707</td>\n",
 650 |        "      <td>160.238808</td>\n",
 651 |        "      <td>199.765744</td>\n",
 652 |        "      <td>39.526937</td>\n",
 653 |        "    </tr>\n",
 654 |        "    <tr>\n",
 655 |        "      <th>3</th>\n",
 656 |        "      <td>1424.54</td>\n",
 657 |        "      <td>1249.27</td>\n",
 658 |        "      <td>4.73477</td>\n",
 659 |        "      <td>7.05822</td>\n",
 660 |        "      <td>7.05822</td>\n",
 661 |        "      <td>0.0</td>\n",
 662 |        "      <td>0.453157</td>\n",
 663 |        "      <td>0.47338</td>\n",
 664 |        "      <td>0.0</td>\n",
 665 |        "      <td>0.0</td>\n",
 666 |        "      <td>...</td>\n",
 667 |        "      <td>-12.615382</td>\n",
 668 |        "      <td>0.821747</td>\n",
 669 |        "      <td>10.816880</td>\n",
 670 |        "      <td>2.161833</td>\n",
 671 |        "      <td>6.710778</td>\n",
 672 |        "      <td>-4.135434</td>\n",
 673 |        "      <td>-5.297082</td>\n",
 674 |        "      <td>132.576623</td>\n",
 675 |        "      <td>195.582107</td>\n",
 676 |        "      <td>63.005483</td>\n",
 677 |        "    </tr>\n",
 678 |        "    <tr>\n",
 679 |        "      <th>4</th>\n",
 680 |        "      <td>2228.31</td>\n",
 681 |        "      <td>1250.61</td>\n",
 682 |        "      <td>6.40783</td>\n",
 683 |        "      <td>8.35944</td>\n",
 684 |        "      <td>8.26946</td>\n",
 685 |        "      <td>0.0</td>\n",
 686 |        "      <td>0.700539</td>\n",
 687 |        "      <td>0.65092</td>\n",
 688 |        "      <td>0.0</td>\n",
 689 |        "      <td>0.0</td>\n",
 690 |        "      <td>...</td>\n",
 691 |        "      <td>-8.743404</td>\n",
 692 |        "      <td>0.258905</td>\n",
 693 |        "      <td>14.153999</td>\n",
 694 |        "      <td>1.653013</td>\n",
 695 |        "      <td>8.272621</td>\n",
 696 |        "      <td>-4.774301</td>\n",
 697 |        "      <td>-5.515219</td>\n",
 698 |        "      <td>171.601539</td>\n",
 699 |        "      <td>214.452966</td>\n",
 700 |        "      <td>42.851427</td>\n",
 701 |        "    </tr>\n",
 702 |        "  </tbody>\n",
 703 |        "</table>\n",
 704 |        "<p>5 rows × 343 columns</p>\n",
 705 |        "</div>"
 706 |       ],
 707 |       "text/plain": [
 708 |        "   ASA [m^2/cm^3]  CellV [A^3]       Df       Di      Dif  NASA [m^2/cm^3]  \\\n",
 709 |        "0         2329.01      1251.28  6.61256  8.87694  8.48668              0.0   \n",
 710 |        "1         1983.81      1254.01  5.80566  7.13426  7.13154              0.0   \n",
 711 |        "2         2259.13      1250.58  5.99131  8.01682  7.98933              0.0   \n",
 712 |        "3         1424.54      1249.27  4.73477  7.05822  7.05822              0.0   \n",
 713 |        "4         2228.31      1250.61  6.40783  8.35944  8.26946              0.0   \n",
 714 |        "\n",
 715 |        "   POAV [cm^3/g]    POAVF  PONAV [cm^3/g]  PONAVF  ...  pure_methane_widomHOA  \\\n",
 716 |        "0       0.818919  0.68874             0.0     0.0  ...              -8.144317   \n",
 717 |        "1       0.495493  0.58032             0.0     0.0  ...             -10.208005   \n",
 718 |        "2       0.728036  0.65710             0.0     0.0  ...              -8.479801   \n",
 719 |        "3       0.453157  0.47338             0.0     0.0  ...             -12.615382   \n",
 720 |        "4       0.700539  0.65092             0.0     0.0  ...              -8.743404   \n",
 721 |        "\n",
 722 |        "  pure_uptake_CO2_298.00_15000  pure_uptake_CO2_298.00_1600000  \\\n",
 723 |        "0                     0.111981                       14.218595   \n",
 724 |        "1                     0.481625                        9.312424   \n",
 725 |        "2                     0.401683                       14.796071   \n",
 726 |        "3                     0.821747                       10.816880   \n",
 727 |        "4                     0.258905                       14.153999   \n",
 728 |        "\n",
 729 |        "   pure_uptake_methane_298.00_580000  pure_uptake_methane_298.00_6500000  \\\n",
 730 |        "0                           1.680640                            9.163066   \n",
 731 |        "1                           1.513152                            5.908356   \n",
 732 |        "2                           1.569714                            7.933198   \n",
 733 |        "3                           2.161833                            6.710778   \n",
 734 |        "4                           1.653013                            8.272621   \n",
 735 |        "\n",
 736 |        "   logKH_CO2  logKH_CH4       CH4DC    CH4HPSTP   CH4LPSTP  \n",
 737 |        "0  -5.125451  -5.511444  175.569974  215.005044  39.435070  \n",
 738 |        "1  -4.502967  -5.505947  143.616349  193.059644  49.443295  \n",
 739 |        "2  -4.433968  -5.525707  160.238808  199.765744  39.526937  \n",
 740 |        "3  -4.135434  -5.297082  132.576623  195.582107  63.005483  \n",
 741 |        "4  -4.774301  -5.515219  171.601539  214.452966  42.851427  \n",
 742 |        "\n",
 743 |        "[5 rows x 343 columns]"
 744 |       ]
 745 |      },
 746 |      "execution_count": 4,
 747 |      "metadata": {},
 748 |      "output_type": "execute_result"
 749 |     }
 750 |    ],
 751 |    "source": [
 752 |     "df.head()"
 753 |    ]
 754 |   },
 755 |   {
 756 |    "cell_type": "markdown",
 757 |    "metadata": {},
 758 |    "source": [
 759 |     "<details>\n",
 760 |     "<summary> <font color='green'>Click here for a hint</font></summary>\n",
 761 |     "<ul>\n",
 762 |     "    <li>Use something like <code>pd.options.display.max_columns=100</code> to adjust how many columns are shown.<code>pd.options.display.max_columns=100</code>  would show at maximum 100 columns. </li>\n",
 763 |     "</ul>\n",
 764 |     "</details>"
 765 |    ]
 766 |   },
 767 |   {
 768 |    "cell_type": "markdown",
 769 |    "metadata": {},
 770 |    "source": [
 771 |     "Let's also get some basic information ..."
 772 |    ]
 773 |   },
 774 |   {
 775 |    "cell_type": "code",
 776 |    "execution_count": 10,
 777 |    "metadata": {},
 778 |    "outputs": [
 779 |     {
 780 |      "name": "stdout",
 781 |      "output_type": "stream",
 782 |      "text": [
 783 |       "<class 'pandas.core.frame.DataFrame'>\n",
 784 |       "RangeIndex: 17379 entries, 0 to 17378\n",
 785 |       "Columns: 343 entries, ASA [m^2/cm^3] to CH4LPSTP\n",
 786 |       "dtypes: float64(342), object(1)\n",
 787 |       "memory usage: 45.5+ MB\n"
 788 |      ]
 789 |     }
 790 |    ],
 791 |    "source": [
 792 |     "df.info()"
 793 |    ]
 794 |   },
 795 |   {
 796 |    "cell_type": "markdown",
 797 |    "metadata": {},
 798 |    "source": [
 799 |     " $\\color{DarkBlue}{\\textsf{Short question}}$\n",
 800 |     "- How many materials are in the dataset? \n",
 801 |     "- Which datatypes do we deal with?"
 802 |    ]
 803 |   },
 804 |   {
 805 |    "cell_type": "markdown",
 806 |    "metadata": {},
 807 |    "source": [
 808 |     "Below, we define three global variables (hence upper case), which are the *names* of our feature and target columns. We will use the `TARGET` for the actual regression and the `TARGET_BINARY` only for the stratified train/test split. The `FEATURES` variable is a list of column names of our dataframe."
 809 |    ]
 810 |   },
 811 |   {
 812 |    "cell_type": "code",
 813 |    "execution_count": 5,
 814 |    "metadata": {},
 815 |    "outputs": [],
 816 |    "source": [
 817 |     "TARGET = \"pure_uptake_CO2_298.00_1600000\"\n",
 818 |     "TARGET_BINARY = \"target_binned\"  # will be created later\n",
 819 |     "FEATURES = (\n",
 820 |     "    geometric_descriptors\n",
 821 |     "    + summed_functionalgroup_descriptors\n",
 822 |     "    + summed_linker_descriptors\n",
 823 |     "    + summed_metalcenter_descriptors\n",
 824 |     ")\n"
 825 |    ]
 826 |   },
 827 |   {
 828 |    "cell_type": "markdown",
 829 |    "metadata": {},
 830 |    "source": [
 831 |     "As descriptors we will use geometric properties such as density, pore volume, etc. and [revised autocorrelation functions](https://pubs.acs.org/doi/abs/10.1021/acs.jpca.7b08750) (RACs) that have been optimized for describing inorganic compounds ([and recently adapated for MOFs](https://www.nature.com/articles/s41467-020-17755-8))"
 832 |    ]
 833 |   },
 834 |   {
 835 |    "cell_type": "markdown",
 836 |    "metadata": {},
 837 |    "source": [
 838 |     "Examples for pore geometry descriptors (in `geometric_descriptors`) include: $D_i$ (the size of the largest included sphere), $D_f$ (the largest free sphere), and $D_{if}$ (the largest included free sphere) along the pore $-$ three ways of characterizing pore size. \n",
 839 |     "\n",
 840 |     "![pore diameters](assets/spheres.png)\n",
 841 |     "\n",
 842 |     "Also included are the surface area (SA) of the pore, and the probe-occupiable pore volume (POV).\n",
 843 |     "More details on the description of pore geometries can be found in [Ongari et al.](https://pubs.acs.org/doi/abs/10.1021/acs.langmuir.7b01682)"
 844 |    ]
 845 |   },
 846 |   {
 847 |    "cell_type": "markdown",
 848 |    "metadata": {},
 849 |    "source": [
 850 |     "RACs (in the lists starting with `summed_...`) operate on the structure graph and encode information about the metal center, linkers and the functional groups as differences or products of heuristics that are relevant for inorganic chemistry, such as electronegativity ($\\chi$), connectivity ($T$), identity ($I$), covalent radii ($S$), and nuclear charge ($Z$)."
 851 |    ]
 852 |   },
 853 |   {
 854 |    "cell_type": "markdown",
 855 |    "metadata": {},
 856 |    "source": [
 857 |     "\n",
 858 |     "<img src=\"assets/racs.png\" alt=\"RACs scheme from the lecture\" width=\"700\"/>"
 859 |    ]
 860 |   },
 861 |   {
 862 |    "cell_type": "markdown",
 863 |    "metadata": {},
 864 |    "source": [
 865 |     "The number in the descriptornames shows the coordination shell that was considered in the calculation of the RACs."
 866 |    ]
 867 |   },
 868 |   {
 869 |    "cell_type": "markdown",
 870 |    "metadata": {},
 871 |    "source": [
 872 |     "The target we use for this application is the high-pressure CO$_2$ uptake. This is the amount of CO$_2$ (mmol) the MOF can load per gram."
 873 |    ]
 874 |   },
 875 |   {
 876 |    "cell_type": "markdown",
 877 |    "metadata": {},
 878 |    "source": [
 879 |     "## 2. Split the data"
 880 |    ]
 881 |   },
 882 |   {
 883 |    "cell_type": "markdown",
 884 |    "metadata": {},
 885 |    "source": [
 886 |     "Next, we split our data into a training set and a test set.\n",
 887 |     "\n",
 888 |     "In order to prevent *any* information of the test set from leaking into our model, we split *before* starting to analyze or transform our data. For more details on why this matters, see [chapter 7.10.2 of Elements of Statistical Learning](https://web.stanford.edu/~hastie/ElemStatLearn//printings/ESLII_print10.pdf)."
 889 |    ]
 890 |   },
 891 |   {
 892 |    "cell_type": "markdown",
 893 |    "metadata": {},
 894 |    "source": [
 895 |     "### 2.1. Split with stratification"
 896 |    ]
 897 |   },
 898 |   {
 899 |    "cell_type": "markdown",
 900 |    "metadata": {},
 901 |    "source": [
 902 |     "[Stratification](https://en.wikipedia.org/wiki/Stratified_sampling) ensures that the class distributions (ratio of \"good\" to \"bad\" materials) are the same in the training and test set.\n",
 903 |     "\n",
 904 |     " $\\color{DarkBlue}{\\textsf{Short question}}$\n",
 905 |     "\n",
 906 |     "- Why is this important? What could happen if we would not do this? \n",
 907 |     " \n",
 908 |     "\n",
 909 |     "For stratification to work, we to define what makes a \"good\" or a \"bad\" material. We will use 15 mmol CO$_2$ / g as the threshold for the uptake, thus binarizing our continuous target variable. (You can choose it based on the histogram of the variables)."
 910 |    ]
 911 |   },
 912 |   {
 913 |    "cell_type": "markdown",
 914 |    "metadata": {},
 915 |    "source": [
 916 |     " $\\color{DarkBlue}{\\textsf{Short Exercise}}$\n",
 917 |     " - add a column 'target_binary' that encodes whether a material is low performing (`0`) or high perfoming (`1`) by comparing the uptake with the `THRESHOLD`"
 918 |    ]
 919 |   },
 920 |   {
 921 |    "cell_type": "markdown",
 922 |    "metadata": {},
 923 |    "source": [
 924 |     "<details>\n",
 925 |     "<summary> <font color='green'>Click here for a hint</font></summary>\n",
 926 |     "<ul>\n",
 927 |     "    <li> you can use <a href='https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html'>pd.cut</a>, \n",
 928 |     "    <a href='https://stackoverflow.com/questions/4406389/if-else-in-a-list-comprehension'>list comprehension</a>, the <a href='https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html#sklearn.preprocessing.Binarizer'> binarizer in sklearn </a>...) </li>\n",
 929 |     "    <li> a list comprehension example: <code> [1 if value > THRESHOLD else 0 for value in df[TARGET]] </code> </li>\n",
 930 |     "</ul>\n",
 931 |     "</details>"
 932 |    ]
 933 |   },
 934 |   {
 935 |    "cell_type": "code",
 936 |    "execution_count": null,
 937 |    "metadata": {},
 938 |    "outputs": [],
 939 |    "source": [
 940 |     "THRESHOLD = 15 # in units of mmol CO2/g\n",
 941 |     "df[TARGET_BINARY] = # add your code"
 942 |    ]
 943 |   },
 944 |   {
 945 |    "cell_type": "markdown",
 946 |    "metadata": {},
 947 |    "source": [
 948 |     "Now, we can perform the actual split into training and test set."
 949 |    ]
 950 |   },
 951 |   {
 952 |    "cell_type": "markdown",
 953 |    "metadata": {},
 954 |    "source": [
 955 |     " $\\color{DarkBlue}{\\textsf{Short Exercise}}$\n",
 956 |     "- select reasonable values for `XX` and `XY` and then perform the test/train splits. What do you consider when making this decision (think about what you would do with really small and really big datasets, what happens if you have only one test point, what happens to the model performance if you have more test points than training points)? \n",
 957 |     "- why do we need to perform the split into a training and test set? \n",
 958 |     "- would we use the test set to tune the hyperparameters of our model?\n",
 959 |     "\n",
 960 |     "<details>\n",
 961 |     "<summary> <font color='green'>Click here for a hint</font></summary>\n",
 962 |     "<ul>\n",
 963 |     "    <li>The `size` arguments can either be integers or, often more convenient, decimals like 0.1</li>\n",
 964 |     "    <li>When you perform the split into training and test set you need to trade-off bias (pessimistic bias due to little training data) and variance (due to little test data) </li>\n",
 965 |     "    <li>A typical split cloud be 70/30, but for huge dataset the test set might be too big and for small datasets the training set might be too small in this way </li>\n",
 966 |     "</ul>\n",
 967 |     "</details>"
 968 |    ]
 969 |   },
 970 |   {
 971 |    "cell_type": "code",
 972 |    "execution_count": null,
 973 |    "metadata": {},
 974 |    "outputs": [],
 975 |    "source": [
 976 |     "df_train_stratified, df_test_stratified = train_test_split(\n",
 977 |     "    df,\n",
 978 |     "    train_size=XX,\n",
 979 |     "    test_size=XY,\n",
 980 |     "    random_state=RANDOM_SEED,\n",
 981 |     "    stratify=df[TARGET_BINARY],\n",
 982 |     ")\n"
 983 |    ]
 984 |   },
 985 |   {
 986 |    "cell_type": "markdown",
 987 |    "metadata": {},
 988 |    "source": [
 989 |     "## 3. Exploratory data analysis (EDA) "
 990 |    ]
 991 |   },
 992 |   {
 993 |    "cell_type": "markdown",
 994 |    "metadata": {},
 995 |    "source": [
 996 |     "After we have put the test set aside, we can give the training set a closer look."
 997 |    ]
 998 |   },
 999 |   {
1000 |    "cell_type": "markdown",
1001 |    "metadata": {},
1002 |    "source": [
1003 |     "### 3.1. Correlations"
1004 |    ]
1005 |   },
1006 |   {
1007 |    "cell_type": "markdown",
1008 |    "metadata": {},
1009 |    "source": [
1010 |     " $\\color{DarkBlue}{\\textsf{Short Exercise}}$\n",
1011 |     "- Plot some features against the target property and calculate the Pearson and Spearman correlation coefficient (what is the different between those correlation coefficients?) \n",
1012 |     "- What are the strongest correlations? Did you expect them? \n",
1013 |     "- What can be a problem when features are correlated?\n",
1014 |     "- *Optional:* Do they change if you switch from CO$_2$ to CH$_4$ uptake as the target instead? Explain your observation."
1015 |    ]
1016 |   },
1017 |   {
1018 |    "cell_type": "markdown",
1019 |    "metadata": {},
1020 |    "source": [
1021 |     "To get the correlation matrices, you can use the `df.corr(method=)`method on your dataframe (`df`). You might want to calculate not the full correlation matrix but just the correlation of the features with the targets"
1022 |    ]
1023 |   },
1024 |   {
1025 |    "cell_type": "markdown",
1026 |    "metadata": {},
1027 |    "source": [
1028 |     "<details>\n",
1029 |     "<summary> <font color='green'>Click here for a hint</font></summary>\n",
1030 |     "<ul>\n",
1031 |     "    <li> To get the correlation with a target, you can use indexing. E.g. <code>df.corr(method='spearman')[TARGET]</code></li>\n",
1032 |     "    <li> use <code>.sort_values()</code> method on the output of `df.corr()` to sort by the value of the correlation coefficient  </li>\n",
1033 |     "      <li> You can use something like <code>scatter = hv.Scatter(df, 'Di', [TARGET,  'density [g/cm^3]']).opts(color='density [g/cm^3]', cmap='rainbow')</code> for plotting. Also consider the <a href=\"https://holoviews.org/reference/elements/matplotlib/Scatter.html\"> <code>holoviews</code> documentation</a>. In case <code>holoviews</code> is too new for you, you can of course just use <code>matplotlib</code> and something like <code>plt.scatter(x,y)</code> </li>\n",
1034 |     "</ul>\n",
1035 |     "</details>"
1036 |    ]
1037 |   },
1038 |   {
1039 |    "cell_type": "code",
1040 |    "execution_count": null,
1041 |    "metadata": {},
1042 |    "outputs": [],
1043 |    "source": [
1044 |     "# add code here"
1045 |    ]
1046 |   },
1047 |   {
1048 |    "cell_type": "markdown",
1049 |    "metadata": {},
1050 |    "source": [
1051 |     "## 4. Baselines"
1052 |    ]
1053 |   },
1054 |   {
1055 |    "cell_type": "markdown",
1056 |    "metadata": {},
1057 |    "source": [
1058 |     "For machine learning, it is important to have some *baselines* to which one then compares the results of a model. Think of a classification model for some rare disease where we only have 1% postives. A classification model that only predictes the negatives *all the time* will still have a amazingly high accuracy. To be able to understand if our model is really better than such a simple prediction we need to make the simple prediction first. This is what we call a baseline.\n",
1059 |     "\n",
1060 |     "A baseline could be a really simple model, a basic heuristic or the current state of the art.\n",
1061 |     "this. We will use a heuristic.\n",
1062 |     "\n",
1063 |     "For this we use sklearn `Dummy` objects that simply calculate the mean, the median or the most frequent case of the training set, when you run the `fit()` method on them (which takes the features matrix $\\mathbf{X}$ and the labels $\\mathbf{y}$ as arguments.\n",
1064 |     "This is, the prediction of a `DummyRegressor` with `mean` strategy will always be the mean, independent of the input (it will not look at the feature matrix!). \n",
1065 |     "\n",
1066 |     "Instead of using those `sklearn` objects you could also just manually compute the the mean or median of the dataset. But we will use those objects as we can learn in this way how to use estimators in `sklearn` and it is also allows you to test your full pipeline with different (baseline) models. \n",
1067 |     "What does this mean? In practice this means that you can use all the regression and classification models shown in the figure below in the same way, they will all have a `fit()` method that accepts `X` and `y` and a predict method that accepts `X` and returns the predictions. \n",
1068 |     "\n",
1069 |     "\n",
1070 |     "<img src=\"https://scikit-learn.org/stable/assets/ml_map.png\" alt=\"ML Map\" width=\"800\"/>\n",
1071 |     "\n",
1072 |     "The estimator objects can be always used in the same way \n",
1073 |     "\n",
1074 |     "<img src=\"https://static.packt-cdn.com/products/9781789800265/graphics/d49a2e95-8f22-42ed-89f1-474b3d028787.png\" alt=\"ML Map\" width=\"400\"/>\n",
1075 |     "\n",
1076 |     "Using these objects, instead of the mean directly, allows you to easily swap them with other models in pipelines, where one chains many data transformation steps (see section 6)."
1077 |    ]
1078 |   },
1079 |   {
1080 |    "cell_type": "markdown",
1081 |    "metadata": {},
1082 |    "source": [
1083 |     "### 4.1. Build dummy models"
1084 |    ]
1085 |   },
1086 |   {
1087 |    "cell_type": "markdown",
1088 |    "metadata": {},
1089 |    "source": [
1090 |     "$\\color{DarkBlue}{\\textsf{Short Question}}$\n",
1091 |     "- If you call `.fit(X, y)` on a `DummyRegressor` does it actually use the `X`? If not, why is there still the place for the `X` in the function? If yes, how does it use it?\n",
1092 |     "\n",
1093 |     "$\\color{DarkBlue}{\\textsf{Short Exercise}}$\n",
1094 |     "- Create [`DummyRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html) instances for  `mean`, `median`. (e.g. `dummyinstance = DummyRegressor(strategy='mean')`)\n",
1095 |     "- Train them on the training data (`dummyinstance.fit(df_train[FEATURES], df_train[TARGET])`)"
1096 |    ]
1097 |   },
1098 |   {
1099 |    "cell_type": "markdown",
1100 |    "metadata": {},
1101 |    "source": [
1102 |     "<details>\n",
1103 |     "<summary> <font color='green'>Click here for hints</font></summary>\n",
1104 |     "<ul>\n",
1105 |     "    <li> to create <code>DummyRegressor</code> you can for example use <code> dummyregressor_mean = DummyRegressor(strategy='mean') </code> </li>\n",
1106 |     "    <li> to see the implementation of the <code>DummyRegressor</code> you can check out <a href=\"https://github.com/scikit-learn/scikit-learn/blob/73732e5a0bc9b72c7049dc699d69aaedbb70ef0a/sklearn/dummy.py#L391\"> the source code on GitHub</a> </li>\n",
1107 |     "</ul>\n",
1108 |     "</details>"
1109 |    ]
1110 |   },
1111 |   {
1112 |    "cell_type": "code",
1113 |    "execution_count": null,
1114 |    "metadata": {},
1115 |    "outputs": [],
1116 |    "source": [
1117 |     "# Build DummyRegressors\n",
1118 |     "dummyregressor_mean = DummyRegressor(strategy='mean')\n",
1119 |     "dummyregressor_median = DummyRegressor(#fillme)"
1120 |    ]
1121 |   },
1122 |   {
1123 |    "cell_type": "code",
1124 |    "execution_count": null,
1125 |    "metadata": {},
1126 |    "outputs": [],
1127 |    "source": [
1128 |     "# Fit Dummy Regressors\n",
1129 |     "dummyregressor_mean.fit(df_train_stratified[FEATURES], df_train_stratified[TARGET])\n",
1130 |     "dummyregressor_median.fit(#fillme)"
1131 |    ]
1132 |   },
1133 |   {
1134 |    "cell_type": "markdown",
1135 |    "metadata": {},
1136 |    "source": [
1137 |     "#### Evaluate the performance of the dummy models"
1138 |    ]
1139 |   },
1140 |   {
1141 |    "cell_type": "markdown",
1142 |    "metadata": {},
1143 |    "source": [
1144 |     " $\\color{DarkBlue}{\\textsf{Short Exercise}}$\n",
1145 |     "- Calculate maximum error, mean absolute error and mean square error for the dummy regressors on training and test set. What would you expect those numbers to be?\n",
1146 |     "- Do the actual values surprise you? \n",
1147 |     "- What does this mean in practice for reporting of metrics/the reasoning behind using baseline models\n",
1148 |     "\n",
1149 |     "It can be handy to store our metrics of choice in a nested dictionary ([Python dictionaries are key-value pairs](https://www.tutorialspoint.com/python/python_dictionary.htm)): \n",
1150 |     "\n",
1151 |     "```python\n",
1152 |     "{\n",
1153 |     "    'dummyestimator1': {\n",
1154 |     "                        'metric_a_key': metric_a_value, \n",
1155 |     "                        'metric_b_key': metric_b_value\n",
1156 |     "                    },\n",
1157 |     "    'dummyestimator2': {\n",
1158 |     "                        'metric_a_key': metric_a_value, \n",
1159 |     "                        'metric_b_key': metric_b_value\n",
1160 |     "                    },\n",
1161 |     " }\n",
1162 |     "``` \n",
1163 |     "\n",
1164 |     "You will now write functions `get_regression_metrics(model, X, y_true)` that compute the metrics and return this dictionary for a given model. The `predict` method takes the feature matrix $\\mathbf{X}$ as input.\n",
1165 |     "\n",
1166 |     "In them, we calculate \n",
1167 |     "\n",
1168 |     "$\\mathrm {MAE} =\\frac{\\sum _{i=1}^{n}\\left|Y_{i}-\\hat{y}_{i}\\right|}{n}.$\n",
1169 |     "\n",
1170 |     "and \n",
1171 |     "\n",
1172 |     "$\\mathrm {MSE} = {\\frac {1}{n}}\\sum _{i=1}^{n}(Y_{i}-{\\hat {Y_{i}}})^{2}.$ \n",
1173 |     "\n",
1174 |     "where $\\hat{y}$ are the predictions and, $Y_{i}$ the true values.\n",
1175 |     "\n",
1176 |     "as well as the maximum error."
1177 |    ]
1178 |   },
1179 |   {
1180 |    "cell_type": "markdown",
1181 |    "metadata": {},
1182 |    "source": [
1183 |     "<details>\n",
1184 |     "<summary> <font color='green'>Click here for hints</font></summary>\n",
1185 |     "<ul>\n",
1186 |     "    <li> to perform a prediction using a estimator object, you can call <code> classifier.predict(X) </code> </li>\n",
1187 |     "    <li> to calculate metrics, you can for example call <code>accuracy_score(true_values, predicted_values) </code> </li>\n",
1188 |     "</ul>\n",
1189 |     "</details>"
1190 |    ]
1191 |   },
1192 |   {
1193 |    "cell_type": "code",
1194 |    "execution_count": null,
1195 |    "metadata": {},
1196 |    "outputs": [],
1197 |    "source": [
1198 |     "def get_regression_metrics(model, X, y_true):\n",
1199 |     "    \"\"\"\n",
1200 |     "    Get a dicionary with regression metrics:\n",
1201 |     "\n",
1202 |     "    model: sklearn model with predict method\n",
1203 |     "    X: feature matrix\n",
1204 |     "    y_true: ground truth labels\n",
1205 |     "    \"\"\"\n",
1206 |     "    y_predicted = model.predict(#fillme)\n",
1207 |     "\n",
1208 |     "    mae = mean_absolute_error(#fillme)\n",
1209 |     "    mse = mean_squared_error(#fillme)\n",
1210 |     "    maximum_error = max_error(#fillme)\n",
1211 |     "\n",
1212 |     "    metrics_dict = {\n",
1213 |     "        'mae': mae,\n",
1214 |     "        'mse': mse,\n",
1215 |     "        'max_error': maximum_error\n",
1216 |     "    }\n",
1217 |     "\n",
1218 |     "    return metrics_dict"
1219 |    ]
1220 |   },
1221 |   {
1222 |    "cell_type": "code",
1223 |    "execution_count": null,
1224 |    "metadata": {},
1225 |    "outputs": [],
1226 |    "source": [
1227 |     "dummy_regressors = [\n",
1228 |     "    ('mean', dummyregressor_mean),\n",
1229 |     "    ('median', dummyregressor_median)\n",
1230 |     "]"
1231 |    ]
1232 |   },
1233 |   {
1234 |    "cell_type": "code",
1235 |    "execution_count": null,
1236 |    "metadata": {},
1237 |    "outputs": [],
1238 |    "source": [
1239 |     "dummy_regressor_results_test = {} # initialize empty dictionary\n",
1240 |     "dummy_regressor_results_train = {}\n",
1241 |     "\n",
1242 |     "# loop over the dummy_regressor list\n",
1243 |     "# if you have a tuple regressorname, regressor = (a, b) that is automatically expanded into the variables\n",
1244 |     "# a = regressorname, b = regressor\n",
1245 |     "for regressorname, regressor in dummy_regressors:\n",
1246 |     "    print(f\"Calculating metrics for {regressorname}\")\n",
1247 |     "    dummy_regressor_results_test[regressorname] = get_regression_metrics(#fillme)\n",
1248 |     "    dummy_regressor_results_train[regressorname] = get_regression_metrics(#fillme)"
1249 |    ]
1250 |   },
1251 |   {
1252 |    "cell_type": "markdown",
1253 |    "metadata": {},
1254 |    "source": [
1255 |     "## 5. Build actual regression models"
1256 |    ]
1257 |   },
1258 |   {
1259 |    "cell_type": "markdown",
1260 |    "metadata": {},
1261 |    "source": [
1262 |     "Let's build a simple [kernel ridge regression (KRR)](https://emtiyaz.github.io/pcml15/kernel-ridge-regression.pdf) machine learning model and train it with our raw data.\n",
1263 |     "You can try different kernels, but we recommend to start with the Gaussian radial basis function ('rbf') kernel.\n",
1264 |     " \n",
1265 |     " $\\color{DarkBlue}{\\textsf{Short Question}}$\n",
1266 |     "- Do you expect this model to perform better than the dummy models?\n",
1267 |     "- Train it and then calculate the performance metrics on the training and test set. How do they compare to the performance of the dummy models?\n",
1268 |     "- What is the shape of the Kernel and of the weights? (you can check your answer by looking at the `dual_coef_` attribute of the KRR instance. You can get shapes of objects using the `shape` atrribute "
1269 |    ]
1270 |   },
1271 |   {
1272 |    "cell_type": "code",
1273 |    "execution_count": null,
1274 |    "metadata": {},
1275 |    "outputs": [],
1276 |    "source": [
1277 |     "# Train the model with a Gaussian kernel\n",
1278 |     "krr = KernelRidge(kernel='rbf')\n",
1279 |     "krr.fit(#fillme)"
1280 |    ]
1281 |   },
1282 |   {
1283 |    "cell_type": "code",
1284 |    "execution_count": null,
1285 |    "metadata": {},
1286 |    "outputs": [],
1287 |    "source": [
1288 |     "# get the metrics on the train and the test set using the get_regression_metrics functions (as above)"
1289 |    ]
1290 |   },
1291 |   {
1292 |    "cell_type": "markdown",
1293 |    "metadata": {},
1294 |    "source": [
1295 |     "## 6. Evaluate the model performance in detail"
1296 |    ]
1297 |   },
1298 |   {
1299 |    "cell_type": "markdown",
1300 |    "metadata": {},
1301 |    "source": [
1302 |     "We have trained our first machine learning model!\n",
1303 |     "We'll first have a closer look at its performance, before learning how to improve it."
1304 |    ]
1305 |   },
1306 |   {
1307 |    "cell_type": "markdown",
1308 |    "metadata": {},
1309 |    "source": [
1310 |     " $\\color{DarkBlue}{\\textsf{Short Exercise}}$\n",
1311 |     "- Create a parity plot (true values against predictions)for the training and test data\n",
1312 |     "- Plot a histogram of the distribution of the training and test errors on the training and test set. Plot the errors also as a function of the true value\n",
1313 |     "- Let's assume we would like to use our model for pre-screening a library of millions of porous materials to zoom-in on those with the most promising gas uptake. Could you tolerate the errors of your model?\n",
1314 |     "- Compare the parity plots for this model with the ones for the dummy models. \n",
1315 |     "Use the plotting functions below the evaluate all the following models you train.\n",
1316 |     "\n",
1317 |     "For this exercise, it can be handy to save the results in a dictionary, e.g. \n",
1318 |     "```(python)\n",
1319 |     "res_train = {\n",
1320 |     "    'y true': [],\n",
1321 |     "    'y pred': []\n",
1322 |     "}\n",
1323 |     "```"
1324 |    ]
1325 |   },
1326 |   {
1327 |    "cell_type": "markdown",
1328 |    "metadata": {},
1329 |    "source": [
1330 |     "<details>\n",
1331 |     "<summary> <font color='green'>Click here for hints for plotting</font></summary>\n",
1332 |     "<ul>\n",
1333 |     "    <li> If you want to use matplotlib to make the parity plots, you can use the <a href=\"https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.hist2d.html\">hist2d function</a> </li>\n",
1334 |     "    <li> To create the frequencies and the edges of a histogram, one can use <code>np.histogram</code></li>\n",
1335 |     "</ul>\n",
1336 |     "</details>"
1337 |    ]
1338 |   },
1339 |   {
1340 |    "cell_type": "code",
1341 |    "execution_count": null,
1342 |    "metadata": {},
1343 |    "outputs": [],
1344 |    "source": [
1345 |     "# Create dictionaries with training and test results to create parity plots\n",
1346 |     "res_train = {\n",
1347 |     "    'y true': # fillme using the dataframe,\n",
1348 |     "    'y pred': # fillme using the model prediction\n",
1349 |     "}\n",
1350 |     "\n",
1351 |     "res_test = {\n",
1352 |     "    'y true': # fillme using the dataframe\n",
1353 |     "    'y pred': # fillme using the model prediction\n",
1354 |     "}"
1355 |    ]
1356 |   },
1357 |   {
1358 |    "cell_type": "markdown",
1359 |    "metadata": {},
1360 |    "source": [
1361 |     "Now, lets calculate the errors"
1362 |    ]
1363 |   },
1364 |   {
1365 |    "cell_type": "code",
1366 |    "execution_count": null,
1367 |    "metadata": {},
1368 |    "outputs": [],
1369 |    "source": [
1370 |     "res_train[\"error\"] = res_train[\"y true\"] - res_train[\"y pred\"]\n",
1371 |     "res_test[\"error\"] = res_test[\"y true\"] - res_test[\"y pred\"]\n"
1372 |    ]
1373 |   },
1374 |   {
1375 |    "cell_type": "markdown",
1376 |    "metadata": {},
1377 |    "source": [
1378 |     "Now, plot the parity plots and error distributions"
1379 |    ]
1380 |   },
1381 |   {
1382 |    "cell_type": "markdown",
1383 |    "metadata": {},
1384 |    "source": [
1385 |     "<details>\n",
1386 |     "<summary> <font color='green'>Click here for hints for plotting</font></summary>\n",
1387 |     "If you want interactive plots, you can use the following code:\n",
1388 |     "<pre><code>\n",
1389 |     "hv.extension(\"bokeh\")\n",
1390 |     "hex_train = hv.HexTiles(res_train, [\"y true\", \"y pred\"]).hist(\n",
1391 |     "    dimension=[\"y true\", \"y pred\"]\n",
1392 |     ")\n",
1393 |     "hex_test = hv.HexTiles(res_test, [\"y true\", \"y pred\"]).hist(\n",
1394 |     "    dimension=[\"y true\", \"y pred\"]\n",
1395 |     ")\n",
1396 |     "hex_train + hex_test\n",
1397 |     "</code>\n",
1398 |     "</details>"
1399 |    ]
1400 |   },
1401 |   {
1402 |    "cell_type": "code",
1403 |    "execution_count": null,
1404 |    "metadata": {},
1405 |    "outputs": [],
1406 |    "source": [
1407 |     "# plot it\n",
1408 |     "hist_density(res_train['y true'], res_train['y pred'], xlabel='y true', ylabel='y pred', title='Train')"
1409 |    ]
1410 |   },
1411 |   {
1412 |    "cell_type": "code",
1413 |    "execution_count": null,
1414 |    "metadata": {},
1415 |    "outputs": [],
1416 |    "source": [
1417 |     "hist_density(res_test['y true'], res_test['y pred'], xlabel='y true', ylabel='y pred', title='Test')"
1418 |    ]
1419 |   },
1420 |   {
1421 |    "cell_type": "markdown",
1422 |    "metadata": {},
1423 |    "source": [
1424 |     "## 7. Improve the model "
1425 |    ]
1426 |   },
1427 |   {
1428 |    "cell_type": "markdown",
1429 |    "metadata": {},
1430 |    "source": [
1431 |     "Our training set still has a couple of issues you might have noticed:\n",
1432 |     "- The feature values are not scaled (different features are measured in different units ...)\n",
1433 |     "- Some features are basically constant, i.e. do not contain relevant information and just increase the dimensionality of the problem \n",
1434 |     "- Some feature distributions are skewed (which is more relevant for some models than for others ...)"
1435 |    ]
1436 |   },
1437 |   {
1438 |    "cell_type": "markdown",
1439 |    "metadata": {},
1440 |    "source": [
1441 |     "$\\color{DarkBlue}{\\textsf{Short Question}}$\n",
1442 |     "- Why might the scaling of the features be relevant for a machine learning model? "
1443 |    ]
1444 |   },
1445 |   {
1446 |    "cell_type": "markdown",
1447 |    "metadata": {},
1448 |    "source": [
1449 |     "### 7.1. Standard scaling and building a first pipeline "
1450 |    ]
1451 |   },
1452 |   {
1453 |    "cell_type": "markdown",
1454 |    "metadata": {},
1455 |    "source": [
1456 |     "Given that we will now go beyond training a single model, we will build [Pipelines](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html), which are objects that can collect a selection of transformations and estimators. This makes it quite easy to apply the same set of operations to different datasets. A simple pipeline might be built as follows \n",
1457 |     "\n",
1458 |     "<img src=\"https://vitalflux.com/wp-content/uploads/2020/08/ML-Pipeline-Page-2-1024x307.png\" alt=\"Pipeline\" width=\"800\"/>\n",
1459 |     "\n"
1460 |    ]
1461 |   },
1462 |   {
1463 |    "cell_type": "markdown",
1464 |    "metadata": {},
1465 |    "source": [
1466 |     " $\\color{DarkBlue}{\\textsf{Short Exercise}}$\n",
1467 |     "- Build a pipline that first performs standard scaling and then fits a KRR. Call it `pipe_w_scaling`. \n",
1468 |     "- Fit it on the training set \n",
1469 |     "- Make predictions, calculate the errors and make the parity plots"
1470 |    ]
1471 |   },
1472 |   {
1473 |    "cell_type": "markdown",
1474 |    "metadata": {},
1475 |    "source": [
1476 |     "<details>\n",
1477 |     "<summary> <font color='green'>Click here for hints</font></summary>\n",
1478 |     "<ul>\n",
1479 |     "    <li> the <code>fit</code>, <code>predict</code> methods also work for pipelines </li>\n",
1480 |     "</ul>\n",
1481 |     "</details>"
1482 |    ]
1483 |   },
1484 |   {
1485 |    "cell_type": "code",
1486 |    "execution_count": null,
1487 |    "metadata": {},
1488 |    "outputs": [],
1489 |    "source": [
1490 |     "pipe_w_scaling = Pipeline(\n",
1491 |     "   [\n",
1492 |     "       ('scaling', StandardScaler()),\n",
1493 |     "       ('krr', #fillme)\n",
1494 |     "   ]\n",
1495 |     ")"
1496 |    ]
1497 |   },
1498 |   {
1499 |    "cell_type": "markdown",
1500 |    "metadata": {},
1501 |    "source": [
1502 |     "### 7.2. Hyperparameter optimization"
1503 |    ]
1504 |   },
1505 |   {
1506 |    "cell_type": "markdown",
1507 |    "metadata": {},
1508 |    "source": [
1509 |     "A key component we did not optimize so far are hyperparameters. Those are parameters of the model that we usually cannot learn from the data but have to fix before we train the model. \n",
1510 |     "Since we cannot learn those parameters it is not trivial to select them. Hence, what we typically do in practice is to create another set, a \"validation set\", and use it to test models trained with different hyperparameters.\n",
1511 |     "\n",
1512 |     "The most common approach to hyperparameter optimization is to define a grid of all relevant parameters and to search over the grid for the best model performance."
1513 |    ]
1514 |   },
1515 |   {
1516 |    "cell_type": "markdown",
1517 |    "metadata": {},
1518 |    "source": [
1519 |     "$\\color{DarkBlue}{\\textsf{Short Exercise}}$\n",
1520 |     "- Think about which parameters you could optimize in the pipeline. Note that your KRR model has two parameters you can optimize. You can also switch off some steps by setting them to `None'.\n",
1521 |     "- For each parameter you need to define a resonable grid to search over.\n",
1522 |     "- Recall, what k-fold cross-validation does. Run the hyperparameter optimization using 5-fold cross-validation (you can adjust the number of folds according to your computational resources/impatience. It turns out at k=10 is the [best tradeoff between variance and bias](https://arxiv.org/abs/1811.12808)). \n",
1523 |     "Tune the hyperparameters until you are statisfied (e.g., until you cannot improve the cross validated error any more)\n",
1524 |     "- Why don't we use the test set for hyperparameter tuning but instead test on the validation set? \n",
1525 |     "- Evaluate the model performance by calculating the performance metrics (MAE, MSE, max error) on the training and the test set.\n",
1526 |     "- *Optional:* Instead of grid search, try to use random search on the same grid (`RandomizedSearchCV`) and fix the number of evaluations (`n_iter`) to a fraction of the number of evaluations of grid search. What do you observe and conclude?"
1527 |    ]
1528 |   },
1529 |   {
1530 |    "cell_type": "markdown",
1531 |    "metadata": {},
1532 |    "source": [
1533 |     " $\\color{DarkRed}{\\textsf{Tips}}$\n",
1534 |     "- If you want to see what is happening, set the `verbosity` argument of the `GridSearchCV` object to a higher number.\n",
1535 |     " \n",
1536 |     "- If you want to speed up the optimization, you can run it in parallel by setting the `n_jobs` argument to the number of workers. If you set it to -1 it will use all available cores. *Using all cores might freeze your computer if you do not have enough memory*\n",
1537 |     " \n",
1538 |     "- If the optimization is too slow, reduce the number of data points in your set, the number of folds or the grid size. Note that it can also be a feasible strategy to first use a coarser grid and the a finer grid for fine-tuning.\n",
1539 |     "\n",
1540 |     "- For grid search, you need to define a parameter grid, which is a dictionary of the following form: \n",
1541 |     "```(python)\n",
1542 |     "param_grid = {\n",
1543 |     "                    'pipelinestage__parameter': np.logspace(-4,1,10),\n",
1544 |     "                    'pipelinestage': [None, TransformerA(), TransformerB()]\n",
1545 |     "            }\n",
1546 |     "```\n",
1547 |     "\n",
1548 |     "- After the search, you can access the best model with `.best_estimator_` and the best parameters with `.best_params_` on the GridSearchCV instance. For example `grid_krr.best_estimator_`\n",
1549 |     "\n",
1550 |     "- If you initialize the GridSearchCV instance with `refit=True` it will automatically train the model with all training data (and not only the training folds from cross-validations)\n",
1551 |     "\n",
1552 |     "The double underscore (dunder) notation works recursively and specifies the parameters for any pipeline stage. \n",
1553 |     "For example, `ovasvm__estimator__cls__C` would specifiy the `C` parameter of the estimator in the one-versus-rest classifier `ovasvm`. \n",
1554 |     "\n",
1555 |     "You can print all parameters of the pipeline using `print(sorted(pipeline.get_params().keys()))`"
1556 |    ]
1557 |   },
1558 |   {
1559 |    "cell_type": "markdown",
1560 |    "metadata": {},
1561 |    "source": [
1562 |     "<div class=\"alert alert-block alert-warning\">\n",
1563 |     "Be aware that tight grids will drastically increase the number of experiments you will run! In some cases, it can be useful to perform the optimization in steps, i.e., first use a coarse grid and then refine in interesting regions. \n",
1564 |     "Alternatively, there are approached like <a href=\"https://www.jmlr.org/papers/volume18/16-558/16-558.pdf\"> hyperband <a> that dynamically adjust the number of data points.\n",
1565 |     "</div>"
1566 |    ]
1567 |   },
1568 |   {
1569 |    "cell_type": "markdown",
1570 |    "metadata": {},
1571 |    "source": [
1572 |     "<details>\n",
1573 |     "<summary> <font color='green'>Click here for hints about pipelines and grid search</font></summary>\n",
1574 |     "<ul>\n",
1575 |     "    <li> You can use the <code>np.logspace</code> function to generate a grid for values that you want to vary on a logarithmic scale </li>\n",
1576 |     "    <li> There are two hyperparameters for KRR: the regularization strength <code>alpha</code> and the Gaussian width  <code>gamma</code> </li>\n",
1577 |     "    <li> For the regularization strength, values between 1 and 1e-3 can be reasonable. For gamma you can use the median heuristic, gamma = 1 / median, or values between 1e-3 and 1e3</li>\n",
1578 |     "</ul>\n",
1579 |     "</details>"
1580 |    ]
1581 |   },
1582 |   {
1583 |    "cell_type": "code",
1584 |    "execution_count": null,
1585 |    "metadata": {},
1586 |    "outputs": [],
1587 |    "source": [
1588 |     "# Define the parameter grid and the grid search object\n",
1589 |     "param_grid = {\n",
1590 |     "                    'scaling': [MinMaxScaler(), StandardScaler()], # test different scaling methods\n",
1591 |     "                    'krr__alpha': #fillme,\n",
1592 |     "                    'krr__#fillme': #fillme\n",
1593 |     "            }\n",
1594 |     "\n",
1595 |     "grid_krr = GridSearchCV(#your pipeline, param_grid=param_grid,\n",
1596 |     "                        cv=#number of folds, verbose=2, n_jobs=2)\n",
1597 |     "\n",
1598 |     "# optional random search\n",
1599 |     "#random_krr = RandomizedSearchCV(#your pipeline, param_distributions=param_grid, n_iter=#number of evaluations,\n",
1600 |     "#                        cv=#number of folds, verbose=2, n_jobs=2\n",
1601 |     ")"
1602 |    ]
1603 |   },
1604 |   {
1605 |    "cell_type": "code",
1606 |    "execution_count": null,
1607 |    "metadata": {},
1608 |    "outputs": [],
1609 |    "source": [
1610 |     "# run the grid search by calling the fit method\n",
1611 |     "grid_krr.fit(#fillme)\n",
1612 |     "# optional random search\n",
1613 |     "# random_krr.fit(#fillme)"
1614 |    ]
1615 |   },
1616 |   {
1617 |    "cell_type": "code",
1618 |    "execution_count": null,
1619 |    "metadata": {},
1620 |    "outputs": [],
1621 |    "source": [
1622 |     "# get the performance metrics\n",
1623 |     "get_regression_metrics(#fillme)"
1624 |    ]
1625 |   },
1626 |   {
1627 |    "cell_type": "markdown",
1628 |    "metadata": {},
1629 |    "source": [
1630 |     "<details>\n",
1631 |     "<summary> <font color='green'>Click here for some more information about hyperparameter optimization</font></summary>\n",
1632 |     "Grid search is not the most efficient way to perform hyperparamter optimization. Even <a href=\"http://jmlr.csail.mit.edu/papers/volume13/bergstra12a/bergstra12a.pdf\">random search was shown to be more efficient</a>. Really efficient though are Bayesian optimization approaches like <a href='https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf)'>TPE</a>. This is implemented in the hyperopt library, which is also installed in your conda environment.\n",
1633 |     "</details>"
1634 |    ]
1635 |   },
1636 |   {
1637 |    "cell_type": "markdown",
1638 |    "metadata": {},
1639 |    "source": [
1640 |     "<details>\n",
1641 |     "<summary> <font color='green'>Click here for hyperparameter optimization with hyperopt (advanded and optional outlook)</font></summary>\n",
1642 |     "    \n",
1643 |     "<b>Import the tools we need</b>\n",
1644 |     "<code>\n",
1645 |     "from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, mix, rand, anneal, space_eval\n",
1646 |     "from functools import partial\n",
1647 |     "</code>    \n",
1648 |     "\n",
1649 |     "<b>Define the grid</b>\n",
1650 |     "<code>\n",
1651 |     "param_hyperopt = {\n",
1652 |     "    \"krr__alpha\": hp.loguniform(\"krr__alpha\", np.log(0.001), np.log(10)),\n",
1653 |     "    \"krr__gamma\": hp.loguniform(\"krr__gamma\", np.log(0.001), np.log(10)),\n",
1654 |     "}\n",
1655 |     "</code> \n",
1656 |     "\n",
1657 |     "<b>Define the objective function</b>\n",
1658 |     "<code>\n",
1659 |     "def objective_function(params):\n",
1660 |     "    pipe.set_params(\n",
1661 |     "        **{\n",
1662 |     "            \"krr__alpha\": params[\"krr__alpha\"],\n",
1663 |     "            \"krr__gamma\": params[\"krr__gamma\"],\n",
1664 |     "        }\n",
1665 |     "    )\n",
1666 |     "    score = cross_val_score(\n",
1667 |     "        pipe, X_train, y_train, cv=10, scoring=\"neg_mean_absolute_error\"\n",
1668 |     "    ).mean()\n",
1669 |     "    return {\"loss\": -score, \"status\": STATUS_OK} \n",
1670 |     "</code>\n",
1671 |     "\n",
1672 |     "<b>We will use a search in which we mix random search, annealing and tpe</b>\n",
1673 |     "<code>\n",
1674 |     "trials = Trials()\n",
1675 |     "mix_search = partial(\n",
1676 |     "   mix.suggest,\n",
1677 |     "   p_suggest=[(0.15, rand.suggest), (0.15, anneal.suggest), (0.7, tpe.suggest)],\n",
1678 |     ")\n",
1679 |     "</code>\n",
1680 |     "\n",
1681 |     "<b>Now, we can minimize the objective function.</b>\n",
1682 |     "<code>\n",
1683 |     "best_param = fmin(\n",
1684 |     "        objective_function,\n",
1685 |     "        param_hyperopt,\n",
1686 |     "        algo=mix_search,\n",
1687 |     "        max_evals=MAX_EVALES,\n",
1688 |     "        trials=trials,\n",
1689 |     "        rstate=np.random.RandomState(RANDOM_SEED),\n",
1690 |     "    )\n",
1691 |     "</code>\n",
1692 |     "\n",
1693 |     "</details>"
1694 |    ]
1695 |   },
1696 |   {
1697 |    "cell_type": "markdown",
1698 |    "metadata": {},
1699 |    "source": [
1700 |     "## 8. Feature Engineering "
1701 |    ]
1702 |   },
1703 |   {
1704 |    "cell_type": "markdown",
1705 |    "metadata": {},
1706 |    "source": [
1707 |     "Finally, we would like to remove features with low variance. This can be done by setting a variance threshold."
1708 |    ]
1709 |   },
1710 |   {
1711 |    "cell_type": "markdown",
1712 |    "metadata": {},
1713 |    "source": [
1714 |     "$\\color{DarkBlue}{\\textsf{Short Question}}$\n",
1715 |     "    \n",
1716 |     "- What is the reasoning behind doing this? \n",
1717 |     "- When might it go wrong and why?"
1718 |    ]
1719 |   },
1720 |   {
1721 |    "cell_type": "markdown",
1722 |    "metadata": {},
1723 |    "source": [
1724 |     " $\\color{DarkBlue}{\\textsf{Short Exercise}}$\n",
1725 |     "- Add a variance threshold to the pipeline (select the correct function argument)\n",
1726 |     "- Use random search for hyperparameter optimization, retrain the pipeline, and calculate the performance metrics (max error, MAE, MSE) on the training and test set\n",
1727 |     "- If you could improve the predictive performance, do not forget to also run the model for the Kaggle competition!"
1728 |    ]
1729 |   },
1730 |   {
1731 |    "cell_type": "code",
1732 |    "execution_count": null,
1733 |    "metadata": {},
1734 |    "outputs": [],
1735 |    "source": [
1736 |     "# Define the pipeline\n",
1737 |     "pipe_variance_threshold = Pipeline(\n",
1738 |     "    # fillme with the pipeline steps\n",
1739 |     "    [\n",
1740 |     "        ('variance_treshold', VarianceThreshold(#fillme with threshold)),\n",
1741 |     "        #fillme with remaining pipeline steps\n",
1742 |     "    ]\n",
1743 |     ")"
1744 |    ]
1745 |   },
1746 |   {
1747 |    "cell_type": "code",
1748 |    "execution_count": null,
1749 |    "metadata": {},
1750 |    "outputs": [],
1751 |    "source": [
1752 |     "param_grid_variance_threshold = {\n",
1753 |     "                    'scaling': [None, StandardScaler()],\n",
1754 |     "                    'krr__alpha': #fillme,\n",
1755 |     "                    'krr__#fillme': #fillme,\n",
1756 |     "                    'variance_treshold__threshold': #fillme\n",
1757 |     "            }\n",
1758 |     "\n",
1759 |     "random_variance_treshold = RandomizedSearchCV(#your pipeline, param_distributions=param_grid, n_iter=#number of evaluations,\n",
1760 |     "                        cv=#number of folds, verbose=2, n_jobs=2)"
1761 |    ]
1762 |   },
1763 |   {
1764 |    "cell_type": "code",
1765 |    "execution_count": null,
1766 |    "metadata": {},
1767 |    "outputs": [],
1768 |    "source": [
1769 |     "# Fit the pipeline and run the evaluation\n",
1770 |     "random_variance_treshold.fit(#fillme)"
1771 |    ]
1772 |   },
1773 |   {
1774 |    "cell_type": "markdown",
1775 |    "metadata": {},
1776 |    "source": [
1777 |     "$\\color{DarkBlue}{\\textsf{Short Exercise (optional)}}$\n",
1778 |     "- replace the variance threshold with a model-based feature selection \n",
1779 |     "`('feature_selection', SelectFromModel(LinearSVC(penalty=\"l1\")))` or [any feature selection method that you would like to try](https://scikit-learn.org/stable/modules/feature_selection.html)"
1780 |    ]
1781 |   },
1782 |   {
1783 |    "cell_type": "markdown",
1784 |    "metadata": {},
1785 |    "source": [
1786 |     "## 9. Saving the model"
1787 |    ]
1788 |   },
1789 |   {
1790 |    "cell_type": "markdown",
1791 |    "metadata": {},
1792 |    "source": [
1793 |     "Now, that we spent so much time in optimizing our model, we do not want to loose it. "
1794 |    ]
1795 |   },
1796 |   {
1797 |    "cell_type": "markdown",
1798 |    "metadata": {},
1799 |    "source": [
1800 |     " $\\color{DarkBlue}{\\textsf{Short Exercise}}$\n",
1801 |     "- use the [joblib library](https://scikit-learn.org/stable/modules/model_persistence.html) to save your model\n",
1802 |     "- make sure you can load it again\n"
1803 |    ]
1804 |   },
1805 |   {
1806 |    "cell_type": "code",
1807 |    "execution_count": null,
1808 |    "metadata": {},
1809 |    "outputs": [],
1810 |    "source": [
1811 |     "# Dump your model\n",
1812 |     "joblib.dump(model, filename)"
1813 |    ]
1814 |   },
1815 |   {
1816 |    "cell_type": "code",
1817 |    "execution_count": null,
1818 |    "metadata": {},
1819 |    "outputs": [],
1820 |    "source": [
1821 |     "# Try to load it again\n",
1822 |     "model_loaded = joblib.load(filename)"
1823 |    ]
1824 |   },
1825 |   {
1826 |    "cell_type": "markdown",
1827 |    "metadata": {},
1828 |    "source": [
1829 |     "## 10. Influence of Regularization"
1830 |    ]
1831 |   },
1832 |   {
1833 |    "cell_type": "markdown",
1834 |    "metadata": {},
1835 |    "source": [
1836 |     " $\\color{DarkBlue}{\\textsf{Short Exercise}}$\n",
1837 |     "- what happens if you set $\\alpha$  to a really small or to large value? Why is this the case explain what the parameter means using the equation derived in the lectures?\n",
1838 |     "\n",
1839 |     " To test this, fix this value in one of your pipelines, retrain the models (re-optimizing the other hyperparameters) and rerun the performance evaluation."
1840 |    ]
1841 |   },
1842 |   {
1843 |    "cell_type": "markdown",
1844 |    "metadata": {},
1845 |    "source": [
1846 |     "<details>\n",
1847 |     "<summary> <font color='green'>Click here for hints</font></summary>\n",
1848 |     "<ul>\n",
1849 |     "    <li> Check the derivation for ridge regression and KRR in the notes. </li>\n",
1850 |     "    <li> Also remember the loss landscapes we discussed in the lectures about LASSO. </li>\n",
1851 |     "</ul>\n",
1852 |     "</details>"
1853 |    ]
1854 |   },
1855 |   {
1856 |    "cell_type": "markdown",
1857 |    "metadata": {},
1858 |    "source": [
1859 |     "## 11. Interpreting the model "
1860 |    ]
1861 |   },
1862 |   {
1863 |    "cell_type": "markdown",
1864 |    "metadata": {},
1865 |    "source": [
1866 |     "Now, that our model performs decently, we would like to know which features are mainly responsible for this, i.e. how the model performs its reasoning. \n",
1867 |     "\n",
1868 |     "One method to do so is the [permutation feature importance technique](https://christophm.github.io/interpretable-ml-book/feature-importance.html)."
1869 |    ]
1870 |   },
1871 |   {
1872 |    "cell_type": "markdown",
1873 |    "metadata": {},
1874 |    "source": [
1875 |     "$\\color{DarkBlue}{\\textsf{Short question}}$\n",
1876 |     "\n",
1877 |     "We use both descriptors that encode the pore geometry (density, pore diameters, surface areas) as well as some that describe the chemistry of the MOF (the RACs). \n",
1878 |     "- Would you expect the relative importance of these features to be different for prediction of gas adsorption at high vs low gas pressure?"
1879 |    ]
1880 |   },
1881 |   {
1882 |    "cell_type": "markdown",
1883 |    "metadata": {},
1884 |    "source": [
1885 |     "<details>\n",
1886 |     "<summary> <font color='green'>Click here for a hint</font></summary>\n",
1887 |     "<ul>\n",
1888 |     "    <li> <a href=\"https://pubs.acs.org/doi/abs/10.1021/acs.chemmater.8b02257\">An article from Diego et al.</a> (10.1021/acs.chemmater.8b02257) gives some hints.</li>\n",
1889 |     "</ul>\n",
1890 |     "</details>"
1891 |    ]
1892 |   },
1893 |   {
1894 |    "cell_type": "markdown",
1895 |    "metadata": {},
1896 |    "source": [
1897 |     " $\\color{DarkBlue}{\\textsf{Short Exercise}}$\n",
1898 |     "- Complete the function `_calculate_permutation_scores` (which we took from the `sklearn` package) and which is needed to calculate the permutation feature importance using the `permutation_importance` function. "
1899 |    ]
1900 |   },
1901 |   {
1902 |    "cell_type": "code",
1903 |    "execution_count": null,
1904 |    "metadata": {},
1905 |    "outputs": [],
1906 |    "source": [
1907 |     "def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,\n",
1908 |     "                                  n_repeats, scorer):\n",
1909 |     "    \"\"\"Calculate score when `col_idx` is permuted. Based on the sklearn implementation\n",
1910 |     "\n",
1911 |     "    estimator: sklearn estimator object\n",
1912 |     "    X: pd.Dataframe or np.array\n",
1913 |     "    y: pd.Dataframe or np.array\n",
1914 |     "    col_idx: int\n",
1915 |     "    random_state: int\n",
1916 |     "    n_repeats: int\n",
1917 |     "    scorer: function that takes model, X and y_true as arguments\n",
1918 |     "    \"\"\"\n",
1919 |     "    random_state = check_random_state(random_state)\n",
1920 |     "\n",
1921 |     "    X_permuted = X.copy()\n",
1922 |     "    scores = np.zeros(n_repeats)\n",
1923 |     "    # get the indices\n",
1924 |     "    shuffling_idx = np.arange(X.shape[0])\n",
1925 |     "    for n_round in range(n_repeats):\n",
1926 |     "        # FILL BELOW HERE\n",
1927 |     "        # shuffle them (fill in what you want to shuffle)\n",
1928 |     "        random_state.shuffle(#fillme)\n",
1929 |     "\n",
1930 |     "        # Deal with dataframes\n",
1931 |     "        if hasattr(X_permuted, \"iloc\"):\n",
1932 |     "            # .iloc selects the indices from a dataframe and you give it [row, column]\n",
1933 |     "            col = X_permuted.iloc[shuffling_idx, col_idx]\n",
1934 |     "            col.index = X_permuted.index\n",
1935 |     "            X_permuted.iloc[:, col_idx] = col\n",
1936 |     "\n",
1937 |     "        # Deal with numpy arrays\n",
1938 |     "        else:\n",
1939 |     "            # FILL BELOW HERE\n",
1940 |     "            # array indexing is [row, column]\n",
1941 |     "            X_permuted[:, col_idx] = X_permuted[#fillme]\n",
1942 |     "\n",
1943 |     "        # Get the scores\n",
1944 |     "        feature_score = scorer(estimator, X_permuted, y)\n",
1945 |     "\n",
1946 |     "        # record the scores in array\n",
1947 |     "        scores[n_round] = feature_score\n",
1948 |     "\n",
1949 |     "    return scores"
1950 |    ]
1951 |   },
1952 |   {
1953 |    "cell_type": "markdown",
1954 |    "metadata": {},
1955 |    "source": [
1956 |     "Nothing to change in the function below, it just call the `_calculate_permutation_scores` function you just completed. "
1957 |    ]
1958 |   },
1959 |   {
1960 |    "cell_type": "code",
1961 |    "execution_count": null,
1962 |    "metadata": {},
1963 |    "outputs": [],
1964 |    "source": [
1965 |     "def permutation_importance(\n",
1966 |     "    estimator,\n",
1967 |     "    X,\n",
1968 |     "    y,\n",
1969 |     "    scoring=\"neg_mean_absolute_error\",\n",
1970 |     "    n_repeats=5,\n",
1971 |     "    n_jobs=2,\n",
1972 |     "    random_state=None,\n",
1973 |     "):\n",
1974 |     "    \"\"\"Permutation importance for feature evaluation\n",
1975 |     "    estimator : object\n",
1976 |     "        An estimator that has already been :term:`fitted` and is compatible\n",
1977 |     "        with :term:`scorer`.\n",
1978 |     "    X : ndarray or DataFrame, shape (n_samples, n_features)\n",
1979 |     "        Data on which permutation importance will be computed.\n",
1980 |     "    y : array-like or None, shape (n_samples, ) or (n_samples, n_classes)\n",
1981 |     "        Targets for supervised or `None` for unsupervised.\n",
1982 |     "    scoring : string, callable or None, default=None\n",
1983 |     "        Scorer to use. It can be a single\n",
1984 |     "        string (see :ref:`scoring_parameter`) or a callable (see\n",
1985 |     "        :ref:`scoring`). If None, the estimator's default scorer is used.\n",
1986 |     "    n_repeats : int, default=5\n",
1987 |     "        Number of times to permute a feature.\n",
1988 |     "    n_jobs : int or None, default=2\n",
1989 |     "        The number of jobs to use for the computation.\n",
1990 |     "        `None` means 1 unless in a :obj:`joblib.parallel_backend` context.\n",
1991 |     "        `-1` means using all processors. See :term:`Glossary <n_jobs>`\n",
1992 |     "        for more details.\n",
1993 |     "    random_state : int, RandomState instance, or None, default=None\n",
1994 |     "        Pseudo-random number generator to control the permutations of each\n",
1995 |     "        feature. See :term:`random_state`.\n",
1996 |     "    \"\"\"\n",
1997 |     "    # Deal with dataframes\n",
1998 |     "    if not hasattr(X, \"iloc\"):\n",
1999 |     "        X = check_array(X, force_all_finite=\"allow-nan\", dtype=None)\n",
2000 |     "\n",
2001 |     "    # Precompute random seed from the random state to be used\n",
2002 |     "    # to get a fresh independent RandomState instance for each\n",
2003 |     "    # parallel call to _calculate_permutation_scores, irrespective of\n",
2004 |     "    # the fact that variables are shared or not depending on the active\n",
2005 |     "    # joblib backend (sequential, thread-based or process-based).\n",
2006 |     "    random_state = check_random_state(random_state)\n",
2007 |     "    random_seed = random_state.randint(np.iinfo(np.int32).max + 1)\n",
2008 |     "\n",
2009 |     "    # Determine scorer from user options.\n",
2010 |     "    scorer = check_scoring(estimator, scoring=scoring)\n",
2011 |     "    # get the performance score on the unpermuted data\n",
2012 |     "    baseline_score = scorer(estimator, X, y)\n",
2013 |     "\n",
2014 |     "    # run the permuted evaluations in parallel for each column\n",
2015 |     "    scores = Parallel(n_jobs=n_jobs)(\n",
2016 |     "        delayed(_calculate_permutation_scores)(\n",
2017 |     "            estimator, X, y, col_idx, random_seed, n_repeats, scorer\n",
2018 |     "        )\n",
2019 |     "        for col_idx in range(X.shape[1])\n",
2020 |     "    )\n",
2021 |     "\n",
2022 |     "    # get difference two\n",
2023 |     "    importances = baseline_score - np.array(scores)\n",
2024 |     "\n",
2025 |     "    # return the results (dictionary)\n",
2026 |     "    return Bunch(\n",
2027 |     "        importances_mean=np.mean(importances, axis=1),\n",
2028 |     "        importances_std=np.std(importances, axis=1),\n",
2029 |     "        importances=importances,\n",
2030 |     "    )\n"
2031 |    ]
2032 |   },
2033 |   {
2034 |    "cell_type": "markdown",
2035 |    "metadata": {},
2036 |    "source": [
2037 |     " $\\color{DarkBlue}{\\textsf{Short Exercise}}$\n",
2038 |     "- Use your function to find the five most important features.\n",
2039 |     "- Which are they? Did you expect this result?"
2040 |    ]
2041 |   },
2042 |   {
2043 |    "cell_type": "code",
2044 |    "execution_count": null,
2045 |    "metadata": {},
2046 |    "outputs": [],
2047 |    "source": [
2048 |     "permutation_results = permutation_importance(#fillme)"
2049 |    ]
2050 |   },
2051 |   {
2052 |    "cell_type": "code",
2053 |    "execution_count": null,
2054 |    "metadata": {},
2055 |    "outputs": [],
2056 |    "source": [
2057 |     "permutation_results[\"features\"] = FEATURES\n",
2058 |     "bars = hv.Bars(\n",
2059 |     "    permutation_results, \"features\", [\"importances_mean\", \"importances_std\"]\n",
2060 |     ").sort(\"importances_mean\", reverse=True)\n",
2061 |     "errors = hv.ErrorBars(\n",
2062 |     "    permutation_results, \"features\", vdims=[\"importances_mean\", \"importances_std\"]\n",
2063 |     ").sort(\"importances_mean\", reverse=True)\n",
2064 |     "\n",
2065 |     "bars * errors\n"
2066 |    ]
2067 |   },
2068 |   {
2069 |    "cell_type": "markdown",
2070 |    "metadata": {},
2071 |    "source": [
2072 |     "<details>\n",
2073 |     "<summary> <font color='green'>Click here for hints</font></summary>\n",
2074 |     "<ul>\n",
2075 |     "    <li> To get the top <emph>n</emph> indices of an array <code>a</code>, you can use <code>np.argsort(a)[-n:]</code></li>\n",
2076 |     "    <li> Get the feature names from the <code>FEATURES</code> list </li> \n",
2077 |     "    <li> combined this might look like <code>np.array(FEATURES)[np.argsort(a)[-n:]]</code></li>\n",
2078 |     "</ul>\n",
2079 |     "</details>"
2080 |    ]
2081 |   },
2082 |   {
2083 |    "cell_type": "markdown",
2084 |    "metadata": {},
2085 |    "source": [
2086 |     "<details>\n",
2087 |     "<summary> <font color='green'>Click here for more information on model interpretation</font></summary>\n",
2088 |     "The permutation feature importance technique is not a silver bullet, e.g. there are issues with correlated features.\n",
2089 |     "However, it is likely <a href='https://explained.ai/rf-importance/'>a better choice than feature importance, like impurity decrease, derived from random forests</a>).\n",
2090 |     "</details>"
2091 |    ]
2092 |   },
2093 |   {
2094 |    "cell_type": "markdown",
2095 |    "metadata": {},
2096 |    "source": [
2097 |     "## 12. Submit your best model to Kaggle "
2098 |    ]
2099 |   },
2100 |   {
2101 |    "attachments": {},
2102 |    "cell_type": "markdown",
2103 |    "metadata": {},
2104 |    "source": [
2105 |     "Join the [Kaggle competition](https://www.kaggle.com/competitions/molsim-2024-ml-challenge/host/launch-checklist) for this course!\n",
2106 |     "For this you can: \n",
2107 |     "- try to continue optimizing your KRR model \n",
2108 |     "- try to use a new model ([browse the sklearn documentation](https://scikit-learn.org/) for ideas or check out [xgboost](https://xgboost.readthedocs.io/en/stable/)\n",
2109 |     "\n",
2110 |     "The important parts for us here are: \n",
2111 |     "- that you make an attempt to improve your model, discuss this attempt, and use proper models to measure potential improvement \n",
2112 |     "- we will not grade you based on how \"fancy\" or model is or how well it performs but rather on whether you do something reasonable that is well motivated in your discussion\n",
2113 |     "- you do not need to try both a model and continue optimizing your model. Doing one of them is, in principle, \"enough\"\n",
2114 |     "\n",
2115 |     "Use then your best model to create a `submission.csv` with your predictions to join the competition and upload it to the competition site.\n"
2116 |    ]
2117 |   },
2118 |   {
2119 |    "cell_type": "code",
2120 |    "execution_count": null,
2121 |    "metadata": {},
2122 |    "outputs": [],
2123 |    "source": [
2124 |     "kaggle_data = pd.read_csv('data/features.csv')\n",
2125 |     "kaggle_predictions = #fillme.predict(kaggle_data[FEATURES])"
2126 |    ]
2127 |   },
2128 |   {
2129 |    "cell_type": "code",
2130 |    "execution_count": null,
2131 |    "metadata": {},
2132 |    "outputs": [],
2133 |    "source": [
2134 |     "submission = pd.DataFrame({\"id\": kaggle_data[\"id\"], \"prediction\": kaggle_predictions})\n",
2135 |     "\n",
2136 |     "submission.to_csv(\"submission.csv\", index=False)\n"
2137 |    ]
2138 |   }
2139 |  ],
2140 |  "metadata": {
2141 |   "kernelspec": {
2142 |    "display_name": "Python 3",
2143 |    "language": "python",
2144 |    "name": "python3"
2145 |   },
2146 |   "language_info": {
2147 |    "codemirror_mode": {
2148 |     "name": "ipython",
2149 |     "version": 3
2150 |    },
2151 |    "file_extension": ".py",
2152 |    "mimetype": "text/x-python",
2153 |    "name": "python",
2154 |    "nbconvert_exporter": "python",
2155 |    "pygments_lexer": "ipython3",
2156 |    "version": "3.10.13"
2157 |   },
2158 |   "vscode": {
2159 |    "interpreter": {
2160 |     "hash": "5c7b89af1651d0b8571dde13640ecdccf7d5a6204171d6ab33e7c296e100e08a"
2161 |    }
2162 |   }
2163 |  },
2164 |  "nbformat": 4,
2165 |  "nbformat_minor": 4
2166 | }
2167 | 


--------------------------------------------------------------------------------
/test/test.py:
--------------------------------------------------------------------------------
 1 | # basics 
 2 | import os 
 3 | import numpy as np 
 4 | 
 5 | # data
 6 | import pandas as pd 
 7 | 
 8 | # machine learning 
 9 | # scaling of data
10 | from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
11 | # train/test split
12 | from sklearn.model_selection import train_test_split
13 | # model selection 
14 | from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
15 | # model
16 | from sklearn.kernel_ridge import KernelRidge
17 | # pipeline 
18 | from sklearn.pipeline import Pipeline
19 | # PCA
20 | from sklearn.decomposition import PCA
21 | # Dummy model
22 | from sklearn.dummy import DummyClassifier, DummyRegressor
23 | # Variance Threshold 
24 | from sklearn.feature_selection import VarianceThreshold, SelectFromModel
25 | # metrics 
26 | from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
27 |                              mean_absolute_error, mean_squared_error, max_error)
28 | # feature names
29 | #from descriptornames import * 
30 | 
31 | # save/load models 
32 | import joblib
33 | 
34 | # For the permutation importance
35 | from joblib import Parallel
36 | from joblib import delayed
37 | from sklearn.metrics import check_scoring
38 | from sklearn.utils import Bunch
39 | from sklearn.utils import check_random_state
40 | from sklearn.utils import check_array
41 | 
42 | # plotting 
43 | import matplotlib.pyplot as plt 
44 | import seaborn as sns
45 | 
46 | # for interactive plots, you can try to use holoviewes
47 | import holoviews as hv
48 | from holoviews import dim, opts
49 | hv.extension('plotly', 'bokeh', 'matplotlib')
50 | 
51 | RANDOM_SEED = 4242424242
52 | DATA_DIR = 'data'
53 | DATA_FILE = os.path.join(DATA_DIR, 'data.csv')
54 | 
55 | np.random.seed(RANDOM_SEED)
56 | 
57 | df = pd.read_csv('data/data.csv')
58 | 
59 | print(df.head())
60 | 
61 | 


--------------------------------------------------------------------------------