├── .github └── workflows │ ├── binder.yaml │ └── deploy.yml ├── .gitignore ├── Contents.md ├── LICENSE ├── Learning.md ├── Lecture1.ipynb ├── Lecture2.ipynb ├── Lecture3.ipynb ├── Lecture4.ipynb ├── Lecture5.ipynb ├── Lecture6.ipynb ├── Lecture7.ipynb ├── Lecture8.ipynb ├── Lecture9.ipynb ├── Overview.md ├── README.md ├── Resources.md ├── _config.yml ├── _toc.yml ├── images ├── 2_Cs2AgBiI6.png ├── 2_Cs2AgBiI6.vesta ├── 2_CsPbI3.png ├── 2_CsPbI3.vesta ├── 2_sum.png ├── 5_bands.png ├── 6_tem.png └── ml-python.png ├── logo.png ├── ref.bib ├── requirements.txt └── slides ├── MLforMaterials_Challenge_25.pdf ├── MLforMaterials_Lecture1_Intro_25.pdf ├── MLforMaterials_Lecture2_Basics_25.pdf ├── MLforMaterials_Lecture3_Data_25.pdf ├── MLforMaterials_Lecture4_Representations_25.pdf ├── MLforMaterials_Lecture5_Classical_25.pdf ├── MLforMaterials_Lecture6_NN_25.pdf ├── MLforMaterials_Lecture7_Build_25.pdf ├── MLforMaterials_Lecture8_Discovery_25.pdf └── MLforMaterials_Lecture9_GenAI_25.pdf /.github/workflows/binder.yaml: -------------------------------------------------------------------------------- 1 | name: Binder 2 | on: [push] 3 | 4 | jobs: 5 | Create-MyBinderOrg-Cache: 6 | runs-on: ubuntu-latest 7 | steps: 8 | - name: cache binder build on mybinder.org 9 | uses: jupyterhub/repo2docker-action@master 10 | with: 11 | NO_PUSH: true 12 | MYBINDERORG_TAG: ${{ github.event.ref }} # This builds the container on mybinder.org with the branch that was pushed on. 13 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: deploy-book 2 | 3 | # Only run this when the master branch changes 4 | on: 5 | push: 6 | branches: 7 | - 2025 8 | # If your git repository has the Jupyter Book within some-subfolder next to 9 | # unrelated files, you can make this run only if a file within that specific 10 | # folder has been modified. 11 | # 12 | # paths: 13 | # - some-subfolder/** 14 | 15 | # This job installs dependencies, builds the book, and pushes it to `gh-pages` 16 | jobs: 17 | deploy-book: 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: actions/checkout@v2 21 | 22 | # Install dependencies 23 | - name: Set up Python 3.8 24 | uses: actions/setup-python@v2 25 | with: 26 | python-version: 3.8 27 | 28 | - name: Install dependencies 29 | run: | 30 | pip install -r requirements.txt 31 | 32 | # Build the book 33 | - name: Build the book 34 | run: | 35 | jupyter-book build . 36 | 37 | # Push the book's HTML to github-pages 38 | - name: GitHub Pages action 39 | uses: peaceiris/actions-gh-pages@v3.6.1 40 | with: 41 | github_token: ${{ secrets.GITHUB_TOKEN }} 42 | publish_dir: ./_build/html 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | .DS_Store 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | _build/ 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | notebooks/CsPbI3.cif 134 | imagenet_labels.json 135 | -------------------------------------------------------------------------------- /Contents.md: -------------------------------------------------------------------------------- 1 | # Course Contents 2 | 3 | 1. **Introduction** 4 | * Overview 5 | * Expectations and assessments 6 | * _Exercise: Getting started_ 7 | 8 | 2. **Machine Learning Basics** 9 | * Terminology 10 | * Learning by example 11 | * Supervised 12 | * Unsupervised 13 | * Reinforcement 14 | * _Exercise: Crystal hardness_ 15 | 16 | 3. **Materials Data** 17 | * Data sources and formats 18 | * API queries 19 | * _Exercise: Data-driven thermoelectrics_ 20 | 21 | 4. **Crystal Representations** 22 | * Compositional 23 | * Structural 24 | * Graphs 25 | * _Exercise: Navigating crystal space_ 26 | 27 | 5. **Classical Learning** 28 | * _k_-nearest neighbours 29 | * _k_-means clustering 30 | * Decision trees and beyond 31 | * _Exercise: Metal or insulator?_ 32 | 33 | 6. **Artificial Neural Networks** 34 | * From neuron to perceptron 35 | * Network architecture and training 36 | * Convolutional neural networks 37 | * _Exercise: Learning microstructure_ 38 | 39 | 7. **Building a Model from Scratch** 40 | * Data preparation 41 | * Model choice 42 | * Training and testing 43 | * _Exercise: Crystal hardness II_ 44 | 45 | 8. **Accelerated Discovery** 46 | * Automated experiments 47 | * Bayesian optimisation 48 | * Reinforcement learning 49 | * _Exercise: Closed-loop optimisation_ 50 | 51 | 9. **Generative Artificial Intelligence** 52 | * Large language models 53 | * From latent space to diffusion 54 | * _Exercise: Research challenge_ 55 | 56 | 10. **Recent Advances** 57 | * Guest lecture 58 | * _Exercise: Research challenge_ 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /Learning.md: -------------------------------------------------------------------------------- 1 | # Learning Outcomes 2 | 3 | At the end of this course, you will be able to: 4 | 5 | - Specify and interpret the central concepts underpinning supervised, unsupervised, and reinforcement learning. 6 | 7 | - Describe approaches for materials representation including chemical composition and crystal structure. 8 | 9 | - Discover structure and property information from public databases using Python. 10 | 11 | - Compare a range of classical machine learning and deep learning approaches. 12 | 13 | - Train and evaluate machine learning models for chemical problems. 14 | -------------------------------------------------------------------------------- /Lecture1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "uDQYZDh0ciGP" 7 | }, 8 | "source": [ 9 | "# Introduction" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "
\n", 17 | " 💡 Ada Lovelace: The more I study, the more insatiable do I feel my genius for it to be.\n", 18 | "
" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "id": "qlQmXeW3ciGS" 25 | }, 26 | "source": [ 27 | "\n", 28 | "\n", 29 | "[Lecture slides](https://speakerdeck.com/aronwalsh/machine-learning-for-materials-lecture-1)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": { 35 | "id": "aB6tYmdQciGS", 36 | "tags": [] 37 | }, 38 | "source": [ 39 | "## 👋 Getting started\n", 40 | "\n", 41 | "Welcome to our first practical session!\n", 42 | "\n", 43 | "This is a Jupyter Notebook loaded inside a Jupyter Book. They are part of [Project Jupyter](https://jupyter.org), a suite of open-source tools. A Jupyter Notebook also allows you to run and easily share computer code. This combination makes Jupyter notebooks a useful tool for analysing data.\n", 44 | "\n", 45 | "Unlike spreadsheets or combinations of separate data analysis codes, you can collect descriptions and notes for individual experiments, links to the raw data collected, the computer code that performs any necessary data analysis, and the final figures generated with these data, ready for use in a report or published paper.\n", 46 | "\n", 47 | "There are a few components to be aware of:\n", 48 | "\n", 49 | "### Python\n", 50 | "A working knowledge of the [Python](https://www.python.org) programming language is assumed for this course. If you are rusty, Chapters 1-4 of [Datacamp](https://www.datacamp.com/courses/intro-to-python-for-data-science) cover the base concepts, as do many other online resources including Imperial's [Introduction to Python](https://www.imperial.ac.uk/students/academic-support/graduate-school/professional-development/doctoral-students/research-computing-data-science/courses/python-for-researchers) course.\n", 51 | "\n", 52 | "\n", 61 | "\n", 62 | "
\n", 63 | "

Choose your degree programme:

\n", 64 | "
\n", 65 | " \n", 66 | " \n", 67 | "
\n", 68 | "
\n", 69 | " \n", 70 | " \n", 71 | "
\n", 72 | "
\n", 73 | "\n", 74 | "
\n", 75 | "

If MSc, have you completed the introductory Python course:

\n", 76 | "
\n", 77 | " \n", 78 | " \n", 79 | "
\n", 80 | "
\n", 81 | " \n", 82 | " \n", 83 | "
\n", 84 | "
\n", 85 | "\n", 86 | "
\n", 87 | "

Rate your current Python level:

\n", 88 | "
\n", 89 | " \n", 90 | " \n", 91 | "
\n", 92 | "
\n", 93 | " \n", 94 | " \n", 95 | "
\n", 96 | "
\n", 97 | " \n", 98 | " \n", 99 | "
\n", 100 | "
\n", 101 | "\n", 102 | "### Markdown\n", 103 | "Markdown is a markup language that allows easy formatting of text. It is widely used for creating and formatting online content. It is easier to read and write than html. A guide to the syntax can be found [here](https://www.markdownguide.org/basic-syntax/).\n", 104 | "\n", 105 | "```\n", 106 | "# Heading\n", 107 | "## Smaller heading\n", 108 | "### Even smaller heading\n", 109 | "```\n", 110 | "\n", 111 | "### Github\n", 112 | "[GitHub](https://github.com) is a platform for writing and sharing code. There are many materials science projects hosted there, which enable researchers from around the world to contribute to their development. These notebooks are hosted on GitHub too. If you find an error, you can raise an [issue](https://github.com/aronwalsh/MLforMaterials/issues) or even better fix it yourself with a [pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests).\n", 113 | "\n", 114 | "### Live coding\n", 115 | "The weekly notebooks are designed to be run online directly in your browser. You can activate the server by clicking the rocket icon on the top right and selecting `Live Code`. There is an option to open in [Binder](https://mybinder.org) or [Google Colab](https://colab.research.google.com). Colab is more powerful, but the formatting won't be as nice. You can opt to install Python on your own computer with [Anaconda](https://www.anaconda.com/products/distribution) and run the notebooks locally, but we do not offer support if things go wrong." 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "id": "95tjL6dJciGU" 122 | }, 123 | "source": [ 124 | "## Analyse data with code\n", 125 | "\n", 126 | "By programming a series of instructions, researchers can consistently obtain the same results from a given dataset. This approach enables us to share datasets and code, allowing other scientists to review, repeat and reuse the analysis. The transparency and reproducibility of code-based analysis enhances research integrity and credibility, while minimising errors. It also enables efficient handling of large datasets and complex calculations, accelerating the exploration of different techniques.\n", 127 | "\n", 128 | "### Running code\n", 129 | "\n", 130 | "Different programming languages can be used in Jupyter notebooks. We will be using Python 3. The large scientific community for Python means that well-developed resources exist for data processing and specific prewritten tools for manipulating and plotting data.\n", 131 | "\n", 132 | "Any code typed into a code cell can be run (executed) by pressing the `run` button. You can also run the selected code block using `Shift-Enter` combination on your keyboard." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": { 139 | "colab": { 140 | "base_uri": "https://localhost:8080/" 141 | }, 142 | "collapsed": false, 143 | "id": "wCimyGVFciGU", 144 | "jupyter": { 145 | "outputs_hidden": false 146 | }, 147 | "outputId": "6f46572f-a956-4342-def3-8713a99c224d" 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "2+3 # run this cell" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "colab": { 159 | "base_uri": "https://localhost:8080/" 160 | }, 161 | "collapsed": false, 162 | "id": "2VOKhE8pciGW", 163 | "jupyter": { 164 | "outputs_hidden": false 165 | }, 166 | "outputId": "c14bdab6-a0e1-4181-b0bb-dc43afb85865" 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "print(\"Beware of 小妖精\") # anything after '#' is a comment and ignored" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "colab": { 178 | "base_uri": "https://localhost:8080/" 179 | }, 180 | "collapsed": false, 181 | "id": "iRqw3mAwciGW", 182 | "jupyter": { 183 | "outputs_hidden": false 184 | }, 185 | "outputId": "e774b03f-36f0-420c-9d2d-d29426602fa3" 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "12*2.40*3737*12 # you get the idea" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": { 196 | "colab": { 197 | "base_uri": "https://localhost:8080/" 198 | }, 199 | "collapsed": false, 200 | "id": "unZ26LEociGW", 201 | "jupyter": { 202 | "outputs_hidden": false 203 | }, 204 | "outputId": "65ccf1d5-52a2-49c7-dec9-6999d12ddd8e" 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "2**1000 - 2 # a big number" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": { 215 | "colab": { 216 | "base_uri": "https://localhost:8080/" 217 | }, 218 | "collapsed": false, 219 | "id": "MyM32PMxciGW", 220 | "jupyter": { 221 | "outputs_hidden": false 222 | }, 223 | "outputId": "a53bd082-8a05-4dbf-c5cb-807809c725aa" 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "import math as m # import a math module\n", 228 | "m.pi" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": { 235 | "colab": { 236 | "base_uri": "https://localhost:8080/" 237 | }, 238 | "collapsed": false, 239 | "id": "P574cgsSciGX", 240 | "jupyter": { 241 | "outputs_hidden": false 242 | }, 243 | "outputId": "a1f16417-6f1d-417b-b6ad-5ab5321a5dfd" 244 | }, 245 | "outputs": [], 246 | "source": [ 247 | "20*m.atan(1/7)+8*m.atan(3/79) # Euler's approximation" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": { 253 | "id": "5B698R2pciGX" 254 | }, 255 | "source": [ 256 | "### Plotting with Matplotlib" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": { 262 | "id": "neke0J4KifCW" 263 | }, 264 | "source": [ 265 | "Let's import the package [Matplotlib](https://matplotlib.org), which we will be using a lot for data visualisation." 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": { 272 | "colab": { 273 | "base_uri": "https://localhost:8080/", 274 | "height": 448 275 | }, 276 | "collapsed": false, 277 | "id": "oyBEOTXociGX", 278 | "jupyter": { 279 | "outputs_hidden": false 280 | }, 281 | "outputId": "8b8a7522-08e3-4235-b064-f64adbf1b6b1", 282 | "tags": [] 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "# Imports\n", 287 | "import matplotlib.pyplot as plt # Plotting\n", 288 | "import numpy as np # Numerical operations\n", 289 | "%matplotlib inline\n", 290 | "\n", 291 | "x = np.arange(0, 10, 0.001) # x = 0 to 10 in steps of 0.001\n", 292 | "y = np.sin(x*x) # define your function\n", 293 | "plt.figure(figsize=(5, 3)) # create a new figure (5x3 inches)\n", 294 | "plt.plot(,y) # plot x against y" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": { 300 | "id": "lxLc8V4tb5zh" 301 | }, 302 | "source": [ 303 | "
\n", 304 | " Code hint \n", 305 | "You need to plot x vs y. Fix the plot command to (x,y).\n", 306 | "
" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": { 312 | "id": "Z8_nYqMH2MW9" 313 | }, 314 | "source": [ 315 | "### Using a DataFrame\n", 316 | "\n", 317 | "A DataFrame organises data into a 2-dimensional table of rows and columns, much like a spreadsheet. They are useful tools to store, access, and modify large sets of data. \n", 318 | "\n", 319 | "In this module, we'll make use of [Pandas](https://pandas.pydata.org) to process input and output data for our machine learning models." 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": { 326 | "colab": { 327 | "base_uri": "https://localhost:8080/" 328 | }, 329 | "id": "UamDrzLn2LoS", 330 | "outputId": "47b9cf3b-9333-46d7-e785-d0b936bbc93e", 331 | "tags": [] 332 | }, 333 | "outputs": [], 334 | "source": [ 335 | "import pandas as pd # Data manipulation using DataFrames\n", 336 | "\n", 337 | "df = pd.DataFrame() # This instantiates an empty pandas DataFrame\n", 338 | "\n", 339 | "data = {\n", 340 | " \"Element\" : ['C', 'O', 'Fe', 'Mg', 'Xe'],\n", 341 | " \"Atomic Number\" : [6, 8, 26, 12, 54],\n", 342 | " \"Atomic Mass\" : [12, 16, 56, 24, 131]\n", 343 | "}\n", 344 | "\n", 345 | "# Let's try loading data into DataFrame df\n", 346 | "df = pd.DataFrame(data)\n", 347 | "\n", 348 | "# We can make the 'Element' column the index using the set_index function\n", 349 | "df = df.set_index(\"Element\")\n", 350 | "\n", 351 | "# Printing the values in the 'Atomic Number' column\n", 352 | "print(df[\"Atom Number\"])" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": { 358 | "id": "BOl6MmQuifCW", 359 | "tags": [] 360 | }, 361 | "source": [ 362 | "
\n", 363 | " Code hint \n", 364 | "Check you are printing the correct column name. Try out some of the other options.\n", 365 | "
" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": { 372 | "colab": { 373 | "base_uri": "https://localhost:8080/" 374 | }, 375 | "id": "gcUlJMzWb5zi", 376 | "outputId": "7bb8809f-3477-4593-f177-857e9bc1a1b4", 377 | "tags": [] 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "# Add a new column\n", 382 | "df[\"Energy (eV)\"] = [5.47, 5.14, 0.12, 4.34, 7.01]\n", 383 | "\n", 384 | "print(df[\"Energy (eV)\"])" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": { 391 | "colab": { 392 | "base_uri": "https://localhost:8080/" 393 | }, 394 | "id": "HxPmwuvub5zi", 395 | "outputId": "bbe23a6f-6569-40dc-d5ed-64431df3a9be", 396 | "tags": [] 397 | }, 398 | "outputs": [], 399 | "source": [ 400 | "# Print a row from the DataFrame\n", 401 | "\n", 402 | "# The df.loc[index] function to print the entry \"C\"\n", 403 | "print(df.loc[''])\n", 404 | "\n", 405 | "print('-----')\n", 406 | "\n", 407 | "# The df.iloc[index] function to print the first entry (counting starts at 0...)\n", 408 | "print(df.iloc[0])" 409 | ] 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "metadata": { 414 | "id": "LsKd-p8Ob5zi" 415 | }, 416 | "source": [ 417 | "
\n", 418 | " Code hint \n", 419 | "You need to tell `df.loc` what to look for. Put an element name in between the quotes.\n", 420 | "
" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": { 426 | "id": "Ug7HnFwUciGX" 427 | }, 428 | "source": [ 429 | "### Write an equation" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": { 435 | "id": "tlakAjCMciGX" 436 | }, 437 | "source": [ 438 | "This equation is written in [LaTeX](https://www.overleaf.com/learn/latex/Learn_LaTeX_in_30_minutes) format. It's easy to learn and useful for complex expressions, e.g. `\\frac{x}{y}` writes x/y as a fraction $\\dfrac{x}{y}$.\n", 439 | "\n", 440 | "`$-\\frac{\\hslash^2}{2m} \\, \\frac{\\partial^2 \\psi}{\\partial x^2}$`\n", 441 | "\n", 442 | "renders as\n", 443 | "\n", 444 | "$-\\dfrac{\\hslash^2}{2m} \\, \\dfrac{\\partial^2 \\psi}{\\partial x^2}$" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": { 450 | "id": "6LT9mCDQciGX" 451 | }, 452 | "source": [ 453 | "### Link an image" 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "metadata": { 459 | "id": "oteHuO9DciGY" 460 | }, 461 | "source": [ 462 | "The syntax employed here is Markdown. It can be used in notebooks, is popular on Github for documentation, and can even be a fast way to take notes during lectures.\n", 463 | "\n", 464 | "`![](https://media.giphy.com/media/cxk3z6nMhpf7a/giphy.gif)`\n", 465 | "\n", 466 | "which renders as\n", 467 | "\n", 468 | "![](https://media.giphy.com/media/cxk3z6nMhpf7a/giphy.gif)" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": { 474 | "id": "8uepYP7rciGY" 475 | }, 476 | "source": [ 477 | "## Computational science\n", 478 | "\n", 479 | "### Thermally-actived diffusion\n", 480 | "\n", 481 | "Ion transport in crystals is a fundamental process that underpins various technological applications, from batteries to semiconductor devices. Understanding the kinetics of ion movement within and between materials is crucial for optimising device performance.\n", 482 | "\n", 483 | "Like many chemical processes, solid-state diffusion transport is thermally activated. We can describe ion motion in a crystal using a familiar Arrhenius relationship.\n", 484 | "\n", 485 | "The diffusion coefficient of a species is given by $D_{ion} = D_0 \\cdot e^{-(\\frac{\\Delta E_a}{k_BT})}$, where:\n", 486 | "- $D_{ion}$ is the diffusion coefficient for a particular ion,\n", 487 | "- $D_0$ is the temperature-independent prefactor (containing an attempt frequency),\n", 488 | "- $\\Delta E_a$ is the activation energy for diffusion,\n", 489 | "- $k_B$ is the Boltzmann constant, and\n", 490 | "- $T$ is the temperature.\n", 491 | "\n", 492 | "Let's write a function for it, which will take advantage of the wonderful [NumPy](https://numpy.org) package. It also uses the [physical constants](https://docs.scipy.org/doc/scipy/reference/constants.html#physical-constants) in [SciPy](https://scipy.org), and explains the function with a [docstring](https://en.wikipedia.org/wiki/Docstring)." 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": null, 498 | "metadata": { 499 | "tags": [] 500 | }, 501 | "outputs": [], 502 | "source": [ 503 | "import numpy as np\n", 504 | "from scipy.constants import physical_constants\n", 505 | "\n", 506 | "# Define constants\n", 507 | "k_B = physical_constants['Boltzmann constant in eV/K'][0]\n", 508 | "\n", 509 | "# Arrhenius function\n", 510 | "def arrhenius(activation_energy, temperature, D0=1):\n", 511 | " \"\"\"\n", 512 | " Calculates the rate using the Arrhenius equation.\n", 513 | " \n", 514 | " Parameters:\n", 515 | " activation_energy (float): the activation energy in eV.\n", 516 | " temperature (float): the temperature in K (must be > 0).\n", 517 | " D0 (float): the pre-exponential factor (default is 1).\n", 518 | " \n", 519 | " Returns:\n", 520 | " float: the rate of the reaction.\n", 521 | " \"\"\"\n", 522 | " if np.any(temperature <= 0):\n", 523 | " raise ValueError(\"Temperature must be greater than 0 K\")\n", 524 | " return D0 * np.exp(-activation_energy / (k_B * temperature))" 525 | ] 526 | }, 527 | { 528 | "cell_type": "markdown", 529 | "metadata": { 530 | "id": "R8aKxKtuciGY" 531 | }, 532 | "source": [ 533 | "This function takes `activation_energy` (eV) and `temperature` (K) as inputs and returns the corresponding diffusion coefficient. Recall that the units of the exponential term cancel out, so $D_{ion}$ takes the same units as $D_0$. Now let's use the function:" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "metadata": { 540 | "colab": { 541 | "base_uri": "https://localhost:8080/" 542 | }, 543 | "id": "gO22e47tciGY", 544 | "outputId": "7f1557d3-674b-45b3-e878-7de2021946ae", 545 | "tags": [] 546 | }, 547 | "outputs": [], 548 | "source": [ 549 | " # Call the function for Ea = 0.12 eV; T = 1000 K\n", 550 | "arrhenius(0.12, 1000) " 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": { 556 | "id": "7rMW0e9bciGY" 557 | }, 558 | "source": [ 559 | "This value tells us the likelihood that each attempt has of overcoming the thermodynamic barrier for ionic diffusion. Decrease the temperature to 100 K and see the difference.\n", 560 | "\n", 561 | "Now let's take advantage of the function to make a plot. We will use the numpy function `linspace`, which is documented over [here](https://numpy.org/doc/stable/reference/generated/numpy.linspace.html). It is used here to generate 100 numbers evenly spaced between 100 and 5000 that represent the temperature range of our \"experiments\"." 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": null, 567 | "metadata": { 568 | "colab": { 569 | "base_uri": "https://localhost:8080/", 570 | "height": 472 571 | }, 572 | "id": "WkWCrwDsb5zj", 573 | "outputId": "f4fade20-aa13-4ab5-9ba9-c6434e97e16b", 574 | "tags": [] 575 | }, 576 | "outputs": [], 577 | "source": [ 578 | "import matplotlib.pyplot as plt\n", 579 | "\n", 580 | "# Pre-exponential term in cm^2/s\n", 581 | "D0 = 0.5\n", 582 | "\n", 583 | "# Range of activation energies in eV\n", 584 | "activation_energies = np.linspace(0.1, 1, 0) # Range from 0.1 to 1 eV in n steps\n", 585 | "\n", 586 | "# Temperature range in K\n", 587 | "T = np.linspace(100, 5000, 100)\n", 588 | "\n", 589 | "# Calculate rates and plot curves\n", 590 | "plt.figure(figsize=(5, 3)) \n", 591 | "\n", 592 | "for activation_energy in activation_energies:\n", 593 | " rates = arrhenius(activation_energy, T, D0)\n", 594 | " plt.plot(T, rates, label=f'{activation_energy:.1f} eV')\n", 595 | "\n", 596 | "plt.xlabel('Temperature (K)')\n", 597 | "plt.ylabel('$D_{ion}$ (cm$^2$/s)') \n", 598 | "plt.title('Varying activation energy')\n", 599 | "plt.legend()\n", 600 | "plt.grid(True)\n", 601 | "plt.show()" 602 | ] 603 | }, 604 | { 605 | "cell_type": "markdown", 606 | "metadata": { 607 | "id": "8DvhjGpCb5zk" 608 | }, 609 | "source": [ 610 | "
\n", 611 | " Code hint \n", 612 | "'np.linspace' requires three arguments (start, stop, number of points). 0 points won't work. Try changing it to 5.\n", 613 | "
" 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": { 619 | "id": "uWlaJMBQciGZ" 620 | }, 621 | "source": [ 622 | "To better visualise the trends, we can make an Arrhenius plot by plotting the natural logarithm of $D$ versus the inverse temperature, 1/T. We use 1000/T to give a nicer range on the $x$-axis." 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": null, 628 | "metadata": { 629 | "colab": { 630 | "base_uri": "https://localhost:8080/", 631 | "height": 472 632 | }, 633 | "id": "7Wgi_g2wciGZ", 634 | "outputId": "8b602c70-d78e-40c2-a00b-226582c57e43", 635 | "tags": [] 636 | }, 637 | "outputs": [], 638 | "source": [ 639 | "# Plotting ln(R) vs 1000/T\n", 640 | "plt.figure(figsize=(5, 3)) \n", 641 | "\n", 642 | "for activation_energy in activation_energies:\n", 643 | " rates = arrhenius(activation_energy, T, D0)\n", 644 | " plt.plot(1000/T, np.log(rates), label=f'{activation_energy:.1f} eV')\n", 645 | "\n", 646 | "plt.xlabel('1000 / Temperature (1/K)')\n", 647 | "plt.ylabel('ln($D_{ion}$)')\n", 648 | "plt.title('Arrhenius plot')\n", 649 | "plt.legend()\n", 650 | "plt.grid(True)\n", 651 | "plt.show()" 652 | ] 653 | }, 654 | { 655 | "cell_type": "markdown", 656 | "metadata": { 657 | "id": "GzN2cRN0ciGZ" 658 | }, 659 | "source": [ 660 | "The last technique to pick up in this class is data fitting. Later in the module, we will use more complex functions in high dimensions, but let's start with linear regression. There is no need to code this by hand as we can use a [function](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html) in the machine learning package [scikit-learn](https://scikit-learn.org). The real power of Python is the quality and quantity of available libraries such as this one." 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": null, 666 | "metadata": { 667 | "colab": { 668 | "base_uri": "https://localhost:8080/" 669 | }, 670 | "id": "8eOomWxMb5zk", 671 | "outputId": "b3cf49b2-076a-42c0-a274-486f7270b0ad", 672 | "tags": [] 673 | }, 674 | "outputs": [], 675 | "source": [ 676 | "import numpy as np\n", 677 | "import pandas as pd\n", 678 | "\n", 679 | "num_points = # Number of data points to generate\n", 680 | "\n", 681 | "# Generate random x-y data points\n", 682 | "x_data = np.random.uniform(0, 10, num_points) # Adjust the range as needed\n", 683 | "y_data = np.random.uniform(0, 10, num_points)\n", 684 | "\n", 685 | "# Create a DataFrame\n", 686 | "data = {'X': x_data, 'Y': y_data}\n", 687 | "df = pd.DataFrame(data)\n", 688 | "\n", 689 | "# Print the DataFrame\n", 690 | "print(df)" 691 | ] 692 | }, 693 | { 694 | "cell_type": "markdown", 695 | "metadata": { 696 | "id": "Ewl5y52Hb5zk" 697 | }, 698 | "source": [ 699 | "
\n", 700 | " Code hint \n", 701 | "Again you need to choose the number of points. 50 should be fine, but you have the power to decide.\n", 702 | "
" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": null, 708 | "metadata": { 709 | "colab": { 710 | "base_uri": "https://localhost:8080/", 711 | "height": 472 712 | }, 713 | "id": "eWwUdHjGciGZ", 714 | "outputId": "c733327e-7559-4f0b-c029-aaf1d843855f", 715 | "tags": [] 716 | }, 717 | "outputs": [], 718 | "source": [ 719 | "from sklearn.linear_model import LinearRegression\n", 720 | "from sklearn.metrics import r2_score, mean_squared_error\n", 721 | "\n", 722 | "# Perform linear regression\n", 723 | "X = df['X'].values.reshape(-1, 1) # Reshape X for compatibility with sklearn\n", 724 | "y = df['Y'].values\n", 725 | "model = LinearRegression().fit(X, y)\n", 726 | "y_pred = model.predict(X)\n", 727 | "\n", 728 | "# Calculate error bars\n", 729 | "residuals = y - y_pred\n", 730 | "error_bars = np.abs(residuals)\n", 731 | "\n", 732 | "# Plot the linear regression line\n", 733 | "plt.figure(figsize=(5, 3)) \n", 734 | "plt.errorbar(df['X'], df['Y'], yerr=error_bars, fmt='o', color='skyblue', label='Prediction errors')\n", 735 | "plt.scatter(df['X'], df['Y'])\n", 736 | "plt.plot(df['X'], y_pred, color='red', label='Regression line')\n", 737 | "plt.xlabel('X')\n", 738 | "plt.ylabel('Y')\n", 739 | "plt.title('Linear regression')\n", 740 | "plt.legend()\n", 741 | "plt.show()" 742 | ] 743 | }, 744 | { 745 | "cell_type": "markdown", 746 | "metadata": { 747 | "id": "nCig-VAmciGZ" 748 | }, 749 | "source": [ 750 | "There are a number of useful analysis tools built into `sklearn`, which we can use to probe the model properties." 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": null, 756 | "metadata": { 757 | "tags": [] 758 | }, 759 | "outputs": [], 760 | "source": [ 761 | "# Print the model parameters and performance\n", 762 | "try:\n", 763 | " print(f'Slope: {model2.coef_[0]:.2f}') # Assuming model.coef_ might be an array for multidimensional X\n", 764 | " print(f'Intercept: {model2.intercept_:.2f}')\n", 765 | " print(f'R^2 Score: {r2_score(y, y_pred):.3f}') # R^2 - coefficient of determination\n", 766 | " print(f'RMSE: {np.sqrt(mean_squared_error(y, y_pred)):.3f}') # Root Mean Squared Error\n", 767 | "except Exception as e:\n", 768 | " print(\"Error in calculating model parameters or performance metrics:\", e)" 769 | ] 770 | }, 771 | { 772 | "cell_type": "markdown", 773 | "metadata": { 774 | "id": "7K-w3ba2b5zs" 775 | }, 776 | "source": [ 777 | "
\n", 778 | " Code hint \n", 779 | "Your model is not called `model2`. Try changing the name.\n", 780 | "
" 781 | ] 782 | }, 783 | { 784 | "cell_type": "markdown", 785 | "metadata": { 786 | "id": "2D92BAYzciGa" 787 | }, 788 | "source": [ 789 | "## 🚨 Exercise 1\n", 790 | "\n", 791 | "
\n", 792 | " 💡 Coding exercises: The exercises are designed to apply what you have learned with room for creativity. It is fine to discuss solutions with your classmates, but the actual code should not be directly copied.\n", 793 | "
\n", 794 | "\n", 795 | "### Your details" 796 | ] 797 | }, 798 | { 799 | "cell_type": "code", 800 | "execution_count": null, 801 | "metadata": { 802 | "colab": { 803 | "base_uri": "https://localhost:8080/" 804 | }, 805 | "id": "xqgBbaSjb5zs", 806 | "outputId": "27965ae7-5d0b-40f2-f4ae-7757939dfb1d", 807 | "tags": [] 808 | }, 809 | "outputs": [], 810 | "source": [ 811 | "import numpy as np\n", 812 | "\n", 813 | "# Insert your values\n", 814 | "Name = \"No Name\" # Replace with your name\n", 815 | "CID = 123446 # Replace with your College ID (as a numeric value with no leading 0s)\n", 816 | "\n", 817 | "# Set a random seed using the CID value\n", 818 | "CID = int(CID)\n", 819 | "np.random.seed(CID)\n", 820 | "\n", 821 | "# Print the message\n", 822 | "print(\"This is the work of \" + Name + \" [CID: \" + str(CID) + \"]\")" 823 | ] 824 | }, 825 | { 826 | "cell_type": "markdown", 827 | "metadata": { 828 | "id": "DIia0_h9ciGa" 829 | }, 830 | "source": [ 831 | "### Problem\n", 832 | "\n", 833 | "Due to their importance in the electronics industry, the diffusion of atoms in semiconductors has been well studied for decades. Below is a set of data for impurity diffusion in crystalline Si [Source: [Casey and Pearson (1975)](https://link.springer.com/chapter/10.1007/978-1-4684-0904-8_2)]. It has been arranged into a DataFrame for your convenience.\n", 834 | "\n", 835 | "```python\n", 836 | "import pandas as pd\n", 837 | "\n", 838 | "data = {\n", 839 | " 'Impurity': ['B', 'Al', 'Ga', 'In', 'P', 'As', 'Sb', 'Bi'],\n", 840 | " 'Mass': [10.81, 26.98, 69.72, 114.82, 30.97, 74.92, 121.76, 208.98], # atomic mass in g/mol\n", 841 | " 'D0': [5.1, 8.0, 3.6, 16.5, 10.5, 60.0, 12.9, 1.03E3], # cm2/sec\n", 842 | " 'Eact': [3.70, 3.47, 3.51, 3.91, 3.69, 4.20, 3.98, 4.63] # eV\n", 843 | "}\n", 844 | "\n", 845 | "df = pd.DataFrame(data)\n", 846 | "print(df)\n", 847 | "```\n", 848 | "\n", 849 | "Two tasks will be given in class." 850 | ] 851 | }, 852 | { 853 | "cell_type": "code", 854 | "execution_count": null, 855 | "metadata": { 856 | "colab": { 857 | "base_uri": "https://localhost:8080/", 858 | "height": 472 859 | }, 860 | "id": "g01sLM1xifCa", 861 | "outputId": "6d0c3d79-37f3-4dc4-d246-b069fc19e7ae", 862 | "tags": [] 863 | }, 864 | "outputs": [], 865 | "source": [ 866 | "#Empty block for your answers\n", 867 | "\n", 868 | "\n" 869 | ] 870 | }, 871 | { 872 | "cell_type": "code", 873 | "execution_count": null, 874 | "metadata": { 875 | "id": "wh5CNdABifCa" 876 | }, 877 | "outputs": [], 878 | "source": [ 879 | "#Empty block for your answers\n", 880 | "\n", 881 | "\n" 882 | ] 883 | }, 884 | { 885 | "cell_type": "markdown", 886 | "metadata": {}, 887 | "source": [ 888 | "
\n", 889 | " 📓 Submission: When your notebook is complete in Google Colab, go to File > Download and choose .ipynb. The completed file should be uploaded to Blackboard under assignments for MATE70026.\n", 890 | "
" 891 | ] 892 | }, 893 | { 894 | "cell_type": "markdown", 895 | "metadata": { 896 | "id": "BZfqPL6zifCa", 897 | "tags": [] 898 | }, 899 | "source": [ 900 | "## 🌊 Dive deeper\n", 901 | "\n", 902 | "* _Level 1:_ Read Chapter 1 of [Machine Learning Refined](https://github.com/neonwatty/machine_learning_refined) for a complementary introduction to the field.\n", 903 | "\n", 904 | "* _Level 2:_ Taylor Sparks has a collection of video lectures on [Python for Materials Engineers](https://www.youtube.com/watch?v=tn1wpfpLx6Y&list=PLL0SWcFqypCmkHClksnGlab3wglEVMqNN&index=2).\n", 905 | "\n", 906 | "* _Level 3:_ If you are a matplotlib pro user, try [plotly](https://plotly.com/python) and [bokeh](https://docs.bokeh.org/en/2.4.1/docs/gallery.html) for interactive visualisations." 907 | ] 908 | } 909 | ], 910 | "metadata": { 911 | "colab": { 912 | "provenance": [], 913 | "toc_visible": true 914 | }, 915 | "kernelspec": { 916 | "display_name": "vscode24", 917 | "language": "python", 918 | "name": "python3" 919 | }, 920 | "language_info": { 921 | "codemirror_mode": { 922 | "name": "ipython", 923 | "version": 3 924 | }, 925 | "file_extension": ".py", 926 | "mimetype": "text/x-python", 927 | "name": "python", 928 | "nbconvert_exporter": "python", 929 | "pygments_lexer": "ipython3", 930 | "version": "3.12.4" 931 | } 932 | }, 933 | "nbformat": 4, 934 | "nbformat_minor": 4 935 | } 936 | -------------------------------------------------------------------------------- /Lecture5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "zhHRr3VVOMFm" 7 | }, 8 | "source": [ 9 | "# Classical Learning" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": { 15 | "id": "cWJD84-9OMFo" 16 | }, 17 | "source": [ 18 | "
\n", 19 | " 💡 Hugh Cartwright: The tools of science are changing; artificial intelligence has spread to the laboratory.\n", 20 | "
" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": { 26 | "id": "vN4ra0MROMFp" 27 | }, 28 | "source": [ 29 | "\n", 30 | "\n", 31 | "[Lecture slides](https://speakerdeck.com/aronwalsh/mlformaterials-lecture5-classical)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": { 37 | "id": "IPrAgT4POMFp" 38 | }, 39 | "source": [ 40 | "## 🎲 Metal or insulator?\n", 41 | "\n", 42 | "In life, some decisions are difficult to make. We hope that our experience informs a choice that is better than a random guess. The same is true for machine learning models.\n", 43 | "\n", 44 | "There are many situations where we want to classify materials according to their properties. One fundamental characteristic is whether a material is a metal or insulator. For this exercise, we can refer to these as class `0` and class `1` materials, respectively. \n", 45 | "\n", 46 | "From our general knowledge, Cu should be `0` and MgO should be `1`, but what about Tl2O3 or Ni2Zn4?\n", 47 | "\n", 48 | "### Theoretical background\n", 49 | "\n", 50 | "Metals are characterised by their free electrons that facilitate the flow of electric current. This arises from a partially filled conduction band, allowing electrons to move easily when subjected to an electric field.\n", 51 | "\n", 52 | "Insulators are characterised by an occupied valence band and empty conduction band, impeding the flow of current. The absence of charge carriers hinders electrical conductivity, making them effective insulators of electricity. Understanding these fundamental differences is crucial for designing and optimising electronic devices.\n", 53 | "\n", 54 | "In this practical, we can use the electronic band gap of a material as a simple descriptor of whether it is a metal (Eg = 0) or an insulator (Eg > 0).\n", 55 | "\n", 56 | "$$\n", 57 | "E_g = E^{conduction-band}_{minimum} - E^{valence-band}_{maximum}\n", 58 | "$$\n", 59 | "\n", 60 | "This classification is coarse as we are ignoring the intermediate regime of semiconductors and more exotic behaviour such as superconductivity.\n", 61 | "\n", 62 | "![image](./images/5_bands.png)\n", 63 | "\n", 64 | "## $k$-means clustering\n", 65 | "\n", 66 | "Let's start by generating synthetic data for materials along with their class labels. To make the analysis faster and more illustrative, we can perform dimensionality reduction from a 10D to 2D feature space, and then cluster the data using $k$-means." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "id": "CLEjvAiAOMFp", 74 | "tags": [] 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "# Installation of libraries\n", 79 | "!pip install elementembeddings --quiet\n", 80 | "!pip install matminer --quiet" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 2, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# Import of modules\n", 90 | "import numpy as np # Numerical operations\n", 91 | "import pandas as pd # DataFrames\n", 92 | "import matplotlib.pyplot as plt # Plotting\n", 93 | "import seaborn as sns # Visualisation\n", 94 | "from sklearn.decomposition import PCA # Principal component analysis (PCA)\n", 95 | "from sklearn.cluster import KMeans # k-means clustering\n", 96 | "from sklearn.metrics import accuracy_score, confusion_matrix # Model evaluation\n", 97 | "from sklearn.tree import DecisionTreeClassifier # Decision tree classifier" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "
\n", 105 | "Colab error solution\n", 106 | "If running the import module cell fails with an \"AttributeError\", click `Runtime` -> `Restart Session` and then simply rerun the cell.\n", 107 | "
" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "### Uncorrelated data\n", 115 | "\n", 116 | "Pay attention to each step in the process:" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "# Step 0: Set the number of clusters\n", 126 | "n_clusters = 0\n", 127 | "\n", 128 | "# Step 1: Generating synthetic (random) data\n", 129 | "np.random.seed(42)\n", 130 | "num_materials = 200\n", 131 | "num_features = 10\n", 132 | "data = np.random.rand(num_materials, num_features)\n", 133 | "labels = np.random.randint(0, 2, num_materials)\n", 134 | "\n", 135 | "# Step 2: Reduce dimensions to 2 using PCA\n", 136 | "pca = PCA(n_components=2)\n", 137 | "reduced_data = pca.fit_transform(data)\n", 138 | "\n", 139 | "# Step 3: Cluster the data using k-means\n", 140 | "kmeans = KMeans(n_clusters=n_clusters, random_state=42)\n", 141 | "predicted_labels = kmeans.fit_predict(reduced_data)\n", 142 | "\n", 143 | "# Step 4: Create a plot to visualise the clusters and known labels\n", 144 | "plt.figure(figsize=(5, 4))\n", 145 | "\n", 146 | "# Plot the materials labeled as metal (label=1)\n", 147 | "plt.scatter(reduced_data[labels == 1, 0], reduced_data[labels == 1, 1], c='lightblue', label='Metal')\n", 148 | "# Plot the materials labeled as insulator (label=0)\n", 149 | "plt.scatter(reduced_data[labels == 0, 0], reduced_data[labels == 0, 1], c='lightcoral', label='Insulator')\n", 150 | "# Plot the cluster centres as stars\n", 151 | "plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='gold', s=200, label='Cluster centres', marker='*')\n", 152 | "\n", 153 | "# Draw cluster boundaries\n", 154 | "h = 0.02 # step size for the meshgrid\n", 155 | "x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1\n", 156 | "y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1\n", 157 | "xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n", 158 | "Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])\n", 159 | "Z = Z.reshape(xx.shape)\n", 160 | "plt.contourf(xx, yy, Z, alpha=0.2, cmap='Pastel1')\n", 161 | "\n", 162 | "plt.xlabel('Principal Component 1')\n", 163 | "plt.ylabel('Principal Component 2')\n", 164 | "plt.title('$k$-means clustering of synthetic data')\n", 165 | "plt.legend()\n", 166 | "plt.show()" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": { 172 | "id": "iyjuZJYmOMFq" 173 | }, 174 | "source": [ 175 | "
\n", 176 | " Code hint \n", 177 | "The algorithm fails for 0 clusters. \n", 178 | "Increase the value of `n_clusters` and look at the behaviour.\n", 179 | "
" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": { 185 | "id": "RWChaS_mOMFr" 186 | }, 187 | "source": [ 188 | "The cluster centres are shown by yellow stars. The model doesn't perform well, as we just generated this \"materials data\" from random numbers. There are no correlations for the algorithms to exploit. Nonetheless, this type of \"failed experiment\" is common in real research.\n", 189 | "\n", 190 | "Since we know the labels, we can quantify how bad the model using the classification accuracy. Is it better than flipping a coin? " 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": { 197 | "colab": { 198 | "base_uri": "https://localhost:8080/" 199 | }, 200 | "id": "XyShX2J-OMFr", 201 | "outputId": "c3c1425f-4354-4080-c42d-f025bedca416", 202 | "tags": [] 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "# Step 5: Quantify classification accuracy\n", 207 | "accuracy = accuracy_score(labels, predicted_labels)\n", 208 | "conf_matrix = confusion_matrix(labels, predicted_labels)\n", 209 | "\n", 210 | "print(\"Accuracy:\", accuracy)\n", 211 | "print(\"\\nConfusion matrix:\")\n", 212 | "print(conf_matrix)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": { 218 | "id": "uSM_A4-ZOMFr" 219 | }, 220 | "source": [ 221 | "## Decision tree classifier\n", 222 | "\n", 223 | "Let's see if we can do better using a dedicated classifier. We will now train a decision tree to tackle the same problem and visualise the decision boundary." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "colab": { 231 | "base_uri": "https://localhost:8080/", 232 | "height": 564 233 | }, 234 | "id": "ZKbtozXuOMFr", 235 | "outputId": "1e23e4bb-e043-4c37-b74a-413a5e002bc1", 236 | "tags": [] 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "# Step 0: Set the depth of the decision tree\n", 241 | "max_tree_depth = 0\n", 242 | "\n", 243 | "# Step 1: Train a decision tree classifier\n", 244 | "def train_decision_tree(depth, reduced_data, labels):\n", 245 | " tree_classifier = DecisionTreeClassifier(max_depth=depth, random_state=42)\n", 246 | " tree_classifier.fit(reduced_data, labels)\n", 247 | " return tree_classifier\n", 248 | "\n", 249 | "tree_classifier = train_decision_tree(max_tree_depth, reduced_data, labels)\n", 250 | "predicted_labels = tree_classifier.predict(reduced_data)\n", 251 | "\n", 252 | "# Step 2: Create a plot to visualise the decision boundary of the decision tree\n", 253 | "plt.figure(figsize=(5, 4))\n", 254 | "\n", 255 | "# Plot the materials labeled as metal (label=1)\n", 256 | "plt.scatter(reduced_data[labels == 1, 0], reduced_data[labels == 1, 1], c='lightblue', label='Metal')\n", 257 | "# Plot the materials labeled as insulator (label=0)\n", 258 | "plt.scatter(reduced_data[labels == 0, 0], reduced_data[labels == 0, 1], c='lightcoral', label='Insulator')\n", 259 | "# Plot the decision boundary of the decision tree classifier\n", 260 | "h = 0.02 # step size for the meshgrid\n", 261 | "x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1\n", 262 | "y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1\n", 263 | "xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n", 264 | "Z = tree_classifier.predict(np.c_[xx.ravel(), yy.ravel()])\n", 265 | "Z = Z.reshape(xx.shape)\n", 266 | "plt.contourf(xx, yy, Z, alpha=0.5, cmap='Pastel1')\n", 267 | "\n", 268 | "plt.xlabel('Principal Component 1')\n", 269 | "plt.ylabel('Principal Component 2')\n", 270 | "plt.title(f'Decision tree (max depth={max_tree_depth}) of synthetic data')\n", 271 | "plt.legend()\n", 272 | "\n", 273 | "plt.show()" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": { 279 | "id": "SW0VbC_4OMFr" 280 | }, 281 | "source": [ 282 | "
\n", 283 | " Code hint \n", 284 | "With no nodes, you have made an indecisive tree 🥁.\n", 285 | " \n", 286 | "Increase the value of `max_tree_depth` and look at the behaviour.\n", 287 | "
" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": { 293 | "id": "sOqtQymnOMFs" 294 | }, 295 | "source": [ 296 | "There should be more structure in the decision boundary due to the more complex model, especially as you increase the tree depth.\n", 297 | "\n", 298 | "$k$-means clustering provides a simple way to group materials based on similarity, yielding a clear linear decision boundary. On the other hand, the decision tree classifier does better in handling non-linear separations. It constructs a boundary based on different feature thresholds, enabling it to capture fine-grained patterns. As always in ML, there is a balance of trade-offs between simplicity and accuracy.\n", 299 | "\n", 300 | "Is the decision tree more accurate? Let's see." 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": { 307 | "colab": { 308 | "base_uri": "https://localhost:8080/" 309 | }, 310 | "id": "PucrDphBOMFs", 311 | "outputId": "09ad3aa5-1ee9-4597-e2ca-c38b5d85057b", 312 | "tags": [] 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "# Step 3: Quantify classification accuracy\n", 317 | "accuracy = accuracy_score(labels, predicted_labels)\n", 318 | "conf_matrix = confusion_matrix(labels, predicted_labels)\n", 319 | "\n", 320 | "print(\"Accuracy:\", accuracy)\n", 321 | "print(\"\\nConfusion Matrix:\")\n", 322 | "print(conf_matrix)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": { 328 | "id": "SNQRw2RhOMFs" 329 | }, 330 | "source": [ 331 | "If you choose a large value for the tree depth, the decision tree will approach a perfect accuracy of 1.0. It does this by memorising (overfitting) the training data but is unlikely to generalise well to new (unseen) data, i.e. overfitting. In contrast, the accuracy of $k$-means clustering is lower because it is an unsupervised algorithm designed for clustering, not classification. Its performance depends on the data structure and the presence of distinct clusters in that feature space.\n", 332 | "\n", 333 | "### Correlated data\n", 334 | "\n", 335 | "Let's try again, but this time we will (manually) add some correlations into the dataset." 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "# Modify dataset with correlation\n", 345 | "correlation_strength = 0.333\n", 346 | "for i in range(num_features):\n", 347 | " # For some features, add a linear correlation with the labels\n", 348 | " if i % 2 == 0: # Correlate every other feature\n", 349 | " data[:, i] = correlation_strength * labels + (1 - correlation_strength) * np.random.rand(num_materials)\n", 350 | "\n", 351 | "pca = PCA(n_components=2)\n", 352 | "reduced_data = pca.fit_transform(data)\n", 353 | "\n", 354 | "# Step 0: Set the depth of the decision tree\n", 355 | "max_tree_depth = 1\n", 356 | "\n", 357 | "# Step 1: Train a decision tree classifier\n", 358 | "def train_decision_tree(depth, reduced_data, labels):\n", 359 | " tree_classifier = DecisionTreeClassifier(max_depth=depth, random_state=42)\n", 360 | " tree_classifier.fit(reduced_data, labels)\n", 361 | " return tree_classifier\n", 362 | "\n", 363 | "tree_classifier = train_decision_tree(max_tree_depth, reduced_data, labels)\n", 364 | "predicted_labels = tree_classifier.predict(reduced_data)\n", 365 | "\n", 366 | "# Step 2: Create a plot to visualise the decision boundary of the decision tree\n", 367 | "plt.figure(figsize=(5, 4))\n", 368 | "\n", 369 | "# Plot the materials labeled as metal (label=1)\n", 370 | "plt.scatter(reduced_data[labels == 1, 0], reduced_data[labels == 1, 1], c='lightblue', label='Metal')\n", 371 | "# Plot the materials labeled as insulator (label=0)\n", 372 | "plt.scatter(reduced_data[labels == 0, 0], reduced_data[labels == 0, 1], c='lightcoral', label='Insulator')\n", 373 | "# Plot the decision boundary of the decision tree classifier\n", 374 | "h = 0.02 # step size for the meshgrid\n", 375 | "x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1\n", 376 | "y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1\n", 377 | "xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n", 378 | "Z = tree_classifier.predict(np.c_[xx.ravel(), yy.ravel()])\n", 379 | "Z = Z.reshape(xx.shape)\n", 380 | "plt.contourf(xx, yy, Z, alpha=0.5, cmap='Pastel1')\n", 381 | "\n", 382 | "plt.xlabel('Principal Component 1')\n", 383 | "plt.ylabel('Principal Component 2')\n", 384 | "plt.title(f'Decision tree (max depth={max_tree_depth}) for artificial materials')\n", 385 | "plt.legend()\n", 386 | "\n", 387 | "plt.show()\n", 388 | "\n", 389 | "# Step 3: Quantify classification accuracy\n", 390 | "accuracy = accuracy_score(labels, predicted_labels)\n", 391 | "conf_matrix = confusion_matrix(labels, predicted_labels)\n", 392 | "\n", 393 | "print(\"Accuracy:\", accuracy)\n", 394 | "print(\"\\nConfusion Matrix:\")\n", 395 | "print(conf_matrix)" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "Now even a very simple tree can effectively draw a decision boundary. Machine learning models take advantage of such correlations in high dimensional feature spaces. You can modify the correlation strength on line 2 to see the effect." 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": { 408 | "id": "yAJreGhfOMFs", 409 | "tags": [] 410 | }, 411 | "source": [ 412 | "## Real materials\n", 413 | "\n", 414 | "We can save time again by making use of a pre-built dataset. We will return to [matminer](https://hackingmaterials.lbl.gov/matminer), which we used before, and load `matbench_expt_is_metal`.\n", 415 | "\n", 416 | "### Load dataset" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": null, 422 | "metadata": { 423 | "tags": [] 424 | }, 425 | "outputs": [], 426 | "source": [ 427 | "import matminer\n", 428 | "from matminer.datasets.dataset_retrieval import load_dataset\n", 429 | "\n", 430 | "# Use matminer to download the dataset\n", 431 | "df = load_dataset('matbench_expt_is_metal')\n", 432 | "print(f'The full dataset contains {df.shape[0]} entries. \\n')\n", 433 | "\n", 434 | "# Display the first 10 entries\n", 435 | "df.head(10)" 436 | ] 437 | }, 438 | { 439 | "cell_type": "markdown", 440 | "metadata": { 441 | "id": "sXq9bXwGOMFs" 442 | }, 443 | "source": [ 444 | "
\n", 445 | " Code hint \n", 446 | "To load a different dataset, you simply change the name in 'load_dataset()'.\n", 447 | "
" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": { 453 | "id": "Y76d3NLhZADO" 454 | }, 455 | "source": [ 456 | "### Materials featurisation\n", 457 | "\n", 458 | "Revisiting concepts from earlier Notebooks, featurising the chemical compositions is necessary to create a useful set of input vectors. This allows the presence (or absence) of an element (or element combinations) to act as a feature that the classifier takes account for.\n", 459 | "\n", 460 | "We will use [ElementEmbeddings](https://wmd-group.github.io/ElementEmbeddings) to featurise the `composition` column. The importance of the pooling method can be tested by generating two sets of features. In the first, the mean of the atomic vectors is used, while in the second, a max pooling method takes the maximum value of each component across all the atomic vectors in the composition." 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "metadata": { 467 | "colab": { 468 | "base_uri": "https://localhost:8080/", 469 | "height": 559 470 | }, 471 | "id": "sTJg5-4yY9au", 472 | "outputId": "18f140fe-b6c1-41dc-b1d2-3adfc6c40e73", 473 | "tags": [] 474 | }, 475 | "outputs": [], 476 | "source": [ 477 | "# Featurise all chemical compositions\n", 478 | "from elementembeddings.composition import composition_featuriser\n", 479 | "\n", 480 | "# Compute element embeddings using mean and max pooling\n", 481 | "mean_df = composition_featuriser(df[\"composition\"], embedding=\"magpie\", stats=[\"mean\"])\n", 482 | "max_df = composition_featuriser(df[\"composition\"], embedding=\"magpie\", stats=[\"maxpool\"])\n", 483 | "\n", 484 | "# Convert \"is_metal\" column to integer labels (0, 1)\n", 485 | "df['is_metal'] = df['is_metal'].astype(int)\n", 486 | "mean_df['is_metal'] = df['is_metal']\n", 487 | "max_df['is_metal'] = df['is_metal']\n", 488 | "\n", 489 | "# Define feature matrices and target variable\n", 490 | "cols_to_drop = ['is_metal', 'formula']\n", 491 | "\n", 492 | "X_mean = mean_df.drop(columns=cols_to_drop, errors='ignore').values\n", 493 | "X_max = max_df.drop(columns=cols_to_drop, errors='ignore').values\n", 494 | "y = df['is_metal'].values # Target variable\n", 495 | "\n", 496 | "# Preview first two rows \n", 497 | "print(\"Mean pooling features (first two rows, first 4 columns):\")\n", 498 | "print(mean_df.iloc[:2, :4]) \n", 499 | "print(\"\\nMax pooling features (first two rows, first 4 columns):\")\n", 500 | "print(max_df.iloc[:2, :4]) " 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": {}, 506 | "source": [ 507 | "In the output, you can see two numerical representations of the chemical compositions using different feature extraction techniques. Now let's see how they cluster." 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "### $k$-means clustering \n", 515 | "\n", 516 | "#### Mean pool" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": null, 522 | "metadata": {}, 523 | "outputs": [], 524 | "source": [ 525 | "# Perform k-means clustering\n", 526 | "kmeans = KMeans(n_clusters=2, random_state=42)\n", 527 | "predicted_labels = kmeans.fit_predict(X_mean)\n", 528 | "\n", 529 | "# Adjust k-means output to match true labels\n", 530 | "if accuracy_score(y, predicted_labels) < 0.5:\n", 531 | " predicted_labels = 1 - predicted_labels\n", 532 | "\n", 533 | "# Assess performance\n", 534 | "accuracy = accuracy_score(y, predicted_labels)\n", 535 | "print(f\"Accuracy: {accuracy:.2f}\")\n", 536 | "\n", 537 | "conf_matrix = confusion_matrix(y, predicted_labels)\n", 538 | "\n", 539 | "plt.figure(figsize=(5, 4))\n", 540 | "sns.heatmap(conf_matrix, annot=True, fmt=\"d\", cmap=\"Blues\", \n", 541 | " xticklabels=['Predicted Insulator', 'Predicted Metal'], \n", 542 | " yticklabels=['True Insulator', 'True Metal'])\n", 543 | "plt.xlabel('Predicted label')\n", 544 | "plt.ylabel('True label')\n", 545 | "plt.show()" 546 | ] 547 | }, 548 | { 549 | "cell_type": "markdown", 550 | "metadata": {}, 551 | "source": [ 552 | "#### Max pool" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "# Perform k-means clustering\n", 562 | "kmeans = KMeans(n_clusters=2, random_state=42)\n", 563 | "predicted_labels = kmeans.fit_predict(X_max)\n", 564 | "\n", 565 | "# Adjust k-means output to match true labels\n", 566 | "if accuracy_score(y, predicted_labels) < 0.5:\n", 567 | " predicted_labels = 1 - predicted_labels\n", 568 | "\n", 569 | "# Assess performance\n", 570 | "accuracy = accuracy_score(y, predicted_labels)\n", 571 | "print(f\"Accuracy: {accuracy:.2f}\")\n", 572 | "\n", 573 | "conf_matrix = confusion_matrix(y, predicted_labels)\n", 574 | "\n", 575 | "plt.figure(figsize=(5, 4))\n", 576 | "sns.heatmap(conf_matrix, annot=True, fmt=\"d\", cmap=\"Blues\", \n", 577 | " xticklabels=['Predicted Insulator', 'Predicted Metal'], \n", 578 | " yticklabels=['True Insulator', 'True Metal'])\n", 579 | "plt.xlabel('Predicted label')\n", 580 | "plt.ylabel('True label')\n", 581 | "plt.show()" 582 | ] 583 | }, 584 | { 585 | "cell_type": "markdown", 586 | "metadata": {}, 587 | "source": [ 588 | "The difference in accuracy between the two methods for this simple example highlights the importance of choosing an appropriate pooling strategy when featurising materials data. In this case, mean pooling provides a more balanced representation, which better distinguishes between metals and insulators." 589 | ] 590 | }, 591 | { 592 | "cell_type": "markdown", 593 | "metadata": { 594 | "tags": [] 595 | }, 596 | "source": [ 597 | "## 🚨 Exercise 5\n", 598 | "\n", 599 | "
\n", 600 | " 💡 Coding exercises: The exercises are designed to apply what you have learned with room for creativity. It is fine to discuss solutions with your classmates, but the actual code should not be directly copied.\n", 601 | "
\n", 602 | "\n", 603 | "### Your details" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": null, 609 | "metadata": {}, 610 | "outputs": [], 611 | "source": [ 612 | "import numpy as np\n", 613 | "\n", 614 | "# Insert your values\n", 615 | "Name = \"No Name\" # Replace with your name\n", 616 | "CID = 123446 # Replace with your College ID (as a numeric value with no leading 0s)\n", 617 | "\n", 618 | "# Set a random seed using the CID value\n", 619 | "CID = int(CID)\n", 620 | "np.random.seed(CID)\n", 621 | "\n", 622 | "# Print the message\n", 623 | "print(\"This is the work of \" + Name + \" [CID: \" + str(CID) + \"]\")" 624 | ] 625 | }, 626 | { 627 | "cell_type": "markdown", 628 | "metadata": { 629 | "id": "4WAC3QJYOMFs", 630 | "tags": [] 631 | }, 632 | "source": [ 633 | "### Problem\n", 634 | "\n", 635 | "The choice of featurisation method can significantly impact the performance of machine learning models, particularly in decision trees, which rely on the features to make accurate splits. \n", 636 | "\n", 637 | "Tasks will be given in class focusing on comparing the impact of different featurisation methods on classification performance." 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": null, 643 | "metadata": {}, 644 | "outputs": [], 645 | "source": [ 646 | "#Empty block for your answers\n", 647 | "\n", 648 | "\n" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": null, 654 | "metadata": {}, 655 | "outputs": [], 656 | "source": [ 657 | "#Empty block for your answers\n", 658 | "\n", 659 | "\n" 660 | ] 661 | }, 662 | { 663 | "cell_type": "markdown", 664 | "metadata": {}, 665 | "source": [ 666 | "
\n", 667 | " Task hint \n", 668 | "For task 4, you can featurise a new composition using a command such as `new_material = composition_featuriser([\"AlGaN2\"], embedding=\"atomic\", stats=[\"sum\"])`\n", 669 | "
\n", 670 | "\n", 671 | "
\n", 672 | " 📓 Submission: When your notebook is complete in Google Colab, go to File > Download and choose .ipynb. The completed file should be uploaded to Blackboard under assignments for MATE70026.\n", 673 | "
" 674 | ] 675 | }, 676 | { 677 | "cell_type": "markdown", 678 | "metadata": { 679 | "tags": [] 680 | }, 681 | "source": [ 682 | "## 🌊 Dive deeper\n", 683 | "\n", 684 | "* _Level 1:_ Tackle Chapter 6 on Linear Two-Class Classification in [Machine Learning Refined](https://github.com/jermwatt/machine_learning_refined#what-is-new-in-the-second-edition).\n", 685 | "\n", 686 | "* _Level 2:_ Play [metal detection](http://palestrina.northwestern.edu/metal-detection/). Note, the website can be a little temperamental. \n", 687 | "\n", 688 | "* _Level 3:_ Dig deeper into the options for definitions decision trees and ensemble models in [scikit-learn](https://scikit-learn.org/stable/modules/tree.html)." 689 | ] 690 | } 691 | ], 692 | "metadata": { 693 | "colab": { 694 | "provenance": [] 695 | }, 696 | "kernelspec": { 697 | "display_name": "vscode24", 698 | "language": "python", 699 | "name": "python3" 700 | }, 701 | "language_info": { 702 | "codemirror_mode": { 703 | "name": "ipython", 704 | "version": 3 705 | }, 706 | "file_extension": ".py", 707 | "mimetype": "text/x-python", 708 | "name": "python", 709 | "nbconvert_exporter": "python", 710 | "pygments_lexer": "ipython3", 711 | "version": "3.12.4" 712 | } 713 | }, 714 | "nbformat": 4, 715 | "nbformat_minor": 4 716 | } 717 | -------------------------------------------------------------------------------- /Lecture7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Building a Model from Scratch" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "
\n", 15 | " 💡 Mildred Dresselhaus: People said you’re crazy... But if you think you’re right, stick to it. And we were right.\n", 16 | "
" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "\n", 24 | "\n", 25 | "[Lecture slides](https://speakerdeck.com/aronwalsh/mlformaterials-lecture7-build)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## 🦾 Crystal hardness revisited\n", 33 | "\n", 34 | "We first tackled the [bulk modulus](https://en.wikipedia.org/wiki/Bulk_modulus) of inorganic crystals in Lecture 2. However our model development was not thorough back then.\n", 35 | "\n", 36 | "Let's revisit this problem using the new knowledge and tricks we have picked up. We will follow the same initial steps, making use of [matminer](https://matminer.readthedocs.io) to access the materials dataset and featurise the data." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# Installation of libraries\n", 46 | "!pip install matminer --quiet\n", 47 | "!pip install xgboost --quiet" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# Downgrade scikit to avoid a conflict with xgboost\n", 57 | " # Note: Ignore the error message\n", 58 | "!pip uninstall -y scikit-learn --quiet\n", 59 | "!pip install scikit-learn==1.3.1 --quiet " 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# Import of modules\n", 69 | "import numpy as np \n", 70 | "import matplotlib.pyplot as plt \n", 71 | "import pandas as pd \n", 72 | "import pprint \n", 73 | "import seaborn as sns \n", 74 | "plt.style.use('ggplot') \n", 75 | "\n", 76 | "# Advanced\n", 77 | "from pymatgen.core import Structure \n", 78 | "import matminer \n", 79 | "from matminer.datasets.dataset_retrieval import load_dataset \n", 80 | "from monty.serialization import loadfn \n", 81 | "\n", 82 | "# To make the model run faster\n", 83 | "teaching_mode = True" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "
\n", 91 | "Colab error solution\n", 92 | "If running the import module cell fails with an \"AttributeError\", click `Runtime` -> `Restart Session` and then simply rerun the cell. \n", 93 | "
" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": { 99 | "tags": [] 100 | }, 101 | "source": [ 102 | "## Data preparation\n", 103 | "\n", 104 | "The steps to load and featurise the bulk modulus data were introduced in Notebook 2, so we can jump straight in." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "# Use matminer to load the dataset\n", 114 | "df = load_dataset('matbench_log_kvrh')\n", 115 | "print(f'The full dataset contains {df.shape[0]} entries. \\n')\n", 116 | "\n", 117 | "if teaching_mode:\n", 118 | " # Store the original DataFrame as a copy\n", 119 | " full_dataset_df = df.copy()\n", 120 | " # Create a subset of the original DataFrame for demonstration purposes\n", 121 | " df = df.sample(n=1000, random_state=33)\n", 122 | " print(f'For teaching purposes we will only work with {df.shape[0]} entries from the dataframe to make the model training and testing faster. \\n')\n", 123 | "\n", 124 | "print('The DataFrame is shown below:')\n", 125 | "df.head(10)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "# Plot a histogram of values\n", 135 | "fig, ax = plt.subplots(figsize=(5, 4))\n", 136 | "ax.hist(df['log10(K_VRH)'])\n", 137 | "ax.set_xlabel(r'$log_{10}K_{VRH}$ [$log_{10}GPa$]' )\n", 138 | "ax.set_ylabel('Counts')\n", 139 | "plt.show()" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "# Use matminer to featurise the dataset\n", 149 | "from matminer.featurizers.composition.composite import ElementProperty\n", 150 | "from matminer.featurizers.structure.order import DensityFeatures\n", 151 | "\n", 152 | "# Add a composition column to df using the composition property of the Structure class\n", 153 | "df['composition'] = df.structure.apply(lambda x: x.composition )\n", 154 | "\n", 155 | "# Create the ElementProperty featuriser\n", 156 | "el_prop_featuriser = ElementProperty.from_preset(preset_name='magpie')\n", 157 | "\n", 158 | "# By default multiprocessing is enabled, however, this can slow performance, so we disable it\n", 159 | "el_prop_featuriser.set_n_jobs(1)\n", 160 | "\n", 161 | "# Featurise using the ElementProperty featuriser\n", 162 | "df = el_prop_featuriser.featurize_dataframe(df, col_id='composition')\n", 163 | "\n", 164 | "# Add structure features\n", 165 | "density_featuriser = DensityFeatures()\n", 166 | "density_featuriser.set_n_jobs(1)\n", 167 | "df=density_featuriser.fit_featurize_dataframe(df, col_id='structure')\n", 168 | "\n", 169 | "# Print the shape of the DataFrame\n", 170 | "print(df.shape)\n", 171 | "df.head()" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "Let's understand the feature space a little better." 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "# Extract the feature columns (excluding the first three)\n", 188 | "feature_columns = df.columns[3:]\n", 189 | "\n", 190 | "# Create a unique colour for each feature\n", 191 | "colors = [plt.cm.jet(i / float(len(feature_columns))) for i in range(len(feature_columns))]\n", 192 | "\n", 193 | "# Plot the distribution of feature values with different colours\n", 194 | "plt.figure(figsize=(5, 4))\n", 195 | "for i, column in enumerate(feature_columns):\n", 196 | " df[column].plot(kind='hist', bins=0, alpha=0.5, color=colors[i], label=column)\n", 197 | "\n", 198 | "plt.title('Feature Distributions')\n", 199 | "plt.show()" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "
\n", 207 | " Code hint \n", 208 | "Add some bins to your histogram. 10-20 should be sufficient.\n", 209 | "
\n", 210 | "\n", 211 | "Some dimensions have very different ranges, as you can see from the spread on the x-axis. We can standardise these. \n", 212 | "\n", 213 | "`MinMaxScaler` is a data scaling technique to transform numerical features within the range [0, 1]. It linearly scales data, preserving relationships between values, making it suitable for algorithms sensitive to feature magnitudes." 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "from sklearn.preprocessing import MinMaxScaler\n", 223 | "\n", 224 | "scaled_df = df.copy()\n", 225 | "\n", 226 | "# Step 1: Standardise the feature columns\n", 227 | "scaler = MinMaxScaler()\n", 228 | "scaled_df[feature_columns] = scaler.fit_transform(scaled_df[feature_columns])\n", 229 | "\n", 230 | "# Step 2: Plot the standardised feature distributions\n", 231 | "plt.figure(figsize=(5, 4))\n", 232 | "for column in feature_columns:\n", 233 | " scaled_df[column].plot(kind='hist', bins=20, alpha=0.5, label=column)\n", 234 | "\n", 235 | "plt.title('Standardised Feature Distributions')\n", 236 | "plt.show()" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "Finally, let's prepare the data for model training. We need to split the dataset into the target variable `log10(K_VRH)` and the input features. For the input features, we must remove any non-numerical data to avoid getting errors later in our workflow." 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "# Define the features we want \n", 253 | "features_to_drop = ['structure','composition','log10(K_VRH)']\n", 254 | "feature_cols = [col for col in list(df.columns) if col not in features_to_drop]\n", 255 | "\n", 256 | "# Get an array of the features\n", 257 | "X = df[feature_cols].values\n", 258 | "scaled_X = scaled_df[feature_cols].values\n", 259 | "\n", 260 | "# Get an array of the target variable\n", 261 | "y = df['log10(K_VRH)'].values\n", 262 | "\n", 263 | "print(f'Shape of X: {X.shape}')\n", 264 | "print(f'Shape of y: {y.shape}')" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "## Model choice\n" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "We are dealing with a supervised regression problem, so should choose a suitable machine learning model. We can start by rebuilding a random forest. Are you curious if the feature scaling has an effect? I am." 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "# Random forest - original features\n", 288 | "from sklearn.ensemble import RandomForestRegressor\n", 289 | "from sklearn import metrics\n", 290 | "\n", 291 | "# Define the model\n", 292 | "rf = RandomForestRegressor(n_estimators=100, criterion='squared_error', max_depth=3, min_samples_split=2, min_samples_leaf=1, random_state=42)\n", 293 | "\n", 294 | "# Fit the model\n", 295 | "rf.fit(X,y)\n", 296 | "\n", 297 | "# Wrap the lines of code for later sections\n", 298 | "def make_prediction_plot(X, y, model, label):\n", 299 | " y_pred = model.predict(X) # Calculate predictions here\n", 300 | " fig, ax = plt.subplots(figsize=(5, 4))\n", 301 | " ax.scatter(y, y_pred, c=y, cmap='viridis')\n", 302 | " ax.plot(y, y, 'r-')\n", 303 | " ax.set_xlabel(f'{label} True')\n", 304 | " ax.set_ylabel(f'{label} Predicted')\n", 305 | " plt.show()\n", 306 | " return y_pred # Return y_pred \n", 307 | "\n", 308 | "# Performance\n", 309 | "y_pred = make_prediction_plot(X, y, rf, 'log10(K_VRH)') \n", 310 | "\n", 311 | "print(f'The training MAE = {metrics.mean_absolute_error(y,y_pred):.3f} log10GPa')\n", 312 | "print(f'The training RMSE = {np.sqrt(metrics.mean_squared_error(y,y_pred)):.3f} log10GPa')\n", 313 | "print(f'The training r^2 = {rf.score(X,y):.3f}')" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "# Random forest - scaled features\n", 323 | "\n", 324 | "# Define the model\n", 325 | "rf2 = RandomForestRegressor(n_estimators=100, criterion='squared_error', max_depth=3, min_samples_split=2, min_samples_leaf=1, random_state=42)\n", 326 | "\n", 327 | "# Fit the model\n", 328 | "rf2.fit(scaled_X, y)\n", 329 | "\n", 330 | "# Performance\n", 331 | "y_pred = make_prediction_plot(scaled_X, y, rf2, 'log10(K_VRH)') \n", 332 | "print(f'The training MAE = {metrics.mean_absolute_error(y, y_pred):.3f} log10GPa')\n", 333 | "print(f'The training RMSE = {np.sqrt(metrics.mean_squared_error(y, y_pred)):.3f} log10GPa')\n", 334 | "print(f'The training r^2 = {rf2.score(scaled_X, y):.3f}')" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "We can see that Random Forest is not sensitive to feature scaling. Recall that this model works by averaging over multiple decision trees, and the decision boundaries are determined by feature thresholds, not their absolute values. \n", 342 | "\n", 343 | "We have time to try one more model. Let's go with the popular [XGBoost](https://xgboost.readthedocs.io). Like Random Forest, it is an ensemble learning method. XGBoost uses a gradient-boosting framework and often achieves higher predictive accuracy by optimising for both bias and variance in the model." 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "# XGBoost model\n", 353 | "import xgboost as xgb\n", 354 | "\n", 355 | "# Define the model\n", 356 | "xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=3, random_state=42, objective='reg:squarederror')\n", 357 | "\n", 358 | "# Fit the model\n", 359 | "xgb_model.fit(scaled_X, y)\n", 360 | "\n", 361 | "# Performance\n", 362 | "y_pred = make_prediction_plot(scaled_X, y, xgb_model, 'log10(K_VRH)') \n", 363 | "print(f'The training MAE = {metrics.mean_absolute_error(y, y_pred):.3f} log10GPa')\n", 364 | "print(f'The training RMSE = {np.sqrt(metrics.mean_squared_error(y, y_pred)):.3f} log10GPa')\n", 365 | "print(f'The training r^2 = {xgb_model.score(scaled_X, y):.3f}')" 366 | ] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": {}, 371 | "source": [ 372 | "XGBoost does a better job, but wait... \n", 373 | "\n", 374 | "We haven't performed proper training and testing yet 😱. These models are likely to be overfit and unable to make useful predictions for new inputs. On to the next stage!" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "## Training and testing\n", 382 | "\n", 383 | "### Train-test split\n", 384 | "\n", 385 | "We are ready to build a real model now. Let's separate the training data from the unseen test set used to assess model performance." 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "from slearn.model_selection import train_test_split\n", 395 | "\n", 396 | "# Split the data into 80% training and 20% testing\n", 397 | "X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state=42)\n", 398 | "\n", 399 | "# Print the sizes of the arrays\n", 400 | "print(f\"X_train shape: {X_train.shape}\")\n", 401 | "print(f\"y_train shape: {y_train.shape}\")\n", 402 | "print(f\"X_test shape: {X_test.shape}\")\n", 403 | "print(f\"y_test shape: {y_test.shape}\")" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": {}, 409 | "source": [ 410 | "
\n", 411 | " Code hint \n", 412 | "The library is \"sklearn\"!\n", 413 | "
" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "### Cross-validation \n", 421 | "\n", 422 | "Using the 80% training set, we can train a model by making use of [cross-validation](https://scikit-learn.org/stable/modules/cross_validation.html) in an attempt to avoid overfitting. Note that this step may take a minute to run as 10 models are being trained (i.e. 5-fold cross-validation x 2 models).\n", 423 | "\n", 424 | "
\n", 425 | " Recap of cross-validation \n", 426 | "Cross-validation partitions data into multiple subsets, training the model on some and validating it on others, ensuring robust evaluation.\n", 427 | "\n", 428 | "_Key types include:_\n", 429 | "\n", 430 | "- **k-Fold Cross-Validation**: Data is split into *k* folds; each fold is used as a validation set once while training on the remaining *k-1* folds.\n", 431 | "- **Leave-One-Out Cross-Validation (LOOCV)**: Each data point is used as a validation set once, with the rest for training.\n", 432 | "- **Stratified k-Fold**: Preserves class proportions in each fold, useful for imbalanced datasets.\n", 433 | "- **Time Series Cross-Validation**: Ensures training always precedes validation, preserving temporal structure.\n", 434 | "\n", 435 | "_Typical workflow:_\n", 436 | "\n", 437 | "1. **Split Data**: Divide the dataset into *k* folds.\n", 438 | "2. **Train and Validate**: Train the model on *k-1* folds, validate on the remaining fold.\n", 439 | "3. **Repeat**: Cycle through all folds, ensuring each serves as a validation set.\n", 440 | "4. **Aggregate Results**: Compute performance metrics across all iterations.\n", 441 | "5. **Train Final Model:** Fit the model using the full training dataset based on cross-validation insights.\n", 442 | "
" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "from sklearn.model_selection import cross_val_score\n", 452 | "from xgboost import XGBRegressor\n", 453 | "\n", 454 | "# Define models\n", 455 | "xgb_model = XGBRegressor(n_estimators=100, max_depth=3, random_state=42, objective='reg:squarederror') \n", 456 | "rf_model = RandomForestRegressor(n_estimators=100, max_depth=3, random_state=42)\n", 457 | "\n", 458 | "# Perform cross-validation for XGBoost\n", 459 | "xgb_cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n", 460 | "xgb_rmse = np.sqrt(-xgb_cv_scores) # Convert to RMSE\n", 461 | "\n", 462 | "# Perform cross-validation for Random Forest\n", 463 | "rf_cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n", 464 | "rf_rmse = np.sqrt(-rf_cv_scores) # Convert to RMSE\n", 465 | "\n", 466 | "# Print results\n", 467 | "# Compare the results\n", 468 | "print(\"XGBoost Cross-Validation Results\")\n", 469 | "print(f\" Mean RMSE: {xgb_rmse.mean():.3f}\")\n", 470 | "print(f\" Standard Deviation of RMSE: {xgb_rmse.std():.3f}\")\n", 471 | "\n", 472 | "print(\"\\nRandom Forest Cross-Validation Results\")\n", 473 | "print(f\" Mean RMSE: {rf_rmse.mean():.3f}\")\n", 474 | "print(f\" Standard Deviation of RMSE: {rf_rmse.std():.3f}\")" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": {}, 480 | "source": [ 481 | "
\n", 482 | " 🙋 Cross-validation output:
\n", 483 | " • Mean RMSE: Mean error across the cross-validation folds (smaller = better).
\n", 484 | " • Standard Deviation of RMSE: Variability in error across the folds (smaller = more consistent).
\n", 485 | "
" 486 | ] 487 | }, 488 | { 489 | "cell_type": "markdown", 490 | "metadata": {}, 491 | "source": [ 492 | "### Hyperparamater optimisation\n", 493 | "\n", 494 | "XGBoost is in the lead! So far, we have not adjusted the models themselves. It is possible to improve performance by tuning the hyperparameters. Manually tuning would be laborious. We can use `GridSearchCV` to automate the search. \n", 495 | "\n", 496 | "Note that this step will be even more computationally expensive as we are performing cross-validation as a function of model hyperparameters for two separate models. You can see how computational cost quickly escalates and this is where powerful GPUs can become essential for machine learning! \n", 497 | "\n", 498 | "
\n", 499 | " ⏱️ This will take 2-3 min to run. Think about how the model is learning from data.\n", 500 | "
" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "from sklearn.model_selection import GridSearchCV\n", 510 | "\n", 511 | "# Hyperparameter grid for XGBoost\n", 512 | "xgb_param_grid = {\n", 513 | " 'n_estimators': [100, 200],\n", 514 | " 'max_depth': [3, 6],\n", 515 | " 'learning_rate': [0.1, 0.2]\n", 516 | "}\n", 517 | "\n", 518 | "xgb_grid_search = GridSearchCV(XGBRegressor(random_state=42), xgb_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)\n", 519 | "xgb_grid_search.fit(X_train, y_train)\n", 520 | "\n", 521 | "best_xgb_params = xgb_grid_search.best_params_\n", 522 | "best_xgb_model = xgb_grid_search.best_estimator_\n", 523 | "\n", 524 | "# Hyperparameter grid for Random Forest\n", 525 | "rf_param_grid = {\n", 526 | " 'n_estimators': [100, 200],\n", 527 | " 'max_depth': [3, 6],\n", 528 | " 'min_samples_split': [2, 4]\n", 529 | "}\n", 530 | "\n", 531 | "rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)\n", 532 | "rf_grid_search.fit(X_train, y_train)\n", 533 | "\n", 534 | "best_rf_params = rf_grid_search.best_params_\n", 535 | "best_rf_model = rf_grid_search.best_estimator_\n", 536 | "\n", 537 | "# Evaluate the best models\n", 538 | "xgb_cv_scores = -cross_val_score(best_xgb_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n", 539 | "xgb_rmse = np.sqrt(xgb_cv_scores)\n", 540 | "\n", 541 | "rf_cv_scores = -cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n", 542 | "rf_rmse = np.sqrt(rf_cv_scores)\n", 543 | "\n", 544 | "# Compare the results of the best models\n", 545 | "print(\"Best XGBoost Hyperparameters:\", best_xgb_params)\n", 546 | "print(\"Best XGBoost Cross-Validation Results\")\n", 547 | "print(f\" Mean RMSE: {xgb_rmse.mean():.3f}\")\n", 548 | "print(f\" Standard Deviation of RMSE: {xgb_rmse.std():.3f}\")\n", 549 | "\n", 550 | "print(\"\\nBest Random Forest Hyperparameters:\", best_rf_params)\n", 551 | "print(\"Best Random Forest Cross-Validation Results\")\n", 552 | "print(f\" Mean RMSE: {rf_rmse.mean():.3f}\")\n", 553 | "print(f\" Standard Deviation of RMSE: {rf_rmse.std():.3f}\")" 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": {}, 559 | "source": [ 560 | "Was it worth the effort? There should be improvements in the RMSE for both models. Note the optimal hyperparameters found.\n", 561 | "\n", 562 | "### Model assessment\n", 563 | "\n", 564 | "Now that we have our best trained models, let's see how they perform on *unseen* test data. Comparing test performance to training performance will help us determine if the model generalises well or shows signs of overfitting or underfitting." 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "from sklearn.metrics import mean_squared_error, r2_score\n", 574 | "\n", 575 | "# Test the best XGBoost model\n", 576 | "xgb_test_preds = best_xgb_model.predict(X_test)\n", 577 | "xgb_test_rmse = np.sqrt(mean_squared_error(y_test, xgb_test_preds))\n", 578 | "xgb_test_r2 = r2_score(y_test, xgb_test_preds)\n", 579 | "\n", 580 | "# Test the best Random Forest model\n", 581 | "rf_test_preds = best_rf_model.predict(X_test)\n", 582 | "rf_test_rmse = np.sqrt(mean_squared_error(y_test, rf_test_preds))\n", 583 | "rf_test_r2 = r2_score(y_test, rf_test_preds)\n", 584 | "\n", 585 | "# Print test results\n", 586 | "print(\"XGBoost test results:\")\n", 587 | "print(f\"RMSE: {xgb_test_rmse:.3f}\")\n", 588 | "print(f\"R²: {xgb_test_r2:.3f}\")\n", 589 | "\n", 590 | "print(\"\\nRandom Forest test results:\")\n", 591 | "print(f\"RMSE: {rf_test_rmse:.3f}\")\n", 592 | "print(f\"R²: {rf_test_r2:.3f}\")\n", 593 | "\n", 594 | "# Create a scatter plot with both models in different colors\n", 595 | "plt.figure(figsize=(5, 4))\n", 596 | "plt.scatter(y_test, xgb_test_preds, c='blue', label=f'XGBoost (R²={xgb_test_r2:.2f})', alpha=0.5)\n", 597 | "plt.scatter(y_test, rf_test_preds, c='green', label=f'Random Forest (R²={rf_test_r2:.2f})', alpha=0.5)\n", 598 | "plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'k--', lw=2) # Reference line (y=x)\n", 599 | "plt.xlabel(\"Actual values\")\n", 600 | "plt.ylabel(\"Predicted values\")\n", 601 | "plt.title(\"Test set performance\")\n", 602 | "plt.legend()\n", 603 | "plt.show()" 604 | ] 605 | }, 606 | { 607 | "cell_type": "markdown", 608 | "metadata": {}, 609 | "source": [ 610 | "XGBoost outperforms Random Forest in both cross-validation and test performance for this task, with the slight increase in RMSE from train to test suggesting both models generalise reasonably well." 611 | ] 612 | }, 613 | { 614 | "cell_type": "markdown", 615 | "metadata": {}, 616 | "source": [ 617 | "### Model speed\n", 618 | "\n", 619 | "The speed of a model may also be important, e.g. a use case involving millions of predictions. Several factors can influence the computational performance, including the dataset size, model complexity, and hardware. We can perform a simple comparison of our two models using `time`." 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": null, 625 | "metadata": {}, 626 | "outputs": [], 627 | "source": [ 628 | "import time\n", 629 | "\n", 630 | "# Measure the training time for XGBoost\n", 631 | "start_time = time.time()\n", 632 | "xgb_model.fit(X_train, y_train)\n", 633 | "xgb_training_time = time.time() - start_time\n", 634 | "\n", 635 | "# Measure the training time for Random Forest\n", 636 | "start_time = time.time()\n", 637 | "rf_model.fit(X_train, y_train)\n", 638 | "rf_training_time = time.time() - start_time\n", 639 | "\n", 640 | "# Measure the prediction time for XGBoost\n", 641 | "start_time = time.time()\n", 642 | "xgb_test_preds = xgb_model.predict(X_test)\n", 643 | "xgb_prediction_time = time.time() - start_time\n", 644 | "\n", 645 | "# Measure the prediction time for Random Forest\n", 646 | "start_time = time.time()\n", 647 | "rf_test_preds = rf_model.predict(X_test)\n", 648 | "rf_prediction_time = time.time() - start_time\n", 649 | "\n", 650 | "print(f\"XGBoost training time: {xgb_training_time:.4f} seconds\")\n", 651 | "print(f\"Random Forest training time: {rf_training_time:.4f} seconds\")\n", 652 | "print(f\"\\nXGBoost prediction time: {xgb_prediction_time:.4f} seconds\")\n", 653 | "print(f\"Random Forest prediction time: {rf_prediction_time:.4f} seconds\")" 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "metadata": {}, 659 | "source": [ 660 | "It is clear that the XGBoost library has been well optimised to run quickly." 661 | ] 662 | }, 663 | { 664 | "cell_type": "markdown", 665 | "metadata": { 666 | "tags": [] 667 | }, 668 | "source": [ 669 | "## 🚨 Exercise 7\n", 670 | "\n", 671 | "
\n", 672 | " 💡 Coding exercises: The exercises are designed to apply what you have learned with room for creativity. It is fine to discuss solutions with your classmates, but the actual code should not be directly copied.\n", 673 | "
\n", 674 | "\n", 675 | "### Your details" 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": null, 681 | "metadata": {}, 682 | "outputs": [], 683 | "source": [ 684 | "import numpy as np\n", 685 | "\n", 686 | "# Insert your values\n", 687 | "Name = \"No Name\" # Replace with your name\n", 688 | "CID = 123446 # Replace with your College ID (as a numeric value with no leading 0s)\n", 689 | "\n", 690 | "# Set a random seed using the CID value\n", 691 | "CID = int(CID)\n", 692 | "np.random.seed(CID)\n", 693 | "\n", 694 | "# Print the message\n", 695 | "print(\"This is the work of \" + Name + \" [CID: \" + str(CID) + \"]\")" 696 | ] 697 | }, 698 | { 699 | "cell_type": "markdown", 700 | "metadata": { 701 | "tags": [] 702 | }, 703 | "source": [ 704 | "### Problem\n", 705 | "\n", 706 | "Selecting the most appropriate ML model for a given purpose is important for achieving predictive performance. Your job will be to assess additional models (e.g. [Nearest Neighbours](https://scikit-learn.org/stable/modules/neighbors.html#neighbors) and [Support Vector Machines](https://scikit-learn.org/stable/modules/svm.html#svm)) for the hardness regression task. The tasks will be given in class." 707 | ] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "execution_count": null, 712 | "metadata": {}, 713 | "outputs": [], 714 | "source": [ 715 | "#Empty block for your answers\n", 716 | "\n", 717 | "\n" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": null, 723 | "metadata": {}, 724 | "outputs": [], 725 | "source": [ 726 | "#Empty block for your answers\n", 727 | "\n", 728 | "\n" 729 | ] 730 | }, 731 | { 732 | "cell_type": "markdown", 733 | "metadata": {}, 734 | "source": [ 735 | "
\n", 736 | " Task hint \n", 737 | "You can perform cross-validation following the same procedure as the random forest model in the main notebook.\n", 738 | "
\n", 739 | "\n", 740 | "
\n", 741 | " 📓 Submission: When your notebook is complete in Google Colab, go to File > Download and choose .ipynb. The completed file should be uploaded to Blackboard under assignments for MATE70026.\n", 742 | "
" 743 | ] 744 | }, 745 | { 746 | "cell_type": "markdown", 747 | "metadata": {}, 748 | "source": [ 749 | "## 🌊 Dive deeper\n", 750 | "\n", 751 | "* _Level 1:_ Tackle Chapter 14 on Tree-Based Learners in [Machine Learning Refined](https://github.com/jermwatt/machine_learning_refined#what-is-new-in-the-second-edition). \n", 752 | "\n", 753 | "* _Level 2:_ Explore the XGBoost [tutorials](https://xgboost.readthedocs.io/en/stable/tutorials/model.html), e.g. predicting multiple properties with multi-output regression. \n", 754 | "\n", 755 | "* _Level 3:_ Find the best model (subject to time constraints) with [Automatminer](https://hackingmaterials.lbl.gov/automatminer) based on [TPOT](https://epistasislab.github.io/tpot)." 756 | ] 757 | } 758 | ], 759 | "metadata": { 760 | "kernelspec": { 761 | "display_name": "vscode24", 762 | "language": "python", 763 | "name": "python3" 764 | }, 765 | "language_info": { 766 | "codemirror_mode": { 767 | "name": "ipython", 768 | "version": 3 769 | }, 770 | "file_extension": ".py", 771 | "mimetype": "text/x-python", 772 | "name": "python", 773 | "nbconvert_exporter": "python", 774 | "pygments_lexer": "ipython3", 775 | "version": "3.12.4" 776 | } 777 | }, 778 | "nbformat": 4, 779 | "nbformat_minor": 4 780 | } 781 | -------------------------------------------------------------------------------- /Lecture8.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Accelerated Discovery" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "
\n", 15 | " 💡 Geoffrey Hinton: It’s quite conceivable that humanity is just a passing phase in the evolution of intelligence.\n", 16 | "
" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "\n", 24 | "\n", 25 | "[Lecture slides](https://speakerdeck.com/aronwalsh/mlformaterials-lecture8-ai)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## 🤖 x 🧪 Closed-loop optimisation \n", 33 | "\n", 34 | "The combination of automation and optimisation is powerful. Closed-loop workflows are of growing importance in materials research for many reasons, including:\n", 35 | "\n", 36 | "1. **Efficiency:** Efficient allocation of resources, both in terms of time and materials. By continuously updating experimental parameters based on real-time feedback, we can reduce the number of trials needed to reach optimal outcomes. \n", 37 | "\n", 38 | "2. **Adapt to changing conditions:** Adaptive decision-making, ensuring that experiments remain effective even when external factors fluctuate. This adaptability is highly valuable for complex systems where traditional trial-and-error approaches are prone to fail.\n", 39 | "\n", 40 | "3. **Exploration of large parameter spaces:** Many materials science problems involve high-dimensional parameter spaces where exhaustive exploration is impractical. Techniques such as Bayesian optimisation can efficiently sample and search these spaces to identify optimal configurations and make discoveries.\n", 41 | "\n", 42 | "4. **Data-driven insights:** Generation of valuable data from ongoing experiments. This data can be analysed to gain a deeper understanding of the underlying processes and relationships, facilitating scientific discoveries and supporting future efforts.\n", 43 | "\n", 44 | "Today we will make use of the [scikit-optimise](https://scikit-optimize.github.io) package." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# Installation of libraries\n", 54 | "!pip install scikit-optimize --quiet" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "# Import of modules\n", 64 | "import numpy as np \n", 65 | "import matplotlib.pyplot as plt \n", 66 | "from scipy.stats import norm # Statistical functions\n", 67 | "from skopt import gp_minimize, dummy_minimize # Bayesian optimisation\n", 68 | "from skopt.utils import create_result # Utility functions for skopt\n", 69 | "from sklearn.metrics import r2_score # R-squared metric" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "tags": [] 76 | }, 77 | "source": [ 78 | "## Bayesian optimisation (BO)\n", 79 | "\n", 80 | "BO is a powerful technique for optimising complex and expensive-to-evaluate functions. It combines probabilistic modeling and decision theory to search for the optimal set of parameters. In materials research, parameters like chemical composition, sample thickness, and processing conditions can be optimised.\n", 81 | "\n", 82 | "BO aims to find the global minimum (or maximum) of an objective function, $O(x)$, where $x$ represents a set of parameters or design variables. Instead of exhaustive searches, BP builds a surrogate model, typically a Gaussian Process (GP), that approximates the true objective function. This surrogate model captures both the mean $\\mu(x)$ and uncertainty $\\sigma(x)$ associated with $O(x)$. The GP is defined as:\n", 83 | "\n", 84 | "$$\n", 85 | "O(x) \\sim \\text{GP}(\\mu(x), k(x, x'))\n", 86 | "$$\n", 87 | "\n", 88 | "where $k(x, x')$ is a kernel function that quantifies the similarity between two input points $x$ and $x'$.\n", 89 | "\n", 90 | "The surrogate model balances exploration and exploitation using an acquisition function $\\alpha(x)$, which trades off between exploring uncertain regions and exploiting promising areas:\n", 91 | "\n", 92 | "$$\n", 93 | "x_{\\text{next}} = \\arg \\max_x \\alpha(x)\n", 94 | "$$\n", 95 | "\n", 96 | "Common acquisition functions include Probability of Improvement (PI), Expected Improvement (EI), and Upper Confidence Bound (UCB). Each of these functions aims to maximise the expected gain in performance over the current best solution.\n", 97 | "\n", 98 | "
\n", 99 | "Curious about the kernel function?\n", 100 | "\n", 101 | "The kernel determines the covariance structure of the GP. A commonly used kernel, and the default in `sklearn`, is the Radial Basis Function (RBF):\n", 102 | "\n", 103 | "$$\n", 104 | "k(x, x') = \\sigma^2 \\exp\\left(-\\frac{\\|x - x'\\|^2}{2l^2}\\right)\n", 105 | "$$\n", 106 | "\n", 107 | "where:\n", 108 | "- $\\sigma^2$ is the **signal variance**, which controls the overall magnitude of function variations,\n", 109 | "- $l$ is the **length scale**, which determines how quickly the function values change with respect to input differences.\n", 110 | "\n", 111 | "There are also many other choices, such as the [Matérn kernel](https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.kernels.Matern.html), which differ in how they model smoothness and continuity.\n", 112 | "
" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "## Building a BO model\n", 120 | "\n", 121 | "### Step 1. Target function\n", 122 | "\n", 123 | "We can start by generating a simple sine-like target function with added noise to keep things interesting. This acts as our \"virtual experiment\", i.e. we can call the function to obtain an output." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "# Fixing the random seed for reproducibility\n", 133 | "np.random.seed(42)\n", 134 | "\n", 135 | "# Define the target function\n", 136 | "def target_function(x):\n", 137 | " x = np.atleast_1d(x) # Ensure x is an array\n", 138 | " return np.sin(x[0]) + 0.1 * x[0] + 0.5 * np.random.randn()\n", 139 | "\n", 140 | "# Generate data for visualisation\n", 141 | "x_values = np.linspace(-5, 5, 200).reshape(-1, 1)\n", 142 | "y_values = np.vectorize(target_function)(x_values)\n", 143 | "\n", 144 | "# Plot the target function\n", 145 | "plt.figure(figsize=(5, 4))\n", 146 | "plt.plot(x_values, y_values, 'r-', alpha=0.5, label='Target Function')\n", 147 | "plt.xlabel('Input')\n", 148 | "plt.ylabel('Output')\n", 149 | "plt.legend()\n", 150 | "plt.show()" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "Let's randomly sample the target function and fit a simple polynomial function to get a feeling for how the model works." 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "# Generate sample points from the target function\n", 167 | "num_initial_points = \n", 168 | "initial_points = np.random.uniform(-5, 5, num_initial_points)\n", 169 | "initial_values = np.vectorize(target_function)(initial_points)\n", 170 | "\n", 171 | "# Plot the sample points\n", 172 | "plt.figure(figsize=(5, 4))\n", 173 | "plt.plot(x_values, y_values, 'r-', alpha=0.5, label='Target Function')\n", 174 | "plt.scatter(initial_points, initial_values, color='blue', marker='o', label='Initial Samples')\n", 175 | "plt.xlabel('Input')\n", 176 | "plt.ylabel('Output')\n", 177 | "plt.legend()\n", 178 | "plt.show()" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "
\n", 186 | " Code hint \n", 187 | "Try `num_initial_points = 10`\n", 188 | "
" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "# Perform a polynomial fit\n", 198 | "degree = # Adjust the degree of the polynomial fit\n", 199 | "coefficients = np.polyfit(initial_points, initial_values, degree)\n", 200 | "poly_fit = np.poly1d(coefficients)\n", 201 | "\n", 202 | "# Calculate R^2\n", 203 | "y_pred = poly_fit(initial_points)\n", 204 | "r_squared = r2_score(initial_values, y_pred)\n", 205 | "\n", 206 | "# Plot the sample points and polynomial fit\n", 207 | "plt.figure(figsize=(5, 4))\n", 208 | "plt.plot(x_values, y_values, 'r-', alpha=0.5, label='Target Function')\n", 209 | "plt.scatter(initial_points, initial_values, color='blue', marker='o', label='Initial Samples')\n", 210 | "plt.plot(x_values, poly_fit(x_values), 'g--', label=f'Polynomial Fit (degree {degree})\\n$R^2 = {r_squared:.4f}$')\n", 211 | "plt.xlabel('Input')\n", 212 | "plt.ylabel('Output')\n", 213 | "plt.legend()\n", 214 | "plt.show()" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "
\n", 222 | " 🐢 Take a beat: Adjust the degree of the polynomial to see how good the fit is. Start with `degree = 2` and gradually increase it.\n", 223 | "
" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "### Step 3: Gaussian Process\n", 231 | "\n", 232 | "Now we can move to Bayesian Optimisation with a Gaussian Process model. The optimisation progress is visualised by plotting the target function, optimisation steps, and a colourbar indicating the step number.\n", 233 | "\n", 234 | "
\n", 235 | " ⏱️ This may take a minute to run. Reverend Bayes makes computers work hard!\n", 236 | "
" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "# Optimise the target function using Bayesian Optimisation\n", 246 | "result = gp_minimize(target_function, [(-5.0, 5.0)], n_calls=50, random_state=42)\n", 247 | "\n", 248 | "# Perform random sampling for comparison\n", 249 | "random_result = dummy_minimize(target_function, [(-5.0, 5.0)], n_calls=50, random_state=42)\n", 250 | "\n", 251 | "# Plot the Gaussian Process model after optimisation\n", 252 | "x_gp = np.array(result.x_iters).reshape(-1, 1)\n", 253 | "y_gp = result.func_vals\n", 254 | "\n", 255 | "# Plot the target function\n", 256 | "plt.figure(figsize=(5, 4))\n", 257 | "plt.plot(x_values, y_values, 'r-', alpha=0.5, label='Target function')\n", 258 | "\n", 259 | "# Plot the optimisation steps with a colormap\n", 260 | "plt.scatter(x_gp, y_gp, c=range(len(x_gp)), cmap='viridis', marker='o', label='Step number')\n", 261 | "\n", 262 | "# Add colorbar to indicate the progress\n", 263 | "cbar = plt.colorbar()\n", 264 | "cbar.set_label('Step number')\n", 265 | "\n", 266 | "plt.title('BO: Gaussian Process Model')\n", 267 | "plt.xlabel('Input')\n", 268 | "plt.ylabel('Output')\n", 269 | "plt.legend()\n", 270 | "plt.show()" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "We can use `plot_gaussian_process` from scikit-optimize to visualise the confidence intervals. `n_samples` determines the number of samples to draw from the Gaussian Process for the estimation." 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "from skopt.plots import plot_gaussian_process as plot_gp\n", 287 | "\n", 288 | "# Plot the Gaussian Process model with confidence intervals\n", 289 | "plt.figure(figsize=(5, 4))\n", 290 | "plot_gp(result)\n", 291 | "\n", 292 | "# Add the target function for reference\n", 293 | "plt.plot(x_values, y_values, 'r-', alpha=0.25, label='Target function')\n", 294 | "\n", 295 | "plt.title('Confidence Intervals')\n", 296 | "plt.xlabel('Input')\n", 297 | "plt.ylabel('Output')\n", 298 | "plt.legend()\n", 299 | "plt.show()" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "The plot shows the GP mean (dashed green), confidence intervals (shaded), and sampled observations (red). The target function (light red) is also overlaid. The confidence region narrows where more observations exist and widens in unexplored areas, reflecting uncertainty in the GP model.\n", 307 | "\n", 308 | "We should always have a benchmark to compare our model to. This block extracts the best results from BO and random sampling, then compares and visualises their performance over optimisation steps." 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "# Extract the cumulative minimum values\n", 318 | "bo_min_values = np.minimum.accumulate(result.func_vals)\n", 319 | "random_min_values = np.minimum.accumulate(random_result.func_vals)\n", 320 | "\n", 321 | "# Plot the cumulative minimum values vs steps for both methods\n", 322 | "plt.figure(figsize=(5, 4))\n", 323 | "plt.plot(range(1, len(bo_min_values) + 1), bo_min_values, 'o-', label='Bayesian Optimisation')\n", 324 | "plt.plot(range(1, len(random_min_values) + 1), random_min_values, 'x-', label='Random Sampling')\n", 325 | "\n", 326 | "plt.title('Does BO Beat Random Sampling?')\n", 327 | "plt.xlabel('Step')\n", 328 | "plt.ylabel('Cumulative Minimum Value')\n", 329 | "plt.legend()\n", 330 | "plt.show()" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "BO (blue) converges faster to a lower minimum value. Random sampling (orange) fluctuates and struggles to improve beyond a certain point. This highlights BO’s advantage in structured search over purely random exploration." 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": { 343 | "tags": [] 344 | }, 345 | "source": [ 346 | "## 🚨 Exercise 8\n", 347 | "\n", 348 | "
\n", 349 | " 💡 Coding exercises: The exercises are designed to apply what you have learned with room for creativity. It is fine to discuss solutions with your classmates, but the actual code should not be directly copied.\n", 350 | "
\n", 351 | "\n", 352 | "### Your details" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "import numpy as np\n", 362 | "\n", 363 | "# Insert your values\n", 364 | "Name = \"No Name\" # Replace with your name\n", 365 | "CID = 123446 # Replace with your College ID (as a numeric value with no leading 0s)\n", 366 | "\n", 367 | "# Set a random seed using the CID value\n", 368 | "CID = int(CID)\n", 369 | "np.random.seed(CID)\n", 370 | "\n", 371 | "# Print the message\n", 372 | "print(\"This is the work of \" + Name + \" [CID: \" + str(CID) + \"]\")" 373 | ] 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "metadata": {}, 378 | "source": [ 379 | "### Problem\n", 380 | "\n", 381 | "The Department of Materials has purchased a new automated thin-film deposition system. The machine has two dials that provide a 2D parameter space (x, y) for materials processing. We can define a (hypothetical) target loss function for optimising the transition temperature of our candidate thin-film superconductors as:\n", 382 | "\n", 383 | "```python\n", 384 | "# Target function for materials processing with x and y \"dials\"\n", 385 | "def supermat(inputs):\n", 386 | " x, y = inputs\n", 387 | " a = 2, b = 5.1 / (2 * np.pi**2)\n", 388 | " c = 3 / np.pi\n", 389 | " r = 4, s = 10, t = 1 / (8 * np.pi)\n", 390 | "\n", 391 | " term1 = a * (y - b * x**2 + c * x - r)**2\n", 392 | " term2 = s * (1 - t) * np.cos(x)\n", 393 | " term3 = s\n", 394 | "\n", 395 | " return term1 + term2 + term3\n", 396 | "\n", 397 | "# Example usage:\n", 398 | "dials = [2.0, 3.0]\n", 399 | "result = supermat(dials)\n", 400 | "print(f\"Experiment by setting dials to ({dials[0]}, {dials[1]}): {result}\")\n", 401 | "```\n", 402 | "\n", 403 | "The tasks will be provided in class." 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [ 412 | "#Empty block for your answers\n", 413 | "\n", 414 | "\n" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": {}, 421 | "outputs": [], 422 | "source": [ 423 | "#Empty block for your answers\n", 424 | "\n", 425 | "\n" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "
\n", 433 | " Task hint \n", 434 | "Remember to first define the target function and then call it using gp_minimize()\n", 435 | "
\n", 436 | "\n", 437 | "
\n", 438 | " 📓 Submission: When your notebook is complete in Google Colab, go to File > Download and choose .ipynb. The completed file should be uploaded to Blackboard under assignments for MATE70026.\n", 439 | "
" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "## 🌊 Dive deeper\n", 447 | "\n", 448 | "* _Level 1:_ Visually explore [Gaussian Processes](https://distill.pub/2019/visual-exploration-gaussian-processes/)\n", 449 | "\n", 450 | "* _Level 2:_ Read a perspective on [Bayesian optimisation for chemical problems](https://chemrxiv.org/engage/chemrxiv/article-details/656dfe74cf8b3c3cd7c611a5) by your teaching assistant Yifan, which includes links to tool and packages under development\n", 451 | "\n", 452 | "* _Level 3:_ Interact with the self-driving laboratory demo by [Sterling Baird](https://github.com/sparks-baird/self-driving-lab-demo)" 453 | ] 454 | } 455 | ], 456 | "metadata": { 457 | "kernelspec": { 458 | "display_name": "vscode24", 459 | "language": "python", 460 | "name": "python3" 461 | }, 462 | "language_info": { 463 | "codemirror_mode": { 464 | "name": "ipython", 465 | "version": 3 466 | }, 467 | "file_extension": ".py", 468 | "mimetype": "text/x-python", 469 | "name": "python", 470 | "nbconvert_exporter": "python", 471 | "pygments_lexer": "ipython3", 472 | "version": "3.12.4" 473 | } 474 | }, 475 | "nbformat": 4, 476 | "nbformat_minor": 4 477 | } 478 | -------------------------------------------------------------------------------- /Lecture9.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Research Challenge" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "
\n", 15 | " 💡 Margaret Atwood: Every aspect of human technology has a dark side, including the bow and arrow.\n", 16 | "
" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## 🏅 Build your own model \n", 24 | "\n", 25 | "We have just completed a session on generative AI (see [Lecture slides](https://speakerdeck.com/aronwalsh/machine-learning-for-materials-lecture-9)), but it is time to go back to supervised machine learning problems.\n", 26 | "\n", 27 | "You have been assigned one dataset from [MatBench](https://matbench.materialsproject.org) as introduced in the [Challenge slides](https://speakerdeck.com/aronwalsh/mlformaterials-challenge-25). You are free to choose and tune any machine-learning model, with any Python library, but it should be appropriate for the problem. For instance, [XGBoost](https://xgboost.readthedocs.io) could be a good starting starting point to build a regression model. You can refer back to earlier notebooks and repurpose code as needed. \n", 28 | "\n", 29 | "You may reach the limits of computing processing power on Google Colab. Building a useful model with limited resources is a real-world skill. Using other free resources is allowed if you find an alternative service, as is running on your own computer. A model tracker such as [wandb](https://wandb.ai) could be helpful for advanced users. If you want to try a brute force approach, a library such as [Automatminer](https://hackingmaterials.lbl.gov/automatminer) may be of interest.\n", 30 | "\n", 31 | "This notebook should be used for keeping a record of your model development, submission, and even your presentation. You are free to edit (add/remove/delete) or rearrange the cells as you see fit.\n", 32 | "\n", 33 | "### Your details" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import numpy as np\n", 43 | "\n", 44 | "# Insert your values\n", 45 | "Name = \"No Name\" # Replace with your name\n", 46 | "CID = 123446 # Replace with your College ID (as a numeric value with no leading 0s)\n", 47 | "\n", 48 | "# Set a random seed using the CID value\n", 49 | "CID = int(CID)\n", 50 | "np.random.seed(CID)\n", 51 | "\n", 52 | "# Print the message\n", 53 | "print(\"This is the work of \" + Name + \" [CID: \" + str(CID) + \"]\")" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## Problem statement\n", 61 | "\n", 62 | "You have been assigned one dataset from the [list](https://matbench.materialsproject.org/Benchmark%20Info/matbench_v0.1/) on [MatBench](https://matbench.materialsproject.org). You should state what problem you are trying to solve and comment on the best-performing model in the benchmark. " 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 2, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "# Spare cell\n", 72 | "\n", 73 | "\n" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": { 79 | "tags": [] 80 | }, 81 | "source": [ 82 | "## Data preparation\n", 83 | "\n", 84 | "Check the data distribution and apply appropriate pre-processing steps as required. " 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "# Installation of libraries\n", 94 | "!pip install matminer # Datasets and featurisation " 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "# Get dataset info from matminer\n", 104 | "from matminer.datasets import get_all_dataset_info\n", 105 | "from matminer.datasets import load_dataset\n", 106 | "\n", 107 | "# Uncomment the info line for your assigned challenge\n", 108 | "\n", 109 | " # A (GTAs - Xia, Kinga)\n", 110 | "#info = get_all_dataset_info(\"matbench_dielectric\")\n", 111 | "\n", 112 | " # B (GTAs - Irea, Pan)\n", 113 | "#info = get_all_dataset_info(\"matbench_expt_gap\")\n", 114 | "\n", 115 | " # C (GTAs - Yifan, Fintan)\n", 116 | "#info = get_all_dataset_info(\"matbench_glass\")\n", 117 | "\n", 118 | "# Check the dataset information\n", 119 | "print(info)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "# Load your dataset into a pandas DataFrame\n", 129 | "df = load_dataset(\" \")\n", 130 | "\n", 131 | "print(df)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "Choose relevant features, which may be based on composition or structure, depending on your problem. [matminer](https://hackingmaterials.lbl.gov/matminer/) is a good place to start." 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "## Model selection, testing and training \n", 146 | "\n", 147 | "Define your model and justify your choice based on the problem and available data. You can look back at earlier notebooks and investigate other examples online including in [scikit-learn](https://scikit-learn.org)." 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 4, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "# Spare cell\n", 157 | "\n", 158 | "\n" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "Train, validate and test your model. Make sure to do proper data splits and to consider the hyperparamaters of your model.\n", 166 | "\n", 167 | "
\n", 168 | "Note on the ROC-AUC classification metric\n", 169 | "There is one metric we didn't cover but is used in Matbench. In binary classification models, the ROC-AUC (Receiver Operating Characteristic - Area Under the Curve) score can be used to evaluate performance. It quantifies the ability of the model to distinguish between positive and negative instances across different decision thresholds. A higher ROC-AUC score (ranging from 0.5 to 1) indicates better performance, with 1 representing a perfect classifier and 0.5 indicating performance no better than random chance. There is a more detailed discussion here: https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc.\n", 170 | "\n", 171 | "The metric can be calculated using the `roc_auc_score` function from the `sklearn.metrics` module, e.g.\n", 172 | "\n", 173 | "```python\n", 174 | "from sklearn.metrics import roc_auc_score\n", 175 | "\n", 176 | "# Assuming you have true labels (y_true) and predicted probabilities (y_pred_prob) \n", 177 | "y_true = [...] \n", 178 | "y_pred_prob = [...] \n", 179 | "\n", 180 | "# Calculate ROC-AUC\n", 181 | "roc_auc = roc_auc_score(y_true, y_pred_prob)\n", 182 | "\n", 183 | "# Display the result\n", 184 | "print(f'ROC-AUC Score: {roc_auc:.4f}')\n", 185 | "```\n", 186 | "
" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 5, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "# Spare cell\n", 196 | "\n", 197 | "\n" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "## Model analysis and discussion\n", 205 | "\n", 206 | "How well does your final model perform? Think of metrics and plots that are useful to dig a little deeper. \n", 207 | "\n", 208 | "Compare against the best-performing model on the [MatBench](https://matbench.materialsproject.org) leaderboard. With limited resources, don't expect to match this performance, but you should do better than a baseline model. " 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "# Spare cell\n", 218 | "\n", 219 | "\n" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "## Large Language Model (LLM) usage declaration\n", 227 | "\n", 228 | "Acknowledge use of a generative model during your assignment. Points to consider:\n", 229 | "\n", 230 | "* State which LLM (e.g. GPT-4, Gemini, Co-Pilot)\n", 231 | "\n", 232 | "* Specify tasks (e.g. summarising research or code snippets)\n", 233 | "\n", 234 | "* Were any limitations/biases noted?\n", 235 | "\n", 236 | "* How did you ensure ethical use?" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 6, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "# Spare cell\n", 246 | "\n", 247 | "\n" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "## ☘️ Final word\n", 255 | "\n", 256 | "Good luck building your own model! We hope that you enjoyed the course and exercises. Dive deeper into the aspects that caught your interest. A useful starting point may be the [Resources](https://aronwalsh.github.io/MLforMaterials/Resources.html) page. \n", 257 | "\n", 258 | "Remember that submission is on Blackboard and you should upload both the completed Juypter Notebook (`.ipynb` file), as well as your recorded narrated presentation (maximum 5 minutes; see guides on using [Zoom](https://www.youtube.com/watch?v=H9qhoAIzW3E) or [Powerpoint](https://www.youtube.com/watch?v=Y5dgwwa5XRA) for this purpose)." 259 | ] 260 | } 261 | ], 262 | "metadata": { 263 | "kernelspec": { 264 | "display_name": "Python 3 (ipykernel)", 265 | "language": "python", 266 | "name": "python3" 267 | }, 268 | "language_info": { 269 | "codemirror_mode": { 270 | "name": "ipython", 271 | "version": 3 272 | }, 273 | "file_extension": ".py", 274 | "mimetype": "text/x-python", 275 | "name": "python", 276 | "nbconvert_exporter": "python", 277 | "pygments_lexer": "ipython3", 278 | "version": "3.11.7" 279 | } 280 | }, 281 | "nbformat": 4, 282 | "nbformat_minor": 4 283 | } 284 | -------------------------------------------------------------------------------- /Overview.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | _Machine Learning for Materials_ (MATE70026) provides an introduction to statistical research tools for materials theory and simulation. It is a module designed for senior undergraduate and junior postgraduate students in the Department of Materials at Imperial College London. 4 | 5 | You will consider how composition-structure-property information in materials science can be represented in a form suitable for machine learning. You will then build, train, and evaluate your own models using public tools and open datasets. 6 | 7 | A hybrid teaching style will be followed with a mixture of lectures and assignments. The course assumes a basic working knowledge of the Python 3 programming language. MSc students are required to complete [Introduction to Python](https://www.imperial.ac.uk/students/academic-support/graduate-school/professional-development/doctoral-students/research-computing-data-science/courses/python-for-researchers) before taking this course. 8 | 9 | If you have corrections or suggestions, please raise an [issue](https://github.com/aronwalsh/MLforMaterials/issues) on Github. 10 | 11 | ```{tableofcontents} 12 | ``` 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Made withJupyter](https://img.shields.io/badge/Made%20with-Jupyter-orange?style=for-the-badge&logo=Jupyter)](https://jupyter.org/try) 2 | 3 | [![deploy-book](https://github.com/aronwalsh/MLforMaterials/actions/workflows/deploy.yml/badge.svg)](https://github.com/aronwalsh/MLforMaterials/actions/workflows/deploy.yml) 4 | [![made-with-Markdown](https://img.shields.io/badge/Made%20with-Markdown-1f425f.svg)](http://commonmark.org) 5 | [![CC-BY license](https://img.shields.io/badge/License-CC--BY-blue.svg)](https://creativecommons.org/licenses/by/4.0) 6 | 7 | # Machine Learning for Materials 8 | 9 | Online resource of a practical machine learning course in the Department of Materials at Imperial College London. 10 | 11 | You have the option to browse the files or download the complete folder using the green `clone or download` button on the top right of the screen ([zip file](https://github.com/aronwalsh/MLforMaterials/archive/master.zip)). 12 | 13 | ## Course Description 14 | 15 | _Machine Learning for Materials_ (MATE70026) provides an introduction to statistical research tools for materials theory and simulation. It is aimed at senior undergraduate or junior postgraduate students. 16 | 17 | You will consider how composition-structure-property information in materials science can be represented in a form suitable for machine learning. You will then build, train, and evaluate your own models using public tools and open datasets. 18 | 19 | A hybrid teaching style will be followed with a mixture of lectures and assignments. The course assumes a basic working knowledge of the Python 3 programming language. 20 | 21 | [Lecture Slides](./slides) 22 | 23 | [Post a Query](https://github.com/aronwalsh/MLforMaterials/issues) 24 | 25 | ## Course Website 26 | 27 | You can view the site at [https://aronwalsh.github.io/MLforMaterials](https://aronwalsh.github.io/MLforMaterials) 28 | 29 | To build a local copy, first install [Jupyter Book](https://jupyterbook.org): 30 | 31 | `pip install -U jupyter-book` 32 | 33 | then enter the repository and run 34 | 35 | `jupyter-book build .` 36 | 37 | ## Acknowledgements 38 | 39 | This module was developed by Aron Walsh with the assistance of [Anthony Onwuli](https://github.com/AntObi) and [Zhenzhu Li](https://github.com/lizhenzhupearl). -------------------------------------------------------------------------------- /Resources.md: -------------------------------------------------------------------------------- 1 | # Resources 2 | 3 | There is a vibrant community of machine learning developers and open-source packages for scientific research. Many of the links below have provided inspiration or borrowed content for this module. 4 | 5 | ![](./images/ml-python.png) 6 | (Image by [John Kitchen](https://kitchingroup.cheme.cmu.edu)) 7 | 8 | ## General Python 9 | 10 | * [Python for Physicists](https://lucydot.github.io/python_novice) 11 | 12 | * [Python for Chemists](https://pythoninchemistry.org) 13 | 14 | * [Phind AI helper](https://www.phind.com) 15 | 16 | ## Tools and Benchmarks 17 | 18 | * Classical ML: [scikit-learn](https://scikit-learn.org); [scikit-opt](https://scikit-optimize.github.io) 19 | 20 | * Deep Learning: [pytorch](https://pytorch.org); [tensorflow](https://www.tensorflow.org); [jax](https://github.com/google/jax); [keras](https://keras.io) 21 | 22 | * Materials benchmarks: [Matbench](https://matbench.materialsproject.org); [Matbench-Discovery](https://matbench-discovery.materialsproject.org); [JARVIS-Leaderboard](https://pages.nist.gov/jarvis_leaderboard) 23 | 24 | * Materials focused tools: [matminer](https://hackingmaterials.lbl.gov/matminer); [automatminer](https://hackingmaterials.lbl.gov/automatminer); [elementembeddings](https://github.com/WMD-group/ElementEmbeddings); [matgl](https://github.com/materialsvirtuallab/matgl); [dscribe](https://singroup.github.io/dscribe) 25 | 26 | * Molecular focused tools: [deepchem](https://deepchem.io); [stk](https://github.com/JelfsMaterialsGroup/stk); [chemiscope](https://chemiscope.org) 27 | 28 | * Model tracking: [mlflow](https://mlflow.org); [wandb](https://wandb.ai) 29 | 30 | * Other lists: [Awesome Materials Informatics](https://github.com/tilde-lab/awesome-materials-informatics); [Awesome Self Driving Labs](https://github.com/AccelerationConsortium/awesome-self-driving-labs); [Awesome Generative AI](https://github.com/aishwaryanr/awesome-generative-ai-guide); [Atomistic ML](https://github.com/JuDFTteam/best-of-atomistic-machine-learning) 31 | 32 | ## Books 33 | 34 | * [Applications of Artificial Intelligence in Chemistry](https://global.oup.com/academic/product/applications-of-artificial-intelligence-in-chemistry-9780198557364) 35 | 36 | * [Bayesian Computation](https://bayesiancomputationbook.com) 37 | 38 | * [Data Analysis: A Bayesian Tutorial](https://global.oup.com/academic/product/data-analysis-9780198568322) 39 | 40 | * [Deep Learning for Molecules and Materials](https://dmol.pub) 41 | 42 | * [Dive into Deep Learning](https://d2l.ai/index.html) 43 | 44 | * [Geometric Deep Learning](https://geometricdeeplearning.com/book) 45 | 46 | * [Interpretable Machine Learning](https://christophm.github.io/interpretable-ml-book) 47 | 48 | * [Machine Learning in Materials Science](https://pubs.acs.org/doi/10.1021/acsinfocus.7e5033) 49 | 50 | * [Reinforcement Learning and Optimal Control](https://web.mit.edu/dimitrib/www/RLbook.html) 51 | 52 | * [Understanding Deep Learning](https://udlbook.github.io/udlbook) 53 | 54 | ## Papers 55 | 56 | * [Best Practices in Machine Learning for Chemistry](https://www.nature.com/articles/s41557-021-00716-z) 57 | 58 | * [Machine Learning for Molecular and Materials Science](https://www.nature.com/articles/s41586-018-0337-2) 59 | 60 | * [Machine Learning for Materials Scientists: An Introductory Guide toward Best Practices](https://pubs.acs.org/doi/10.1021/acs.chemmater.0c01907) 61 | 62 | ## Videos 63 | 64 | * [Materials Cloud (Webinars)](https://www.youtube.com/@MaterialsCloud) 65 | 66 | * [Materials Project (Webinars)](https://www.youtube.com/@MaterialsProject) 67 | 68 | * [Materials Informatics (Taylor Sparks)](https://www.youtube.com/@TaylorSparks/videos) 69 | 70 | * [Neural networks (3Blue1Brown)](https://www.youtube.com/watch?v=aircAruvnKk) 71 | 72 | * [Python for Science (John Kitchin)](https://www.youtube.com/@JohnKitchin/videos) 73 | 74 | * [Statquest with Josh Starmer](https://www.youtube.com/@statquest/videos) 75 | 76 | ## Other Courses 77 | 78 | * [Automated Experiments](https://github.com/SergeiVKalinin/UTK-Spring-2023---Automated-Experiment) 79 | 80 | * [AI For Everyone](https://www.coursea.org/learn/ai-for-everyone) 81 | 82 | * [Data Driven Chemistry](https://github.com/Edinburgh-Chemistry-Teaching/Data-driven-chemistry) 83 | 84 | * [Deep Neural Networks](https://www.youtube.com/playlist?list=PL_iWQOsE6TfVmKkQHucjPAoRtIJYt8a5A) 85 | 86 | * [Gentle Introduction to Graph Neural Networks](https://distill.pub/2021/gnn-intro/) 87 | 88 | * [Homemade Machine Learning](https://github.com/trekhleb/homemade-machine-learning) 89 | 90 | * [Introduction to Machine Learning in Chemistry](https://github.com/ML4chemArg/Intro-to-Machine-Learning-in-Chemistry) 91 | 92 | * [Materials Informatics](https://www.youtube.com/watch?v=DDliZDwiAoU&list=PLL0SWcFqypCl4lrzk1dMWwTUrzQZFt7y0) 93 | 94 | * [Mathematics for Machine Learning Specialisation](https://www.coursera.org/specializations/mathematics-machine-learning) 95 | 96 | * [Mathematics in Materials](https://github.com/SergeiVKalinin/MSE_Spring2024) 97 | 98 | * [Machine Learning for Everyone](https://vas3k.com/blog/machine_learning/) 99 | 100 | * [Machine Learning for Materials](https://github.com/SergeiVKalinin/MSE_Fall2023) 101 | 102 | * [Machine Learning Refined](https://github.com/jermwatt/machine_learning_refined) 103 | 104 | * [Materials Informatics](https://github.com/sp8rks/MaterialsInformatics) 105 | 106 | * [Microsoft ML for Beginners](https://github.com/microsoft/ML-For-Beginners) 107 | 108 | * [ML for Chemistry](https://github.com/Edinburgh-Chemistry-Teaching/ML-for-Chemistry) 109 | 110 | * [ML in Materials](https://github.com/SergeiVKalinin/MSE_Fall2023) 111 | 112 | * [ML Training Resources](https://github.com/keeeto/ml-training-resources) 113 | 114 | * [Numerics of ML](https://github.com/philipphennig/NumericsOfML) 115 | 116 | * [Practical Deep Learning](https://course.fast.ai) 117 | 118 | * [Stanford ML Cheat Sheets](https://stanford.edu/~shervine/teaching) 119 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | # Book settings 2 | # Learn more at https://jupyterbook.org/customize/config.html 3 | 4 | title: Machine Learning for Materials 5 | author: Aron Walsh 6 | logo: logo.png 7 | copyright: "2025" 8 | 9 | # Force re-execution of notebooks on each build. 10 | # See https://jupyterbook.org/content/execute.html 11 | execute: 12 | execute_notebooks: off 13 | 14 | # Define the name of the latex output file for PDF builds 15 | latex: 16 | latex_documents: 17 | targetname: ml4materials.tex 18 | 19 | # Add a bibtex file so that we can create citations 20 | bibtex_bibfiles: 21 | - ref.bib 22 | 23 | launch_buttons: 24 | colab_url: "https://colab.research.google.com" 25 | binderhub_url: "https://mybinder.org" 26 | thebe: true 27 | 28 | # Information about where the book exists on the web 29 | repository: 30 | url: https://github.com/aronwalsh/MLforMaterials # Online location of your book 31 | branch: 2025 # Which branch of the repository should be used when creating links (optional) 32 | 33 | # Add GitHub buttons to your book 34 | # See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository 35 | html: 36 | use_issues_button: true 37 | use_repository_button: true 38 | google_analytics_id: G-74ZFK336GP 39 | 40 | parse: 41 | myst_enable_extensions: 42 | # don't forget to list any other extensions you want enabled, 43 | # including those that are enabled by default! 44 | - dollarmath 45 | - amsmath 46 | - linkify 47 | - html_admonition 48 | - html_image -------------------------------------------------------------------------------- /_toc.yml: -------------------------------------------------------------------------------- 1 | # Table of contents 2 | # Learn more at https://jupyterbook.org/customize/toc.html 3 | 4 | format: jb-book 5 | root: Overview 6 | parts: 7 | - caption: Course Details 8 | numbered: false 9 | chapters: 10 | - file: Contents 11 | - file: Learning 12 | - file: Resources 13 | - caption: Notebooks 14 | numbered: true 15 | chapters: 16 | - file: Lecture1 17 | - file: Lecture2 18 | - file: Lecture3 19 | - file: Lecture4 20 | - file: Lecture5 21 | - file: Lecture6 22 | - file: Lecture7 23 | - file: Lecture8 24 | - file: Lecture9 -------------------------------------------------------------------------------- /images/2_Cs2AgBiI6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/images/2_Cs2AgBiI6.png -------------------------------------------------------------------------------- /images/2_Cs2AgBiI6.vesta: -------------------------------------------------------------------------------- 1 | #VESTA_FORMAT_VERSION 3.5.4 2 | 3 | 4 | CRYSTAL 5 | 6 | TITLE 7 | Cs8 Ag4 Bi4 I24 8 | 9 | GROUP 10 | 225 1 F m -3 m 11 | SYMOP 12 | 0.000000 0.000000 0.000000 1 0 0 0 1 0 0 0 1 1 13 | 0.000000 0.000000 0.000000 -1 0 0 0 -1 0 0 0 -1 1 14 | 0.000000 0.000000 0.000000 -1 0 0 0 -1 0 0 0 1 1 15 | 0.000000 0.000000 0.000000 1 0 0 0 1 0 0 0 -1 1 16 | 0.000000 0.000000 0.000000 -1 0 0 0 1 0 0 0 -1 1 17 | 0.000000 0.000000 0.000000 1 0 0 0 -1 0 0 0 1 1 18 | 0.000000 0.000000 0.000000 1 0 0 0 -1 0 0 0 -1 1 19 | 0.000000 0.000000 0.000000 -1 0 0 0 1 0 0 0 1 1 20 | 0.000000 0.000000 0.000000 0 0 1 1 0 0 0 1 0 1 21 | 0.000000 0.000000 0.000000 0 0 -1 -1 0 0 0 -1 0 1 22 | 0.000000 0.000000 0.000000 0 0 1 -1 0 0 0 -1 0 1 23 | 0.000000 0.000000 0.000000 0 0 -1 1 0 0 0 1 0 1 24 | 0.000000 0.000000 0.000000 0 0 -1 -1 0 0 0 1 0 1 25 | 0.000000 0.000000 0.000000 0 0 1 1 0 0 0 -1 0 1 26 | 0.000000 0.000000 0.000000 0 0 -1 1 0 0 0 -1 0 1 27 | 0.000000 0.000000 0.000000 0 0 1 -1 0 0 0 1 0 1 28 | 0.000000 0.000000 0.000000 0 1 0 0 0 1 1 0 0 1 29 | 0.000000 0.000000 0.000000 0 -1 0 0 0 -1 -1 0 0 1 30 | 0.000000 0.000000 0.000000 0 -1 0 0 0 1 -1 0 0 1 31 | 0.000000 0.000000 0.000000 0 1 0 0 0 -1 1 0 0 1 32 | 0.000000 0.000000 0.000000 0 1 0 0 0 -1 -1 0 0 1 33 | 0.000000 0.000000 0.000000 0 -1 0 0 0 1 1 0 0 1 34 | 0.000000 0.000000 0.000000 0 -1 0 0 0 -1 1 0 0 1 35 | 0.000000 0.000000 0.000000 0 1 0 0 0 1 -1 0 0 1 36 | 0.000000 0.000000 0.000000 0 1 0 1 0 0 0 0 -1 1 37 | 0.000000 0.000000 0.000000 0 -1 0 -1 0 0 0 0 1 1 38 | 0.000000 0.000000 0.000000 0 -1 0 -1 0 0 0 0 -1 1 39 | 0.000000 0.000000 0.000000 0 1 0 1 0 0 0 0 1 1 40 | 0.000000 0.000000 0.000000 0 1 0 -1 0 0 0 0 1 1 41 | 0.000000 0.000000 0.000000 0 -1 0 1 0 0 0 0 -1 1 42 | 0.000000 0.000000 0.000000 0 -1 0 1 0 0 0 0 1 1 43 | 0.000000 0.000000 0.000000 0 1 0 -1 0 0 0 0 -1 1 44 | 0.000000 0.000000 0.000000 1 0 0 0 0 1 0 -1 0 1 45 | 0.000000 0.000000 0.000000 -1 0 0 0 0 -1 0 1 0 1 46 | 0.000000 0.000000 0.000000 -1 0 0 0 0 1 0 1 0 1 47 | 0.000000 0.000000 0.000000 1 0 0 0 0 -1 0 -1 0 1 48 | 0.000000 0.000000 0.000000 -1 0 0 0 0 -1 0 -1 0 1 49 | 0.000000 0.000000 0.000000 1 0 0 0 0 1 0 1 0 1 50 | 0.000000 0.000000 0.000000 1 0 0 0 0 -1 0 1 0 1 51 | 0.000000 0.000000 0.000000 -1 0 0 0 0 1 0 -1 0 1 52 | 0.000000 0.000000 0.000000 0 0 1 0 1 0 -1 0 0 1 53 | 0.000000 0.000000 0.000000 0 0 -1 0 -1 0 1 0 0 1 54 | 0.000000 0.000000 0.000000 0 0 1 0 -1 0 1 0 0 1 55 | 0.000000 0.000000 0.000000 0 0 -1 0 1 0 -1 0 0 1 56 | 0.000000 0.000000 0.000000 0 0 -1 0 1 0 1 0 0 1 57 | 0.000000 0.000000 0.000000 0 0 1 0 -1 0 -1 0 0 1 58 | 0.000000 0.000000 0.000000 0 0 -1 0 -1 0 -1 0 0 1 59 | 0.000000 0.000000 0.000000 0 0 1 0 1 0 1 0 0 1 60 | 0.000000 0.500000 0.500000 1 0 0 0 1 0 0 0 1 1 61 | 0.000000 0.500000 0.500000 -1 0 0 0 -1 0 0 0 -1 1 62 | 0.000000 0.500000 0.500000 -1 0 0 0 -1 0 0 0 1 1 63 | 0.000000 0.500000 0.500000 1 0 0 0 1 0 0 0 -1 1 64 | 0.000000 0.500000 0.500000 -1 0 0 0 1 0 0 0 -1 1 65 | 0.000000 0.500000 0.500000 1 0 0 0 -1 0 0 0 1 1 66 | 0.000000 0.500000 0.500000 1 0 0 0 -1 0 0 0 -1 1 67 | 0.000000 0.500000 0.500000 -1 0 0 0 1 0 0 0 1 1 68 | 0.000000 0.500000 0.500000 0 0 1 1 0 0 0 1 0 1 69 | 0.000000 0.500000 0.500000 0 0 -1 -1 0 0 0 -1 0 1 70 | 0.000000 0.500000 0.500000 0 0 1 -1 0 0 0 -1 0 1 71 | 0.000000 0.500000 0.500000 0 0 -1 1 0 0 0 1 0 1 72 | 0.000000 0.500000 0.500000 0 0 -1 -1 0 0 0 1 0 1 73 | 0.000000 0.500000 0.500000 0 0 1 1 0 0 0 -1 0 1 74 | 0.000000 0.500000 0.500000 0 0 -1 1 0 0 0 -1 0 1 75 | 0.000000 0.500000 0.500000 0 0 1 -1 0 0 0 1 0 1 76 | 0.000000 0.500000 0.500000 0 1 0 0 0 1 1 0 0 1 77 | 0.000000 0.500000 0.500000 0 -1 0 0 0 -1 -1 0 0 1 78 | 0.000000 0.500000 0.500000 0 -1 0 0 0 1 -1 0 0 1 79 | 0.000000 0.500000 0.500000 0 1 0 0 0 -1 1 0 0 1 80 | 0.000000 0.500000 0.500000 0 1 0 0 0 -1 -1 0 0 1 81 | 0.000000 0.500000 0.500000 0 -1 0 0 0 1 1 0 0 1 82 | 0.000000 0.500000 0.500000 0 -1 0 0 0 -1 1 0 0 1 83 | 0.000000 0.500000 0.500000 0 1 0 0 0 1 -1 0 0 1 84 | 0.000000 0.500000 0.500000 0 1 0 1 0 0 0 0 -1 1 85 | 0.000000 0.500000 0.500000 0 -1 0 -1 0 0 0 0 1 1 86 | 0.000000 0.500000 0.500000 0 -1 0 -1 0 0 0 0 -1 1 87 | 0.000000 0.500000 0.500000 0 1 0 1 0 0 0 0 1 1 88 | 0.000000 0.500000 0.500000 0 1 0 -1 0 0 0 0 1 1 89 | 0.000000 0.500000 0.500000 0 -1 0 1 0 0 0 0 -1 1 90 | 0.000000 0.500000 0.500000 0 -1 0 1 0 0 0 0 1 1 91 | 0.000000 0.500000 0.500000 0 1 0 -1 0 0 0 0 -1 1 92 | 0.000000 0.500000 0.500000 1 0 0 0 0 1 0 -1 0 1 93 | 0.000000 0.500000 0.500000 -1 0 0 0 0 -1 0 1 0 1 94 | 0.000000 0.500000 0.500000 -1 0 0 0 0 1 0 1 0 1 95 | 0.000000 0.500000 0.500000 1 0 0 0 0 -1 0 -1 0 1 96 | 0.000000 0.500000 0.500000 -1 0 0 0 0 -1 0 -1 0 1 97 | 0.000000 0.500000 0.500000 1 0 0 0 0 1 0 1 0 1 98 | 0.000000 0.500000 0.500000 1 0 0 0 0 -1 0 1 0 1 99 | 0.000000 0.500000 0.500000 -1 0 0 0 0 1 0 -1 0 1 100 | 0.000000 0.500000 0.500000 0 0 1 0 1 0 -1 0 0 1 101 | 0.000000 0.500000 0.500000 0 0 -1 0 -1 0 1 0 0 1 102 | 0.000000 0.500000 0.500000 0 0 1 0 -1 0 1 0 0 1 103 | 0.000000 0.500000 0.500000 0 0 -1 0 1 0 -1 0 0 1 104 | 0.000000 0.500000 0.500000 0 0 -1 0 1 0 1 0 0 1 105 | 0.000000 0.500000 0.500000 0 0 1 0 -1 0 -1 0 0 1 106 | 0.000000 0.500000 0.500000 0 0 -1 0 -1 0 -1 0 0 1 107 | 0.000000 0.500000 0.500000 0 0 1 0 1 0 1 0 0 1 108 | 0.500000 0.000000 0.500000 1 0 0 0 1 0 0 0 1 1 109 | 0.500000 0.000000 0.500000 -1 0 0 0 -1 0 0 0 -1 1 110 | 0.500000 0.000000 0.500000 -1 0 0 0 -1 0 0 0 1 1 111 | 0.500000 0.000000 0.500000 1 0 0 0 1 0 0 0 -1 1 112 | 0.500000 0.000000 0.500000 -1 0 0 0 1 0 0 0 -1 1 113 | 0.500000 0.000000 0.500000 1 0 0 0 -1 0 0 0 1 1 114 | 0.500000 0.000000 0.500000 1 0 0 0 -1 0 0 0 -1 1 115 | 0.500000 0.000000 0.500000 -1 0 0 0 1 0 0 0 1 1 116 | 0.500000 0.000000 0.500000 0 0 1 1 0 0 0 1 0 1 117 | 0.500000 0.000000 0.500000 0 0 -1 -1 0 0 0 -1 0 1 118 | 0.500000 0.000000 0.500000 0 0 1 -1 0 0 0 -1 0 1 119 | 0.500000 0.000000 0.500000 0 0 -1 1 0 0 0 1 0 1 120 | 0.500000 0.000000 0.500000 0 0 -1 -1 0 0 0 1 0 1 121 | 0.500000 0.000000 0.500000 0 0 1 1 0 0 0 -1 0 1 122 | 0.500000 0.000000 0.500000 0 0 -1 1 0 0 0 -1 0 1 123 | 0.500000 0.000000 0.500000 0 0 1 -1 0 0 0 1 0 1 124 | 0.500000 0.000000 0.500000 0 1 0 0 0 1 1 0 0 1 125 | 0.500000 0.000000 0.500000 0 -1 0 0 0 -1 -1 0 0 1 126 | 0.500000 0.000000 0.500000 0 -1 0 0 0 1 -1 0 0 1 127 | 0.500000 0.000000 0.500000 0 1 0 0 0 -1 1 0 0 1 128 | 0.500000 0.000000 0.500000 0 1 0 0 0 -1 -1 0 0 1 129 | 0.500000 0.000000 0.500000 0 -1 0 0 0 1 1 0 0 1 130 | 0.500000 0.000000 0.500000 0 -1 0 0 0 -1 1 0 0 1 131 | 0.500000 0.000000 0.500000 0 1 0 0 0 1 -1 0 0 1 132 | 0.500000 0.000000 0.500000 0 1 0 1 0 0 0 0 -1 1 133 | 0.500000 0.000000 0.500000 0 -1 0 -1 0 0 0 0 1 1 134 | 0.500000 0.000000 0.500000 0 -1 0 -1 0 0 0 0 -1 1 135 | 0.500000 0.000000 0.500000 0 1 0 1 0 0 0 0 1 1 136 | 0.500000 0.000000 0.500000 0 1 0 -1 0 0 0 0 1 1 137 | 0.500000 0.000000 0.500000 0 -1 0 1 0 0 0 0 -1 1 138 | 0.500000 0.000000 0.500000 0 -1 0 1 0 0 0 0 1 1 139 | 0.500000 0.000000 0.500000 0 1 0 -1 0 0 0 0 -1 1 140 | 0.500000 0.000000 0.500000 1 0 0 0 0 1 0 -1 0 1 141 | 0.500000 0.000000 0.500000 -1 0 0 0 0 -1 0 1 0 1 142 | 0.500000 0.000000 0.500000 -1 0 0 0 0 1 0 1 0 1 143 | 0.500000 0.000000 0.500000 1 0 0 0 0 -1 0 -1 0 1 144 | 0.500000 0.000000 0.500000 -1 0 0 0 0 -1 0 -1 0 1 145 | 0.500000 0.000000 0.500000 1 0 0 0 0 1 0 1 0 1 146 | 0.500000 0.000000 0.500000 1 0 0 0 0 -1 0 1 0 1 147 | 0.500000 0.000000 0.500000 -1 0 0 0 0 1 0 -1 0 1 148 | 0.500000 0.000000 0.500000 0 0 1 0 1 0 -1 0 0 1 149 | 0.500000 0.000000 0.500000 0 0 -1 0 -1 0 1 0 0 1 150 | 0.500000 0.000000 0.500000 0 0 1 0 -1 0 1 0 0 1 151 | 0.500000 0.000000 0.500000 0 0 -1 0 1 0 -1 0 0 1 152 | 0.500000 0.000000 0.500000 0 0 -1 0 1 0 1 0 0 1 153 | 0.500000 0.000000 0.500000 0 0 1 0 -1 0 -1 0 0 1 154 | 0.500000 0.000000 0.500000 0 0 -1 0 -1 0 -1 0 0 1 155 | 0.500000 0.000000 0.500000 0 0 1 0 1 0 1 0 0 1 156 | 0.500000 0.500000 0.000000 1 0 0 0 1 0 0 0 1 1 157 | 0.500000 0.500000 0.000000 -1 0 0 0 -1 0 0 0 -1 1 158 | 0.500000 0.500000 0.000000 -1 0 0 0 -1 0 0 0 1 1 159 | 0.500000 0.500000 0.000000 1 0 0 0 1 0 0 0 -1 1 160 | 0.500000 0.500000 0.000000 -1 0 0 0 1 0 0 0 -1 1 161 | 0.500000 0.500000 0.000000 1 0 0 0 -1 0 0 0 1 1 162 | 0.500000 0.500000 0.000000 1 0 0 0 -1 0 0 0 -1 1 163 | 0.500000 0.500000 0.000000 -1 0 0 0 1 0 0 0 1 1 164 | 0.500000 0.500000 0.000000 0 0 1 1 0 0 0 1 0 1 165 | 0.500000 0.500000 0.000000 0 0 -1 -1 0 0 0 -1 0 1 166 | 0.500000 0.500000 0.000000 0 0 1 -1 0 0 0 -1 0 1 167 | 0.500000 0.500000 0.000000 0 0 -1 1 0 0 0 1 0 1 168 | 0.500000 0.500000 0.000000 0 0 -1 -1 0 0 0 1 0 1 169 | 0.500000 0.500000 0.000000 0 0 1 1 0 0 0 -1 0 1 170 | 0.500000 0.500000 0.000000 0 0 -1 1 0 0 0 -1 0 1 171 | 0.500000 0.500000 0.000000 0 0 1 -1 0 0 0 1 0 1 172 | 0.500000 0.500000 0.000000 0 1 0 0 0 1 1 0 0 1 173 | 0.500000 0.500000 0.000000 0 -1 0 0 0 -1 -1 0 0 1 174 | 0.500000 0.500000 0.000000 0 -1 0 0 0 1 -1 0 0 1 175 | 0.500000 0.500000 0.000000 0 1 0 0 0 -1 1 0 0 1 176 | 0.500000 0.500000 0.000000 0 1 0 0 0 -1 -1 0 0 1 177 | 0.500000 0.500000 0.000000 0 -1 0 0 0 1 1 0 0 1 178 | 0.500000 0.500000 0.000000 0 -1 0 0 0 -1 1 0 0 1 179 | 0.500000 0.500000 0.000000 0 1 0 0 0 1 -1 0 0 1 180 | 0.500000 0.500000 0.000000 0 1 0 1 0 0 0 0 -1 1 181 | 0.500000 0.500000 0.000000 0 -1 0 -1 0 0 0 0 1 1 182 | 0.500000 0.500000 0.000000 0 -1 0 -1 0 0 0 0 -1 1 183 | 0.500000 0.500000 0.000000 0 1 0 1 0 0 0 0 1 1 184 | 0.500000 0.500000 0.000000 0 1 0 -1 0 0 0 0 1 1 185 | 0.500000 0.500000 0.000000 0 -1 0 1 0 0 0 0 -1 1 186 | 0.500000 0.500000 0.000000 0 -1 0 1 0 0 0 0 1 1 187 | 0.500000 0.500000 0.000000 0 1 0 -1 0 0 0 0 -1 1 188 | 0.500000 0.500000 0.000000 1 0 0 0 0 1 0 -1 0 1 189 | 0.500000 0.500000 0.000000 -1 0 0 0 0 -1 0 1 0 1 190 | 0.500000 0.500000 0.000000 -1 0 0 0 0 1 0 1 0 1 191 | 0.500000 0.500000 0.000000 1 0 0 0 0 -1 0 -1 0 1 192 | 0.500000 0.500000 0.000000 -1 0 0 0 0 -1 0 -1 0 1 193 | 0.500000 0.500000 0.000000 1 0 0 0 0 1 0 1 0 1 194 | 0.500000 0.500000 0.000000 1 0 0 0 0 -1 0 1 0 1 195 | 0.500000 0.500000 0.000000 -1 0 0 0 0 1 0 -1 0 1 196 | 0.500000 0.500000 0.000000 0 0 1 0 1 0 -1 0 0 1 197 | 0.500000 0.500000 0.000000 0 0 -1 0 -1 0 1 0 0 1 198 | 0.500000 0.500000 0.000000 0 0 1 0 -1 0 1 0 0 1 199 | 0.500000 0.500000 0.000000 0 0 -1 0 1 0 -1 0 0 1 200 | 0.500000 0.500000 0.000000 0 0 -1 0 1 0 1 0 0 1 201 | 0.500000 0.500000 0.000000 0 0 1 0 -1 0 -1 0 0 1 202 | 0.500000 0.500000 0.000000 0 0 -1 0 -1 0 -1 0 0 1 203 | 0.500000 0.500000 0.000000 0 0 1 0 1 0 1 0 0 1 204 | -1.0 -1.0 -1.0 1 0 0 0 0 0 0 0 0 205 | TRANM 0 206 | 0.000000 0.000000 0.000000 1 0 0 0 1 0 0 0 1 207 | LTRANSL 208 | -1 209 | 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 210 | LORIENT 211 | -1 0 0 0 0 212 | 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 213 | 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 214 | LMATRIX 215 | 1.000000 0.000000 0.000000 0.000000 216 | 0.000000 1.000000 0.000000 0.000000 217 | 0.000000 0.000000 1.000000 0.000000 218 | 0.000000 0.000000 0.000000 1.000000 219 | 0.000000 0.000000 0.000000 220 | CELLP 221 | 12.253718 12.253718 12.253718 90.000000 90.000000 90.000000 222 | 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 223 | STRUC 224 | 1 Cs Cs0 1.0000 0.250000 0.250000 0.250000 8c -43m 225 | 0.000000 0.000000 0.000000 1.00 226 | 2 Ag Ag1 1.0000 0.000000 0.000000 0.500000 4b m-3m 227 | 0.000000 0.000000 0.000000 1.00 228 | 3 Bi Bi2 1.0000 0.000000 0.000000 0.000000 4a m-3m 229 | 0.000000 0.000000 0.000000 3.00 230 | 4 I I3 1.0000 0.000000 0.000000 0.253060 24e 4m. m 231 | 0.000000 0.000000 0.000000 -1.00 232 | 0 0 0 0 0 0 0 233 | THERI 1 234 | 1 Cs0 0.000000 235 | 2 Ag1 0.000000 236 | 3 Bi2 0.000000 237 | 4 I3 0.000000 238 | 0 0 0 239 | SHAPE 240 | 0 0 0 0 0.000000 0 192 192 192 192 241 | BOUND 242 | 0 1 0 1 0 1 243 | 0 0 0 0 0 244 | SBOND 245 | 1 Bi I 0.00000 3.38291 0 1 1 0 1 0.250 2.000 127 127 127 246 | 2 Ag I 0.00000 3.38291 0 1 1 0 1 0.250 2.000 127 127 127 247 | 0 0 0 0 248 | SITET 249 | 1 Cs0 2.7200 14 254 185 14 254 185 204 0 250 | 2 Ag1 1.4400 183 187 189 183 187 189 204 0 251 | 3 Bi2 1.8200 210 47 247 210 47 247 204 0 252 | 4 I3 1.3300 142 31 138 142 31 138 204 0 253 | 0 0 0 0 0 0 254 | VECTR 255 | 0 0 0 0 0 256 | VECTT 257 | 0 0 0 0 0 258 | SPLAN 259 | 0 0 0 0 260 | LBLAT 261 | -1 262 | LBLSP 263 | -1 264 | DLATM 265 | -1 266 | DLBND 267 | -1 268 | DLPLY 269 | -1 270 | PLN2D 271 | 0 0 0 0 272 | ATOMT 273 | 1 Cs 2.7200 14 254 185 14 254 185 204 274 | 2 Ag 1.4400 183 187 189 183 187 189 204 275 | 3 Bi 1.8200 210 47 247 210 47 247 204 276 | 4 I 1.3300 142 31 138 142 31 138 204 277 | 0 0 0 0 0 0 278 | SCENE 279 | -0.184211 0.982852 0.008235 0.000000 280 | -0.091074 -0.025410 0.995520 0.000000 281 | 0.978658 0.182636 0.094193 0.000000 282 | 0.000000 0.000000 0.000000 1.000000 283 | 0.000 0.000 284 | 0.000 285 | 1.000 286 | HBOND 0 2 287 | 288 | STYLE 289 | DISPF 37753794 290 | MODEL 2 1 0 291 | SURFS 0 1 1 292 | SECTS 32 1 293 | FORMS 0 1 294 | ATOMS 0 0 1 295 | BONDS 1 296 | POLYS 1 297 | VECTS 1.000000 298 | FORMP 299 | 1 1.0 0 0 0 300 | ATOMP 301 | 24 24 0 50 2.0 0 302 | BONDP 303 | 1 16 0.250 2.000 127 127 127 304 | POLYP 305 | 204 1 1.000 180 180 180 306 | ISURF 307 | 0 0 0 0 308 | TEX3P 309 | 1 0.00000E+00 1.00000E+00 310 | SECTP 311 | 1 0.00000E+00 1.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 312 | CONTR 313 | 0.1 -1 1 1 10 -1 2 5 314 | 2 1 2 1 315 | 0 0 0 316 | 0 0 0 317 | 0 0 0 318 | 0 0 0 319 | HKLPP 320 | 192 1 1.000 255 0 255 321 | UCOLP 322 | 0 1 1.000 0 0 0 323 | COMPS 0 324 | LABEL 1 12 1.000 0 325 | PROJT 0 0.962 326 | BKGRC 327 | 255 255 255 328 | DPTHQ 1 -0.5000 3.5000 329 | LIGHT0 1 330 | 1.000000 0.000000 0.000000 0.000000 331 | 0.000000 1.000000 0.000000 0.000000 332 | 0.000000 0.000000 1.000000 0.000000 333 | 0.000000 0.000000 0.000000 1.000000 334 | 0.000000 0.000000 20.000000 0.000000 335 | 0.000000 0.000000 -1.000000 336 | 26 26 26 255 337 | 179 179 179 255 338 | 255 255 255 255 339 | LIGHT1 340 | 1.000000 0.000000 0.000000 0.000000 341 | 0.000000 1.000000 0.000000 0.000000 342 | 0.000000 0.000000 1.000000 0.000000 343 | 0.000000 0.000000 0.000000 1.000000 344 | 0.000000 0.000000 20.000000 0.000000 345 | 0.000000 0.000000 -1.000000 346 | 0 0 0 0 347 | 0 0 0 0 348 | 0 0 0 0 349 | LIGHT2 350 | 1.000000 0.000000 0.000000 0.000000 351 | 0.000000 1.000000 0.000000 0.000000 352 | 0.000000 0.000000 1.000000 0.000000 353 | 0.000000 0.000000 0.000000 1.000000 354 | 0.000000 0.000000 20.000000 0.000000 355 | 0.000000 0.000000 -1.000000 356 | 0 0 0 0 357 | 0 0 0 0 358 | 0 0 0 0 359 | LIGHT3 360 | 1.000000 0.000000 0.000000 0.000000 361 | 0.000000 1.000000 0.000000 0.000000 362 | 0.000000 0.000000 1.000000 0.000000 363 | 0.000000 0.000000 0.000000 1.000000 364 | 0.000000 0.000000 20.000000 0.000000 365 | 0.000000 0.000000 -1.000000 366 | 0 0 0 0 367 | 0 0 0 0 368 | 0 0 0 0 369 | SECCL 0 370 | 371 | TEXCL 0 372 | 373 | ATOMM 374 | 204 204 204 255 375 | 25.600 376 | BONDM 377 | 255 255 255 255 378 | 128.000 379 | POLYM 380 | 255 255 255 255 381 | 128.000 382 | SURFM 383 | 0 0 0 255 384 | 128.000 385 | FORMM 386 | 255 255 255 255 387 | 128.000 388 | HKLPM 389 | 255 255 255 255 390 | 128.000 391 | -------------------------------------------------------------------------------- /images/2_CsPbI3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/images/2_CsPbI3.png -------------------------------------------------------------------------------- /images/2_CsPbI3.vesta: -------------------------------------------------------------------------------- 1 | #VESTA_FORMAT_VERSION 3.5.4 2 | 3 | 4 | CRYSTAL 5 | 6 | TITLE 7 | Cs1 Pb1 I3 8 | 9 | GROUP 10 | 221 1 P m -3 m 11 | SYMOP 12 | 0.000000 0.000000 0.000000 1 0 0 0 1 0 0 0 1 1 13 | 0.000000 0.000000 0.000000 -1 0 0 0 -1 0 0 0 -1 1 14 | 0.000000 0.000000 0.000000 -1 0 0 0 -1 0 0 0 1 1 15 | 0.000000 0.000000 0.000000 1 0 0 0 1 0 0 0 -1 1 16 | 0.000000 0.000000 0.000000 -1 0 0 0 1 0 0 0 -1 1 17 | 0.000000 0.000000 0.000000 1 0 0 0 -1 0 0 0 1 1 18 | 0.000000 0.000000 0.000000 1 0 0 0 -1 0 0 0 -1 1 19 | 0.000000 0.000000 0.000000 -1 0 0 0 1 0 0 0 1 1 20 | 0.000000 0.000000 0.000000 0 0 1 1 0 0 0 1 0 1 21 | 0.000000 0.000000 0.000000 0 0 -1 -1 0 0 0 -1 0 1 22 | 0.000000 0.000000 0.000000 0 0 1 -1 0 0 0 -1 0 1 23 | 0.000000 0.000000 0.000000 0 0 -1 1 0 0 0 1 0 1 24 | 0.000000 0.000000 0.000000 0 0 -1 -1 0 0 0 1 0 1 25 | 0.000000 0.000000 0.000000 0 0 1 1 0 0 0 -1 0 1 26 | 0.000000 0.000000 0.000000 0 0 -1 1 0 0 0 -1 0 1 27 | 0.000000 0.000000 0.000000 0 0 1 -1 0 0 0 1 0 1 28 | 0.000000 0.000000 0.000000 0 1 0 0 0 1 1 0 0 1 29 | 0.000000 0.000000 0.000000 0 -1 0 0 0 -1 -1 0 0 1 30 | 0.000000 0.000000 0.000000 0 -1 0 0 0 1 -1 0 0 1 31 | 0.000000 0.000000 0.000000 0 1 0 0 0 -1 1 0 0 1 32 | 0.000000 0.000000 0.000000 0 1 0 0 0 -1 -1 0 0 1 33 | 0.000000 0.000000 0.000000 0 -1 0 0 0 1 1 0 0 1 34 | 0.000000 0.000000 0.000000 0 -1 0 0 0 -1 1 0 0 1 35 | 0.000000 0.000000 0.000000 0 1 0 0 0 1 -1 0 0 1 36 | 0.000000 0.000000 0.000000 0 1 0 1 0 0 0 0 -1 1 37 | 0.000000 0.000000 0.000000 0 -1 0 -1 0 0 0 0 1 1 38 | 0.000000 0.000000 0.000000 0 -1 0 -1 0 0 0 0 -1 1 39 | 0.000000 0.000000 0.000000 0 1 0 1 0 0 0 0 1 1 40 | 0.000000 0.000000 0.000000 0 1 0 -1 0 0 0 0 1 1 41 | 0.000000 0.000000 0.000000 0 -1 0 1 0 0 0 0 -1 1 42 | 0.000000 0.000000 0.000000 0 -1 0 1 0 0 0 0 1 1 43 | 0.000000 0.000000 0.000000 0 1 0 -1 0 0 0 0 -1 1 44 | 0.000000 0.000000 0.000000 1 0 0 0 0 1 0 -1 0 1 45 | 0.000000 0.000000 0.000000 -1 0 0 0 0 -1 0 1 0 1 46 | 0.000000 0.000000 0.000000 -1 0 0 0 0 1 0 1 0 1 47 | 0.000000 0.000000 0.000000 1 0 0 0 0 -1 0 -1 0 1 48 | 0.000000 0.000000 0.000000 -1 0 0 0 0 -1 0 -1 0 1 49 | 0.000000 0.000000 0.000000 1 0 0 0 0 1 0 1 0 1 50 | 0.000000 0.000000 0.000000 1 0 0 0 0 -1 0 1 0 1 51 | 0.000000 0.000000 0.000000 -1 0 0 0 0 1 0 -1 0 1 52 | 0.000000 0.000000 0.000000 0 0 1 0 1 0 -1 0 0 1 53 | 0.000000 0.000000 0.000000 0 0 -1 0 -1 0 1 0 0 1 54 | 0.000000 0.000000 0.000000 0 0 1 0 -1 0 1 0 0 1 55 | 0.000000 0.000000 0.000000 0 0 -1 0 1 0 -1 0 0 1 56 | 0.000000 0.000000 0.000000 0 0 -1 0 1 0 1 0 0 1 57 | 0.000000 0.000000 0.000000 0 0 1 0 -1 0 -1 0 0 1 58 | 0.000000 0.000000 0.000000 0 0 -1 0 -1 0 -1 0 0 1 59 | 0.000000 0.000000 0.000000 0 0 1 0 1 0 1 0 0 1 60 | -1.0 -1.0 -1.0 1 0 0 0 0 0 0 0 0 61 | TRANM 0 62 | 0.000000 0.000000 0.000000 1 0 0 0 1 0 0 0 1 63 | LTRANSL 64 | -1 65 | 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 66 | LORIENT 67 | -1 0 0 0 0 68 | 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 69 | 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 70 | LMATRIX 71 | 1.000000 0.000000 0.000000 0.000000 72 | 0.000000 1.000000 0.000000 0.000000 73 | 0.000000 0.000000 1.000000 0.000000 74 | 0.000000 0.000000 0.000000 1.000000 75 | 0.000000 0.000000 0.000000 76 | CELLP 77 | 6.275142 6.275142 6.275142 90.000000 90.000000 90.000000 78 | 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 79 | STRUC 80 | 1 Cs Cs0 1.0000 0.500000 0.500000 0.500000 1b m-3m 81 | 0.000000 0.000000 0.000000 1.00 82 | 2 Pb Pb1 1.0000 0.000000 0.000000 0.000000 1a m-3m 83 | 0.000000 0.000000 0.000000 2.00 84 | 3 I I2 1.0000 0.000000 0.000000 0.500000 3d 4/mm. m 85 | 0.000000 0.000000 0.000000 -1.00 86 | 0 0 0 0 0 0 0 87 | THERI 1 88 | 1 Cs0 0.000000 89 | 2 Pb1 0.000000 90 | 3 I2 0.000000 91 | 0 0 0 92 | SHAPE 93 | 0 0 0 0 0.000000 0 192 192 192 192 94 | BOUND 95 | 0 1 0 1 0 1 96 | 0 0 0 0 0 97 | SBOND 98 | 1 Pb I 0.00000 3.69562 0 1 1 0 1 0.250 2.000 127 127 127 99 | 0 0 0 0 100 | SITET 101 | 1 Cs0 2.7200 14 254 185 14 254 185 204 0 102 | 2 Pb1 1.7500 82 83 91 82 83 91 204 0 103 | 3 I2 1.3300 142 31 138 142 31 138 204 0 104 | 0 0 0 0 0 0 105 | VECTR 106 | 0 0 0 0 0 107 | VECTT 108 | 0 0 0 0 0 109 | SPLAN 110 | 0 0 0 0 111 | LBLAT 112 | -1 113 | LBLSP 114 | -1 115 | DLATM 116 | -1 117 | DLBND 118 | -1 119 | DLPLY 120 | -1 121 | PLN2D 122 | 0 0 0 0 123 | ATOMT 124 | 1 Cs 2.7200 14 254 185 14 254 185 204 125 | 2 Pb 1.7500 82 83 91 82 83 91 204 126 | 3 I 1.3300 142 31 138 142 31 138 204 127 | 0 0 0 0 0 0 128 | SCENE 129 | 0.971230 0.013601 -0.237754 0.000000 130 | -0.048255 0.988897 -0.140552 0.000000 131 | 0.233202 0.147982 0.961103 0.000000 132 | 0.000000 0.000000 0.000000 1.000000 133 | 0.000 0.000 134 | 0.000 135 | 0.629 136 | HBOND 0 2 137 | 138 | STYLE 139 | DISPF 37753794 140 | MODEL 2 1 0 141 | SURFS 0 1 1 142 | SECTS 32 1 143 | FORMS 0 1 144 | ATOMS 0 0 1 145 | BONDS 1 146 | POLYS 1 147 | VECTS 1.000000 148 | FORMP 149 | 1 1.0 0 0 0 150 | ATOMP 151 | 24 24 0 50 2.0 0 152 | BONDP 153 | 1 16 0.250 2.000 127 127 127 154 | POLYP 155 | 204 1 1.000 180 180 180 156 | ISURF 157 | 0 0 0 0 158 | TEX3P 159 | 1 0.00000E+00 1.00000E+00 160 | SECTP 161 | 1 0.00000E+00 1.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 162 | CONTR 163 | 0.1 -1 1 1 10 -1 2 5 164 | 2 1 2 1 165 | 0 0 0 166 | 0 0 0 167 | 0 0 0 168 | 0 0 0 169 | HKLPP 170 | 192 1 1.000 255 0 255 171 | UCOLP 172 | 0 1 1.000 0 0 0 173 | COMPS 0 174 | LABEL 1 12 1.000 0 175 | PROJT 0 0.962 176 | BKGRC 177 | 255 255 255 178 | DPTHQ 1 -0.5000 3.5000 179 | LIGHT0 1 180 | 1.000000 0.000000 0.000000 0.000000 181 | 0.000000 1.000000 0.000000 0.000000 182 | 0.000000 0.000000 1.000000 0.000000 183 | 0.000000 0.000000 0.000000 1.000000 184 | 0.000000 0.000000 20.000000 0.000000 185 | 0.000000 0.000000 -1.000000 186 | 51 51 51 255 187 | 196 196 196 255 188 | 255 255 255 255 189 | LIGHT1 190 | 1.000000 0.000000 0.000000 0.000000 191 | 0.000000 1.000000 0.000000 0.000000 192 | 0.000000 0.000000 1.000000 0.000000 193 | 0.000000 0.000000 0.000000 1.000000 194 | 0.000000 0.000000 20.000000 0.000000 195 | 0.000000 0.000000 -1.000000 196 | 0 0 0 0 197 | 0 0 0 0 198 | 0 0 0 0 199 | LIGHT2 200 | 1.000000 0.000000 0.000000 0.000000 201 | 0.000000 1.000000 0.000000 0.000000 202 | 0.000000 0.000000 1.000000 0.000000 203 | 0.000000 0.000000 0.000000 1.000000 204 | 0.000000 0.000000 20.000000 0.000000 205 | 0.000000 0.000000 -1.000000 206 | 0 0 0 0 207 | 0 0 0 0 208 | 0 0 0 0 209 | LIGHT3 210 | 1.000000 0.000000 0.000000 0.000000 211 | 0.000000 1.000000 0.000000 0.000000 212 | 0.000000 0.000000 1.000000 0.000000 213 | 0.000000 0.000000 0.000000 1.000000 214 | 0.000000 0.000000 20.000000 0.000000 215 | 0.000000 0.000000 -1.000000 216 | 0 0 0 0 217 | 0 0 0 0 218 | 0 0 0 0 219 | SECCL 0 220 | 221 | TEXCL 0 222 | 223 | ATOMM 224 | 204 204 204 255 225 | 25.600 226 | BONDM 227 | 255 255 255 255 228 | 128.000 229 | POLYM 230 | 255 255 255 255 231 | 128.000 232 | SURFM 233 | 0 0 0 255 234 | 128.000 235 | FORMM 236 | 255 255 255 255 237 | 128.000 238 | HKLPM 239 | 255 255 255 255 240 | 128.000 241 | -------------------------------------------------------------------------------- /images/2_sum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/images/2_sum.png -------------------------------------------------------------------------------- /images/5_bands.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/images/5_bands.png -------------------------------------------------------------------------------- /images/6_tem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/images/6_tem.png -------------------------------------------------------------------------------- /images/ml-python.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/images/ml-python.png -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/logo.png -------------------------------------------------------------------------------- /ref.bib: -------------------------------------------------------------------------------- 1 | --- 2 | --- 3 | 4 | @inproceedings{holdgraf_evidence_2014, 5 | address = {Brisbane, Australia, Australia}, 6 | title = {Evidence for {Predictive} {Coding} in {Human} {Auditory} {Cortex}}, 7 | booktitle = {International {Conference} on {Cognitive} {Neuroscience}}, 8 | publisher = {Frontiers in Neuroscience}, 9 | author = {Holdgraf, Christopher Ramsay and de Heer, Wendy and Pasley, Brian N. and Knight, Robert T.}, 10 | year = {2014} 11 | } 12 | 13 | @article{holdgraf_rapid_2016, 14 | title = {Rapid tuning shifts in human auditory cortex enhance speech intelligibility}, 15 | volume = {7}, 16 | issn = {2041-1723}, 17 | url = {http://www.nature.com/doifinder/10.1038/ncomms13654}, 18 | doi = {10.1038/ncomms13654}, 19 | number = {May}, 20 | journal = {Nature Communications}, 21 | author = {Holdgraf, Christopher Ramsay and de Heer, Wendy and Pasley, Brian N. and Rieger, Jochem W. and Crone, Nathan and Lin, Jack J. and Knight, Robert T. and Theunissen, Frédéric E.}, 22 | year = {2016}, 23 | pages = {13654}, 24 | file = {Holdgraf et al. - 2016 - Rapid tuning shifts in human auditory cortex enhance speech intelligibility.pdf:C\:\\Users\\chold\\Zotero\\storage\\MDQP3JWE\\Holdgraf et al. - 2016 - Rapid tuning shifts in human auditory cortex enhance speech intelligibility.pdf:application/pdf} 25 | } 26 | 27 | @inproceedings{holdgraf_portable_2017, 28 | title = {Portable learning environments for hands-on computational instruction using container-and cloud-based technology to teach data science}, 29 | volume = {Part F1287}, 30 | isbn = {978-1-4503-5272-7}, 31 | doi = {10.1145/3093338.3093370}, 32 | abstract = {© 2017 ACM. There is an increasing interest in learning outside of the traditional classroom setting. This is especially true for topics covering computational tools and data science, as both are challenging to incorporate in the standard curriculum. These atypical learning environments offer new opportunities for teaching, particularly when it comes to combining conceptual knowledge with hands-on experience/expertise with methods and skills. Advances in cloud computing and containerized environments provide an attractive opportunity to improve the effciency and ease with which students can learn. This manuscript details recent advances towards using commonly-Available cloud computing services and advanced cyberinfrastructure support for improving the learning experience in bootcamp-style events. We cover the benets (and challenges) of using a server hosted remotely instead of relying on student laptops, discuss the technology that was used in order to make this possible, and give suggestions for how others could implement and improve upon this model for pedagogy and reproducibility.}, 33 | booktitle = {{ACM} {International} {Conference} {Proceeding} {Series}}, 34 | author = {Holdgraf, Christopher Ramsay and Culich, A. and Rokem, A. and Deniz, F. and Alegro, M. and Ushizima, D.}, 35 | year = {2017}, 36 | keywords = {Teaching, Bootcamps, Cloud computing, Data science, Docker, Pedagogy} 37 | } 38 | 39 | @article{holdgraf_encoding_2017, 40 | title = {Encoding and decoding models in cognitive electrophysiology}, 41 | volume = {11}, 42 | issn = {16625137}, 43 | doi = {10.3389/fnsys.2017.00061}, 44 | abstract = {© 2017 Holdgraf, Rieger, Micheli, Martin, Knight and Theunissen. Cognitive neuroscience has seen rapid growth in the size and complexity of data recorded from the human brain as well as in the computational tools available to analyze this data. This data explosion has resulted in an increased use of multivariate, model-based methods for asking neuroscience questions, allowing scientists to investigate multiple hypotheses with a single dataset, to use complex, time-varying stimuli, and to study the human brain under more naturalistic conditions. These tools come in the form of “Encoding” models, in which stimulus features are used to model brain activity, and “Decoding” models, in which neural features are used to generated a stimulus output. Here we review the current state of encoding and decoding models in cognitive electrophysiology and provide a practical guide toward conducting experiments and analyses in this emerging field. Our examples focus on using linear models in the study of human language and audition. We show how to calculate auditory receptive fields from natural sounds as well as how to decode neural recordings to predict speech. The paper aims to be a useful tutorial to these approaches, and a practical introduction to using machine learning and applied statistics to build models of neural activity. The data analytic approaches we discuss may also be applied to other sensory modalities, motor systems, and cognitive systems, and we cover some examples in these areas. In addition, a collection of Jupyter notebooks is publicly available as a complement to the material covered in this paper, providing code examples and tutorials for predictive modeling in python. The aimis to provide a practical understanding of predictivemodeling of human brain data and to propose best-practices in conducting these analyses.}, 45 | journal = {Frontiers in Systems Neuroscience}, 46 | author = {Holdgraf, Christopher Ramsay and Rieger, J.W. and Micheli, C. and Martin, S. and Knight, R.T. and Theunissen, F.E.}, 47 | year = {2017}, 48 | keywords = {Decoding models, Encoding models, Electrocorticography (ECoG), Electrophysiology/evoked potentials, Machine learning applied to neuroscience, Natural stimuli, Predictive modeling, Tutorials} 49 | } 50 | 51 | @book{ruby, 52 | title = {The Ruby Programming Language}, 53 | author = {Flanagan, David and Matsumoto, Yukihiro}, 54 | year = {2008}, 55 | publisher = {O'Reilly Media} 56 | } 57 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter-book 2 | jupyterlab_myst 3 | myst-nb 4 | numpy 5 | matplotlib 6 | pandas 7 | scikit-learn 8 | seaborn -------------------------------------------------------------------------------- /slides/MLforMaterials_Challenge_25.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Challenge_25.pdf -------------------------------------------------------------------------------- /slides/MLforMaterials_Lecture1_Intro_25.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture1_Intro_25.pdf -------------------------------------------------------------------------------- /slides/MLforMaterials_Lecture2_Basics_25.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture2_Basics_25.pdf -------------------------------------------------------------------------------- /slides/MLforMaterials_Lecture3_Data_25.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture3_Data_25.pdf -------------------------------------------------------------------------------- /slides/MLforMaterials_Lecture4_Representations_25.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture4_Representations_25.pdf -------------------------------------------------------------------------------- /slides/MLforMaterials_Lecture5_Classical_25.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture5_Classical_25.pdf -------------------------------------------------------------------------------- /slides/MLforMaterials_Lecture6_NN_25.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture6_NN_25.pdf -------------------------------------------------------------------------------- /slides/MLforMaterials_Lecture7_Build_25.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture7_Build_25.pdf -------------------------------------------------------------------------------- /slides/MLforMaterials_Lecture8_Discovery_25.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture8_Discovery_25.pdf -------------------------------------------------------------------------------- /slides/MLforMaterials_Lecture9_GenAI_25.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture9_GenAI_25.pdf --------------------------------------------------------------------------------