├── .github
    └── workflows
    │   ├── binder.yaml
    │   └── deploy.yml
├── .gitignore
├── Contents.md
├── LICENSE
├── Learning.md
├── Lecture1.ipynb
├── Lecture2.ipynb
├── Lecture3.ipynb
├── Lecture4.ipynb
├── Lecture5.ipynb
├── Lecture6.ipynb
├── Lecture7.ipynb
├── Lecture8.ipynb
├── Lecture9.ipynb
├── Overview.md
├── README.md
├── Resources.md
├── _config.yml
├── _toc.yml
├── images
    ├── 2_Cs2AgBiI6.png
    ├── 2_Cs2AgBiI6.vesta
    ├── 2_CsPbI3.png
    ├── 2_CsPbI3.vesta
    ├── 2_sum.png
    ├── 5_bands.png
    ├── 6_tem.png
    └── ml-python.png
├── logo.png
├── ref.bib
├── requirements.txt
└── slides
    ├── MLforMaterials_Challenge_25.pdf
    ├── MLforMaterials_Lecture1_Intro_25.pdf
    ├── MLforMaterials_Lecture2_Basics_25.pdf
    ├── MLforMaterials_Lecture3_Data_25.pdf
    ├── MLforMaterials_Lecture4_Representations_25.pdf
    ├── MLforMaterials_Lecture5_Classical_25.pdf
    ├── MLforMaterials_Lecture6_NN_25.pdf
    ├── MLforMaterials_Lecture7_Build_25.pdf
    ├── MLforMaterials_Lecture8_Discovery_25.pdf
    └── MLforMaterials_Lecture9_GenAI_25.pdf


/.github/workflows/binder.yaml:
--------------------------------------------------------------------------------
 1 | name: Binder
 2 | on: [push]
 3 | 
 4 | jobs:
 5 |   Create-MyBinderOrg-Cache:
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |     - name: cache binder build on mybinder.org
 9 |       uses: jupyterhub/repo2docker-action@master
10 |       with:
11 |         NO_PUSH: true
12 |         MYBINDERORG_TAG: ${{ github.event.ref }} # This builds the container on mybinder.org with the branch that was pushed on.
13 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
 1 | name: deploy-book
 2 | 
 3 | # Only run this when the master branch changes
 4 | on:
 5 |   push:
 6 |     branches:
 7 |     - 2025
 8 |     # If your git repository has the Jupyter Book within some-subfolder next to
 9 |     # unrelated files, you can make this run only if a file within that specific
10 |     # folder has been modified.
11 |     #
12 |     # paths:
13 |     # - some-subfolder/**
14 | 
15 | # This job installs dependencies, builds the book, and pushes it to `gh-pages`
16 | jobs:
17 |   deploy-book:
18 |     runs-on: ubuntu-latest
19 |     steps:
20 |     - uses: actions/checkout@v2
21 | 
22 |     # Install dependencies
23 |     - name: Set up Python 3.8
24 |       uses: actions/setup-python@v2
25 |       with:
26 |         python-version: 3.8
27 | 
28 |     - name: Install dependencies
29 |       run: |
30 |         pip install -r requirements.txt
31 | 
32 |     # Build the book
33 |     - name: Build the book
34 |       run: |
35 |         jupyter-book build .
36 | 
37 |     # Push the book's HTML to github-pages
38 |     - name: GitHub Pages action
39 |       uses: peaceiris/actions-gh-pages@v3.6.1
40 |       with:
41 |         github_token: ${{ secrets.GITHUB_TOKEN }}
42 |         publish_dir: ./_build/html
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | .DS_Store
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | _build/
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | notebooks/CsPbI3.cif
134 | imagenet_labels.json
135 | 


--------------------------------------------------------------------------------
/Contents.md:
--------------------------------------------------------------------------------
 1 | # Course Contents
 2 | 
 3 | 1. **Introduction**    
 4 |     * Overview 
 5 |     * Expectations and assessments
 6 | 	* _Exercise: Getting started_ 
 7 | 
 8 | 2. **Machine Learning Basics**  
 9 | 	* Terminology
10 | 	* Learning by example
11 | 		* Supervised
12 | 		* Unsupervised
13 | 		* Reinforcement
14 | 	* _Exercise: Crystal hardness_ 
15 | 
16 | 3. **Materials Data**
17 | 	* Data sources and formats
18 | 	* API queries 
19 | 	* _Exercise: Data-driven thermoelectrics_  
20 | 
21 | 4. **Crystal Representations**
22 | 	* Compositional 
23 | 	* Structural
24 | 	* Graphs   
25 | 	* _Exercise: Navigating crystal space_  
26 | 
27 | 5. **Classical Learning**
28 | 	* _k_-nearest neighbours
29 | 	* _k_-means clustering
30 | 	* Decision trees and beyond
31 | 	* _Exercise: Metal or insulator?_ 
32 | 
33 | 6. **Artificial Neural Networks**
34 | 	* From neuron to perceptron
35 | 	* Network architecture and training
36 | 	* Convolutional neural networks   
37 | 	* _Exercise: Learning microstructure_ 
38 | 
39 | 7. **Building a Model from Scratch**
40 | 	* Data preparation
41 | 	* Model choice
42 | 	* Training and testing 
43 | 	* _Exercise: Crystal hardness II_ 
44 | 
45 | 8. **Accelerated Discovery** 
46 | 	* Automated experiments 
47 | 	* Bayesian optimisation
48 | 	* Reinforcement learning  
49 | 	* _Exercise: Closed-loop optimisation_ 
50 | 
51 | 9. **Generative Artificial Intelligence** 
52 | 	* Large language models 
53 | 	* From latent space to diffusion
54 | 	* _Exercise: Research challenge_ 
55 | 	
56 | 10. **Recent Advances** 
57 | 	* Guest lecture  
58 | 	* _Exercise: Research challenge_ 
59 | 	


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Legal Code
  2 | 
  3 | CC0 1.0 Universal
  4 | 
  5 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
  6 |     LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
  7 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
  8 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
  9 |     REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
 10 |     PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
 11 |     THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
 12 |     HEREUNDER.
 13 | 
 14 | Statement of Purpose
 15 | 
 16 | The laws of most jurisdictions throughout the world automatically confer
 17 | exclusive Copyright and Related Rights (defined below) upon the creator
 18 | and subsequent owner(s) (each and all, an "owner") of an original work of
 19 | authorship and/or a database (each, a "Work").
 20 | 
 21 | Certain owners wish to permanently relinquish those rights to a Work for
 22 | the purpose of contributing to a commons of creative, cultural and
 23 | scientific works ("Commons") that the public can reliably and without fear
 24 | of later claims of infringement build upon, modify, incorporate in other
 25 | works, reuse and redistribute as freely as possible in any form whatsoever
 26 | and for any purposes, including without limitation commercial purposes.
 27 | These owners may contribute to the Commons to promote the ideal of a free
 28 | culture and the further production of creative, cultural and scientific
 29 | works, or to gain reputation or greater distribution for their Work in
 30 | part through the use and efforts of others.
 31 | 
 32 | For these and/or other purposes and motivations, and without any
 33 | expectation of additional consideration or compensation, the person
 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 35 | is an owner of Copyright and Related Rights in the Work, voluntarily
 36 | elects to apply CC0 to the Work and publicly distribute the Work under its
 37 | terms, with knowledge of his or her Copyright and Related Rights in the
 38 | Work and the meaning and intended legal effect of CC0 on those rights.
 39 | 
 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 41 | protected by copyright and related or neighboring rights ("Copyright and
 42 | Related Rights"). Copyright and Related Rights include, but are not
 43 | limited to, the following:
 44 | 
 45 |   i. the right to reproduce, adapt, distribute, perform, display,
 46 |      communicate, and translate a Work;
 47 |  ii. moral rights retained by the original author(s) and/or performer(s);
 48 | iii. publicity and privacy rights pertaining to a person's image or
 49 |      likeness depicted in a Work;
 50 |  iv. rights protecting against unfair competition in regards to a Work,
 51 |      subject to the limitations in paragraph 4(a), below;
 52 |   v. rights protecting the extraction, dissemination, use and reuse of data
 53 |      in a Work;
 54 |  vi. database rights (such as those arising under Directive 96/9/EC of the
 55 |      European Parliament and of the Council of 11 March 1996 on the legal
 56 |      protection of databases, and under any national implementation
 57 |      thereof, including any amended or successor version of such
 58 |      directive); and
 59 | vii. other similar, equivalent or corresponding rights throughout the
 60 |      world based on applicable law or treaty, and any national
 61 |      implementations thereof.
 62 | 
 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
 64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
 65 | irrevocably and unconditionally waives, abandons, and surrenders all of
 66 | Affirmer's Copyright and Related Rights and associated claims and causes
 67 | of action, whether now known or unknown (including existing as well as
 68 | future claims and causes of action), in the Work (i) in all territories
 69 | worldwide, (ii) for the maximum duration provided by applicable law or
 70 | treaty (including future time extensions), (iii) in any current or future
 71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
 72 | including without limitation commercial, advertising or promotional
 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 74 | member of the public at large and to the detriment of Affirmer's heirs and
 75 | successors, fully intending that such Waiver shall not be subject to
 76 | revocation, rescission, cancellation, termination, or any other legal or
 77 | equitable action to disrupt the quiet enjoyment of the Work by the public
 78 | as contemplated by Affirmer's express Statement of Purpose.
 79 | 
 80 | 3. Public License Fallback. Should any part of the Waiver for any reason
 81 | be judged legally invalid or ineffective under applicable law, then the
 82 | Waiver shall be preserved to the maximum extent permitted taking into
 83 | account Affirmer's express Statement of Purpose. In addition, to the
 84 | extent the Waiver is so judged Affirmer hereby grants to each affected
 85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
 88 | maximum duration provided by applicable law or treaty (including future
 89 | time extensions), (iii) in any current or future medium and for any number
 90 | of copies, and (iv) for any purpose whatsoever, including without
 91 | limitation commercial, advertising or promotional purposes (the
 92 | "License"). The License shall be deemed effective as of the date CC0 was
 93 | applied by Affirmer to the Work. Should any part of the License for any
 94 | reason be judged legally invalid or ineffective under applicable law, such
 95 | partial invalidity or ineffectiveness shall not invalidate the remainder
 96 | of the License, and in such case Affirmer hereby affirms that he or she
 97 | will not (i) exercise any of his or her remaining Copyright and Related
 98 | Rights in the Work or (ii) assert any associated claims and causes of
 99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 | 
102 | 4. Limitations and Disclaimers.
103 | 
104 |  a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 |     surrendered, licensed or otherwise affected by this document.
106 |  b. Affirmer offers the Work as-is and makes no representations or
107 |     warranties of any kind concerning the Work, express, implied,
108 |     statutory or otherwise, including without limitation warranties of
109 |     title, merchantability, fitness for a particular purpose, non
110 |     infringement, or the absence of latent or other defects, accuracy, or
111 |     the present or absence of errors, whether or not discoverable, all to
112 |     the greatest extent permissible under applicable law.
113 |  c. Affirmer disclaims responsibility for clearing rights of other persons
114 |     that may apply to the Work or any use thereof, including without
115 |     limitation any person's Copyright and Related Rights in the Work.
116 |     Further, Affirmer disclaims responsibility for obtaining any necessary
117 |     consents, permissions or other rights required for any use of the
118 |     Work.
119 |  d. Affirmer understands and acknowledges that Creative Commons is not a
120 |     party to this document and has no duty or obligation with respect to
121 |     this CC0 or use of the Work.
122 | 


--------------------------------------------------------------------------------
/Learning.md:
--------------------------------------------------------------------------------
 1 | # Learning Outcomes
 2 | 
 3 | At the end of this course, you will be able to:
 4 | 
 5 | - Specify and interpret the central concepts underpinning supervised, unsupervised, and reinforcement learning.
 6 | 
 7 | - Describe approaches for materials representation including chemical composition and crystal structure.
 8 | 
 9 | - Discover structure and property information from public databases using Python.
10 | 
11 | - Compare a range of classical machine learning and deep learning approaches.
12 | 
13 | - Train and evaluate machine learning models for chemical problems.
14 | 


--------------------------------------------------------------------------------
/Lecture1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "id": "uDQYZDh0ciGP"
  7 |    },
  8 |    "source": [
  9 |     "# Introduction"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "<div style=\"background-color: #f8d7da; border-left: 6px solid #ccc; margin: 20px; padding: 15px;\">\n",
 17 |     "    <strong>💡 Ada Lovelace:</strong> The more I study, the more insatiable do I feel my genius for it to be.\n",
 18 |     "</div>"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {
 24 |     "id": "qlQmXeW3ciGS"
 25 |    },
 26 |    "source": [
 27 |     "<iframe class=\"speakerdeck-iframe\" frameborder=\"0\" src=\"https://speakerdeck.com/player/3be55cc221ec446eb79c9b2a77fe695e\" title=\"Machine Learning for Materials (Lecture 1)\" allowfullscreen=\"true\" style=\"border: 0px; background-clip: padding-box; background-color: rgba(0, 0, 0, 0.1); margin: 0px; padding: 0px; border-radius: 6px; box-shadow: rgba(0, 0, 0, 0.2) 0px 5px 40px; width: 100%; height: auto; aspect-ratio: 560 / 420;\" data-ratio=\"1.3333333333333333\"></iframe>\n",
 28 |     "\n",
 29 |     "[Lecture slides](https://speakerdeck.com/aronwalsh/machine-learning-for-materials-lecture-1)"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {
 35 |     "id": "aB6tYmdQciGS",
 36 |     "tags": []
 37 |    },
 38 |    "source": [
 39 |     "## 👋 Getting started\n",
 40 |     "\n",
 41 |     "Welcome to our first practical session!\n",
 42 |     "\n",
 43 |     "This is a Jupyter Notebook loaded inside a Jupyter Book. They are part of [Project Jupyter](https://jupyter.org), a suite of open-source tools. A Jupyter Notebook also allows you to run and easily share computer code. This combination makes Jupyter notebooks a useful tool for analysing data.\n",
 44 |     "\n",
 45 |     "Unlike spreadsheets or combinations of separate data analysis codes, you can collect descriptions and notes for individual experiments, links to the raw data collected, the computer code that performs any necessary data analysis, and the final figures generated with these data, ready for use in a report or published paper.\n",
 46 |     "\n",
 47 |     "There are a few components to be aware of:\n",
 48 |     "\n",
 49 |     "### Python\n",
 50 |     "A working knowledge of the [Python](https://www.python.org) programming language is assumed for this course. If you are rusty, Chapters 1-4 of [Datacamp](https://www.datacamp.com/courses/intro-to-python-for-data-science) cover the base concepts, as do many other online resources including Imperial's [Introduction to Python](https://www.imperial.ac.uk/students/academic-support/graduate-school/professional-development/doctoral-students/research-computing-data-science/courses/python-for-researchers) course.\n",
 51 |     "\n",
 52 |     "<style>\n",
 53 |     "  .question-group {\n",
 54 |     "    margin-bottom: 15px; /* Adds space between question sections */\n",
 55 |     "  }\n",
 56 |     "\n",
 57 |     "  .question-group div {\n",
 58 |     "    margin-bottom: 5px; /* Adds space between individual options */\n",
 59 |     "  }\n",
 60 |     "</style>\n",
 61 |     "\n",
 62 |     "<div class=\"question-group\">\n",
 63 |     "  <p>Choose your degree programme:</p>\n",
 64 |     "  <div>\n",
 65 |     "    <input type=\"checkbox\" name=\"uchk\" id=\"meng\">\n",
 66 |     "    <label for=\"meng\">MEng</label>\n",
 67 |     "  </div>\n",
 68 |     "  <div>\n",
 69 |     "    <input type=\"checkbox\" name=\"uchk\" id=\"msc\">\n",
 70 |     "    <label for=\"msc\">MSc</label>\n",
 71 |     "  </div>\n",
 72 |     "</div>\n",
 73 |     "\n",
 74 |     "<div class=\"question-group\">\n",
 75 |     "  <p>If MSc, have you completed the introductory Python course:</p>\n",
 76 |     "  <div>\n",
 77 |     "    <input type=\"checkbox\" name=\"uchk\" id=\"yes\">\n",
 78 |     "    <label for=\"yes\">Yes</label>\n",
 79 |     "  </div>\n",
 80 |     "  <div>\n",
 81 |     "    <input type=\"checkbox\" name=\"uchk\" id=\"no\">\n",
 82 |     "    <label for=\"no\">No</label>\n",
 83 |     "  </div>\n",
 84 |     "</div>\n",
 85 |     "\n",
 86 |     "<div class=\"question-group\">\n",
 87 |     "  <p>Rate your current Python level:</p>\n",
 88 |     "  <div>\n",
 89 |     "    <input type=\"checkbox\" name=\"uchk\" id=\"beginner\">\n",
 90 |     "    <label for=\"beginner\">Beginner</label>\n",
 91 |     "  </div>\n",
 92 |     "  <div>\n",
 93 |     "    <input type=\"checkbox\" name=\"uchk\" id=\"intermediate\">\n",
 94 |     "    <label for=\"intermediate\">Intermediate</label>\n",
 95 |     "  </div>\n",
 96 |     "  <div>\n",
 97 |     "    <input type=\"checkbox\" name=\"uchk\" id=\"advanced\">\n",
 98 |     "    <label for=\"advanced\">Advanced</label>\n",
 99 |     "  </div>\n",
100 |     "</div>\n",
101 |     "\n",
102 |     "### Markdown\n",
103 |     "Markdown is a markup language that allows easy formatting of text. It is widely used for creating and formatting online content. It is easier to read and write than html. A guide to the syntax can be found [here](https://www.markdownguide.org/basic-syntax/).\n",
104 |     "\n",
105 |     "```\n",
106 |     "# Heading\n",
107 |     "## Smaller heading\n",
108 |     "### Even smaller heading\n",
109 |     "```\n",
110 |     "\n",
111 |     "### Github\n",
112 |     "[GitHub](https://github.com) is a platform for writing and sharing code. There are many materials science projects hosted there, which enable researchers from around the world to contribute to their development. These notebooks are hosted on GitHub too. If you find an error, you can raise an [issue](https://github.com/aronwalsh/MLforMaterials/issues) or even better fix it yourself with a [pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests).\n",
113 |     "\n",
114 |     "### Live coding\n",
115 |     "The weekly notebooks are designed to be run online directly in your browser. You can activate the server by clicking the rocket icon on the top right and selecting `Live Code`. There is an option to open in [Binder](https://mybinder.org) or [Google Colab](https://colab.research.google.com). Colab is more powerful, but the formatting won't be as nice. You can opt to install Python on your own computer with [Anaconda](https://www.anaconda.com/products/distribution) and run the notebooks locally, but we do not offer support if things go wrong."
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {
121 |     "id": "95tjL6dJciGU"
122 |    },
123 |    "source": [
124 |     "## Analyse data with code\n",
125 |     "\n",
126 |     "By programming a series of instructions, researchers can consistently obtain the same results from a given dataset. This approach enables us to share datasets and code, allowing other scientists to review, repeat and reuse the analysis.  The transparency and reproducibility of code-based analysis enhances research integrity and credibility, while minimising errors. It also enables efficient handling of large datasets and complex calculations, accelerating the exploration of different techniques.\n",
127 |     "\n",
128 |     "### Running code\n",
129 |     "\n",
130 |     "Different programming languages can be used in Jupyter notebooks. We will be using Python 3. The large scientific community for Python means that well-developed resources exist for data processing and specific prewritten tools for manipulating and plotting data.\n",
131 |     "\n",
132 |     "Any code typed into a code cell can be run (executed) by pressing the `run` button. You can also run the selected code block using `Shift-Enter` combination on your keyboard."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {
139 |     "colab": {
140 |      "base_uri": "https://localhost:8080/"
141 |     },
142 |     "collapsed": false,
143 |     "id": "wCimyGVFciGU",
144 |     "jupyter": {
145 |      "outputs_hidden": false
146 |     },
147 |     "outputId": "6f46572f-a956-4342-def3-8713a99c224d"
148 |    },
149 |    "outputs": [],
150 |    "source": [
151 |     "2+3 # run this cell"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {
158 |     "colab": {
159 |      "base_uri": "https://localhost:8080/"
160 |     },
161 |     "collapsed": false,
162 |     "id": "2VOKhE8pciGW",
163 |     "jupyter": {
164 |      "outputs_hidden": false
165 |     },
166 |     "outputId": "c14bdab6-a0e1-4181-b0bb-dc43afb85865"
167 |    },
168 |    "outputs": [],
169 |    "source": [
170 |     "print(\"Beware of 小妖精\") # anything after '#' is a comment and ignored"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {
177 |     "colab": {
178 |      "base_uri": "https://localhost:8080/"
179 |     },
180 |     "collapsed": false,
181 |     "id": "iRqw3mAwciGW",
182 |     "jupyter": {
183 |      "outputs_hidden": false
184 |     },
185 |     "outputId": "e774b03f-36f0-420c-9d2d-d29426602fa3"
186 |    },
187 |    "outputs": [],
188 |    "source": [
189 |     "12*2.40*3737*12 # you get the idea"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {
196 |     "colab": {
197 |      "base_uri": "https://localhost:8080/"
198 |     },
199 |     "collapsed": false,
200 |     "id": "unZ26LEociGW",
201 |     "jupyter": {
202 |      "outputs_hidden": false
203 |     },
204 |     "outputId": "65ccf1d5-52a2-49c7-dec9-6999d12ddd8e"
205 |    },
206 |    "outputs": [],
207 |    "source": [
208 |     "2**1000 - 2 # a big number"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {
215 |     "colab": {
216 |      "base_uri": "https://localhost:8080/"
217 |     },
218 |     "collapsed": false,
219 |     "id": "MyM32PMxciGW",
220 |     "jupyter": {
221 |      "outputs_hidden": false
222 |     },
223 |     "outputId": "a53bd082-8a05-4dbf-c5cb-807809c725aa"
224 |    },
225 |    "outputs": [],
226 |    "source": [
227 |     "import math as m # import a math module\n",
228 |     "m.pi"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {
235 |     "colab": {
236 |      "base_uri": "https://localhost:8080/"
237 |     },
238 |     "collapsed": false,
239 |     "id": "P574cgsSciGX",
240 |     "jupyter": {
241 |      "outputs_hidden": false
242 |     },
243 |     "outputId": "a1f16417-6f1d-417b-b6ad-5ab5321a5dfd"
244 |    },
245 |    "outputs": [],
246 |    "source": [
247 |     "20*m.atan(1/7)+8*m.atan(3/79) # Euler's approximation"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {
253 |     "id": "5B698R2pciGX"
254 |    },
255 |    "source": [
256 |     "### Plotting with Matplotlib"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "markdown",
261 |    "metadata": {
262 |     "id": "neke0J4KifCW"
263 |    },
264 |    "source": [
265 |     "Let's import the package [Matplotlib](https://matplotlib.org), which we will be using a lot for data visualisation."
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": null,
271 |    "metadata": {
272 |     "colab": {
273 |      "base_uri": "https://localhost:8080/",
274 |      "height": 448
275 |     },
276 |     "collapsed": false,
277 |     "id": "oyBEOTXociGX",
278 |     "jupyter": {
279 |      "outputs_hidden": false
280 |     },
281 |     "outputId": "8b8a7522-08e3-4235-b064-f64adbf1b6b1",
282 |     "tags": []
283 |    },
284 |    "outputs": [],
285 |    "source": [
286 |     "# Imports\n",
287 |     "import matplotlib.pyplot as plt  # Plotting\n",
288 |     "import numpy as np  # Numerical operations\n",
289 |     "%matplotlib inline\n",
290 |     "\n",
291 |     "x = np.arange(0, 10, 0.001) # x = 0 to 10 in steps of 0.001\n",
292 |     "y = np.sin(x*x) # define your function\n",
293 |     "plt.figure(figsize=(5, 3)) # create a new figure (5x3 inches)\n",
294 |     "plt.plot(,y) # plot x against y"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "markdown",
299 |    "metadata": {
300 |     "id": "lxLc8V4tb5zh"
301 |    },
302 |    "source": [
303 |     "<details>\n",
304 |     "<summary> Code hint </summary>\n",
305 |     "You need to plot x vs y. Fix the plot command to (x,y).\n",
306 |     "</details>"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "markdown",
311 |    "metadata": {
312 |     "id": "Z8_nYqMH2MW9"
313 |    },
314 |    "source": [
315 |     "### Using a DataFrame\n",
316 |     "\n",
317 |     "A DataFrame organises data into a 2-dimensional table of rows and columns, much like a spreadsheet. They are useful tools to store, access, and modify large sets of data. \n",
318 |     "\n",
319 |     "In this module, we'll make use of [Pandas](https://pandas.pydata.org) to process input and output data for our machine learning models."
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": null,
325 |    "metadata": {
326 |     "colab": {
327 |      "base_uri": "https://localhost:8080/"
328 |     },
329 |     "id": "UamDrzLn2LoS",
330 |     "outputId": "47b9cf3b-9333-46d7-e785-d0b936bbc93e",
331 |     "tags": []
332 |    },
333 |    "outputs": [],
334 |    "source": [
335 |     "import pandas as pd  # Data manipulation using DataFrames\n",
336 |     "\n",
337 |     "df = pd.DataFrame() # This instantiates an empty pandas DataFrame\n",
338 |     "\n",
339 |     "data = {\n",
340 |     "    \"Element\" : ['C', 'O', 'Fe', 'Mg', 'Xe'],\n",
341 |     "    \"Atomic Number\" : [6, 8, 26, 12, 54],\n",
342 |     "    \"Atomic Mass\" : [12, 16, 56, 24, 131]\n",
343 |     "}\n",
344 |     "\n",
345 |     "# Let's try loading data into DataFrame df\n",
346 |     "df = pd.DataFrame(data)\n",
347 |     "\n",
348 |     "# We can make the 'Element' column the index using the set_index function\n",
349 |     "df = df.set_index(\"Element\")\n",
350 |     "\n",
351 |     "# Printing the values in the 'Atomic Number' column\n",
352 |     "print(df[\"Atom Number\"])"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "markdown",
357 |    "metadata": {
358 |     "id": "BOl6MmQuifCW",
359 |     "tags": []
360 |    },
361 |    "source": [
362 |     "<details>\n",
363 |     "<summary> Code hint </summary>\n",
364 |     "Check you are printing the correct column name. Try out some of the other options.\n",
365 |     "</details>"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "metadata": {
372 |     "colab": {
373 |      "base_uri": "https://localhost:8080/"
374 |     },
375 |     "id": "gcUlJMzWb5zi",
376 |     "outputId": "7bb8809f-3477-4593-f177-857e9bc1a1b4",
377 |     "tags": []
378 |    },
379 |    "outputs": [],
380 |    "source": [
381 |     "# Add a new column\n",
382 |     "df[\"Energy (eV)\"] = [5.47, 5.14, 0.12, 4.34, 7.01]\n",
383 |     "\n",
384 |     "print(df[\"Energy (eV)\"])"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": null,
390 |    "metadata": {
391 |     "colab": {
392 |      "base_uri": "https://localhost:8080/"
393 |     },
394 |     "id": "HxPmwuvub5zi",
395 |     "outputId": "bbe23a6f-6569-40dc-d5ed-64431df3a9be",
396 |     "tags": []
397 |    },
398 |    "outputs": [],
399 |    "source": [
400 |     "# Print a row from the DataFrame\n",
401 |     "\n",
402 |     "# The df.loc[index] function to print the entry \"C\"\n",
403 |     "print(df.loc[''])\n",
404 |     "\n",
405 |     "print('-----')\n",
406 |     "\n",
407 |     "# The df.iloc[index] function to print the first entry (counting starts at 0...)\n",
408 |     "print(df.iloc[0])"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "markdown",
413 |    "metadata": {
414 |     "id": "LsKd-p8Ob5zi"
415 |    },
416 |    "source": [
417 |     "<details>\n",
418 |     "<summary> Code hint </summary>\n",
419 |     "You need to tell `df.loc` what to look for. Put an element name in between the quotes.\n",
420 |     "</details>"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "markdown",
425 |    "metadata": {
426 |     "id": "Ug7HnFwUciGX"
427 |    },
428 |    "source": [
429 |     "### Write an equation"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "markdown",
434 |    "metadata": {
435 |     "id": "tlakAjCMciGX"
436 |    },
437 |    "source": [
438 |     "This equation is written in [LaTeX](https://www.overleaf.com/learn/latex/Learn_LaTeX_in_30_minutes) format. It's easy to learn and useful for complex expressions, e.g. `\\frac{x}{y}` writes x/y as a fraction $\\dfrac{x}{y}$.\n",
439 |     "\n",
440 |     "`$-\\frac{\\hslash^2}{2m} \\, \\frac{\\partial^2 \\psi}{\\partial x^2}$`\n",
441 |     "\n",
442 |     "renders as\n",
443 |     "\n",
444 |     "$-\\dfrac{\\hslash^2}{2m} \\, \\dfrac{\\partial^2 \\psi}{\\partial x^2}$"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "markdown",
449 |    "metadata": {
450 |     "id": "6LT9mCDQciGX"
451 |    },
452 |    "source": [
453 |     "### Link an image"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "markdown",
458 |    "metadata": {
459 |     "id": "oteHuO9DciGY"
460 |    },
461 |    "source": [
462 |     "The syntax employed here is Markdown. It can be used in notebooks, is popular on Github for documentation, and can even be a fast way to take notes during lectures.\n",
463 |     "\n",
464 |     "`![](https://media.giphy.com/media/cxk3z6nMhpf7a/giphy.gif)`\n",
465 |     "\n",
466 |     "which renders as\n",
467 |     "\n",
468 |     "![](https://media.giphy.com/media/cxk3z6nMhpf7a/giphy.gif)"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "markdown",
473 |    "metadata": {
474 |     "id": "8uepYP7rciGY"
475 |    },
476 |    "source": [
477 |     "## Computational science\n",
478 |     "\n",
479 |     "### Thermally-actived diffusion\n",
480 |     "\n",
481 |     "Ion transport in crystals is a fundamental process that underpins various technological applications, from batteries to semiconductor devices. Understanding the kinetics of ion movement within and between materials is crucial for optimising device performance.\n",
482 |     "\n",
483 |     "Like many chemical processes, solid-state diffusion transport is thermally activated. We can describe ion motion in a crystal using a familiar Arrhenius relationship.\n",
484 |     "\n",
485 |     "The diffusion coefficient of a species is given by $D_{ion} = D_0 \\cdot e^{-(\\frac{\\Delta E_a}{k_BT})}$, where:\n",
486 |     "- $D_{ion}$ is the diffusion coefficient for a particular ion,\n",
487 |     "- $D_0$ is the temperature-independent prefactor (containing an attempt frequency),\n",
488 |     "- $\\Delta E_a$ is the activation energy for diffusion,\n",
489 |     "- $k_B$ is the Boltzmann constant, and\n",
490 |     "- $T$ is the temperature.\n",
491 |     "\n",
492 |     "Let's write a function for it, which will take advantage of the wonderful [NumPy](https://numpy.org) package. It also uses the [physical constants](https://docs.scipy.org/doc/scipy/reference/constants.html#physical-constants) in [SciPy](https://scipy.org), and explains the function with a [docstring](https://en.wikipedia.org/wiki/Docstring)."
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": null,
498 |    "metadata": {
499 |     "tags": []
500 |    },
501 |    "outputs": [],
502 |    "source": [
503 |     "import numpy as np\n",
504 |     "from scipy.constants import physical_constants\n",
505 |     "\n",
506 |     "# Define constants\n",
507 |     "k_B = physical_constants['Boltzmann constant in eV/K'][0]\n",
508 |     "\n",
509 |     "# Arrhenius function\n",
510 |     "def arrhenius(activation_energy, temperature, D0=1):\n",
511 |     "    \"\"\"\n",
512 |     "    Calculates the rate using the Arrhenius equation.\n",
513 |     "    \n",
514 |     "    Parameters:\n",
515 |     "    activation_energy (float): the activation energy in eV.\n",
516 |     "    temperature (float): the temperature in K (must be > 0).\n",
517 |     "    D0 (float): the pre-exponential factor (default is 1).\n",
518 |     "    \n",
519 |     "    Returns:\n",
520 |     "    float: the rate of the reaction.\n",
521 |     "    \"\"\"\n",
522 |     "    if np.any(temperature <= 0):\n",
523 |     "        raise ValueError(\"Temperature must be greater than 0 K\")\n",
524 |     "    return D0 * np.exp(-activation_energy / (k_B * temperature))"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "markdown",
529 |    "metadata": {
530 |     "id": "R8aKxKtuciGY"
531 |    },
532 |    "source": [
533 |     "This function takes `activation_energy` (eV) and `temperature` (K) as inputs and returns the corresponding diffusion coefficient. Recall that the units of the exponential term cancel out, so $D_{ion}$ takes the same units as $D_0$. Now let's use the function:"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": null,
539 |    "metadata": {
540 |     "colab": {
541 |      "base_uri": "https://localhost:8080/"
542 |     },
543 |     "id": "gO22e47tciGY",
544 |     "outputId": "7f1557d3-674b-45b3-e878-7de2021946ae",
545 |     "tags": []
546 |    },
547 |    "outputs": [],
548 |    "source": [
549 |     " # Call the function for Ea = 0.12 eV; T = 1000 K\n",
550 |     "arrhenius(0.12, 1000) "
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "markdown",
555 |    "metadata": {
556 |     "id": "7rMW0e9bciGY"
557 |    },
558 |    "source": [
559 |     "This value tells us the likelihood that each attempt has of overcoming the thermodynamic barrier for ionic diffusion. Decrease the temperature to 100 K and see the difference.\n",
560 |     "\n",
561 |     "Now let's take advantage of the function to make a plot. We will use the numpy function `linspace`, which is documented over [here](https://numpy.org/doc/stable/reference/generated/numpy.linspace.html). It is used here to generate 100 numbers evenly spaced between 100 and 5000 that represent the temperature range of our \"experiments\"."
562 |    ]
563 |   },
564 |   {
565 |    "cell_type": "code",
566 |    "execution_count": null,
567 |    "metadata": {
568 |     "colab": {
569 |      "base_uri": "https://localhost:8080/",
570 |      "height": 472
571 |     },
572 |     "id": "WkWCrwDsb5zj",
573 |     "outputId": "f4fade20-aa13-4ab5-9ba9-c6434e97e16b",
574 |     "tags": []
575 |    },
576 |    "outputs": [],
577 |    "source": [
578 |     "import matplotlib.pyplot as plt\n",
579 |     "\n",
580 |     "# Pre-exponential term in cm^2/s\n",
581 |     "D0 = 0.5\n",
582 |     "\n",
583 |     "# Range of activation energies in eV\n",
584 |     "activation_energies = np.linspace(0.1, 1, 0) # Range from 0.1 to 1 eV in n steps\n",
585 |     "\n",
586 |     "# Temperature range in K\n",
587 |     "T = np.linspace(100, 5000, 100)\n",
588 |     "\n",
589 |     "# Calculate rates and plot curves\n",
590 |     "plt.figure(figsize=(5, 3)) \n",
591 |     "\n",
592 |     "for activation_energy in activation_energies:\n",
593 |     "    rates = arrhenius(activation_energy, T, D0)\n",
594 |     "    plt.plot(T, rates, label=f'{activation_energy:.1f} eV')\n",
595 |     "\n",
596 |     "plt.xlabel('Temperature (K)')\n",
597 |     "plt.ylabel('$D_{ion}$ (cm$^2$/s)') \n",
598 |     "plt.title('Varying activation energy')\n",
599 |     "plt.legend()\n",
600 |     "plt.grid(True)\n",
601 |     "plt.show()"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "markdown",
606 |    "metadata": {
607 |     "id": "8DvhjGpCb5zk"
608 |    },
609 |    "source": [
610 |     "<details>\n",
611 |     "<summary> Code hint </summary>\n",
612 |     "'np.linspace' requires three arguments (start, stop, number of points). 0 points won't work. Try changing it to 5.\n",
613 |     "</details>"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "markdown",
618 |    "metadata": {
619 |     "id": "uWlaJMBQciGZ"
620 |    },
621 |    "source": [
622 |     "To better visualise the trends, we can make an Arrhenius plot by plotting the natural logarithm of $D$ versus the inverse temperature, 1/T.  We use 1000/T to give a nicer range on the $x$-axis."
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "code",
627 |    "execution_count": null,
628 |    "metadata": {
629 |     "colab": {
630 |      "base_uri": "https://localhost:8080/",
631 |      "height": 472
632 |     },
633 |     "id": "7Wgi_g2wciGZ",
634 |     "outputId": "8b602c70-d78e-40c2-a00b-226582c57e43",
635 |     "tags": []
636 |    },
637 |    "outputs": [],
638 |    "source": [
639 |     "# Plotting ln(R) vs 1000/T\n",
640 |     "plt.figure(figsize=(5, 3)) \n",
641 |     "\n",
642 |     "for activation_energy in activation_energies:\n",
643 |     "    rates = arrhenius(activation_energy, T, D0)\n",
644 |     "    plt.plot(1000/T, np.log(rates), label=f'{activation_energy:.1f} eV')\n",
645 |     "\n",
646 |     "plt.xlabel('1000 / Temperature (1/K)')\n",
647 |     "plt.ylabel('ln($D_{ion}$)')\n",
648 |     "plt.title('Arrhenius plot')\n",
649 |     "plt.legend()\n",
650 |     "plt.grid(True)\n",
651 |     "plt.show()"
652 |    ]
653 |   },
654 |   {
655 |    "cell_type": "markdown",
656 |    "metadata": {
657 |     "id": "GzN2cRN0ciGZ"
658 |    },
659 |    "source": [
660 |     "The last technique to pick up in this class is data fitting. Later in the module, we will use more complex functions in high dimensions, but let's start with linear regression. There is no need to code this by hand as we can use a [function](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html) in the machine learning package [scikit-learn](https://scikit-learn.org). The real power of Python is the quality and quantity of available libraries such as this one."
661 |    ]
662 |   },
663 |   {
664 |    "cell_type": "code",
665 |    "execution_count": null,
666 |    "metadata": {
667 |     "colab": {
668 |      "base_uri": "https://localhost:8080/"
669 |     },
670 |     "id": "8eOomWxMb5zk",
671 |     "outputId": "b3cf49b2-076a-42c0-a274-486f7270b0ad",
672 |     "tags": []
673 |    },
674 |    "outputs": [],
675 |    "source": [
676 |     "import numpy as np\n",
677 |     "import pandas as pd\n",
678 |     "\n",
679 |     "num_points =  # Number of data points to generate\n",
680 |     "\n",
681 |     "# Generate random x-y data points\n",
682 |     "x_data = np.random.uniform(0, 10, num_points)  # Adjust the range as needed\n",
683 |     "y_data = np.random.uniform(0, 10, num_points)\n",
684 |     "\n",
685 |     "# Create a DataFrame\n",
686 |     "data = {'X': x_data, 'Y': y_data}\n",
687 |     "df = pd.DataFrame(data)\n",
688 |     "\n",
689 |     "# Print the DataFrame\n",
690 |     "print(df)"
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "markdown",
695 |    "metadata": {
696 |     "id": "Ewl5y52Hb5zk"
697 |    },
698 |    "source": [
699 |     "<details>\n",
700 |     "<summary> Code hint </summary>\n",
701 |     "Again you need to choose the number of points. 50 should be fine, but you have the power to decide.\n",
702 |     "</details>"
703 |    ]
704 |   },
705 |   {
706 |    "cell_type": "code",
707 |    "execution_count": null,
708 |    "metadata": {
709 |     "colab": {
710 |      "base_uri": "https://localhost:8080/",
711 |      "height": 472
712 |     },
713 |     "id": "eWwUdHjGciGZ",
714 |     "outputId": "c733327e-7559-4f0b-c029-aaf1d843855f",
715 |     "tags": []
716 |    },
717 |    "outputs": [],
718 |    "source": [
719 |     "from sklearn.linear_model import LinearRegression\n",
720 |     "from sklearn.metrics import r2_score, mean_squared_error\n",
721 |     "\n",
722 |     "# Perform linear regression\n",
723 |     "X = df['X'].values.reshape(-1, 1)  # Reshape X for compatibility with sklearn\n",
724 |     "y = df['Y'].values\n",
725 |     "model = LinearRegression().fit(X, y)\n",
726 |     "y_pred = model.predict(X)\n",
727 |     "\n",
728 |     "# Calculate error bars\n",
729 |     "residuals = y - y_pred\n",
730 |     "error_bars = np.abs(residuals)\n",
731 |     "\n",
732 |     "# Plot the linear regression line\n",
733 |     "plt.figure(figsize=(5, 3)) \n",
734 |     "plt.errorbar(df['X'], df['Y'], yerr=error_bars, fmt='o', color='skyblue', label='Prediction errors')\n",
735 |     "plt.scatter(df['X'], df['Y'])\n",
736 |     "plt.plot(df['X'], y_pred, color='red', label='Regression line')\n",
737 |     "plt.xlabel('X')\n",
738 |     "plt.ylabel('Y')\n",
739 |     "plt.title('Linear regression')\n",
740 |     "plt.legend()\n",
741 |     "plt.show()"
742 |    ]
743 |   },
744 |   {
745 |    "cell_type": "markdown",
746 |    "metadata": {
747 |     "id": "nCig-VAmciGZ"
748 |    },
749 |    "source": [
750 |     "There are a number of useful analysis tools built into `sklearn`, which we can use to probe the model properties."
751 |    ]
752 |   },
753 |   {
754 |    "cell_type": "code",
755 |    "execution_count": null,
756 |    "metadata": {
757 |     "tags": []
758 |    },
759 |    "outputs": [],
760 |    "source": [
761 |     "# Print the model parameters and performance\n",
762 |     "try:\n",
763 |     "    print(f'Slope: {model2.coef_[0]:.2f}')  # Assuming model.coef_ might be an array for multidimensional X\n",
764 |     "    print(f'Intercept: {model2.intercept_:.2f}')\n",
765 |     "    print(f'R^2 Score: {r2_score(y, y_pred):.3f}')  # R^2 - coefficient of determination\n",
766 |     "    print(f'RMSE: {np.sqrt(mean_squared_error(y, y_pred)):.3f}')  # Root Mean Squared Error\n",
767 |     "except Exception as e:\n",
768 |     "    print(\"Error in calculating model parameters or performance metrics:\", e)"
769 |    ]
770 |   },
771 |   {
772 |    "cell_type": "markdown",
773 |    "metadata": {
774 |     "id": "7K-w3ba2b5zs"
775 |    },
776 |    "source": [
777 |     "<details>\n",
778 |     "<summary> Code hint </summary>\n",
779 |     "Your model is not called `model2`. Try changing the name.\n",
780 |     "</details>"
781 |    ]
782 |   },
783 |   {
784 |    "cell_type": "markdown",
785 |    "metadata": {
786 |     "id": "2D92BAYzciGa"
787 |    },
788 |    "source": [
789 |     "## 🚨 Exercise 1\n",
790 |     "\n",
791 |     "<div style=\"background-color: #dceefb; border-left: 6px solid #ccc; margin: 20px; padding: 15px; border-radius: 5px;\">\n",
792 |     "    <strong>💡 Coding exercises:</strong> The exercises are designed to apply what you have learned with room for creativity. It is fine to discuss solutions with your classmates, but the actual code should not be directly copied.\n",
793 |     "</div>\n",
794 |     "\n",
795 |     "### Your details"
796 |    ]
797 |   },
798 |   {
799 |    "cell_type": "code",
800 |    "execution_count": null,
801 |    "metadata": {
802 |     "colab": {
803 |      "base_uri": "https://localhost:8080/"
804 |     },
805 |     "id": "xqgBbaSjb5zs",
806 |     "outputId": "27965ae7-5d0b-40f2-f4ae-7757939dfb1d",
807 |     "tags": []
808 |    },
809 |    "outputs": [],
810 |    "source": [
811 |     "import numpy as np\n",
812 |     "\n",
813 |     "# Insert your values\n",
814 |     "Name = \"No Name\" # Replace with your name\n",
815 |     "CID = 123446 # Replace with your College ID (as a numeric value with no leading 0s)\n",
816 |     "\n",
817 |     "# Set a random seed using the CID value\n",
818 |     "CID = int(CID)\n",
819 |     "np.random.seed(CID)\n",
820 |     "\n",
821 |     "# Print the message\n",
822 |     "print(\"This is the work of \" + Name + \" [CID: \" + str(CID) + \"]\")"
823 |    ]
824 |   },
825 |   {
826 |    "cell_type": "markdown",
827 |    "metadata": {
828 |     "id": "DIia0_h9ciGa"
829 |    },
830 |    "source": [
831 |     "### Problem\n",
832 |     "\n",
833 |     "Due to their importance in the electronics industry, the diffusion of atoms in semiconductors has been well studied for decades. Below is a set of data for impurity diffusion in crystalline Si [Source: [Casey and Pearson (1975)](https://link.springer.com/chapter/10.1007/978-1-4684-0904-8_2)]. It has been arranged into a DataFrame for your convenience.\n",
834 |     "\n",
835 |     "```python\n",
836 |     "import pandas as pd\n",
837 |     "\n",
838 |     "data = {\n",
839 |     "    'Impurity': ['B', 'Al', 'Ga', 'In', 'P', 'As', 'Sb', 'Bi'],\n",
840 |     "    'Mass': [10.81, 26.98, 69.72, 114.82, 30.97, 74.92, 121.76, 208.98], # atomic mass in g/mol\n",
841 |     "    'D0': [5.1, 8.0, 3.6, 16.5, 10.5, 60.0, 12.9, 1.03E3],  # cm2/sec\n",
842 |     "    'Eact': [3.70, 3.47, 3.51, 3.91, 3.69, 4.20, 3.98, 4.63]  # eV\n",
843 |     "}\n",
844 |     "\n",
845 |     "df = pd.DataFrame(data)\n",
846 |     "print(df)\n",
847 |     "```\n",
848 |     "\n",
849 |     "Two tasks will be given in class."
850 |    ]
851 |   },
852 |   {
853 |    "cell_type": "code",
854 |    "execution_count": null,
855 |    "metadata": {
856 |     "colab": {
857 |      "base_uri": "https://localhost:8080/",
858 |      "height": 472
859 |     },
860 |     "id": "g01sLM1xifCa",
861 |     "outputId": "6d0c3d79-37f3-4dc4-d246-b069fc19e7ae",
862 |     "tags": []
863 |    },
864 |    "outputs": [],
865 |    "source": [
866 |     "#Empty block for your answers\n",
867 |     "\n",
868 |     "\n"
869 |    ]
870 |   },
871 |   {
872 |    "cell_type": "code",
873 |    "execution_count": null,
874 |    "metadata": {
875 |     "id": "wh5CNdABifCa"
876 |    },
877 |    "outputs": [],
878 |    "source": [
879 |     "#Empty block for your answers\n",
880 |     "\n",
881 |     "\n"
882 |    ]
883 |   },
884 |   {
885 |    "cell_type": "markdown",
886 |    "metadata": {},
887 |    "source": [
888 |     "<div style=\"background-color: #d4edda; border-left: 6px solid #ccc; margin: 20px; padding: 15px; border-radius: 5px;\">\n",
889 |     "    <strong>📓 Submission:</strong> When your notebook is complete in Google Colab, go to <em>File > Download</em> and choose <code>.ipynb</code>. The completed file should be uploaded to Blackboard under assignments for MATE70026.\n",
890 |     "</div>"
891 |    ]
892 |   },
893 |   {
894 |    "cell_type": "markdown",
895 |    "metadata": {
896 |     "id": "BZfqPL6zifCa",
897 |     "tags": []
898 |    },
899 |    "source": [
900 |     "## 🌊 Dive deeper\n",
901 |     "\n",
902 |     "* _Level 1:_ Read Chapter 1 of [Machine Learning Refined](https://github.com/neonwatty/machine_learning_refined) for a complementary introduction to the field.\n",
903 |     "\n",
904 |     "* _Level 2:_ Taylor Sparks has a collection of video lectures on [Python for Materials Engineers](https://www.youtube.com/watch?v=tn1wpfpLx6Y&list=PLL0SWcFqypCmkHClksnGlab3wglEVMqNN&index=2).\n",
905 |     "\n",
906 |     "* _Level 3:_ If you are a matplotlib pro user, try [plotly](https://plotly.com/python) and [bokeh](https://docs.bokeh.org/en/2.4.1/docs/gallery.html) for interactive visualisations."
907 |    ]
908 |   }
909 |  ],
910 |  "metadata": {
911 |   "colab": {
912 |    "provenance": [],
913 |    "toc_visible": true
914 |   },
915 |   "kernelspec": {
916 |    "display_name": "vscode24",
917 |    "language": "python",
918 |    "name": "python3"
919 |   },
920 |   "language_info": {
921 |    "codemirror_mode": {
922 |     "name": "ipython",
923 |     "version": 3
924 |    },
925 |    "file_extension": ".py",
926 |    "mimetype": "text/x-python",
927 |    "name": "python",
928 |    "nbconvert_exporter": "python",
929 |    "pygments_lexer": "ipython3",
930 |    "version": "3.12.4"
931 |   }
932 |  },
933 |  "nbformat": 4,
934 |  "nbformat_minor": 4
935 | }
936 | 


--------------------------------------------------------------------------------
/Lecture5.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "id": "zhHRr3VVOMFm"
  7 |    },
  8 |    "source": [
  9 |     "# Classical Learning"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {
 15 |     "id": "cWJD84-9OMFo"
 16 |    },
 17 |    "source": [
 18 |     "<div style=\"background-color: #f8d7da; border-left: 6px solid #ccc; margin: 20px; padding: 15px;\">\n",
 19 |     "    <strong>💡 Hugh Cartwright:</strong> The tools of science are changing; artificial intelligence has spread to the laboratory.\n",
 20 |     "</div>"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {
 26 |     "id": "vN4ra0MROMFp"
 27 |    },
 28 |    "source": [
 29 |     "<iframe class=\"speakerdeck-iframe\" frameborder=\"0\" src=\"https://speakerdeck.com/player/b098c15f50ce4a468a1c5eecd6de0f96\" title=\"Machine Learning for Materials (Lecture 5)\" allowfullscreen=\"true\" style=\"border: 0px; background-clip: padding-box; background-color: rgba(0, 0, 0, 0.1); margin: 0px; padding: 0px; border-radius: 6px; box-shadow: rgba(0, 0, 0, 0.2) 0px 5px 40px; width: 100%; height: auto; aspect-ratio: 560 / 420;\" data-ratio=\"1.3333333333333333\"></iframe>\n",
 30 |     "\n",
 31 |     "[Lecture slides](https://speakerdeck.com/aronwalsh/mlformaterials-lecture5-classical)"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {
 37 |     "id": "IPrAgT4POMFp"
 38 |    },
 39 |    "source": [
 40 |     "## 🎲 Metal or insulator?\n",
 41 |     "\n",
 42 |     "In life, some decisions are difficult to make. We hope that our experience informs a choice that is better than a random guess. The same is true for machine learning models.\n",
 43 |     "\n",
 44 |     "There are many situations where we want to classify materials according to their properties. One fundamental characteristic is whether a material is a metal or insulator. For this exercise, we can refer to these as class `0` and class `1` materials, respectively. \n",
 45 |     "\n",
 46 |     "From our general knowledge, Cu should be `0` and MgO should be `1`, but what about Tl<sub>2</sub>O<sub>3</sub> or Ni<sub>2</sub>Zn<sub>4</sub>?\n",
 47 |     "\n",
 48 |     "### Theoretical background\n",
 49 |     "\n",
 50 |     "Metals are characterised by their free electrons that facilitate the flow of electric current. This arises from a partially filled conduction band, allowing electrons to move easily when subjected to an electric field.\n",
 51 |     "\n",
 52 |     "Insulators are characterised by an occupied valence band and empty conduction band, impeding the flow of current. The absence of charge carriers hinders electrical conductivity, making them effective insulators of electricity. Understanding these fundamental differences is crucial for designing and optimising electronic devices.\n",
 53 |     "\n",
 54 |     "In this practical, we can use the electronic band gap of a material as a simple descriptor of whether it is a metal (E<sub>g</sub> = 0) or an insulator (E<sub>g</sub> > 0).\n",
 55 |     "\n",
 56 |     "$$\n",
 57 |     "E_g = E^{conduction-band}_{minimum} - E^{valence-band}_{maximum}\n",
 58 |     "$$\n",
 59 |     "\n",
 60 |     "This classification is coarse as we are ignoring the intermediate regime of semiconductors and more exotic behaviour such as superconductivity.\n",
 61 |     "\n",
 62 |     "![image](./images/5_bands.png)\n",
 63 |     "\n",
 64 |     "## $k$-means clustering\n",
 65 |     "\n",
 66 |     "Let's start by generating synthetic data for materials along with their class labels. To make the analysis faster and more illustrative, we can perform dimensionality reduction from a 10D to 2D feature space, and then cluster the data using $k$-means."
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {
 73 |     "id": "CLEjvAiAOMFp",
 74 |     "tags": []
 75 |    },
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "# Installation of libraries\n",
 79 |     "!pip install elementembeddings --quiet\n",
 80 |     "!pip install matminer --quiet"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 2,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "# Import of modules\n",
 90 |     "import numpy as np  # Numerical operations\n",
 91 |     "import pandas as pd  # DataFrames\n",
 92 |     "import matplotlib.pyplot as plt  # Plotting\n",
 93 |     "import seaborn as sns  # Visualisation\n",
 94 |     "from sklearn.decomposition import PCA  # Principal component analysis (PCA)\n",
 95 |     "from sklearn.cluster import KMeans # k-means clustering\n",
 96 |     "from sklearn.metrics import accuracy_score, confusion_matrix  # Model evaluation\n",
 97 |     "from sklearn.tree import DecisionTreeClassifier  # Decision tree classifier"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "<details>\n",
105 |     "<summary>Colab error solution</summary>\n",
106 |     "If running the import module cell fails with an \"AttributeError\", click `Runtime` -> `Restart Session` and then simply rerun the cell.\n",
107 |     "</details>"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "### Uncorrelated data\n",
115 |     "\n",
116 |     "Pay attention to each step in the process:"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "# Step 0: Set the number of clusters\n",
126 |     "n_clusters = 0\n",
127 |     "\n",
128 |     "# Step 1: Generating synthetic (random) data\n",
129 |     "np.random.seed(42)\n",
130 |     "num_materials = 200\n",
131 |     "num_features = 10\n",
132 |     "data = np.random.rand(num_materials, num_features)\n",
133 |     "labels = np.random.randint(0, 2, num_materials)\n",
134 |     "\n",
135 |     "# Step 2: Reduce dimensions to 2 using PCA\n",
136 |     "pca = PCA(n_components=2)\n",
137 |     "reduced_data = pca.fit_transform(data)\n",
138 |     "\n",
139 |     "# Step 3: Cluster the data using k-means\n",
140 |     "kmeans = KMeans(n_clusters=n_clusters, random_state=42)\n",
141 |     "predicted_labels = kmeans.fit_predict(reduced_data)\n",
142 |     "\n",
143 |     "# Step 4: Create a plot to visualise the clusters and known labels\n",
144 |     "plt.figure(figsize=(5, 4))\n",
145 |     "\n",
146 |     "# Plot the materials labeled as metal (label=1)\n",
147 |     "plt.scatter(reduced_data[labels == 1, 0], reduced_data[labels == 1, 1], c='lightblue', label='Metal')\n",
148 |     "# Plot the materials labeled as insulator (label=0)\n",
149 |     "plt.scatter(reduced_data[labels == 0, 0], reduced_data[labels == 0, 1], c='lightcoral', label='Insulator')\n",
150 |     "# Plot the cluster centres as stars\n",
151 |     "plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='gold', s=200, label='Cluster centres', marker='*')\n",
152 |     "\n",
153 |     "# Draw cluster boundaries\n",
154 |     "h = 0.02  # step size for the meshgrid\n",
155 |     "x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1\n",
156 |     "y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1\n",
157 |     "xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
158 |     "Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])\n",
159 |     "Z = Z.reshape(xx.shape)\n",
160 |     "plt.contourf(xx, yy, Z, alpha=0.2, cmap='Pastel1')\n",
161 |     "\n",
162 |     "plt.xlabel('Principal Component 1')\n",
163 |     "plt.ylabel('Principal Component 2')\n",
164 |     "plt.title('$k$-means clustering of synthetic data')\n",
165 |     "plt.legend()\n",
166 |     "plt.show()"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {
172 |     "id": "iyjuZJYmOMFq"
173 |    },
174 |    "source": [
175 |     "<details>\n",
176 |     "<summary> Code hint </summary>\n",
177 |     "The algorithm fails for 0 clusters. \n",
178 |     "Increase the value of `n_clusters` and look at the behaviour.\n",
179 |     "</details>"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {
185 |     "id": "RWChaS_mOMFr"
186 |    },
187 |    "source": [
188 |     "The cluster centres are shown by yellow stars. The model doesn't perform well, as we just generated this \"materials data\" from random numbers. There are no correlations for the algorithms to exploit. Nonetheless, this type of \"failed experiment\" is common in real research.\n",
189 |     "\n",
190 |     "Since we know the labels, we can quantify how bad the model using the classification accuracy. Is it better than flipping a coin? "
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {
197 |     "colab": {
198 |      "base_uri": "https://localhost:8080/"
199 |     },
200 |     "id": "XyShX2J-OMFr",
201 |     "outputId": "c3c1425f-4354-4080-c42d-f025bedca416",
202 |     "tags": []
203 |    },
204 |    "outputs": [],
205 |    "source": [
206 |     "# Step 5: Quantify classification accuracy\n",
207 |     "accuracy = accuracy_score(labels, predicted_labels)\n",
208 |     "conf_matrix = confusion_matrix(labels, predicted_labels)\n",
209 |     "\n",
210 |     "print(\"Accuracy:\", accuracy)\n",
211 |     "print(\"\\nConfusion matrix:\")\n",
212 |     "print(conf_matrix)"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {
218 |     "id": "uSM_A4-ZOMFr"
219 |    },
220 |    "source": [
221 |     "## Decision tree classifier\n",
222 |     "\n",
223 |     "Let's see if we can do better using a dedicated classifier. We will now train a decision tree to tackle the same problem and visualise the decision boundary."
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {
230 |     "colab": {
231 |      "base_uri": "https://localhost:8080/",
232 |      "height": 564
233 |     },
234 |     "id": "ZKbtozXuOMFr",
235 |     "outputId": "1e23e4bb-e043-4c37-b74a-413a5e002bc1",
236 |     "tags": []
237 |    },
238 |    "outputs": [],
239 |    "source": [
240 |     "# Step 0: Set the depth of the decision tree\n",
241 |     "max_tree_depth = 0\n",
242 |     "\n",
243 |     "# Step 1: Train a decision tree classifier\n",
244 |     "def train_decision_tree(depth, reduced_data, labels):\n",
245 |     "    tree_classifier = DecisionTreeClassifier(max_depth=depth, random_state=42)\n",
246 |     "    tree_classifier.fit(reduced_data, labels)\n",
247 |     "    return tree_classifier\n",
248 |     "\n",
249 |     "tree_classifier = train_decision_tree(max_tree_depth, reduced_data, labels)\n",
250 |     "predicted_labels = tree_classifier.predict(reduced_data)\n",
251 |     "\n",
252 |     "# Step 2: Create a plot to visualise the decision boundary of the decision tree\n",
253 |     "plt.figure(figsize=(5, 4))\n",
254 |     "\n",
255 |     "# Plot the materials labeled as metal (label=1)\n",
256 |     "plt.scatter(reduced_data[labels == 1, 0], reduced_data[labels == 1, 1], c='lightblue', label='Metal')\n",
257 |     "# Plot the materials labeled as insulator (label=0)\n",
258 |     "plt.scatter(reduced_data[labels == 0, 0], reduced_data[labels == 0, 1], c='lightcoral', label='Insulator')\n",
259 |     "# Plot the decision boundary of the decision tree classifier\n",
260 |     "h = 0.02  # step size for the meshgrid\n",
261 |     "x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1\n",
262 |     "y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1\n",
263 |     "xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
264 |     "Z = tree_classifier.predict(np.c_[xx.ravel(), yy.ravel()])\n",
265 |     "Z = Z.reshape(xx.shape)\n",
266 |     "plt.contourf(xx, yy, Z, alpha=0.5, cmap='Pastel1')\n",
267 |     "\n",
268 |     "plt.xlabel('Principal Component 1')\n",
269 |     "plt.ylabel('Principal Component 2')\n",
270 |     "plt.title(f'Decision tree (max depth={max_tree_depth}) of synthetic data')\n",
271 |     "plt.legend()\n",
272 |     "\n",
273 |     "plt.show()"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "markdown",
278 |    "metadata": {
279 |     "id": "SW0VbC_4OMFr"
280 |    },
281 |    "source": [
282 |     "<details>\n",
283 |     "<summary> Code hint </summary>\n",
284 |     "With no nodes, you have made an indecisive tree 🥁.\n",
285 |     "    \n",
286 |     "Increase the value of `max_tree_depth` and look at the behaviour.\n",
287 |     "</details>"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {
293 |     "id": "sOqtQymnOMFs"
294 |    },
295 |    "source": [
296 |     "There should be more structure in the decision boundary due to the more complex model, especially as you increase the tree depth.\n",
297 |     "\n",
298 |     "$k$-means clustering provides a simple way to group materials based on similarity, yielding a clear linear decision boundary. On the other hand, the decision tree classifier does better in handling non-linear separations. It constructs a boundary based on different feature thresholds, enabling it to capture fine-grained patterns. As always in ML, there is a balance of trade-offs between simplicity and accuracy.\n",
299 |     "\n",
300 |     "Is the decision tree more accurate? Let's see."
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": null,
306 |    "metadata": {
307 |     "colab": {
308 |      "base_uri": "https://localhost:8080/"
309 |     },
310 |     "id": "PucrDphBOMFs",
311 |     "outputId": "09ad3aa5-1ee9-4597-e2ca-c38b5d85057b",
312 |     "tags": []
313 |    },
314 |    "outputs": [],
315 |    "source": [
316 |     "# Step 3: Quantify classification accuracy\n",
317 |     "accuracy = accuracy_score(labels, predicted_labels)\n",
318 |     "conf_matrix = confusion_matrix(labels, predicted_labels)\n",
319 |     "\n",
320 |     "print(\"Accuracy:\", accuracy)\n",
321 |     "print(\"\\nConfusion Matrix:\")\n",
322 |     "print(conf_matrix)"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {
328 |     "id": "SNQRw2RhOMFs"
329 |    },
330 |    "source": [
331 |     "If you choose a large value for the tree depth, the decision tree will approach a perfect accuracy of 1.0. It does this by memorising (overfitting) the training data but is unlikely to generalise well to new (unseen) data, i.e. overfitting. In contrast, the accuracy of $k$-means clustering is lower because it is an unsupervised algorithm designed for clustering, not classification. Its performance depends on the data structure and the presence of distinct clusters in that feature space.\n",
332 |     "\n",
333 |     "### Correlated data\n",
334 |     "\n",
335 |     "Let's try again, but this time we will (manually) add some correlations into the dataset."
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": null,
341 |    "metadata": {},
342 |    "outputs": [],
343 |    "source": [
344 |     "# Modify dataset with correlation\n",
345 |     "correlation_strength = 0.333\n",
346 |     "for i in range(num_features):\n",
347 |     "    # For some features, add a linear correlation with the labels\n",
348 |     "    if i % 2 == 0:  # Correlate every other feature\n",
349 |     "        data[:, i] = correlation_strength * labels + (1 - correlation_strength) * np.random.rand(num_materials)\n",
350 |     "\n",
351 |     "pca = PCA(n_components=2)\n",
352 |     "reduced_data = pca.fit_transform(data)\n",
353 |     "\n",
354 |     "# Step 0: Set the depth of the decision tree\n",
355 |     "max_tree_depth = 1\n",
356 |     "\n",
357 |     "# Step 1: Train a decision tree classifier\n",
358 |     "def train_decision_tree(depth, reduced_data, labels):\n",
359 |     "    tree_classifier = DecisionTreeClassifier(max_depth=depth, random_state=42)\n",
360 |     "    tree_classifier.fit(reduced_data, labels)\n",
361 |     "    return tree_classifier\n",
362 |     "\n",
363 |     "tree_classifier = train_decision_tree(max_tree_depth, reduced_data, labels)\n",
364 |     "predicted_labels = tree_classifier.predict(reduced_data)\n",
365 |     "\n",
366 |     "# Step 2: Create a plot to visualise the decision boundary of the decision tree\n",
367 |     "plt.figure(figsize=(5, 4))\n",
368 |     "\n",
369 |     "# Plot the materials labeled as metal (label=1)\n",
370 |     "plt.scatter(reduced_data[labels == 1, 0], reduced_data[labels == 1, 1], c='lightblue', label='Metal')\n",
371 |     "# Plot the materials labeled as insulator (label=0)\n",
372 |     "plt.scatter(reduced_data[labels == 0, 0], reduced_data[labels == 0, 1], c='lightcoral', label='Insulator')\n",
373 |     "# Plot the decision boundary of the decision tree classifier\n",
374 |     "h = 0.02  # step size for the meshgrid\n",
375 |     "x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1\n",
376 |     "y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1\n",
377 |     "xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
378 |     "Z = tree_classifier.predict(np.c_[xx.ravel(), yy.ravel()])\n",
379 |     "Z = Z.reshape(xx.shape)\n",
380 |     "plt.contourf(xx, yy, Z, alpha=0.5, cmap='Pastel1')\n",
381 |     "\n",
382 |     "plt.xlabel('Principal Component 1')\n",
383 |     "plt.ylabel('Principal Component 2')\n",
384 |     "plt.title(f'Decision tree (max depth={max_tree_depth}) for artificial materials')\n",
385 |     "plt.legend()\n",
386 |     "\n",
387 |     "plt.show()\n",
388 |     "\n",
389 |     "# Step 3: Quantify classification accuracy\n",
390 |     "accuracy = accuracy_score(labels, predicted_labels)\n",
391 |     "conf_matrix = confusion_matrix(labels, predicted_labels)\n",
392 |     "\n",
393 |     "print(\"Accuracy:\", accuracy)\n",
394 |     "print(\"\\nConfusion Matrix:\")\n",
395 |     "print(conf_matrix)"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "markdown",
400 |    "metadata": {},
401 |    "source": [
402 |     "Now even a very simple tree can effectively draw a decision boundary. Machine learning models take advantage of such correlations in high dimensional feature spaces. You can modify the correlation strength on line 2 to see the effect."
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "markdown",
407 |    "metadata": {
408 |     "id": "yAJreGhfOMFs",
409 |     "tags": []
410 |    },
411 |    "source": [
412 |     "## Real materials\n",
413 |     "\n",
414 |     "We can save time again by making use of a pre-built dataset. We will return to [matminer](https://hackingmaterials.lbl.gov/matminer), which we used before, and load `matbench_expt_is_metal`.\n",
415 |     "\n",
416 |     "### Load dataset"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": null,
422 |    "metadata": {
423 |     "tags": []
424 |    },
425 |    "outputs": [],
426 |    "source": [
427 |     "import matminer\n",
428 |     "from matminer.datasets.dataset_retrieval import load_dataset\n",
429 |     "\n",
430 |     "# Use matminer to download the dataset\n",
431 |     "df = load_dataset('matbench_expt_is_metal')\n",
432 |     "print(f'The full dataset contains {df.shape[0]} entries. \\n')\n",
433 |     "\n",
434 |     "# Display the first 10 entries\n",
435 |     "df.head(10)"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "markdown",
440 |    "metadata": {
441 |     "id": "sXq9bXwGOMFs"
442 |    },
443 |    "source": [
444 |     "<details>\n",
445 |     "<summary> Code hint </summary>\n",
446 |     "To load a different dataset, you simply change the name in 'load_dataset()'.\n",
447 |     "</details>"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "markdown",
452 |    "metadata": {
453 |     "id": "Y76d3NLhZADO"
454 |    },
455 |    "source": [
456 |     "### Materials featurisation\n",
457 |     "\n",
458 |     "Revisiting concepts from earlier Notebooks, featurising the chemical compositions is necessary to create a useful set of input vectors. This allows the presence (or absence) of an element (or element combinations) to act as a feature that the classifier takes account for.\n",
459 |     "\n",
460 |     "We will use [ElementEmbeddings](https://wmd-group.github.io/ElementEmbeddings) to featurise the `composition` column. The importance of the pooling method can be tested by generating two sets of features. In the first, the mean of the atomic vectors is used, while in the second, a max pooling method takes the maximum value of each component across all the atomic vectors in the composition."
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "code",
465 |    "execution_count": null,
466 |    "metadata": {
467 |     "colab": {
468 |      "base_uri": "https://localhost:8080/",
469 |      "height": 559
470 |     },
471 |     "id": "sTJg5-4yY9au",
472 |     "outputId": "18f140fe-b6c1-41dc-b1d2-3adfc6c40e73",
473 |     "tags": []
474 |    },
475 |    "outputs": [],
476 |    "source": [
477 |     "# Featurise all chemical compositions\n",
478 |     "from elementembeddings.composition import composition_featuriser\n",
479 |     "\n",
480 |     "# Compute element embeddings using mean and max pooling\n",
481 |     "mean_df = composition_featuriser(df[\"composition\"], embedding=\"magpie\", stats=[\"mean\"])\n",
482 |     "max_df = composition_featuriser(df[\"composition\"], embedding=\"magpie\", stats=[\"maxpool\"])\n",
483 |     "\n",
484 |     "# Convert \"is_metal\" column to integer labels (0, 1)\n",
485 |     "df['is_metal'] = df['is_metal'].astype(int)\n",
486 |     "mean_df['is_metal'] = df['is_metal']\n",
487 |     "max_df['is_metal'] = df['is_metal']\n",
488 |     "\n",
489 |     "# Define feature matrices and target variable\n",
490 |     "cols_to_drop = ['is_metal', 'formula']\n",
491 |     "\n",
492 |     "X_mean = mean_df.drop(columns=cols_to_drop, errors='ignore').values\n",
493 |     "X_max = max_df.drop(columns=cols_to_drop, errors='ignore').values\n",
494 |     "y = df['is_metal'].values  # Target variable\n",
495 |     "\n",
496 |     "# Preview first two rows \n",
497 |     "print(\"Mean pooling features (first two rows, first 4 columns):\")\n",
498 |     "print(mean_df.iloc[:2, :4])  \n",
499 |     "print(\"\\nMax pooling features (first two rows, first 4 columns):\")\n",
500 |     "print(max_df.iloc[:2, :4]) "
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "markdown",
505 |    "metadata": {},
506 |    "source": [
507 |     "In the output, you can see two numerical representations of the chemical compositions using different feature extraction techniques. Now let's see how they cluster."
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "markdown",
512 |    "metadata": {},
513 |    "source": [
514 |     "### $k$-means clustering \n",
515 |     "\n",
516 |     "#### Mean pool"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": null,
522 |    "metadata": {},
523 |    "outputs": [],
524 |    "source": [
525 |     "# Perform k-means clustering\n",
526 |     "kmeans = KMeans(n_clusters=2, random_state=42)\n",
527 |     "predicted_labels = kmeans.fit_predict(X_mean)\n",
528 |     "\n",
529 |     "# Adjust k-means output to match true labels\n",
530 |     "if accuracy_score(y, predicted_labels) < 0.5:\n",
531 |     "    predicted_labels = 1 - predicted_labels\n",
532 |     "\n",
533 |     "# Assess performance\n",
534 |     "accuracy = accuracy_score(y, predicted_labels)\n",
535 |     "print(f\"Accuracy: {accuracy:.2f}\")\n",
536 |     "\n",
537 |     "conf_matrix = confusion_matrix(y, predicted_labels)\n",
538 |     "\n",
539 |     "plt.figure(figsize=(5, 4))\n",
540 |     "sns.heatmap(conf_matrix, annot=True, fmt=\"d\", cmap=\"Blues\", \n",
541 |     "            xticklabels=['Predicted Insulator', 'Predicted Metal'], \n",
542 |     "            yticklabels=['True Insulator', 'True Metal'])\n",
543 |     "plt.xlabel('Predicted label')\n",
544 |     "plt.ylabel('True label')\n",
545 |     "plt.show()"
546 |    ]
547 |   },
548 |   {
549 |    "cell_type": "markdown",
550 |    "metadata": {},
551 |    "source": [
552 |     "#### Max pool"
553 |    ]
554 |   },
555 |   {
556 |    "cell_type": "code",
557 |    "execution_count": null,
558 |    "metadata": {},
559 |    "outputs": [],
560 |    "source": [
561 |     "# Perform k-means clustering\n",
562 |     "kmeans = KMeans(n_clusters=2, random_state=42)\n",
563 |     "predicted_labels = kmeans.fit_predict(X_max)\n",
564 |     "\n",
565 |     "# Adjust k-means output to match true labels\n",
566 |     "if accuracy_score(y, predicted_labels) < 0.5:\n",
567 |     "    predicted_labels = 1 - predicted_labels\n",
568 |     "\n",
569 |     "# Assess performance\n",
570 |     "accuracy = accuracy_score(y, predicted_labels)\n",
571 |     "print(f\"Accuracy: {accuracy:.2f}\")\n",
572 |     "\n",
573 |     "conf_matrix = confusion_matrix(y, predicted_labels)\n",
574 |     "\n",
575 |     "plt.figure(figsize=(5, 4))\n",
576 |     "sns.heatmap(conf_matrix, annot=True, fmt=\"d\", cmap=\"Blues\", \n",
577 |     "            xticklabels=['Predicted Insulator', 'Predicted Metal'], \n",
578 |     "            yticklabels=['True Insulator', 'True Metal'])\n",
579 |     "plt.xlabel('Predicted label')\n",
580 |     "plt.ylabel('True label')\n",
581 |     "plt.show()"
582 |    ]
583 |   },
584 |   {
585 |    "cell_type": "markdown",
586 |    "metadata": {},
587 |    "source": [
588 |     "The difference in accuracy between the two methods for this simple example highlights the importance of choosing an appropriate pooling strategy when featurising materials data. In this case, mean pooling provides a more balanced representation, which better distinguishes between metals and insulators."
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "markdown",
593 |    "metadata": {
594 |     "tags": []
595 |    },
596 |    "source": [
597 |     "## 🚨 Exercise 5\n",
598 |     "\n",
599 |     "<div style=\"background-color: #dceefb; border-left: 6px solid #ccc; margin: 20px; padding: 15px; border-radius: 5px;\">\n",
600 |     "    <strong>💡 Coding exercises:</strong> The exercises are designed to apply what you have learned with room for creativity. It is fine to discuss solutions with your classmates, but the actual code should not be directly copied.\n",
601 |     "</div>\n",
602 |     "\n",
603 |     "### Your details"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "code",
608 |    "execution_count": null,
609 |    "metadata": {},
610 |    "outputs": [],
611 |    "source": [
612 |     "import numpy as np\n",
613 |     "\n",
614 |     "# Insert your values\n",
615 |     "Name = \"No Name\" # Replace with your name\n",
616 |     "CID = 123446 # Replace with your College ID (as a numeric value with no leading 0s)\n",
617 |     "\n",
618 |     "# Set a random seed using the CID value\n",
619 |     "CID = int(CID)\n",
620 |     "np.random.seed(CID)\n",
621 |     "\n",
622 |     "# Print the message\n",
623 |     "print(\"This is the work of \" + Name + \" [CID: \" + str(CID) + \"]\")"
624 |    ]
625 |   },
626 |   {
627 |    "cell_type": "markdown",
628 |    "metadata": {
629 |     "id": "4WAC3QJYOMFs",
630 |     "tags": []
631 |    },
632 |    "source": [
633 |     "### Problem\n",
634 |     "\n",
635 |     "The choice of featurisation method can significantly impact the performance of machine learning models, particularly in decision trees, which rely on the features to make accurate splits. \n",
636 |     "\n",
637 |     "Tasks will be given in class focusing on comparing the impact of different featurisation methods on classification performance."
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "code",
642 |    "execution_count": null,
643 |    "metadata": {},
644 |    "outputs": [],
645 |    "source": [
646 |     "#Empty block for your answers\n",
647 |     "\n",
648 |     "\n"
649 |    ]
650 |   },
651 |   {
652 |    "cell_type": "code",
653 |    "execution_count": null,
654 |    "metadata": {},
655 |    "outputs": [],
656 |    "source": [
657 |     "#Empty block for your answers\n",
658 |     "\n",
659 |     "\n"
660 |    ]
661 |   },
662 |   {
663 |    "cell_type": "markdown",
664 |    "metadata": {},
665 |    "source": [
666 |     "<details>\n",
667 |     "<summary> Task hint </summary>\n",
668 |     "For task 4, you can featurise a new composition using a command such as `new_material = composition_featuriser([\"AlGaN2\"], embedding=\"atomic\", stats=[\"sum\"])`\n",
669 |     "</details>\n",
670 |     "\n",
671 |     "<div style=\"background-color: #d4edda; border-left: 6px solid #ccc; margin: 20px; padding: 15px; border-radius: 5px;\">\n",
672 |     "    <strong>📓 Submission:</strong> When your notebook is complete in Google Colab, go to <em>File > Download</em> and choose <code>.ipynb</code>. The completed file should be uploaded to Blackboard under assignments for MATE70026.\n",
673 |     "</div>"
674 |    ]
675 |   },
676 |   {
677 |    "cell_type": "markdown",
678 |    "metadata": {
679 |     "tags": []
680 |    },
681 |    "source": [
682 |     "## 🌊 Dive deeper\n",
683 |     "\n",
684 |     "* _Level 1:_ Tackle Chapter 6 on Linear Two-Class Classification in [Machine Learning Refined](https://github.com/jermwatt/machine_learning_refined#what-is-new-in-the-second-edition).\n",
685 |     "\n",
686 |     "* _Level 2:_ Play [metal detection](http://palestrina.northwestern.edu/metal-detection/). Note, the website can be a little temperamental. \n",
687 |     "\n",
688 |     "* _Level 3:_ Dig deeper into the options for definitions decision trees and ensemble models in [scikit-learn](https://scikit-learn.org/stable/modules/tree.html)."
689 |    ]
690 |   }
691 |  ],
692 |  "metadata": {
693 |   "colab": {
694 |    "provenance": []
695 |   },
696 |   "kernelspec": {
697 |    "display_name": "vscode24",
698 |    "language": "python",
699 |    "name": "python3"
700 |   },
701 |   "language_info": {
702 |    "codemirror_mode": {
703 |     "name": "ipython",
704 |     "version": 3
705 |    },
706 |    "file_extension": ".py",
707 |    "mimetype": "text/x-python",
708 |    "name": "python",
709 |    "nbconvert_exporter": "python",
710 |    "pygments_lexer": "ipython3",
711 |    "version": "3.12.4"
712 |   }
713 |  },
714 |  "nbformat": 4,
715 |  "nbformat_minor": 4
716 | }
717 | 


--------------------------------------------------------------------------------
/Lecture7.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Building a Model from Scratch"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "<div style=\"background-color: #f8d7da; border-left: 6px solid #ccc; margin: 20px; padding: 15px;\">\n",
 15 |     "    <strong>💡 Mildred Dresselhaus:</strong> People said you’re crazy... But if you think you’re right, stick to it. And we were right.\n",
 16 |     "</div>"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "<iframe class=\"speakerdeck-iframe\" frameborder=\"0\" src=\"https://speakerdeck.com/player/31699a6d5e6c47f1be25e6dd16af566b\" title=\"Machine Learning for Materials (Lecture 7)\" allowfullscreen=\"true\" style=\"border: 0px; background-clip: padding-box; background-color: rgba(0, 0, 0, 0.1); margin: 0px; padding: 0px; border-radius: 6px; box-shadow: rgba(0, 0, 0, 0.2) 0px 5px 40px; width: 100%; height: auto; aspect-ratio: 560 / 420;\" data-ratio=\"1.3333333333333333\"></iframe>\n",
 24 |     "\n",
 25 |     "[Lecture slides](https://speakerdeck.com/aronwalsh/mlformaterials-lecture7-build)"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## 🦾 Crystal hardness revisited\n",
 33 |     "\n",
 34 |     "We first tackled the [bulk modulus](https://en.wikipedia.org/wiki/Bulk_modulus) of inorganic crystals in Lecture 2. However our model development was not thorough back then.\n",
 35 |     "\n",
 36 |     "Let's revisit this problem using the new knowledge and tricks we have picked up. We will follow the same initial steps, making use of [matminer](https://matminer.readthedocs.io) to access the materials dataset and featurise the data."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "# Installation of libraries\n",
 46 |     "!pip install matminer --quiet\n",
 47 |     "!pip install xgboost --quiet"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "# Downgrade scikit to avoid a conflict with xgboost\n",
 57 |     "    # Note: Ignore the error message\n",
 58 |     "!pip uninstall -y scikit-learn --quiet\n",
 59 |     "!pip install scikit-learn==1.3.1 --quiet "
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "# Import of modules\n",
 69 |     "import numpy as np  \n",
 70 |     "import matplotlib.pyplot as plt  \n",
 71 |     "import pandas as pd  \n",
 72 |     "import pprint  \n",
 73 |     "import seaborn as sns \n",
 74 |     "plt.style.use('ggplot') \n",
 75 |     "\n",
 76 |     "# Advanced\n",
 77 |     "from pymatgen.core import Structure \n",
 78 |     "import matminer  \n",
 79 |     "from matminer.datasets.dataset_retrieval import load_dataset \n",
 80 |     "from monty.serialization import loadfn \n",
 81 |     "\n",
 82 |     "# To make the model run faster\n",
 83 |     "teaching_mode = True"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "<details>\n",
 91 |     "<summary>Colab error solution</summary>\n",
 92 |     "If running the import module cell fails with an \"AttributeError\", click `Runtime` -> `Restart Session` and then simply rerun the cell. \n",
 93 |     "</details>"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {
 99 |     "tags": []
100 |    },
101 |    "source": [
102 |     "## Data preparation\n",
103 |     "\n",
104 |     "The steps to load and featurise the bulk modulus data were introduced in Notebook 2, so we can jump straight in."
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "# Use matminer to load the dataset\n",
114 |     "df = load_dataset('matbench_log_kvrh')\n",
115 |     "print(f'The full dataset contains {df.shape[0]} entries. \\n')\n",
116 |     "\n",
117 |     "if teaching_mode:\n",
118 |     "  # Store the original DataFrame as a copy\n",
119 |     "  full_dataset_df = df.copy()\n",
120 |     "  # Create a subset of the original DataFrame for demonstration purposes\n",
121 |     "  df = df.sample(n=1000, random_state=33)\n",
122 |     "  print(f'For teaching purposes we will only work with {df.shape[0]} entries from the dataframe to make the model training and testing faster. \\n')\n",
123 |     "\n",
124 |     "print('The DataFrame is shown below:')\n",
125 |     "df.head(10)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "# Plot a histogram of values\n",
135 |     "fig, ax = plt.subplots(figsize=(5, 4))\n",
136 |     "ax.hist(df['log10(K_VRH)'])\n",
137 |     "ax.set_xlabel(r'$log_{10}K_{VRH}$ [$log_{10}GPa$]' )\n",
138 |     "ax.set_ylabel('Counts')\n",
139 |     "plt.show()"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "# Use matminer to featurise the dataset\n",
149 |     "from matminer.featurizers.composition.composite import ElementProperty\n",
150 |     "from matminer.featurizers.structure.order import DensityFeatures\n",
151 |     "\n",
152 |     "# Add a composition column to df using the composition property of the Structure class\n",
153 |     "df['composition'] = df.structure.apply(lambda x: x.composition )\n",
154 |     "\n",
155 |     "# Create the ElementProperty featuriser\n",
156 |     "el_prop_featuriser = ElementProperty.from_preset(preset_name='magpie')\n",
157 |     "\n",
158 |     "# By default multiprocessing is enabled, however, this can slow performance, so we disable it\n",
159 |     "el_prop_featuriser.set_n_jobs(1)\n",
160 |     "\n",
161 |     "# Featurise using the ElementProperty featuriser\n",
162 |     "df = el_prop_featuriser.featurize_dataframe(df, col_id='composition')\n",
163 |     "\n",
164 |     "# Add structure features\n",
165 |     "density_featuriser = DensityFeatures()\n",
166 |     "density_featuriser.set_n_jobs(1)\n",
167 |     "df=density_featuriser.fit_featurize_dataframe(df, col_id='structure')\n",
168 |     "\n",
169 |     "# Print the shape of the DataFrame\n",
170 |     "print(df.shape)\n",
171 |     "df.head()"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "Let's understand the feature space a little better."
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "# Extract the feature columns (excluding the first three)\n",
188 |     "feature_columns = df.columns[3:]\n",
189 |     "\n",
190 |     "# Create a unique colour for each feature\n",
191 |     "colors = [plt.cm.jet(i / float(len(feature_columns))) for i in range(len(feature_columns))]\n",
192 |     "\n",
193 |     "# Plot the distribution of feature values with different colours\n",
194 |     "plt.figure(figsize=(5, 4))\n",
195 |     "for i, column in enumerate(feature_columns):\n",
196 |     "    df[column].plot(kind='hist', bins=0, alpha=0.5, color=colors[i], label=column)\n",
197 |     "\n",
198 |     "plt.title('Feature Distributions')\n",
199 |     "plt.show()"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "<details>\n",
207 |     "<summary> Code hint </summary>\n",
208 |     "Add some bins to your histogram. 10-20 should be sufficient.\n",
209 |     "</details>\n",
210 |     "\n",
211 |     "Some dimensions have very different ranges, as you can see from the spread on the x-axis. We can standardise these. \n",
212 |     "\n",
213 |     "`MinMaxScaler` is a data scaling technique to transform numerical features within the range [0, 1]. It linearly scales data, preserving relationships between values, making it suitable for algorithms sensitive to feature magnitudes."
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "from sklearn.preprocessing import MinMaxScaler\n",
223 |     "\n",
224 |     "scaled_df = df.copy()\n",
225 |     "\n",
226 |     "# Step 1: Standardise the feature columns\n",
227 |     "scaler = MinMaxScaler()\n",
228 |     "scaled_df[feature_columns] = scaler.fit_transform(scaled_df[feature_columns])\n",
229 |     "\n",
230 |     "# Step 2: Plot the standardised feature distributions\n",
231 |     "plt.figure(figsize=(5, 4))\n",
232 |     "for column in feature_columns:\n",
233 |     "    scaled_df[column].plot(kind='hist', bins=20, alpha=0.5, label=column)\n",
234 |     "\n",
235 |     "plt.title('Standardised Feature Distributions')\n",
236 |     "plt.show()"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "markdown",
241 |    "metadata": {},
242 |    "source": [
243 |     "Finally, let's prepare the data for model training. We need to split the dataset into the target variable `log10(K_VRH)` and the input features. For the input features, we must remove any non-numerical data to avoid getting errors later in our workflow."
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "# Define the features we want \n",
253 |     "features_to_drop = ['structure','composition','log10(K_VRH)']\n",
254 |     "feature_cols = [col for col in list(df.columns) if col not in features_to_drop]\n",
255 |     "\n",
256 |     "# Get an array of the features\n",
257 |     "X = df[feature_cols].values\n",
258 |     "scaled_X = scaled_df[feature_cols].values\n",
259 |     "\n",
260 |     "# Get an array of the target variable\n",
261 |     "y = df['log10(K_VRH)'].values\n",
262 |     "\n",
263 |     "print(f'Shape of X: {X.shape}')\n",
264 |     "print(f'Shape of y: {y.shape}')"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "## Model choice\n"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "metadata": {},
277 |    "source": [
278 |     "We are dealing with a supervised regression problem, so should choose a suitable machine learning model. We can start by rebuilding a random forest. Are you curious if the feature scaling has an effect? I am."
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "# Random forest - original features\n",
288 |     "from sklearn.ensemble import RandomForestRegressor\n",
289 |     "from sklearn import metrics\n",
290 |     "\n",
291 |     "# Define the model\n",
292 |     "rf = RandomForestRegressor(n_estimators=100, criterion='squared_error', max_depth=3, min_samples_split=2, min_samples_leaf=1, random_state=42)\n",
293 |     "\n",
294 |     "# Fit the model\n",
295 |     "rf.fit(X,y)\n",
296 |     "\n",
297 |     "# Wrap the lines of code for later sections\n",
298 |     "def make_prediction_plot(X, y, model, label):\n",
299 |     "    y_pred = model.predict(X)  # Calculate predictions here\n",
300 |     "    fig, ax = plt.subplots(figsize=(5, 4))\n",
301 |     "    ax.scatter(y, y_pred, c=y, cmap='viridis')\n",
302 |     "    ax.plot(y, y, 'r-')\n",
303 |     "    ax.set_xlabel(f'{label} True')\n",
304 |     "    ax.set_ylabel(f'{label} Predicted')\n",
305 |     "    plt.show()\n",
306 |     "    return y_pred  # Return y_pred \n",
307 |     "\n",
308 |     "# Performance\n",
309 |     "y_pred = make_prediction_plot(X, y, rf, 'log10(K_VRH)')  \n",
310 |     "\n",
311 |     "print(f'The training MAE = {metrics.mean_absolute_error(y,y_pred):.3f} log10GPa')\n",
312 |     "print(f'The training RMSE = {np.sqrt(metrics.mean_squared_error(y,y_pred)):.3f} log10GPa')\n",
313 |     "print(f'The training r^2 = {rf.score(X,y):.3f}')"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": null,
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": [
322 |     "# Random forest - scaled features\n",
323 |     "\n",
324 |     "# Define the model\n",
325 |     "rf2 = RandomForestRegressor(n_estimators=100, criterion='squared_error', max_depth=3, min_samples_split=2, min_samples_leaf=1, random_state=42)\n",
326 |     "\n",
327 |     "# Fit the model\n",
328 |     "rf2.fit(scaled_X, y)\n",
329 |     "\n",
330 |     "# Performance\n",
331 |     "y_pred = make_prediction_plot(scaled_X, y, rf2, 'log10(K_VRH)')  \n",
332 |     "print(f'The training MAE = {metrics.mean_absolute_error(y, y_pred):.3f} log10GPa')\n",
333 |     "print(f'The training RMSE = {np.sqrt(metrics.mean_squared_error(y, y_pred)):.3f} log10GPa')\n",
334 |     "print(f'The training r^2 = {rf2.score(scaled_X, y):.3f}')"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "metadata": {},
340 |    "source": [
341 |     "We can see that Random Forest is not sensitive to feature scaling. Recall that this model works by averaging over multiple decision trees, and the decision boundaries are determined by feature thresholds, not their absolute values. \n",
342 |     "\n",
343 |     "We have time to try one more model. Let's go with the popular [XGBoost](https://xgboost.readthedocs.io). Like Random Forest, it is an ensemble learning method. XGBoost uses a gradient-boosting framework and often achieves higher predictive accuracy by optimising for both bias and variance in the model."
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {},
350 |    "outputs": [],
351 |    "source": [
352 |     "# XGBoost model\n",
353 |     "import xgboost as xgb\n",
354 |     "\n",
355 |     "# Define the model\n",
356 |     "xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=3, random_state=42, objective='reg:squarederror')\n",
357 |     "\n",
358 |     "# Fit the model\n",
359 |     "xgb_model.fit(scaled_X, y)\n",
360 |     "\n",
361 |     "# Performance\n",
362 |     "y_pred = make_prediction_plot(scaled_X, y, xgb_model, 'log10(K_VRH)') \n",
363 |     "print(f'The training MAE = {metrics.mean_absolute_error(y, y_pred):.3f} log10GPa')\n",
364 |     "print(f'The training RMSE = {np.sqrt(metrics.mean_squared_error(y, y_pred)):.3f} log10GPa')\n",
365 |     "print(f'The training r^2 = {xgb_model.score(scaled_X, y):.3f}')"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "markdown",
370 |    "metadata": {},
371 |    "source": [
372 |     "XGBoost does a better job, but wait... \n",
373 |     "\n",
374 |     "We haven't performed proper training and testing yet 😱. These models are likely to be overfit and unable to make useful predictions for new inputs. On to the next stage!"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "markdown",
379 |    "metadata": {},
380 |    "source": [
381 |     "## Training and testing\n",
382 |     "\n",
383 |     "### Train-test split\n",
384 |     "\n",
385 |     "We are ready to build a real model now. Let's separate the training data from the unseen test set used to assess model performance."
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": null,
391 |    "metadata": {},
392 |    "outputs": [],
393 |    "source": [
394 |     "from slearn.model_selection import train_test_split\n",
395 |     "\n",
396 |     "# Split the data into 80% training and 20% testing\n",
397 |     "X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state=42)\n",
398 |     "\n",
399 |     "# Print the sizes of the arrays\n",
400 |     "print(f\"X_train shape: {X_train.shape}\")\n",
401 |     "print(f\"y_train shape: {y_train.shape}\")\n",
402 |     "print(f\"X_test shape: {X_test.shape}\")\n",
403 |     "print(f\"y_test shape: {y_test.shape}\")"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "markdown",
408 |    "metadata": {},
409 |    "source": [
410 |     "<details>\n",
411 |     "<summary> Code hint </summary>\n",
412 |     "The library is \"sklearn\"!\n",
413 |     "</details>"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "markdown",
418 |    "metadata": {},
419 |    "source": [
420 |     "### Cross-validation \n",
421 |     "\n",
422 |     "Using the 80% training set, we can train a model by making use of [cross-validation](https://scikit-learn.org/stable/modules/cross_validation.html) in an attempt to avoid overfitting. Note that this step may take a minute to run as 10 models are being trained (i.e. 5-fold cross-validation x 2 models).\n",
423 |     "\n",
424 |     "<details>\n",
425 |     "<summary> Recap of cross-validation </summary>\n",
426 |     "Cross-validation partitions data into multiple subsets, training the model on some and validating it on others, ensuring robust evaluation.\n",
427 |     "\n",
428 |     "_Key types include:_\n",
429 |     "\n",
430 |     "- **k-Fold Cross-Validation**: Data is split into *k* folds; each fold is used as a validation set once while training on the remaining *k-1* folds.\n",
431 |     "- **Leave-One-Out Cross-Validation (LOOCV)**: Each data point is used as a validation set once, with the rest for training.\n",
432 |     "- **Stratified k-Fold**: Preserves class proportions in each fold, useful for imbalanced datasets.\n",
433 |     "- **Time Series Cross-Validation**: Ensures training always precedes validation, preserving temporal structure.\n",
434 |     "\n",
435 |     "_Typical workflow:_\n",
436 |     "\n",
437 |     "1. **Split Data**: Divide the dataset into *k* folds.\n",
438 |     "2. **Train and Validate**: Train the model on *k-1* folds, validate on the remaining fold.\n",
439 |     "3. **Repeat**: Cycle through all folds, ensuring each serves as a validation set.\n",
440 |     "4. **Aggregate Results**: Compute performance metrics across all iterations.\n",
441 |     "5. **Train Final Model:** Fit the model using the full training dataset based on cross-validation insights.\n",
442 |     "</details>"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": null,
448 |    "metadata": {},
449 |    "outputs": [],
450 |    "source": [
451 |     "from sklearn.model_selection import cross_val_score\n",
452 |     "from xgboost import XGBRegressor\n",
453 |     "\n",
454 |     "# Define models\n",
455 |     "xgb_model = XGBRegressor(n_estimators=100, max_depth=3, random_state=42, objective='reg:squarederror')  \n",
456 |     "rf_model = RandomForestRegressor(n_estimators=100, max_depth=3, random_state=42)\n",
457 |     "\n",
458 |     "# Perform cross-validation for XGBoost\n",
459 |     "xgb_cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
460 |     "xgb_rmse = np.sqrt(-xgb_cv_scores)  # Convert to RMSE\n",
461 |     "\n",
462 |     "# Perform cross-validation for Random Forest\n",
463 |     "rf_cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
464 |     "rf_rmse = np.sqrt(-rf_cv_scores)  # Convert to RMSE\n",
465 |     "\n",
466 |     "# Print results\n",
467 |     "# Compare the results\n",
468 |     "print(\"XGBoost Cross-Validation Results\")\n",
469 |     "print(f\"  Mean RMSE: {xgb_rmse.mean():.3f}\")\n",
470 |     "print(f\"  Standard Deviation of RMSE: {xgb_rmse.std():.3f}\")\n",
471 |     "\n",
472 |     "print(\"\\nRandom Forest Cross-Validation Results\")\n",
473 |     "print(f\"  Mean RMSE: {rf_rmse.mean():.3f}\")\n",
474 |     "print(f\"  Standard Deviation of RMSE: {rf_rmse.std():.3f}\")"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "markdown",
479 |    "metadata": {},
480 |    "source": [
481 |     "<div style=\"background-color: #d4edda; border-left: 6px solid #ccc; margin: 20px; padding: 15px;\">\n",
482 |     "    <strong>🙋 Cross-validation output:</strong><br>\n",
483 |     "    • <strong>Mean RMSE:</strong> Mean error across the cross-validation folds (smaller = better).<br>\n",
484 |     "    • <strong>Standard Deviation of RMSE:</strong> Variability in error across the folds (smaller = more consistent).<br>\n",
485 |     "</div>"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "markdown",
490 |    "metadata": {},
491 |    "source": [
492 |     "### Hyperparamater optimisation\n",
493 |     "\n",
494 |     "XGBoost is in the lead! So far, we have not adjusted the models themselves. It is possible to improve performance by tuning the hyperparameters. Manually tuning would be laborious. We can use `GridSearchCV` to automate the search. \n",
495 |     "\n",
496 |     "Note that this step will be even more computationally expensive as we are performing cross-validation as a function of model hyperparameters for two separate models. You can see how computational cost quickly escalates and this is where powerful GPUs can become essential for machine learning! \n",
497 |     "\n",
498 |     "<div style=\"background-color: #d4edda; border-left: 6px solid #ccc; margin: 20px; padding: 15px; border-radius: 5px;\">\n",
499 |     "    <strong>⏱️ This will take 2-3 min to run. Think about how the model is learning from data.\n",
500 |     "</div>"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": null,
506 |    "metadata": {},
507 |    "outputs": [],
508 |    "source": [
509 |     "from sklearn.model_selection import GridSearchCV\n",
510 |     "\n",
511 |     "# Hyperparameter grid for XGBoost\n",
512 |     "xgb_param_grid = {\n",
513 |     "    'n_estimators': [100, 200],\n",
514 |     "    'max_depth': [3, 6],\n",
515 |     "    'learning_rate': [0.1, 0.2]\n",
516 |     "}\n",
517 |     "\n",
518 |     "xgb_grid_search = GridSearchCV(XGBRegressor(random_state=42), xgb_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)\n",
519 |     "xgb_grid_search.fit(X_train, y_train)\n",
520 |     "\n",
521 |     "best_xgb_params = xgb_grid_search.best_params_\n",
522 |     "best_xgb_model = xgb_grid_search.best_estimator_\n",
523 |     "\n",
524 |     "# Hyperparameter grid for Random Forest\n",
525 |     "rf_param_grid = {\n",
526 |     "    'n_estimators': [100, 200],\n",
527 |     "    'max_depth': [3, 6],\n",
528 |     "    'min_samples_split': [2, 4]\n",
529 |     "}\n",
530 |     "\n",
531 |     "rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)\n",
532 |     "rf_grid_search.fit(X_train, y_train)\n",
533 |     "\n",
534 |     "best_rf_params = rf_grid_search.best_params_\n",
535 |     "best_rf_model = rf_grid_search.best_estimator_\n",
536 |     "\n",
537 |     "# Evaluate the best models\n",
538 |     "xgb_cv_scores = -cross_val_score(best_xgb_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
539 |     "xgb_rmse = np.sqrt(xgb_cv_scores)\n",
540 |     "\n",
541 |     "rf_cv_scores = -cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
542 |     "rf_rmse = np.sqrt(rf_cv_scores)\n",
543 |     "\n",
544 |     "# Compare the results of the best models\n",
545 |     "print(\"Best XGBoost Hyperparameters:\", best_xgb_params)\n",
546 |     "print(\"Best XGBoost Cross-Validation Results\")\n",
547 |     "print(f\"  Mean RMSE: {xgb_rmse.mean():.3f}\")\n",
548 |     "print(f\"  Standard Deviation of RMSE: {xgb_rmse.std():.3f}\")\n",
549 |     "\n",
550 |     "print(\"\\nBest Random Forest Hyperparameters:\", best_rf_params)\n",
551 |     "print(\"Best Random Forest Cross-Validation Results\")\n",
552 |     "print(f\"  Mean RMSE: {rf_rmse.mean():.3f}\")\n",
553 |     "print(f\"  Standard Deviation of RMSE: {rf_rmse.std():.3f}\")"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "markdown",
558 |    "metadata": {},
559 |    "source": [
560 |     "Was it worth the effort? There should be improvements in the RMSE for both models. Note the optimal hyperparameters found.\n",
561 |     "\n",
562 |     "### Model assessment\n",
563 |     "\n",
564 |     "Now that we have our best trained models, let's see how they perform on *unseen* test data. Comparing test performance to training performance will help us determine if the model generalises well or shows signs of overfitting or underfitting."
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "code",
569 |    "execution_count": null,
570 |    "metadata": {},
571 |    "outputs": [],
572 |    "source": [
573 |     "from sklearn.metrics import mean_squared_error, r2_score\n",
574 |     "\n",
575 |     "# Test the best XGBoost model\n",
576 |     "xgb_test_preds = best_xgb_model.predict(X_test)\n",
577 |     "xgb_test_rmse = np.sqrt(mean_squared_error(y_test, xgb_test_preds))\n",
578 |     "xgb_test_r2 = r2_score(y_test, xgb_test_preds)\n",
579 |     "\n",
580 |     "# Test the best Random Forest model\n",
581 |     "rf_test_preds = best_rf_model.predict(X_test)\n",
582 |     "rf_test_rmse = np.sqrt(mean_squared_error(y_test, rf_test_preds))\n",
583 |     "rf_test_r2 = r2_score(y_test, rf_test_preds)\n",
584 |     "\n",
585 |     "# Print test results\n",
586 |     "print(\"XGBoost test results:\")\n",
587 |     "print(f\"RMSE: {xgb_test_rmse:.3f}\")\n",
588 |     "print(f\"R²: {xgb_test_r2:.3f}\")\n",
589 |     "\n",
590 |     "print(\"\\nRandom Forest test results:\")\n",
591 |     "print(f\"RMSE: {rf_test_rmse:.3f}\")\n",
592 |     "print(f\"R²: {rf_test_r2:.3f}\")\n",
593 |     "\n",
594 |     "# Create a scatter plot with both models in different colors\n",
595 |     "plt.figure(figsize=(5, 4))\n",
596 |     "plt.scatter(y_test, xgb_test_preds, c='blue', label=f'XGBoost (R²={xgb_test_r2:.2f})', alpha=0.5)\n",
597 |     "plt.scatter(y_test, rf_test_preds, c='green', label=f'Random Forest (R²={rf_test_r2:.2f})', alpha=0.5)\n",
598 |     "plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'k--', lw=2)  # Reference line (y=x)\n",
599 |     "plt.xlabel(\"Actual values\")\n",
600 |     "plt.ylabel(\"Predicted values\")\n",
601 |     "plt.title(\"Test set performance\")\n",
602 |     "plt.legend()\n",
603 |     "plt.show()"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "markdown",
608 |    "metadata": {},
609 |    "source": [
610 |     "XGBoost outperforms Random Forest in both cross-validation and test performance for this task, with the slight increase in RMSE from train to test suggesting both models generalise reasonably well."
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "markdown",
615 |    "metadata": {},
616 |    "source": [
617 |     "### Model speed\n",
618 |     "\n",
619 |     "The speed of a model may also be important, e.g. a use case involving millions of predictions. Several factors can influence the computational performance, including the dataset size, model complexity, and hardware. We can perform a simple comparison of our two models using `time`."
620 |    ]
621 |   },
622 |   {
623 |    "cell_type": "code",
624 |    "execution_count": null,
625 |    "metadata": {},
626 |    "outputs": [],
627 |    "source": [
628 |     "import time\n",
629 |     "\n",
630 |     "# Measure the training time for XGBoost\n",
631 |     "start_time = time.time()\n",
632 |     "xgb_model.fit(X_train, y_train)\n",
633 |     "xgb_training_time = time.time() - start_time\n",
634 |     "\n",
635 |     "# Measure the training time for Random Forest\n",
636 |     "start_time = time.time()\n",
637 |     "rf_model.fit(X_train, y_train)\n",
638 |     "rf_training_time = time.time() - start_time\n",
639 |     "\n",
640 |     "# Measure the prediction time for XGBoost\n",
641 |     "start_time = time.time()\n",
642 |     "xgb_test_preds = xgb_model.predict(X_test)\n",
643 |     "xgb_prediction_time = time.time() - start_time\n",
644 |     "\n",
645 |     "# Measure the prediction time for Random Forest\n",
646 |     "start_time = time.time()\n",
647 |     "rf_test_preds = rf_model.predict(X_test)\n",
648 |     "rf_prediction_time = time.time() - start_time\n",
649 |     "\n",
650 |     "print(f\"XGBoost training time: {xgb_training_time:.4f} seconds\")\n",
651 |     "print(f\"Random Forest training time: {rf_training_time:.4f} seconds\")\n",
652 |     "print(f\"\\nXGBoost prediction time: {xgb_prediction_time:.4f} seconds\")\n",
653 |     "print(f\"Random Forest prediction time: {rf_prediction_time:.4f} seconds\")"
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "markdown",
658 |    "metadata": {},
659 |    "source": [
660 |     "It is clear that the XGBoost library has been well optimised to run quickly."
661 |    ]
662 |   },
663 |   {
664 |    "cell_type": "markdown",
665 |    "metadata": {
666 |     "tags": []
667 |    },
668 |    "source": [
669 |     "## 🚨 Exercise 7\n",
670 |     "\n",
671 |     "<div style=\"background-color: #dceefb; border-left: 6px solid #ccc; margin: 20px; padding: 15px; border-radius: 5px;\">\n",
672 |     "    <strong>💡 Coding exercises:</strong> The exercises are designed to apply what you have learned with room for creativity. It is fine to discuss solutions with your classmates, but the actual code should not be directly copied.\n",
673 |     "</div>\n",
674 |     "\n",
675 |     "### Your details"
676 |    ]
677 |   },
678 |   {
679 |    "cell_type": "code",
680 |    "execution_count": null,
681 |    "metadata": {},
682 |    "outputs": [],
683 |    "source": [
684 |     "import numpy as np\n",
685 |     "\n",
686 |     "# Insert your values\n",
687 |     "Name = \"No Name\" # Replace with your name\n",
688 |     "CID = 123446 # Replace with your College ID (as a numeric value with no leading 0s)\n",
689 |     "\n",
690 |     "# Set a random seed using the CID value\n",
691 |     "CID = int(CID)\n",
692 |     "np.random.seed(CID)\n",
693 |     "\n",
694 |     "# Print the message\n",
695 |     "print(\"This is the work of \" + Name + \" [CID: \" + str(CID) + \"]\")"
696 |    ]
697 |   },
698 |   {
699 |    "cell_type": "markdown",
700 |    "metadata": {
701 |     "tags": []
702 |    },
703 |    "source": [
704 |     "### Problem\n",
705 |     "\n",
706 |     "Selecting the most appropriate ML model for a given purpose is important for achieving predictive performance. Your job will be to assess additional models (e.g. [Nearest Neighbours](https://scikit-learn.org/stable/modules/neighbors.html#neighbors) and [Support Vector Machines](https://scikit-learn.org/stable/modules/svm.html#svm)) for the hardness regression task. The tasks will be given in class."
707 |    ]
708 |   },
709 |   {
710 |    "cell_type": "code",
711 |    "execution_count": null,
712 |    "metadata": {},
713 |    "outputs": [],
714 |    "source": [
715 |     "#Empty block for your answers\n",
716 |     "\n",
717 |     "\n"
718 |    ]
719 |   },
720 |   {
721 |    "cell_type": "code",
722 |    "execution_count": null,
723 |    "metadata": {},
724 |    "outputs": [],
725 |    "source": [
726 |     "#Empty block for your answers\n",
727 |     "\n",
728 |     "\n"
729 |    ]
730 |   },
731 |   {
732 |    "cell_type": "markdown",
733 |    "metadata": {},
734 |    "source": [
735 |     "<details>\n",
736 |     "<summary> Task hint </summary>\n",
737 |     "You can perform cross-validation following the same procedure as the random forest model in the main notebook.\n",
738 |     "</details>\n",
739 |     "\n",
740 |     "<div style=\"background-color: #d4edda; border-left: 6px solid #ccc; margin: 20px; padding: 15px; border-radius: 5px;\">\n",
741 |     "    <strong>📓 Submission:</strong> When your notebook is complete in Google Colab, go to <em>File > Download</em> and choose <code>.ipynb</code>. The completed file should be uploaded to Blackboard under assignments for MATE70026.\n",
742 |     "</div>"
743 |    ]
744 |   },
745 |   {
746 |    "cell_type": "markdown",
747 |    "metadata": {},
748 |    "source": [
749 |     "## 🌊 Dive deeper\n",
750 |     "\n",
751 |     "* _Level 1:_ Tackle Chapter 14 on Tree-Based Learners in [Machine Learning Refined](https://github.com/jermwatt/machine_learning_refined#what-is-new-in-the-second-edition). \n",
752 |     "\n",
753 |     "* _Level 2:_ Explore the XGBoost [tutorials](https://xgboost.readthedocs.io/en/stable/tutorials/model.html), e.g. predicting multiple properties with multi-output regression. \n",
754 |     "\n",
755 |     "* _Level 3:_ Find the best model (subject to time constraints) with [Automatminer](https://hackingmaterials.lbl.gov/automatminer) based on [TPOT](https://epistasislab.github.io/tpot)."
756 |    ]
757 |   }
758 |  ],
759 |  "metadata": {
760 |   "kernelspec": {
761 |    "display_name": "vscode24",
762 |    "language": "python",
763 |    "name": "python3"
764 |   },
765 |   "language_info": {
766 |    "codemirror_mode": {
767 |     "name": "ipython",
768 |     "version": 3
769 |    },
770 |    "file_extension": ".py",
771 |    "mimetype": "text/x-python",
772 |    "name": "python",
773 |    "nbconvert_exporter": "python",
774 |    "pygments_lexer": "ipython3",
775 |    "version": "3.12.4"
776 |   }
777 |  },
778 |  "nbformat": 4,
779 |  "nbformat_minor": 4
780 | }
781 | 


--------------------------------------------------------------------------------
/Lecture8.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Accelerated Discovery"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "<div style=\"background-color: #f8d7da; border-left: 6px solid #ccc; margin: 20px; padding: 15px;\">\n",
 15 |     "    <strong>💡 Geoffrey Hinton:</strong> It’s quite conceivable that humanity is just a passing phase in the evolution of intelligence.\n",
 16 |     "</div>"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "<iframe class=\"speakerdeck-iframe\" frameborder=\"0\" src=\"https://speakerdeck.com/player/e7872bd00d8348bcbf8f02720d5f36b6\" title=\"Machine Learning for Materials (Lecture 8)\" allowfullscreen=\"true\" style=\"border: 0px; background-clip: padding-box; background-color: rgba(0, 0, 0, 0.1); margin: 0px; padding: 0px; border-radius: 6px; box-shadow: rgba(0, 0, 0, 0.2) 0px 5px 40px; width: 100%; height: auto; aspect-ratio: 560 / 420;\" data-ratio=\"1.3333333333333333\"></iframe>\n",
 24 |     "\n",
 25 |     "[Lecture slides](https://speakerdeck.com/aronwalsh/mlformaterials-lecture8-ai)"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## 🤖 x 🧪 Closed-loop optimisation \n",
 33 |     "\n",
 34 |     "The combination of automation and optimisation is powerful. Closed-loop workflows are of growing importance in materials research for many reasons, including:\n",
 35 |     "\n",
 36 |     "1. **Efficiency:** Efficient allocation of resources, both in terms of time and materials. By continuously updating experimental parameters based on real-time feedback, we can reduce the number of trials needed to reach optimal outcomes. \n",
 37 |     "\n",
 38 |     "2. **Adapt to changing conditions:** Adaptive decision-making, ensuring that experiments remain effective even when external factors fluctuate. This adaptability is highly valuable for complex systems where traditional trial-and-error approaches are prone to fail.\n",
 39 |     "\n",
 40 |     "3. **Exploration of large parameter spaces:** Many materials science problems involve high-dimensional parameter spaces where exhaustive exploration is impractical. Techniques such as Bayesian optimisation can efficiently sample and search these spaces to identify optimal configurations and make discoveries.\n",
 41 |     "\n",
 42 |     "4. **Data-driven insights:** Generation of valuable data from ongoing experiments. This data can be analysed to gain a deeper understanding of the underlying processes and relationships, facilitating scientific discoveries and supporting future efforts.\n",
 43 |     "\n",
 44 |     "Today we will make use of the [scikit-optimise](https://scikit-optimize.github.io) package."
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "# Installation of libraries\n",
 54 |     "!pip install scikit-optimize --quiet"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "# Import of modules\n",
 64 |     "import numpy as np \n",
 65 |     "import matplotlib.pyplot as plt \n",
 66 |     "from scipy.stats import norm  # Statistical functions\n",
 67 |     "from skopt import gp_minimize, dummy_minimize  # Bayesian optimisation\n",
 68 |     "from skopt.utils import create_result  # Utility functions for skopt\n",
 69 |     "from sklearn.metrics import r2_score  # R-squared metric"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {
 75 |     "tags": []
 76 |    },
 77 |    "source": [
 78 |     "## Bayesian optimisation (BO)\n",
 79 |     "\n",
 80 |     "BO is a powerful technique for optimising complex and expensive-to-evaluate functions. It combines probabilistic modeling and decision theory to search for the optimal set of parameters. In materials research, parameters like chemical composition, sample thickness, and processing conditions can be optimised.\n",
 81 |     "\n",
 82 |     "BO aims to find the global minimum (or maximum) of an objective function, $O(x)$, where $x$ represents a set of parameters or design variables. Instead of exhaustive searches, BP builds a surrogate model, typically a Gaussian Process (GP), that approximates the true objective function. This surrogate model captures both the mean $\\mu(x)$ and uncertainty $\\sigma(x)$ associated with $O(x)$. The GP is defined as:\n",
 83 |     "\n",
 84 |     "$$\n",
 85 |     "O(x) \\sim \\text{GP}(\\mu(x), k(x, x'))\n",
 86 |     "$$\n",
 87 |     "\n",
 88 |     "where $k(x, x')$ is a kernel function that quantifies the similarity between two input points $x$ and $x'$.\n",
 89 |     "\n",
 90 |     "The surrogate model balances exploration and exploitation using an acquisition function $\\alpha(x)$, which trades off between exploring uncertain regions and exploiting promising areas:\n",
 91 |     "\n",
 92 |     "$$\n",
 93 |     "x_{\\text{next}} = \\arg \\max_x \\alpha(x)\n",
 94 |     "$$\n",
 95 |     "\n",
 96 |     "Common acquisition functions include Probability of Improvement (PI), Expected Improvement (EI), and Upper Confidence Bound (UCB). Each of these functions aims to maximise the expected gain in performance over the current best solution.\n",
 97 |     "\n",
 98 |     "<details>\n",
 99 |     "<summary>Curious about the kernel function?</summary>\n",
100 |     "\n",
101 |     "The kernel determines the covariance structure of the GP. A commonly used kernel, and the default in `sklearn`, is the Radial Basis Function (RBF):\n",
102 |     "\n",
103 |     "$$\n",
104 |     "k(x, x') = \\sigma^2 \\exp\\left(-\\frac{\\|x - x'\\|^2}{2l^2}\\right)\n",
105 |     "$$\n",
106 |     "\n",
107 |     "where:\n",
108 |     "- $\\sigma^2$ is the **signal variance**, which controls the overall magnitude of function variations,\n",
109 |     "- $l$ is the **length scale**, which determines how quickly the function values change with respect to input differences.\n",
110 |     "\n",
111 |     "There are also many other choices, such as the [Matérn kernel](https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.kernels.Matern.html), which differ in how they model smoothness and continuity.\n",
112 |     "</details>"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "## Building a BO model\n",
120 |     "\n",
121 |     "### Step 1. Target function\n",
122 |     "\n",
123 |     "We can start by generating a simple sine-like target function with added noise to keep things interesting. This acts as our \"virtual experiment\", i.e. we can call the function to obtain an output."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "# Fixing the random seed for reproducibility\n",
133 |     "np.random.seed(42)\n",
134 |     "\n",
135 |     "# Define the target function\n",
136 |     "def target_function(x):\n",
137 |     "    x = np.atleast_1d(x) # Ensure x is an array\n",
138 |     "    return np.sin(x[0]) + 0.1 * x[0] + 0.5 * np.random.randn()\n",
139 |     "\n",
140 |     "# Generate data for visualisation\n",
141 |     "x_values = np.linspace(-5, 5, 200).reshape(-1, 1)\n",
142 |     "y_values = np.vectorize(target_function)(x_values)\n",
143 |     "\n",
144 |     "# Plot the target function\n",
145 |     "plt.figure(figsize=(5, 4))\n",
146 |     "plt.plot(x_values, y_values, 'r-', alpha=0.5, label='Target Function')\n",
147 |     "plt.xlabel('Input')\n",
148 |     "plt.ylabel('Output')\n",
149 |     "plt.legend()\n",
150 |     "plt.show()"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "Let's randomly sample the target function and fit a simple polynomial function to get a feeling for how the model works."
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "# Generate sample points from the target function\n",
167 |     "num_initial_points = \n",
168 |     "initial_points = np.random.uniform(-5, 5, num_initial_points)\n",
169 |     "initial_values = np.vectorize(target_function)(initial_points)\n",
170 |     "\n",
171 |     "# Plot the sample points\n",
172 |     "plt.figure(figsize=(5, 4))\n",
173 |     "plt.plot(x_values, y_values, 'r-', alpha=0.5, label='Target Function')\n",
174 |     "plt.scatter(initial_points, initial_values, color='blue', marker='o', label='Initial Samples')\n",
175 |     "plt.xlabel('Input')\n",
176 |     "plt.ylabel('Output')\n",
177 |     "plt.legend()\n",
178 |     "plt.show()"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "<details>\n",
186 |     "<summary> Code hint </summary>\n",
187 |     "Try `num_initial_points = 10`\n",
188 |     "</details>"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "# Perform a polynomial fit\n",
198 |     "degree =   # Adjust the degree of the polynomial fit\n",
199 |     "coefficients = np.polyfit(initial_points, initial_values, degree)\n",
200 |     "poly_fit = np.poly1d(coefficients)\n",
201 |     "\n",
202 |     "# Calculate R^2\n",
203 |     "y_pred = poly_fit(initial_points)\n",
204 |     "r_squared = r2_score(initial_values, y_pred)\n",
205 |     "\n",
206 |     "# Plot the sample points and polynomial fit\n",
207 |     "plt.figure(figsize=(5, 4))\n",
208 |     "plt.plot(x_values, y_values, 'r-', alpha=0.5, label='Target Function')\n",
209 |     "plt.scatter(initial_points, initial_values, color='blue', marker='o', label='Initial Samples')\n",
210 |     "plt.plot(x_values, poly_fit(x_values), 'g--', label=f'Polynomial Fit (degree {degree})\\n$R^2 = {r_squared:.4f}$')\n",
211 |     "plt.xlabel('Input')\n",
212 |     "plt.ylabel('Output')\n",
213 |     "plt.legend()\n",
214 |     "plt.show()"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "<div style=\"background-color: #d4edda; border-left: 6px solid #ccc; margin: 20px; padding: 15px; border-radius: 5px;\">\n",
222 |     "    <strong>🐢 Take a beat:</strong> Adjust the degree of the polynomial to see how good the fit is. Start with `degree = 2` and gradually increase it.\n",
223 |     "</div>"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "### Step 3: Gaussian Process\n",
231 |     "\n",
232 |     "Now we can move to Bayesian Optimisation with a Gaussian Process model. The optimisation progress is visualised by plotting the target function, optimisation steps, and a colourbar indicating the step number.\n",
233 |     "\n",
234 |     "<div style=\"background-color: #d4edda; border-left: 6px solid #ccc; margin: 20px; padding: 15px; border-radius: 5px;\">\n",
235 |     "    ⏱️ This may take a minute to run. Reverend Bayes makes computers work hard!\n",
236 |     "</div>"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "# Optimise the target function using Bayesian Optimisation\n",
246 |     "result = gp_minimize(target_function, [(-5.0, 5.0)], n_calls=50, random_state=42)\n",
247 |     "\n",
248 |     "# Perform random sampling for comparison\n",
249 |     "random_result = dummy_minimize(target_function, [(-5.0, 5.0)], n_calls=50, random_state=42)\n",
250 |     "\n",
251 |     "# Plot the Gaussian Process model after optimisation\n",
252 |     "x_gp = np.array(result.x_iters).reshape(-1, 1)\n",
253 |     "y_gp = result.func_vals\n",
254 |     "\n",
255 |     "# Plot the target function\n",
256 |     "plt.figure(figsize=(5, 4))\n",
257 |     "plt.plot(x_values, y_values, 'r-', alpha=0.5, label='Target function')\n",
258 |     "\n",
259 |     "# Plot the optimisation steps with a colormap\n",
260 |     "plt.scatter(x_gp, y_gp, c=range(len(x_gp)), cmap='viridis', marker='o', label='Step number')\n",
261 |     "\n",
262 |     "# Add colorbar to indicate the progress\n",
263 |     "cbar = plt.colorbar()\n",
264 |     "cbar.set_label('Step number')\n",
265 |     "\n",
266 |     "plt.title('BO: Gaussian Process Model')\n",
267 |     "plt.xlabel('Input')\n",
268 |     "plt.ylabel('Output')\n",
269 |     "plt.legend()\n",
270 |     "plt.show()"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {},
276 |    "source": [
277 |     "We can use `plot_gaussian_process` from scikit-optimize to visualise the confidence intervals. `n_samples` determines the number of samples to draw from the Gaussian Process for the estimation."
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "from skopt.plots import plot_gaussian_process as plot_gp\n",
287 |     "\n",
288 |     "# Plot the Gaussian Process model with confidence intervals\n",
289 |     "plt.figure(figsize=(5, 4))\n",
290 |     "plot_gp(result)\n",
291 |     "\n",
292 |     "# Add the target function for reference\n",
293 |     "plt.plot(x_values, y_values, 'r-', alpha=0.25, label='Target function')\n",
294 |     "\n",
295 |     "plt.title('Confidence Intervals')\n",
296 |     "plt.xlabel('Input')\n",
297 |     "plt.ylabel('Output')\n",
298 |     "plt.legend()\n",
299 |     "plt.show()"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "metadata": {},
305 |    "source": [
306 |     "The plot shows the GP mean (dashed green), confidence intervals (shaded), and sampled observations (red). The target function (light red) is also overlaid. The confidence region narrows where more observations exist and widens in unexplored areas, reflecting uncertainty in the GP model.\n",
307 |     "\n",
308 |     "We should always have a benchmark to compare our model to. This block extracts the best results from BO and random sampling, then compares and visualises their performance over optimisation steps."
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": null,
314 |    "metadata": {},
315 |    "outputs": [],
316 |    "source": [
317 |     "# Extract the cumulative minimum values\n",
318 |     "bo_min_values = np.minimum.accumulate(result.func_vals)\n",
319 |     "random_min_values = np.minimum.accumulate(random_result.func_vals)\n",
320 |     "\n",
321 |     "# Plot the cumulative minimum values vs steps for both methods\n",
322 |     "plt.figure(figsize=(5, 4))\n",
323 |     "plt.plot(range(1, len(bo_min_values) + 1), bo_min_values, 'o-', label='Bayesian Optimisation')\n",
324 |     "plt.plot(range(1, len(random_min_values) + 1), random_min_values, 'x-', label='Random Sampling')\n",
325 |     "\n",
326 |     "plt.title('Does BO Beat Random Sampling?')\n",
327 |     "plt.xlabel('Step')\n",
328 |     "plt.ylabel('Cumulative Minimum Value')\n",
329 |     "plt.legend()\n",
330 |     "plt.show()"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "markdown",
335 |    "metadata": {},
336 |    "source": [
337 |     "BO (blue) converges faster to a lower minimum value. Random sampling (orange) fluctuates and struggles to improve beyond a certain point. This highlights BO’s advantage in structured search over purely random exploration."
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "markdown",
342 |    "metadata": {
343 |     "tags": []
344 |    },
345 |    "source": [
346 |     "## 🚨 Exercise 8\n",
347 |     "\n",
348 |     "<div style=\"background-color: #dceefb; border-left: 6px solid #ccc; margin: 20px; padding: 15px; border-radius: 5px;\">\n",
349 |     "    <strong>💡 Coding exercises:</strong> The exercises are designed to apply what you have learned with room for creativity. It is fine to discuss solutions with your classmates, but the actual code should not be directly copied.\n",
350 |     "</div>\n",
351 |     "\n",
352 |     "### Your details"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": null,
358 |    "metadata": {},
359 |    "outputs": [],
360 |    "source": [
361 |     "import numpy as np\n",
362 |     "\n",
363 |     "# Insert your values\n",
364 |     "Name = \"No Name\" # Replace with your name\n",
365 |     "CID = 123446 # Replace with your College ID (as a numeric value with no leading 0s)\n",
366 |     "\n",
367 |     "# Set a random seed using the CID value\n",
368 |     "CID = int(CID)\n",
369 |     "np.random.seed(CID)\n",
370 |     "\n",
371 |     "# Print the message\n",
372 |     "print(\"This is the work of \" + Name + \" [CID: \" + str(CID) + \"]\")"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "markdown",
377 |    "metadata": {},
378 |    "source": [
379 |     "### Problem\n",
380 |     "\n",
381 |     "The Department of Materials has purchased a new automated thin-film deposition system. The machine has two dials that provide a 2D parameter space (x, y) for materials processing. We can define a (hypothetical) target loss function for optimising the transition temperature of our candidate thin-film superconductors as:\n",
382 |     "\n",
383 |     "```python\n",
384 |     "# Target function for materials processing with x and y \"dials\"\n",
385 |     "def supermat(inputs):\n",
386 |     "    x, y = inputs\n",
387 |     "    a = 2, b = 5.1 / (2 * np.pi**2)\n",
388 |     "    c = 3 / np.pi\n",
389 |     "    r = 4, s = 10, t = 1 / (8 * np.pi)\n",
390 |     "\n",
391 |     "    term1 = a * (y - b * x**2 + c * x - r)**2\n",
392 |     "    term2 = s * (1 - t) * np.cos(x)\n",
393 |     "    term3 = s\n",
394 |     "\n",
395 |     "    return term1 + term2 + term3\n",
396 |     "\n",
397 |     "# Example usage:\n",
398 |     "dials = [2.0, 3.0]\n",
399 |     "result = supermat(dials)\n",
400 |     "print(f\"Experiment by setting dials to ({dials[0]}, {dials[1]}): {result}\")\n",
401 |     "```\n",
402 |     "\n",
403 |     "The tasks will be provided in class."
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": null,
409 |    "metadata": {},
410 |    "outputs": [],
411 |    "source": [
412 |     "#Empty block for your answers\n",
413 |     "\n",
414 |     "\n"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": null,
420 |    "metadata": {},
421 |    "outputs": [],
422 |    "source": [
423 |     "#Empty block for your answers\n",
424 |     "\n",
425 |     "\n"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "markdown",
430 |    "metadata": {},
431 |    "source": [
432 |     "<details>\n",
433 |     "<summary> Task hint </summary>\n",
434 |     "Remember to first define the target function and then call it using gp_minimize()\n",
435 |     "</details>\n",
436 |     "\n",
437 |     "<div style=\"background-color: #d4edda; border-left: 6px solid #ccc; margin: 20px; padding: 15px; border-radius: 5px;\">\n",
438 |     "    <strong>📓 Submission:</strong> When your notebook is complete in Google Colab, go to <em>File > Download</em> and choose <code>.ipynb</code>. The completed file should be uploaded to Blackboard under assignments for MATE70026.\n",
439 |     "</div>"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "markdown",
444 |    "metadata": {},
445 |    "source": [
446 |     "## 🌊 Dive deeper\n",
447 |     "\n",
448 |     "* _Level 1:_ Visually explore [Gaussian Processes](https://distill.pub/2019/visual-exploration-gaussian-processes/)\n",
449 |     "\n",
450 |     "* _Level 2:_ Read a perspective on [Bayesian optimisation for chemical problems](https://chemrxiv.org/engage/chemrxiv/article-details/656dfe74cf8b3c3cd7c611a5) by your teaching assistant Yifan, which includes links to tool and packages under development\n",
451 |     "\n",
452 |     "* _Level 3:_ Interact with the self-driving laboratory demo by [Sterling Baird](https://github.com/sparks-baird/self-driving-lab-demo)"
453 |    ]
454 |   }
455 |  ],
456 |  "metadata": {
457 |   "kernelspec": {
458 |    "display_name": "vscode24",
459 |    "language": "python",
460 |    "name": "python3"
461 |   },
462 |   "language_info": {
463 |    "codemirror_mode": {
464 |     "name": "ipython",
465 |     "version": 3
466 |    },
467 |    "file_extension": ".py",
468 |    "mimetype": "text/x-python",
469 |    "name": "python",
470 |    "nbconvert_exporter": "python",
471 |    "pygments_lexer": "ipython3",
472 |    "version": "3.12.4"
473 |   }
474 |  },
475 |  "nbformat": 4,
476 |  "nbformat_minor": 4
477 | }
478 | 


--------------------------------------------------------------------------------
/Lecture9.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Research Challenge"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "<div style=\"background-color: #f8d7da; border-left: 6px solid #ccc; margin: 20px; padding: 15px;\">\n",
 15 |     "    <strong>💡 Margaret Atwood:</strong> Every aspect of human technology has a dark side, including the bow and arrow.\n",
 16 |     "</div>"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## 🏅 Build your own model \n",
 24 |     "\n",
 25 |     "We have just completed a session on generative AI (see [Lecture slides](https://speakerdeck.com/aronwalsh/machine-learning-for-materials-lecture-9)), but it is time to go back to supervised machine learning problems.\n",
 26 |     "\n",
 27 |     "You have been assigned one dataset from [MatBench](https://matbench.materialsproject.org) as introduced in the [Challenge slides](https://speakerdeck.com/aronwalsh/mlformaterials-challenge-25). You are free to choose and tune any machine-learning model, with any Python library, but it should be appropriate for the problem. For instance, [XGBoost](https://xgboost.readthedocs.io) could be a good starting starting point to build a regression model. You can refer back to earlier notebooks and repurpose code as needed. \n",
 28 |     "\n",
 29 |     "You may reach the limits of computing processing power on Google Colab. Building a useful model with limited resources is a real-world skill. Using other free resources is allowed if you find an alternative service, as is running on your own computer. A model tracker such as [wandb](https://wandb.ai) could be helpful for advanced users. If you want to try a brute force approach, a library such as [Automatminer](https://hackingmaterials.lbl.gov/automatminer) may be of interest.\n",
 30 |     "\n",
 31 |     "This notebook should be used for keeping a record of your model development, submission, and even your presentation. You are free to edit (add/remove/delete) or rearrange the cells as you see fit.\n",
 32 |     "\n",
 33 |     "### Your details"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import numpy as np\n",
 43 |     "\n",
 44 |     "# Insert your values\n",
 45 |     "Name = \"No Name\" # Replace with your name\n",
 46 |     "CID = 123446 # Replace with your College ID (as a numeric value with no leading 0s)\n",
 47 |     "\n",
 48 |     "# Set a random seed using the CID value\n",
 49 |     "CID = int(CID)\n",
 50 |     "np.random.seed(CID)\n",
 51 |     "\n",
 52 |     "# Print the message\n",
 53 |     "print(\"This is the work of \" + Name + \" [CID: \" + str(CID) + \"]\")"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "## Problem statement\n",
 61 |     "\n",
 62 |     "You have been assigned one dataset from the [list](https://matbench.materialsproject.org/Benchmark%20Info/matbench_v0.1/) on [MatBench](https://matbench.materialsproject.org). You should state what problem you are trying to solve and comment on the best-performing model in the benchmark. "
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 2,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "# Spare cell\n",
 72 |     "\n",
 73 |     "\n"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {
 79 |     "tags": []
 80 |    },
 81 |    "source": [
 82 |     "## Data preparation\n",
 83 |     "\n",
 84 |     "Check the data distribution and apply appropriate pre-processing steps as required. "
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "# Installation of libraries\n",
 94 |     "!pip install matminer # Datasets and featurisation "
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "# Get dataset info from matminer\n",
104 |     "from matminer.datasets import get_all_dataset_info\n",
105 |     "from matminer.datasets import load_dataset\n",
106 |     "\n",
107 |     "# Uncomment the info line for your assigned challenge\n",
108 |     "\n",
109 |     "  # A (GTAs - Xia, Kinga)\n",
110 |     "#info = get_all_dataset_info(\"matbench_dielectric\")\n",
111 |     "\n",
112 |     "  # B (GTAs - Irea, Pan)\n",
113 |     "#info = get_all_dataset_info(\"matbench_expt_gap\")\n",
114 |     "\n",
115 |     "  # C (GTAs - Yifan, Fintan)\n",
116 |     "#info = get_all_dataset_info(\"matbench_glass\")\n",
117 |     "\n",
118 |     "# Check the dataset information\n",
119 |     "print(info)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "# Load your dataset into a pandas DataFrame\n",
129 |     "df = load_dataset(\" \")\n",
130 |     "\n",
131 |     "print(df)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "Choose relevant features, which may be based on composition or structure, depending on your problem. [matminer](https://hackingmaterials.lbl.gov/matminer/) is a good place to start."
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "## Model selection, testing and training \n",
146 |     "\n",
147 |     "Define your model and justify your choice based on the problem and available data. You can look back at earlier notebooks and investigate other examples online including in [scikit-learn](https://scikit-learn.org)."
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 4,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "# Spare cell\n",
157 |     "\n",
158 |     "\n"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "Train, validate and test your model. Make sure to do proper data splits and to consider the hyperparamaters of your model.\n",
166 |     "\n",
167 |     "<details>\n",
168 |     "<summary>Note on the ROC-AUC classification metric</summary>\n",
169 |     "There is one metric we didn't cover but is used in Matbench. In binary classification models, the ROC-AUC (Receiver Operating Characteristic - Area Under the Curve) score can be used to evaluate performance. It quantifies the ability of the model to distinguish between positive and negative instances across different decision thresholds. A higher ROC-AUC score (ranging from 0.5 to 1) indicates better performance, with 1 representing a perfect classifier and 0.5 indicating performance no better than random chance. There is a more detailed discussion here: https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc.\n",
170 |     "\n",
171 |     "The metric can be calculated using the `roc_auc_score` function from the `sklearn.metrics` module, e.g.\n",
172 |     "\n",
173 |     "```python\n",
174 |     "from sklearn.metrics import roc_auc_score\n",
175 |     "\n",
176 |     "# Assuming you have true labels (y_true) and predicted probabilities (y_pred_prob) \n",
177 |     "y_true = [...]  \n",
178 |     "y_pred_prob = [...]  \n",
179 |     "\n",
180 |     "# Calculate ROC-AUC\n",
181 |     "roc_auc = roc_auc_score(y_true, y_pred_prob)\n",
182 |     "\n",
183 |     "# Display the result\n",
184 |     "print(f'ROC-AUC Score: {roc_auc:.4f}')\n",
185 |     "```\n",
186 |     "</details>"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 5,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "# Spare cell\n",
196 |     "\n",
197 |     "\n"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "## Model analysis and discussion\n",
205 |     "\n",
206 |     "How well does your final model perform? Think of metrics and plots that are useful to dig a little deeper. \n",
207 |     "\n",
208 |     "Compare against the best-performing model on the [MatBench](https://matbench.materialsproject.org) leaderboard.  With limited resources, don't expect to match this performance, but you should do better than a baseline model. "
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "# Spare cell\n",
218 |     "\n",
219 |     "\n"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "## Large Language Model (LLM) usage declaration\n",
227 |     "\n",
228 |     "Acknowledge use of a generative model during your assignment. Points to consider:\n",
229 |     "\n",
230 |     "* State which LLM (e.g. GPT-4, Gemini, Co-Pilot)\n",
231 |     "\n",
232 |     "* Specify tasks (e.g. summarising research or code snippets)\n",
233 |     "\n",
234 |     "* Were any limitations/biases noted?\n",
235 |     "\n",
236 |     "* How did you ensure ethical use?"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 6,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "# Spare cell\n",
246 |     "\n",
247 |     "\n"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {},
253 |    "source": [
254 |     "## ☘️ Final word\n",
255 |     "\n",
256 |     "Good luck building your own model! We hope that you enjoyed the course and exercises. Dive deeper into the aspects that caught your interest. A useful starting point may be the [Resources](https://aronwalsh.github.io/MLforMaterials/Resources.html) page. \n",
257 |     "\n",
258 |     "Remember that submission is on Blackboard and you should upload both the completed Juypter Notebook (`.ipynb` file), as well as your recorded narrated presentation (maximum 5 minutes; see guides on using [Zoom](https://www.youtube.com/watch?v=H9qhoAIzW3E) or [Powerpoint](https://www.youtube.com/watch?v=Y5dgwwa5XRA) for this purpose)."
259 |    ]
260 |   }
261 |  ],
262 |  "metadata": {
263 |   "kernelspec": {
264 |    "display_name": "Python 3 (ipykernel)",
265 |    "language": "python",
266 |    "name": "python3"
267 |   },
268 |   "language_info": {
269 |    "codemirror_mode": {
270 |     "name": "ipython",
271 |     "version": 3
272 |    },
273 |    "file_extension": ".py",
274 |    "mimetype": "text/x-python",
275 |    "name": "python",
276 |    "nbconvert_exporter": "python",
277 |    "pygments_lexer": "ipython3",
278 |    "version": "3.11.7"
279 |   }
280 |  },
281 |  "nbformat": 4,
282 |  "nbformat_minor": 4
283 | }
284 | 


--------------------------------------------------------------------------------
/Overview.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | 
 3 | _Machine Learning for Materials_ (MATE70026) provides an introduction to statistical research tools for materials theory and simulation. It is a module designed for senior undergraduate and junior postgraduate students in the Department of Materials at Imperial College London.
 4 | 
 5 | You will consider how composition-structure-property information in materials science can be represented in a form suitable for machine learning. You will then build, train, and evaluate your own models using public tools and open datasets. 
 6 | 
 7 | A hybrid teaching style will be followed with a mixture of lectures and assignments. The course assumes a basic working knowledge of the Python 3 programming language.  MSc students are required to complete [Introduction to Python](https://www.imperial.ac.uk/students/academic-support/graduate-school/professional-development/doctoral-students/research-computing-data-science/courses/python-for-researchers) before taking this course.
 8 | 
 9 | If you have corrections or suggestions, please raise an [issue](https://github.com/aronwalsh/MLforMaterials/issues) on Github.
10 | 
11 | ```{tableofcontents}
12 | ```
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Made withJupyter](https://img.shields.io/badge/Made%20with-Jupyter-orange?style=for-the-badge&logo=Jupyter)](https://jupyter.org/try)
 2 | 
 3 | [![deploy-book](https://github.com/aronwalsh/MLforMaterials/actions/workflows/deploy.yml/badge.svg)](https://github.com/aronwalsh/MLforMaterials/actions/workflows/deploy.yml)
 4 | [![made-with-Markdown](https://img.shields.io/badge/Made%20with-Markdown-1f425f.svg)](http://commonmark.org)
 5 | [![CC-BY license](https://img.shields.io/badge/License-CC--BY-blue.svg)](https://creativecommons.org/licenses/by/4.0)
 6 | 
 7 | # Machine Learning for Materials
 8 | 
 9 | Online resource of a practical machine learning course in the Department of Materials at Imperial College London.
10 | 
11 | You have the option to browse the files or download the complete folder using the green `clone or download` button on the top right of the screen ([zip file](https://github.com/aronwalsh/MLforMaterials/archive/master.zip)).
12 | 
13 | ## Course Description
14 | 
15 | _Machine Learning for Materials_ (MATE70026) provides an introduction to statistical research tools for materials theory and simulation. It is aimed at senior undergraduate or junior postgraduate students. 
16 | 
17 | You will consider how composition-structure-property information in materials science can be represented in a form suitable for machine learning. You will then build, train, and evaluate your own models using public tools and open datasets. 
18 | 
19 | A hybrid teaching style will be followed with a mixture of lectures and assignments. The course assumes a basic working knowledge of the Python 3 programming language.
20 | 
21 | [Lecture Slides](./slides)
22 | 
23 | [Post a Query](https://github.com/aronwalsh/MLforMaterials/issues)
24 | 
25 | ## Course Website
26 | 
27 | You can view the site at [https://aronwalsh.github.io/MLforMaterials](https://aronwalsh.github.io/MLforMaterials)
28 | 
29 | To build a local copy, first install [Jupyter Book](https://jupyterbook.org):
30 | 
31 | `pip install -U jupyter-book`
32 | 
33 | then enter the repository and run 
34 | 
35 | `jupyter-book build .`
36 | 
37 | ## Acknowledgements
38 | 
39 | This module was developed by Aron Walsh with the assistance of [Anthony Onwuli](https://github.com/AntObi) and [Zhenzhu Li](https://github.com/lizhenzhupearl).


--------------------------------------------------------------------------------
/Resources.md:
--------------------------------------------------------------------------------
  1 | # Resources
  2 | 
  3 | There is a vibrant community of machine learning developers and open-source packages for scientific research. Many of the links below have provided inspiration or borrowed content for this module.
  4 | 
  5 | ![](./images/ml-python.png)
  6 | (Image by [John Kitchen](https://kitchingroup.cheme.cmu.edu))
  7 | 
  8 | ## General Python
  9 | 
 10 | * [Python for Physicists](https://lucydot.github.io/python_novice)
 11 | 
 12 | * [Python for Chemists](https://pythoninchemistry.org)
 13 | 
 14 | * [Phind AI helper](https://www.phind.com)
 15 | 
 16 | ## Tools and Benchmarks
 17 | 
 18 | * Classical ML: [scikit-learn](https://scikit-learn.org); [scikit-opt](https://scikit-optimize.github.io)
 19 | 
 20 | * Deep Learning: [pytorch](https://pytorch.org); [tensorflow](https://www.tensorflow.org); [jax](https://github.com/google/jax); [keras](https://keras.io)
 21 | 
 22 | * Materials benchmarks: [Matbench](https://matbench.materialsproject.org); [Matbench-Discovery](https://matbench-discovery.materialsproject.org); [JARVIS-Leaderboard](https://pages.nist.gov/jarvis_leaderboard)
 23 | 
 24 | * Materials focused tools: [matminer](https://hackingmaterials.lbl.gov/matminer); [automatminer](https://hackingmaterials.lbl.gov/automatminer); [elementembeddings](https://github.com/WMD-group/ElementEmbeddings); [matgl](https://github.com/materialsvirtuallab/matgl); [dscribe](https://singroup.github.io/dscribe)
 25 | 
 26 | * Molecular focused tools: [deepchem](https://deepchem.io); [stk](https://github.com/JelfsMaterialsGroup/stk); [chemiscope](https://chemiscope.org)
 27 | 
 28 | * Model tracking: [mlflow](https://mlflow.org); [wandb](https://wandb.ai)
 29 | 
 30 | * Other lists: [Awesome Materials Informatics](https://github.com/tilde-lab/awesome-materials-informatics); [Awesome Self Driving Labs](https://github.com/AccelerationConsortium/awesome-self-driving-labs); [Awesome Generative AI](https://github.com/aishwaryanr/awesome-generative-ai-guide); [Atomistic ML](https://github.com/JuDFTteam/best-of-atomistic-machine-learning)
 31 | 
 32 | ## Books
 33 | 
 34 | * [Applications of Artificial Intelligence in Chemistry](https://global.oup.com/academic/product/applications-of-artificial-intelligence-in-chemistry-9780198557364)
 35 | 
 36 | * [Bayesian Computation](https://bayesiancomputationbook.com)
 37 | 
 38 | * [Data Analysis: A Bayesian Tutorial](https://global.oup.com/academic/product/data-analysis-9780198568322)
 39 | 
 40 | * [Deep Learning for Molecules and Materials](https://dmol.pub)
 41 | 
 42 | * [Dive into Deep Learning](https://d2l.ai/index.html)
 43 | 
 44 | * [Geometric Deep Learning](https://geometricdeeplearning.com/book)
 45 | 
 46 | * [Interpretable Machine Learning](https://christophm.github.io/interpretable-ml-book)
 47 | 
 48 | * [Machine Learning in Materials Science](https://pubs.acs.org/doi/10.1021/acsinfocus.7e5033)
 49 | 
 50 | * [Reinforcement Learning and Optimal Control](https://web.mit.edu/dimitrib/www/RLbook.html)
 51 | 
 52 | * [Understanding Deep Learning](https://udlbook.github.io/udlbook)
 53 | 
 54 | ## Papers 
 55 | 
 56 | * [Best Practices in Machine Learning for Chemistry](https://www.nature.com/articles/s41557-021-00716-z)
 57 | 
 58 | * [Machine Learning for Molecular and Materials Science](https://www.nature.com/articles/s41586-018-0337-2)
 59 | 
 60 | * [Machine Learning for Materials Scientists: An Introductory Guide toward Best Practices](https://pubs.acs.org/doi/10.1021/acs.chemmater.0c01907)
 61 | 
 62 | ## Videos
 63 | 
 64 | * [Materials Cloud (Webinars)](https://www.youtube.com/@MaterialsCloud)
 65 | 
 66 | * [Materials Project (Webinars)](https://www.youtube.com/@MaterialsProject)
 67 | 
 68 | * [Materials Informatics (Taylor Sparks)](https://www.youtube.com/@TaylorSparks/videos)
 69 | 
 70 | * [Neural networks (3Blue1Brown)](https://www.youtube.com/watch?v=aircAruvnKk)
 71 | 
 72 | * [Python for Science (John Kitchin)](https://www.youtube.com/@JohnKitchin/videos)
 73 | 
 74 | * [Statquest with Josh Starmer](https://www.youtube.com/@statquest/videos)
 75 | 
 76 | ## Other Courses
 77 | 
 78 | * [Automated Experiments](https://github.com/SergeiVKalinin/UTK-Spring-2023---Automated-Experiment) 
 79 | 
 80 | * [AI For Everyone](https://www.coursea.org/learn/ai-for-everyone)
 81 | 
 82 | * [Data Driven Chemistry](https://github.com/Edinburgh-Chemistry-Teaching/Data-driven-chemistry)
 83 | 
 84 | * [Deep Neural Networks](https://www.youtube.com/playlist?list=PL_iWQOsE6TfVmKkQHucjPAoRtIJYt8a5A)
 85 | 
 86 | * [Gentle Introduction to Graph Neural Networks](https://distill.pub/2021/gnn-intro/)
 87 | 
 88 | * [Homemade Machine Learning](https://github.com/trekhleb/homemade-machine-learning)
 89 | 
 90 | * [Introduction to Machine Learning in Chemistry](https://github.com/ML4chemArg/Intro-to-Machine-Learning-in-Chemistry)
 91 | 
 92 | * [Materials Informatics](https://www.youtube.com/watch?v=DDliZDwiAoU&list=PLL0SWcFqypCl4lrzk1dMWwTUrzQZFt7y0)
 93 | 
 94 | * [Mathematics for Machine Learning Specialisation](https://www.coursera.org/specializations/mathematics-machine-learning)
 95 | 
 96 | * [Mathematics in Materials](https://github.com/SergeiVKalinin/MSE_Spring2024)
 97 | 
 98 | * [Machine Learning for Everyone](https://vas3k.com/blog/machine_learning/)
 99 | 
100 | * [Machine Learning for Materials](https://github.com/SergeiVKalinin/MSE_Fall2023)
101 | 
102 | * [Machine Learning Refined](https://github.com/jermwatt/machine_learning_refined)
103 | 
104 | * [Materials Informatics](https://github.com/sp8rks/MaterialsInformatics)
105 | 
106 | * [Microsoft ML for Beginners](https://github.com/microsoft/ML-For-Beginners)
107 | 
108 | * [ML for Chemistry](https://github.com/Edinburgh-Chemistry-Teaching/ML-for-Chemistry)
109 | 
110 | * [ML in Materials](https://github.com/SergeiVKalinin/MSE_Fall2023)
111 | 
112 | * [ML Training Resources](https://github.com/keeeto/ml-training-resources)
113 | 
114 | * [Numerics of ML](https://github.com/philipphennig/NumericsOfML)
115 | 
116 | * [Practical Deep Learning](https://course.fast.ai)
117 | 
118 | * [Stanford ML Cheat Sheets](https://stanford.edu/~shervine/teaching)
119 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
 1 | # Book settings
 2 | # Learn more at https://jupyterbook.org/customize/config.html
 3 | 
 4 | title: Machine Learning for Materials
 5 | author: Aron Walsh
 6 | logo: logo.png
 7 | copyright: "2025"
 8 | 
 9 | # Force re-execution of notebooks on each build.
10 | # See https://jupyterbook.org/content/execute.html
11 | execute:
12 |   execute_notebooks: off
13 |   
14 | # Define the name of the latex output file for PDF builds
15 | latex:
16 |   latex_documents:
17 |     targetname: ml4materials.tex
18 | 
19 | # Add a bibtex file so that we can create citations
20 | bibtex_bibfiles:
21 |   - ref.bib
22 | 
23 | launch_buttons:
24 |   colab_url: "https://colab.research.google.com"
25 |   binderhub_url: "https://mybinder.org"
26 |   thebe: true
27 | 
28 | # Information about where the book exists on the web
29 | repository:
30 |   url: https://github.com/aronwalsh/MLforMaterials  # Online location of your book
31 |   branch: 2025  # Which branch of the repository should be used when creating links (optional)
32 | 
33 | # Add GitHub buttons to your book
34 | # See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository
35 | html:
36 |   use_issues_button: true
37 |   use_repository_button: true
38 |   google_analytics_id: G-74ZFK336GP
39 |   
40 | parse:
41 |   myst_enable_extensions:
42 |     # don't forget to list any other extensions you want enabled,
43 |     # including those that are enabled by default!
44 |   - dollarmath
45 |   - amsmath
46 |   - linkify
47 |   - html_admonition
48 |   - html_image


--------------------------------------------------------------------------------
/_toc.yml:
--------------------------------------------------------------------------------
 1 | # Table of contents
 2 | # Learn more at https://jupyterbook.org/customize/toc.html
 3 | 
 4 | format: jb-book
 5 | root: Overview
 6 | parts:
 7 | - caption: Course Details
 8 |   numbered: false
 9 |   chapters:  
10 |   - file: Contents
11 |   - file: Learning  
12 |   - file: Resources
13 | - caption: Notebooks
14 |   numbered: true
15 |   chapters:
16 |   - file: Lecture1
17 |   - file: Lecture2
18 |   - file: Lecture3
19 |   - file: Lecture4
20 |   - file: Lecture5
21 |   - file: Lecture6
22 |   - file: Lecture7
23 |   - file: Lecture8
24 |   - file: Lecture9


--------------------------------------------------------------------------------
/images/2_Cs2AgBiI6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/images/2_Cs2AgBiI6.png


--------------------------------------------------------------------------------
/images/2_Cs2AgBiI6.vesta:
--------------------------------------------------------------------------------
  1 | #VESTA_FORMAT_VERSION 3.5.4
  2 | 
  3 | 
  4 | CRYSTAL
  5 | 
  6 | TITLE
  7 | Cs8 Ag4 Bi4 I24
  8 | 
  9 | GROUP
 10 | 225 1 F m -3 m
 11 | SYMOP
 12 |  0.000000  0.000000  0.000000  1  0  0   0  1  0   0  0  1   1
 13 |  0.000000  0.000000  0.000000 -1  0  0   0 -1  0   0  0 -1   1
 14 |  0.000000  0.000000  0.000000 -1  0  0   0 -1  0   0  0  1   1
 15 |  0.000000  0.000000  0.000000  1  0  0   0  1  0   0  0 -1   1
 16 |  0.000000  0.000000  0.000000 -1  0  0   0  1  0   0  0 -1   1
 17 |  0.000000  0.000000  0.000000  1  0  0   0 -1  0   0  0  1   1
 18 |  0.000000  0.000000  0.000000  1  0  0   0 -1  0   0  0 -1   1
 19 |  0.000000  0.000000  0.000000 -1  0  0   0  1  0   0  0  1   1
 20 |  0.000000  0.000000  0.000000  0  0  1   1  0  0   0  1  0   1
 21 |  0.000000  0.000000  0.000000  0  0 -1  -1  0  0   0 -1  0   1
 22 |  0.000000  0.000000  0.000000  0  0  1  -1  0  0   0 -1  0   1
 23 |  0.000000  0.000000  0.000000  0  0 -1   1  0  0   0  1  0   1
 24 |  0.000000  0.000000  0.000000  0  0 -1  -1  0  0   0  1  0   1
 25 |  0.000000  0.000000  0.000000  0  0  1   1  0  0   0 -1  0   1
 26 |  0.000000  0.000000  0.000000  0  0 -1   1  0  0   0 -1  0   1
 27 |  0.000000  0.000000  0.000000  0  0  1  -1  0  0   0  1  0   1
 28 |  0.000000  0.000000  0.000000  0  1  0   0  0  1   1  0  0   1
 29 |  0.000000  0.000000  0.000000  0 -1  0   0  0 -1  -1  0  0   1
 30 |  0.000000  0.000000  0.000000  0 -1  0   0  0  1  -1  0  0   1
 31 |  0.000000  0.000000  0.000000  0  1  0   0  0 -1   1  0  0   1
 32 |  0.000000  0.000000  0.000000  0  1  0   0  0 -1  -1  0  0   1
 33 |  0.000000  0.000000  0.000000  0 -1  0   0  0  1   1  0  0   1
 34 |  0.000000  0.000000  0.000000  0 -1  0   0  0 -1   1  0  0   1
 35 |  0.000000  0.000000  0.000000  0  1  0   0  0  1  -1  0  0   1
 36 |  0.000000  0.000000  0.000000  0  1  0   1  0  0   0  0 -1   1
 37 |  0.000000  0.000000  0.000000  0 -1  0  -1  0  0   0  0  1   1
 38 |  0.000000  0.000000  0.000000  0 -1  0  -1  0  0   0  0 -1   1
 39 |  0.000000  0.000000  0.000000  0  1  0   1  0  0   0  0  1   1
 40 |  0.000000  0.000000  0.000000  0  1  0  -1  0  0   0  0  1   1
 41 |  0.000000  0.000000  0.000000  0 -1  0   1  0  0   0  0 -1   1
 42 |  0.000000  0.000000  0.000000  0 -1  0   1  0  0   0  0  1   1
 43 |  0.000000  0.000000  0.000000  0  1  0  -1  0  0   0  0 -1   1
 44 |  0.000000  0.000000  0.000000  1  0  0   0  0  1   0 -1  0   1
 45 |  0.000000  0.000000  0.000000 -1  0  0   0  0 -1   0  1  0   1
 46 |  0.000000  0.000000  0.000000 -1  0  0   0  0  1   0  1  0   1
 47 |  0.000000  0.000000  0.000000  1  0  0   0  0 -1   0 -1  0   1
 48 |  0.000000  0.000000  0.000000 -1  0  0   0  0 -1   0 -1  0   1
 49 |  0.000000  0.000000  0.000000  1  0  0   0  0  1   0  1  0   1
 50 |  0.000000  0.000000  0.000000  1  0  0   0  0 -1   0  1  0   1
 51 |  0.000000  0.000000  0.000000 -1  0  0   0  0  1   0 -1  0   1
 52 |  0.000000  0.000000  0.000000  0  0  1   0  1  0  -1  0  0   1
 53 |  0.000000  0.000000  0.000000  0  0 -1   0 -1  0   1  0  0   1
 54 |  0.000000  0.000000  0.000000  0  0  1   0 -1  0   1  0  0   1
 55 |  0.000000  0.000000  0.000000  0  0 -1   0  1  0  -1  0  0   1
 56 |  0.000000  0.000000  0.000000  0  0 -1   0  1  0   1  0  0   1
 57 |  0.000000  0.000000  0.000000  0  0  1   0 -1  0  -1  0  0   1
 58 |  0.000000  0.000000  0.000000  0  0 -1   0 -1  0  -1  0  0   1
 59 |  0.000000  0.000000  0.000000  0  0  1   0  1  0   1  0  0   1
 60 |  0.000000  0.500000  0.500000  1  0  0   0  1  0   0  0  1   1
 61 |  0.000000  0.500000  0.500000 -1  0  0   0 -1  0   0  0 -1   1
 62 |  0.000000  0.500000  0.500000 -1  0  0   0 -1  0   0  0  1   1
 63 |  0.000000  0.500000  0.500000  1  0  0   0  1  0   0  0 -1   1
 64 |  0.000000  0.500000  0.500000 -1  0  0   0  1  0   0  0 -1   1
 65 |  0.000000  0.500000  0.500000  1  0  0   0 -1  0   0  0  1   1
 66 |  0.000000  0.500000  0.500000  1  0  0   0 -1  0   0  0 -1   1
 67 |  0.000000  0.500000  0.500000 -1  0  0   0  1  0   0  0  1   1
 68 |  0.000000  0.500000  0.500000  0  0  1   1  0  0   0  1  0   1
 69 |  0.000000  0.500000  0.500000  0  0 -1  -1  0  0   0 -1  0   1
 70 |  0.000000  0.500000  0.500000  0  0  1  -1  0  0   0 -1  0   1
 71 |  0.000000  0.500000  0.500000  0  0 -1   1  0  0   0  1  0   1
 72 |  0.000000  0.500000  0.500000  0  0 -1  -1  0  0   0  1  0   1
 73 |  0.000000  0.500000  0.500000  0  0  1   1  0  0   0 -1  0   1
 74 |  0.000000  0.500000  0.500000  0  0 -1   1  0  0   0 -1  0   1
 75 |  0.000000  0.500000  0.500000  0  0  1  -1  0  0   0  1  0   1
 76 |  0.000000  0.500000  0.500000  0  1  0   0  0  1   1  0  0   1
 77 |  0.000000  0.500000  0.500000  0 -1  0   0  0 -1  -1  0  0   1
 78 |  0.000000  0.500000  0.500000  0 -1  0   0  0  1  -1  0  0   1
 79 |  0.000000  0.500000  0.500000  0  1  0   0  0 -1   1  0  0   1
 80 |  0.000000  0.500000  0.500000  0  1  0   0  0 -1  -1  0  0   1
 81 |  0.000000  0.500000  0.500000  0 -1  0   0  0  1   1  0  0   1
 82 |  0.000000  0.500000  0.500000  0 -1  0   0  0 -1   1  0  0   1
 83 |  0.000000  0.500000  0.500000  0  1  0   0  0  1  -1  0  0   1
 84 |  0.000000  0.500000  0.500000  0  1  0   1  0  0   0  0 -1   1
 85 |  0.000000  0.500000  0.500000  0 -1  0  -1  0  0   0  0  1   1
 86 |  0.000000  0.500000  0.500000  0 -1  0  -1  0  0   0  0 -1   1
 87 |  0.000000  0.500000  0.500000  0  1  0   1  0  0   0  0  1   1
 88 |  0.000000  0.500000  0.500000  0  1  0  -1  0  0   0  0  1   1
 89 |  0.000000  0.500000  0.500000  0 -1  0   1  0  0   0  0 -1   1
 90 |  0.000000  0.500000  0.500000  0 -1  0   1  0  0   0  0  1   1
 91 |  0.000000  0.500000  0.500000  0  1  0  -1  0  0   0  0 -1   1
 92 |  0.000000  0.500000  0.500000  1  0  0   0  0  1   0 -1  0   1
 93 |  0.000000  0.500000  0.500000 -1  0  0   0  0 -1   0  1  0   1
 94 |  0.000000  0.500000  0.500000 -1  0  0   0  0  1   0  1  0   1
 95 |  0.000000  0.500000  0.500000  1  0  0   0  0 -1   0 -1  0   1
 96 |  0.000000  0.500000  0.500000 -1  0  0   0  0 -1   0 -1  0   1
 97 |  0.000000  0.500000  0.500000  1  0  0   0  0  1   0  1  0   1
 98 |  0.000000  0.500000  0.500000  1  0  0   0  0 -1   0  1  0   1
 99 |  0.000000  0.500000  0.500000 -1  0  0   0  0  1   0 -1  0   1
100 |  0.000000  0.500000  0.500000  0  0  1   0  1  0  -1  0  0   1
101 |  0.000000  0.500000  0.500000  0  0 -1   0 -1  0   1  0  0   1
102 |  0.000000  0.500000  0.500000  0  0  1   0 -1  0   1  0  0   1
103 |  0.000000  0.500000  0.500000  0  0 -1   0  1  0  -1  0  0   1
104 |  0.000000  0.500000  0.500000  0  0 -1   0  1  0   1  0  0   1
105 |  0.000000  0.500000  0.500000  0  0  1   0 -1  0  -1  0  0   1
106 |  0.000000  0.500000  0.500000  0  0 -1   0 -1  0  -1  0  0   1
107 |  0.000000  0.500000  0.500000  0  0  1   0  1  0   1  0  0   1
108 |  0.500000  0.000000  0.500000  1  0  0   0  1  0   0  0  1   1
109 |  0.500000  0.000000  0.500000 -1  0  0   0 -1  0   0  0 -1   1
110 |  0.500000  0.000000  0.500000 -1  0  0   0 -1  0   0  0  1   1
111 |  0.500000  0.000000  0.500000  1  0  0   0  1  0   0  0 -1   1
112 |  0.500000  0.000000  0.500000 -1  0  0   0  1  0   0  0 -1   1
113 |  0.500000  0.000000  0.500000  1  0  0   0 -1  0   0  0  1   1
114 |  0.500000  0.000000  0.500000  1  0  0   0 -1  0   0  0 -1   1
115 |  0.500000  0.000000  0.500000 -1  0  0   0  1  0   0  0  1   1
116 |  0.500000  0.000000  0.500000  0  0  1   1  0  0   0  1  0   1
117 |  0.500000  0.000000  0.500000  0  0 -1  -1  0  0   0 -1  0   1
118 |  0.500000  0.000000  0.500000  0  0  1  -1  0  0   0 -1  0   1
119 |  0.500000  0.000000  0.500000  0  0 -1   1  0  0   0  1  0   1
120 |  0.500000  0.000000  0.500000  0  0 -1  -1  0  0   0  1  0   1
121 |  0.500000  0.000000  0.500000  0  0  1   1  0  0   0 -1  0   1
122 |  0.500000  0.000000  0.500000  0  0 -1   1  0  0   0 -1  0   1
123 |  0.500000  0.000000  0.500000  0  0  1  -1  0  0   0  1  0   1
124 |  0.500000  0.000000  0.500000  0  1  0   0  0  1   1  0  0   1
125 |  0.500000  0.000000  0.500000  0 -1  0   0  0 -1  -1  0  0   1
126 |  0.500000  0.000000  0.500000  0 -1  0   0  0  1  -1  0  0   1
127 |  0.500000  0.000000  0.500000  0  1  0   0  0 -1   1  0  0   1
128 |  0.500000  0.000000  0.500000  0  1  0   0  0 -1  -1  0  0   1
129 |  0.500000  0.000000  0.500000  0 -1  0   0  0  1   1  0  0   1
130 |  0.500000  0.000000  0.500000  0 -1  0   0  0 -1   1  0  0   1
131 |  0.500000  0.000000  0.500000  0  1  0   0  0  1  -1  0  0   1
132 |  0.500000  0.000000  0.500000  0  1  0   1  0  0   0  0 -1   1
133 |  0.500000  0.000000  0.500000  0 -1  0  -1  0  0   0  0  1   1
134 |  0.500000  0.000000  0.500000  0 -1  0  -1  0  0   0  0 -1   1
135 |  0.500000  0.000000  0.500000  0  1  0   1  0  0   0  0  1   1
136 |  0.500000  0.000000  0.500000  0  1  0  -1  0  0   0  0  1   1
137 |  0.500000  0.000000  0.500000  0 -1  0   1  0  0   0  0 -1   1
138 |  0.500000  0.000000  0.500000  0 -1  0   1  0  0   0  0  1   1
139 |  0.500000  0.000000  0.500000  0  1  0  -1  0  0   0  0 -1   1
140 |  0.500000  0.000000  0.500000  1  0  0   0  0  1   0 -1  0   1
141 |  0.500000  0.000000  0.500000 -1  0  0   0  0 -1   0  1  0   1
142 |  0.500000  0.000000  0.500000 -1  0  0   0  0  1   0  1  0   1
143 |  0.500000  0.000000  0.500000  1  0  0   0  0 -1   0 -1  0   1
144 |  0.500000  0.000000  0.500000 -1  0  0   0  0 -1   0 -1  0   1
145 |  0.500000  0.000000  0.500000  1  0  0   0  0  1   0  1  0   1
146 |  0.500000  0.000000  0.500000  1  0  0   0  0 -1   0  1  0   1
147 |  0.500000  0.000000  0.500000 -1  0  0   0  0  1   0 -1  0   1
148 |  0.500000  0.000000  0.500000  0  0  1   0  1  0  -1  0  0   1
149 |  0.500000  0.000000  0.500000  0  0 -1   0 -1  0   1  0  0   1
150 |  0.500000  0.000000  0.500000  0  0  1   0 -1  0   1  0  0   1
151 |  0.500000  0.000000  0.500000  0  0 -1   0  1  0  -1  0  0   1
152 |  0.500000  0.000000  0.500000  0  0 -1   0  1  0   1  0  0   1
153 |  0.500000  0.000000  0.500000  0  0  1   0 -1  0  -1  0  0   1
154 |  0.500000  0.000000  0.500000  0  0 -1   0 -1  0  -1  0  0   1
155 |  0.500000  0.000000  0.500000  0  0  1   0  1  0   1  0  0   1
156 |  0.500000  0.500000  0.000000  1  0  0   0  1  0   0  0  1   1
157 |  0.500000  0.500000  0.000000 -1  0  0   0 -1  0   0  0 -1   1
158 |  0.500000  0.500000  0.000000 -1  0  0   0 -1  0   0  0  1   1
159 |  0.500000  0.500000  0.000000  1  0  0   0  1  0   0  0 -1   1
160 |  0.500000  0.500000  0.000000 -1  0  0   0  1  0   0  0 -1   1
161 |  0.500000  0.500000  0.000000  1  0  0   0 -1  0   0  0  1   1
162 |  0.500000  0.500000  0.000000  1  0  0   0 -1  0   0  0 -1   1
163 |  0.500000  0.500000  0.000000 -1  0  0   0  1  0   0  0  1   1
164 |  0.500000  0.500000  0.000000  0  0  1   1  0  0   0  1  0   1
165 |  0.500000  0.500000  0.000000  0  0 -1  -1  0  0   0 -1  0   1
166 |  0.500000  0.500000  0.000000  0  0  1  -1  0  0   0 -1  0   1
167 |  0.500000  0.500000  0.000000  0  0 -1   1  0  0   0  1  0   1
168 |  0.500000  0.500000  0.000000  0  0 -1  -1  0  0   0  1  0   1
169 |  0.500000  0.500000  0.000000  0  0  1   1  0  0   0 -1  0   1
170 |  0.500000  0.500000  0.000000  0  0 -1   1  0  0   0 -1  0   1
171 |  0.500000  0.500000  0.000000  0  0  1  -1  0  0   0  1  0   1
172 |  0.500000  0.500000  0.000000  0  1  0   0  0  1   1  0  0   1
173 |  0.500000  0.500000  0.000000  0 -1  0   0  0 -1  -1  0  0   1
174 |  0.500000  0.500000  0.000000  0 -1  0   0  0  1  -1  0  0   1
175 |  0.500000  0.500000  0.000000  0  1  0   0  0 -1   1  0  0   1
176 |  0.500000  0.500000  0.000000  0  1  0   0  0 -1  -1  0  0   1
177 |  0.500000  0.500000  0.000000  0 -1  0   0  0  1   1  0  0   1
178 |  0.500000  0.500000  0.000000  0 -1  0   0  0 -1   1  0  0   1
179 |  0.500000  0.500000  0.000000  0  1  0   0  0  1  -1  0  0   1
180 |  0.500000  0.500000  0.000000  0  1  0   1  0  0   0  0 -1   1
181 |  0.500000  0.500000  0.000000  0 -1  0  -1  0  0   0  0  1   1
182 |  0.500000  0.500000  0.000000  0 -1  0  -1  0  0   0  0 -1   1
183 |  0.500000  0.500000  0.000000  0  1  0   1  0  0   0  0  1   1
184 |  0.500000  0.500000  0.000000  0  1  0  -1  0  0   0  0  1   1
185 |  0.500000  0.500000  0.000000  0 -1  0   1  0  0   0  0 -1   1
186 |  0.500000  0.500000  0.000000  0 -1  0   1  0  0   0  0  1   1
187 |  0.500000  0.500000  0.000000  0  1  0  -1  0  0   0  0 -1   1
188 |  0.500000  0.500000  0.000000  1  0  0   0  0  1   0 -1  0   1
189 |  0.500000  0.500000  0.000000 -1  0  0   0  0 -1   0  1  0   1
190 |  0.500000  0.500000  0.000000 -1  0  0   0  0  1   0  1  0   1
191 |  0.500000  0.500000  0.000000  1  0  0   0  0 -1   0 -1  0   1
192 |  0.500000  0.500000  0.000000 -1  0  0   0  0 -1   0 -1  0   1
193 |  0.500000  0.500000  0.000000  1  0  0   0  0  1   0  1  0   1
194 |  0.500000  0.500000  0.000000  1  0  0   0  0 -1   0  1  0   1
195 |  0.500000  0.500000  0.000000 -1  0  0   0  0  1   0 -1  0   1
196 |  0.500000  0.500000  0.000000  0  0  1   0  1  0  -1  0  0   1
197 |  0.500000  0.500000  0.000000  0  0 -1   0 -1  0   1  0  0   1
198 |  0.500000  0.500000  0.000000  0  0  1   0 -1  0   1  0  0   1
199 |  0.500000  0.500000  0.000000  0  0 -1   0  1  0  -1  0  0   1
200 |  0.500000  0.500000  0.000000  0  0 -1   0  1  0   1  0  0   1
201 |  0.500000  0.500000  0.000000  0  0  1   0 -1  0  -1  0  0   1
202 |  0.500000  0.500000  0.000000  0  0 -1   0 -1  0  -1  0  0   1
203 |  0.500000  0.500000  0.000000  0  0  1   0  1  0   1  0  0   1
204 |  -1.0 -1.0 -1.0  1 0 0  0 0 0  0 0 0
205 | TRANM 0
206 |  0.000000  0.000000  0.000000  1  0  0   0  1  0   0  0  1
207 | LTRANSL
208 |  -1
209 |  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
210 | LORIENT
211 |  -1   0   0   0   0
212 |  1.000000  0.000000  0.000000  1.000000  0.000000  0.000000
213 |  0.000000  0.000000  1.000000  0.000000  0.000000  1.000000
214 | LMATRIX
215 |  1.000000  0.000000  0.000000  0.000000
216 |  0.000000  1.000000  0.000000  0.000000
217 |  0.000000  0.000000  1.000000  0.000000
218 |  0.000000  0.000000  0.000000  1.000000
219 |  0.000000  0.000000  0.000000
220 | CELLP
221 |  12.253718  12.253718  12.253718  90.000000  90.000000  90.000000
222 |   0.000000   0.000000   0.000000   0.000000   0.000000   0.000000
223 | STRUC
224 |   1 Cs        Cs0  1.0000   0.250000   0.250000   0.250000    8c    -43m
225 |                             0.000000   0.000000   0.000000  1.00
226 |   2 Ag        Ag1  1.0000   0.000000   0.000000   0.500000    4b    m-3m
227 |                             0.000000   0.000000   0.000000  1.00
228 |   3 Bi        Bi2  1.0000   0.000000   0.000000   0.000000    4a    m-3m
229 |                             0.000000   0.000000   0.000000  3.00
230 |   4  I         I3  1.0000   0.000000   0.000000   0.253060   24e   4m. m
231 |                             0.000000   0.000000   0.000000 -1.00
232 |   0 0 0 0 0 0 0
233 | THERI 1
234 |   1        Cs0  0.000000
235 |   2        Ag1  0.000000
236 |   3        Bi2  0.000000
237 |   4         I3  0.000000
238 |   0 0 0
239 | SHAPE
240 |   0       0       0       0   0.000000  0   192   192   192   192
241 | BOUND
242 |        0        1         0        1         0        1
243 |   0   0   0   0  0
244 | SBOND
245 |   1    Bi     I    0.00000    3.38291  0  1  1  0  1  0.250  2.000 127 127 127
246 |   2    Ag     I    0.00000    3.38291  0  1  1  0  1  0.250  2.000 127 127 127
247 |   0 0 0 0
248 | SITET
249 |   1        Cs0  2.7200  14 254 185  14 254 185 204  0
250 |   2        Ag1  1.4400 183 187 189 183 187 189 204  0
251 |   3        Bi2  1.8200 210  47 247 210  47 247 204  0
252 |   4         I3  1.3300 142  31 138 142  31 138 204  0
253 |   0 0 0 0 0 0
254 | VECTR
255 |  0 0 0 0 0
256 | VECTT
257 |  0 0 0 0 0
258 | SPLAN
259 |   0   0   0   0
260 | LBLAT
261 |  -1
262 | LBLSP
263 |  -1
264 | DLATM
265 |  -1
266 | DLBND
267 |  -1
268 | DLPLY
269 |  -1
270 | PLN2D
271 |   0   0   0   0
272 | ATOMT
273 |   1         Cs  2.7200  14 254 185  14 254 185 204
274 |   2         Ag  1.4400 183 187 189 183 187 189 204
275 |   3         Bi  1.8200 210  47 247 210  47 247 204
276 |   4          I  1.3300 142  31 138 142  31 138 204
277 |   0 0 0 0 0 0
278 | SCENE
279 | -0.184211  0.982852  0.008235  0.000000
280 | -0.091074 -0.025410  0.995520  0.000000
281 |  0.978658  0.182636  0.094193  0.000000
282 |  0.000000  0.000000  0.000000  1.000000
283 |   0.000   0.000
284 |   0.000
285 |   1.000
286 | HBOND 0 2
287 | 
288 | STYLE
289 | DISPF 37753794
290 | MODEL   2  1  0
291 | SURFS   0  1  1
292 | SECTS  32  1
293 | FORMS   0  1
294 | ATOMS   0  0  1
295 | BONDS   1
296 | POLYS   1
297 | VECTS 1.000000
298 | FORMP
299 |   1  1.0   0   0   0
300 | ATOMP
301 |  24  24   0  50  2.0   0
302 | BONDP
303 |   1  16  0.250  2.000 127 127 127
304 | POLYP
305 |  204 1  1.000 180 180 180
306 | ISURF
307 |   0   0   0   0
308 | TEX3P
309 |   1  0.00000E+00  1.00000E+00
310 | SECTP
311 |   1  0.00000E+00  1.00000E+00  0.00000E+00  0.00000E+00  0.00000E+00  0.00000E+00
312 | CONTR
313 |  0.1 -1 1 1 10 -1 2 5
314 |  2 1 2 1
315 |    0   0   0
316 |    0   0   0
317 |    0   0   0
318 |    0   0   0
319 | HKLPP
320 |  192 1  1.000 255   0 255
321 | UCOLP
322 |    0   1  1.000   0   0   0
323 | COMPS 0
324 | LABEL 1    12  1.000 0
325 | PROJT 0  0.962
326 | BKGRC
327 |  255 255 255
328 | DPTHQ 1 -0.5000  3.5000
329 | LIGHT0 1
330 |  1.000000  0.000000  0.000000  0.000000
331 |  0.000000  1.000000  0.000000  0.000000
332 |  0.000000  0.000000  1.000000  0.000000
333 |  0.000000  0.000000  0.000000  1.000000
334 |  0.000000  0.000000 20.000000  0.000000
335 |  0.000000  0.000000 -1.000000
336 |   26  26  26 255
337 |  179 179 179 255
338 |  255 255 255 255
339 | LIGHT1
340 |  1.000000  0.000000  0.000000  0.000000
341 |  0.000000  1.000000  0.000000  0.000000
342 |  0.000000  0.000000  1.000000  0.000000
343 |  0.000000  0.000000  0.000000  1.000000
344 |  0.000000  0.000000 20.000000  0.000000
345 |  0.000000  0.000000 -1.000000
346 |    0   0   0   0
347 |    0   0   0   0
348 |    0   0   0   0
349 | LIGHT2
350 |  1.000000  0.000000  0.000000  0.000000
351 |  0.000000  1.000000  0.000000  0.000000
352 |  0.000000  0.000000  1.000000  0.000000
353 |  0.000000  0.000000  0.000000  1.000000
354 |  0.000000  0.000000 20.000000  0.000000
355 |  0.000000  0.000000 -1.000000
356 |    0   0   0   0
357 |    0   0   0   0
358 |    0   0   0   0
359 | LIGHT3
360 |  1.000000  0.000000  0.000000  0.000000
361 |  0.000000  1.000000  0.000000  0.000000
362 |  0.000000  0.000000  1.000000  0.000000
363 |  0.000000  0.000000  0.000000  1.000000
364 |  0.000000  0.000000 20.000000  0.000000
365 |  0.000000  0.000000 -1.000000
366 |    0   0   0   0
367 |    0   0   0   0
368 |    0   0   0   0
369 | SECCL 0
370 | 
371 | TEXCL 0
372 | 
373 | ATOMM
374 |  204 204 204 255
375 |   25.600
376 | BONDM
377 |  255 255 255 255
378 |  128.000
379 | POLYM
380 |  255 255 255 255
381 |  128.000
382 | SURFM
383 |    0   0   0 255
384 |  128.000
385 | FORMM
386 |  255 255 255 255
387 |  128.000
388 | HKLPM
389 |  255 255 255 255
390 |  128.000
391 | 


--------------------------------------------------------------------------------
/images/2_CsPbI3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/images/2_CsPbI3.png


--------------------------------------------------------------------------------
/images/2_CsPbI3.vesta:
--------------------------------------------------------------------------------
  1 | #VESTA_FORMAT_VERSION 3.5.4
  2 | 
  3 | 
  4 | CRYSTAL
  5 | 
  6 | TITLE
  7 | Cs1 Pb1 I3
  8 | 
  9 | GROUP
 10 | 221 1 P m -3 m
 11 | SYMOP
 12 |  0.000000  0.000000  0.000000  1  0  0   0  1  0   0  0  1   1
 13 |  0.000000  0.000000  0.000000 -1  0  0   0 -1  0   0  0 -1   1
 14 |  0.000000  0.000000  0.000000 -1  0  0   0 -1  0   0  0  1   1
 15 |  0.000000  0.000000  0.000000  1  0  0   0  1  0   0  0 -1   1
 16 |  0.000000  0.000000  0.000000 -1  0  0   0  1  0   0  0 -1   1
 17 |  0.000000  0.000000  0.000000  1  0  0   0 -1  0   0  0  1   1
 18 |  0.000000  0.000000  0.000000  1  0  0   0 -1  0   0  0 -1   1
 19 |  0.000000  0.000000  0.000000 -1  0  0   0  1  0   0  0  1   1
 20 |  0.000000  0.000000  0.000000  0  0  1   1  0  0   0  1  0   1
 21 |  0.000000  0.000000  0.000000  0  0 -1  -1  0  0   0 -1  0   1
 22 |  0.000000  0.000000  0.000000  0  0  1  -1  0  0   0 -1  0   1
 23 |  0.000000  0.000000  0.000000  0  0 -1   1  0  0   0  1  0   1
 24 |  0.000000  0.000000  0.000000  0  0 -1  -1  0  0   0  1  0   1
 25 |  0.000000  0.000000  0.000000  0  0  1   1  0  0   0 -1  0   1
 26 |  0.000000  0.000000  0.000000  0  0 -1   1  0  0   0 -1  0   1
 27 |  0.000000  0.000000  0.000000  0  0  1  -1  0  0   0  1  0   1
 28 |  0.000000  0.000000  0.000000  0  1  0   0  0  1   1  0  0   1
 29 |  0.000000  0.000000  0.000000  0 -1  0   0  0 -1  -1  0  0   1
 30 |  0.000000  0.000000  0.000000  0 -1  0   0  0  1  -1  0  0   1
 31 |  0.000000  0.000000  0.000000  0  1  0   0  0 -1   1  0  0   1
 32 |  0.000000  0.000000  0.000000  0  1  0   0  0 -1  -1  0  0   1
 33 |  0.000000  0.000000  0.000000  0 -1  0   0  0  1   1  0  0   1
 34 |  0.000000  0.000000  0.000000  0 -1  0   0  0 -1   1  0  0   1
 35 |  0.000000  0.000000  0.000000  0  1  0   0  0  1  -1  0  0   1
 36 |  0.000000  0.000000  0.000000  0  1  0   1  0  0   0  0 -1   1
 37 |  0.000000  0.000000  0.000000  0 -1  0  -1  0  0   0  0  1   1
 38 |  0.000000  0.000000  0.000000  0 -1  0  -1  0  0   0  0 -1   1
 39 |  0.000000  0.000000  0.000000  0  1  0   1  0  0   0  0  1   1
 40 |  0.000000  0.000000  0.000000  0  1  0  -1  0  0   0  0  1   1
 41 |  0.000000  0.000000  0.000000  0 -1  0   1  0  0   0  0 -1   1
 42 |  0.000000  0.000000  0.000000  0 -1  0   1  0  0   0  0  1   1
 43 |  0.000000  0.000000  0.000000  0  1  0  -1  0  0   0  0 -1   1
 44 |  0.000000  0.000000  0.000000  1  0  0   0  0  1   0 -1  0   1
 45 |  0.000000  0.000000  0.000000 -1  0  0   0  0 -1   0  1  0   1
 46 |  0.000000  0.000000  0.000000 -1  0  0   0  0  1   0  1  0   1
 47 |  0.000000  0.000000  0.000000  1  0  0   0  0 -1   0 -1  0   1
 48 |  0.000000  0.000000  0.000000 -1  0  0   0  0 -1   0 -1  0   1
 49 |  0.000000  0.000000  0.000000  1  0  0   0  0  1   0  1  0   1
 50 |  0.000000  0.000000  0.000000  1  0  0   0  0 -1   0  1  0   1
 51 |  0.000000  0.000000  0.000000 -1  0  0   0  0  1   0 -1  0   1
 52 |  0.000000  0.000000  0.000000  0  0  1   0  1  0  -1  0  0   1
 53 |  0.000000  0.000000  0.000000  0  0 -1   0 -1  0   1  0  0   1
 54 |  0.000000  0.000000  0.000000  0  0  1   0 -1  0   1  0  0   1
 55 |  0.000000  0.000000  0.000000  0  0 -1   0  1  0  -1  0  0   1
 56 |  0.000000  0.000000  0.000000  0  0 -1   0  1  0   1  0  0   1
 57 |  0.000000  0.000000  0.000000  0  0  1   0 -1  0  -1  0  0   1
 58 |  0.000000  0.000000  0.000000  0  0 -1   0 -1  0  -1  0  0   1
 59 |  0.000000  0.000000  0.000000  0  0  1   0  1  0   1  0  0   1
 60 |  -1.0 -1.0 -1.0  1 0 0  0 0 0  0 0 0
 61 | TRANM 0
 62 |  0.000000  0.000000  0.000000  1  0  0   0  1  0   0  0  1
 63 | LTRANSL
 64 |  -1
 65 |  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
 66 | LORIENT
 67 |  -1   0   0   0   0
 68 |  1.000000  0.000000  0.000000  1.000000  0.000000  0.000000
 69 |  0.000000  0.000000  1.000000  0.000000  0.000000  1.000000
 70 | LMATRIX
 71 |  1.000000  0.000000  0.000000  0.000000
 72 |  0.000000  1.000000  0.000000  0.000000
 73 |  0.000000  0.000000  1.000000  0.000000
 74 |  0.000000  0.000000  0.000000  1.000000
 75 |  0.000000  0.000000  0.000000
 76 | CELLP
 77 |   6.275142   6.275142   6.275142  90.000000  90.000000  90.000000
 78 |   0.000000   0.000000   0.000000   0.000000   0.000000   0.000000
 79 | STRUC
 80 |   1 Cs        Cs0  1.0000   0.500000   0.500000   0.500000    1b    m-3m
 81 |                             0.000000   0.000000   0.000000  1.00
 82 |   2 Pb        Pb1  1.0000   0.000000   0.000000   0.000000    1a    m-3m
 83 |                             0.000000   0.000000   0.000000  2.00
 84 |   3  I         I2  1.0000   0.000000   0.000000   0.500000    3d 4/mm. m
 85 |                             0.000000   0.000000   0.000000 -1.00
 86 |   0 0 0 0 0 0 0
 87 | THERI 1
 88 |   1        Cs0  0.000000
 89 |   2        Pb1  0.000000
 90 |   3         I2  0.000000
 91 |   0 0 0
 92 | SHAPE
 93 |   0       0       0       0   0.000000  0   192   192   192   192
 94 | BOUND
 95 |        0        1         0        1         0        1
 96 |   0   0   0   0  0
 97 | SBOND
 98 |   1    Pb     I    0.00000    3.69562  0  1  1  0  1  0.250  2.000 127 127 127
 99 |   0 0 0 0
100 | SITET
101 |   1        Cs0  2.7200  14 254 185  14 254 185 204  0
102 |   2        Pb1  1.7500  82  83  91  82  83  91 204  0
103 |   3         I2  1.3300 142  31 138 142  31 138 204  0
104 |   0 0 0 0 0 0
105 | VECTR
106 |  0 0 0 0 0
107 | VECTT
108 |  0 0 0 0 0
109 | SPLAN
110 |   0   0   0   0
111 | LBLAT
112 |  -1
113 | LBLSP
114 |  -1
115 | DLATM
116 |  -1
117 | DLBND
118 |  -1
119 | DLPLY
120 |  -1
121 | PLN2D
122 |   0   0   0   0
123 | ATOMT
124 |   1         Cs  2.7200  14 254 185  14 254 185 204
125 |   2         Pb  1.7500  82  83  91  82  83  91 204
126 |   3          I  1.3300 142  31 138 142  31 138 204
127 |   0 0 0 0 0 0
128 | SCENE
129 |  0.971230  0.013601 -0.237754  0.000000
130 | -0.048255  0.988897 -0.140552  0.000000
131 |  0.233202  0.147982  0.961103  0.000000
132 |  0.000000  0.000000  0.000000  1.000000
133 |   0.000   0.000
134 |   0.000
135 |   0.629
136 | HBOND 0 2
137 | 
138 | STYLE
139 | DISPF 37753794
140 | MODEL   2  1  0
141 | SURFS   0  1  1
142 | SECTS  32  1
143 | FORMS   0  1
144 | ATOMS   0  0  1
145 | BONDS   1
146 | POLYS   1
147 | VECTS 1.000000
148 | FORMP
149 |   1  1.0   0   0   0
150 | ATOMP
151 |  24  24   0  50  2.0   0
152 | BONDP
153 |   1  16  0.250  2.000 127 127 127
154 | POLYP
155 |  204 1  1.000 180 180 180
156 | ISURF
157 |   0   0   0   0
158 | TEX3P
159 |   1  0.00000E+00  1.00000E+00
160 | SECTP
161 |   1  0.00000E+00  1.00000E+00  0.00000E+00  0.00000E+00  0.00000E+00  0.00000E+00
162 | CONTR
163 |  0.1 -1 1 1 10 -1 2 5
164 |  2 1 2 1
165 |    0   0   0
166 |    0   0   0
167 |    0   0   0
168 |    0   0   0
169 | HKLPP
170 |  192 1  1.000 255   0 255
171 | UCOLP
172 |    0   1  1.000   0   0   0
173 | COMPS 0
174 | LABEL 1    12  1.000 0
175 | PROJT 0  0.962
176 | BKGRC
177 |  255 255 255
178 | DPTHQ 1 -0.5000  3.5000
179 | LIGHT0 1
180 |  1.000000  0.000000  0.000000  0.000000
181 |  0.000000  1.000000  0.000000  0.000000
182 |  0.000000  0.000000  1.000000  0.000000
183 |  0.000000  0.000000  0.000000  1.000000
184 |  0.000000  0.000000 20.000000  0.000000
185 |  0.000000  0.000000 -1.000000
186 |   51  51  51 255
187 |  196 196 196 255
188 |  255 255 255 255
189 | LIGHT1
190 |  1.000000  0.000000  0.000000  0.000000
191 |  0.000000  1.000000  0.000000  0.000000
192 |  0.000000  0.000000  1.000000  0.000000
193 |  0.000000  0.000000  0.000000  1.000000
194 |  0.000000  0.000000 20.000000  0.000000
195 |  0.000000  0.000000 -1.000000
196 |    0   0   0   0
197 |    0   0   0   0
198 |    0   0   0   0
199 | LIGHT2
200 |  1.000000  0.000000  0.000000  0.000000
201 |  0.000000  1.000000  0.000000  0.000000
202 |  0.000000  0.000000  1.000000  0.000000
203 |  0.000000  0.000000  0.000000  1.000000
204 |  0.000000  0.000000 20.000000  0.000000
205 |  0.000000  0.000000 -1.000000
206 |    0   0   0   0
207 |    0   0   0   0
208 |    0   0   0   0
209 | LIGHT3
210 |  1.000000  0.000000  0.000000  0.000000
211 |  0.000000  1.000000  0.000000  0.000000
212 |  0.000000  0.000000  1.000000  0.000000
213 |  0.000000  0.000000  0.000000  1.000000
214 |  0.000000  0.000000 20.000000  0.000000
215 |  0.000000  0.000000 -1.000000
216 |    0   0   0   0
217 |    0   0   0   0
218 |    0   0   0   0
219 | SECCL 0
220 | 
221 | TEXCL 0
222 | 
223 | ATOMM
224 |  204 204 204 255
225 |   25.600
226 | BONDM
227 |  255 255 255 255
228 |  128.000
229 | POLYM
230 |  255 255 255 255
231 |  128.000
232 | SURFM
233 |    0   0   0 255
234 |  128.000
235 | FORMM
236 |  255 255 255 255
237 |  128.000
238 | HKLPM
239 |  255 255 255 255
240 |  128.000
241 | 


--------------------------------------------------------------------------------
/images/2_sum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/images/2_sum.png


--------------------------------------------------------------------------------
/images/5_bands.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/images/5_bands.png


--------------------------------------------------------------------------------
/images/6_tem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/images/6_tem.png


--------------------------------------------------------------------------------
/images/ml-python.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/images/ml-python.png


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/logo.png


--------------------------------------------------------------------------------
/ref.bib:
--------------------------------------------------------------------------------
 1 | ---
 2 | ---
 3 | 
 4 | @inproceedings{holdgraf_evidence_2014,
 5 | 	address = {Brisbane, Australia, Australia},
 6 | 	title = {Evidence for {Predictive} {Coding} in {Human} {Auditory} {Cortex}},
 7 | 	booktitle = {International {Conference} on {Cognitive} {Neuroscience}},
 8 | 	publisher = {Frontiers in Neuroscience},
 9 | 	author = {Holdgraf, Christopher Ramsay and de Heer, Wendy and Pasley, Brian N. and Knight, Robert T.},
10 | 	year = {2014}
11 | }
12 | 
13 | @article{holdgraf_rapid_2016,
14 | 	title = {Rapid tuning shifts in human auditory cortex enhance speech intelligibility},
15 | 	volume = {7},
16 | 	issn = {2041-1723},
17 | 	url = {http://www.nature.com/doifinder/10.1038/ncomms13654},
18 | 	doi = {10.1038/ncomms13654},
19 | 	number = {May},
20 | 	journal = {Nature Communications},
21 | 	author = {Holdgraf, Christopher Ramsay and de Heer, Wendy and Pasley, Brian N. and Rieger, Jochem W. and Crone, Nathan and Lin, Jack J. and Knight, Robert T. and Theunissen, Frédéric E.},
22 | 	year = {2016},
23 | 	pages = {13654},
24 | 	file = {Holdgraf et al. - 2016 - Rapid tuning shifts in human auditory cortex enhance speech intelligibility.pdf:C\:\\Users\\chold\\Zotero\\storage\\MDQP3JWE\\Holdgraf et al. - 2016 - Rapid tuning shifts in human auditory cortex enhance speech intelligibility.pdf:application/pdf}
25 | }
26 | 
27 | @inproceedings{holdgraf_portable_2017,
28 | 	title = {Portable learning environments for hands-on computational instruction using container-and cloud-based technology to teach data science},
29 | 	volume = {Part F1287},
30 | 	isbn = {978-1-4503-5272-7},
31 | 	doi = {10.1145/3093338.3093370},
32 | 	abstract = {© 2017 ACM. There is an increasing interest in learning outside of the traditional classroom setting. This is especially true for topics covering computational tools and data science, as both are challenging to incorporate in the standard curriculum. These atypical learning environments offer new opportunities for teaching, particularly when it comes to combining conceptual knowledge with hands-on experience/expertise with methods and skills. Advances in cloud computing and containerized environments provide an attractive opportunity to improve the effciency and ease with which students can learn. This manuscript details recent advances towards using commonly-Available cloud computing services and advanced cyberinfrastructure support for improving the learning experience in bootcamp-style events. We cover the benets (and challenges) of using a server hosted remotely instead of relying on student laptops, discuss the technology that was used in order to make this possible, and give suggestions for how others could implement and improve upon this model for pedagogy and reproducibility.},
33 | 	booktitle = {{ACM} {International} {Conference} {Proceeding} {Series}},
34 | 	author = {Holdgraf, Christopher Ramsay and Culich, A. and Rokem, A. and Deniz, F. and Alegro, M. and Ushizima, D.},
35 | 	year = {2017},
36 | 	keywords = {Teaching, Bootcamps, Cloud computing, Data science, Docker, Pedagogy}
37 | }
38 | 
39 | @article{holdgraf_encoding_2017,
40 | 	title = {Encoding and decoding models in cognitive electrophysiology},
41 | 	volume = {11},
42 | 	issn = {16625137},
43 | 	doi = {10.3389/fnsys.2017.00061},
44 | 	abstract = {© 2017 Holdgraf, Rieger, Micheli, Martin, Knight and Theunissen. Cognitive neuroscience has seen rapid growth in the size and complexity of data recorded from the human brain as well as in the computational tools available to analyze this data. This data explosion has resulted in an increased use of multivariate, model-based methods for asking neuroscience questions, allowing scientists to investigate multiple hypotheses with a single dataset, to use complex, time-varying stimuli, and to study the human brain under more naturalistic conditions. These tools come in the form of “Encoding” models, in which stimulus features are used to model brain activity, and “Decoding” models, in which neural features are used to generated a stimulus output. Here we review the current state of encoding and decoding models in cognitive electrophysiology and provide a practical guide toward conducting experiments and analyses in this emerging field. Our examples focus on using linear models in the study of human language and audition. We show how to calculate auditory receptive fields from natural sounds as well as how to decode neural recordings to predict speech. The paper aims to be a useful tutorial to these approaches, and a practical introduction to using machine learning and applied statistics to build models of neural activity. The data analytic approaches we discuss may also be applied to other sensory modalities, motor systems, and cognitive systems, and we cover some examples in these areas. In addition, a collection of Jupyter notebooks is publicly available as a complement to the material covered in this paper, providing code examples and tutorials for predictive modeling in python. The aimis to provide a practical understanding of predictivemodeling of human brain data and to propose best-practices in conducting these analyses.},
45 | 	journal = {Frontiers in Systems Neuroscience},
46 | 	author = {Holdgraf, Christopher Ramsay and Rieger, J.W. and Micheli, C. and Martin, S. and Knight, R.T. and Theunissen, F.E.},
47 | 	year = {2017},
48 | 	keywords = {Decoding models, Encoding models, Electrocorticography (ECoG), Electrophysiology/evoked potentials, Machine learning applied to neuroscience, Natural stimuli, Predictive modeling, Tutorials}
49 | }
50 | 
51 | @book{ruby,
52 |   title     = {The Ruby Programming Language},
53 |   author    = {Flanagan, David and Matsumoto, Yukihiro},
54 |   year      = {2008},
55 |   publisher = {O'Reilly Media}
56 | }
57 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter-book
2 | jupyterlab_myst
3 | myst-nb
4 | numpy
5 | matplotlib
6 | pandas
7 | scikit-learn
8 | seaborn


--------------------------------------------------------------------------------
/slides/MLforMaterials_Challenge_25.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Challenge_25.pdf


--------------------------------------------------------------------------------
/slides/MLforMaterials_Lecture1_Intro_25.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture1_Intro_25.pdf


--------------------------------------------------------------------------------
/slides/MLforMaterials_Lecture2_Basics_25.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture2_Basics_25.pdf


--------------------------------------------------------------------------------
/slides/MLforMaterials_Lecture3_Data_25.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture3_Data_25.pdf


--------------------------------------------------------------------------------
/slides/MLforMaterials_Lecture4_Representations_25.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture4_Representations_25.pdf


--------------------------------------------------------------------------------
/slides/MLforMaterials_Lecture5_Classical_25.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture5_Classical_25.pdf


--------------------------------------------------------------------------------
/slides/MLforMaterials_Lecture6_NN_25.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture6_NN_25.pdf


--------------------------------------------------------------------------------
/slides/MLforMaterials_Lecture7_Build_25.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture7_Build_25.pdf


--------------------------------------------------------------------------------
/slides/MLforMaterials_Lecture8_Discovery_25.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture8_Discovery_25.pdf


--------------------------------------------------------------------------------
/slides/MLforMaterials_Lecture9_GenAI_25.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aronwalsh/MLforMaterials/5149c2776deffd18ae02912b56e4052575c4cd4d/slides/MLforMaterials_Lecture9_GenAI_25.pdf


--------------------------------------------------------------------------------