├── .github
└── workflows
│ ├── binder.yaml
│ └── deploy.yml
├── .gitignore
├── Contents.md
├── LICENSE
├── Learning.md
├── Lecture1.ipynb
├── Lecture2.ipynb
├── Lecture3.ipynb
├── Lecture4.ipynb
├── Lecture5.ipynb
├── Lecture6.ipynb
├── Lecture7.ipynb
├── Lecture8.ipynb
├── Lecture9.ipynb
├── Overview.md
├── README.md
├── Resources.md
├── _config.yml
├── _toc.yml
├── images
├── 2_Cs2AgBiI6.png
├── 2_Cs2AgBiI6.vesta
├── 2_CsPbI3.png
├── 2_CsPbI3.vesta
├── 2_sum.png
├── 5_bands.png
├── 6_tem.png
└── ml-python.png
├── logo.png
├── ref.bib
├── requirements.txt
└── slides
├── MLforMaterials_Challenge_25.pdf
├── MLforMaterials_Lecture1_Intro_25.pdf
├── MLforMaterials_Lecture2_Basics_25.pdf
├── MLforMaterials_Lecture3_Data_25.pdf
├── MLforMaterials_Lecture4_Representations_25.pdf
├── MLforMaterials_Lecture5_Classical_25.pdf
├── MLforMaterials_Lecture6_NN_25.pdf
├── MLforMaterials_Lecture7_Build_25.pdf
├── MLforMaterials_Lecture8_Discovery_25.pdf
└── MLforMaterials_Lecture9_GenAI_25.pdf
/.github/workflows/binder.yaml:
--------------------------------------------------------------------------------
1 | name: Binder
2 | on: [push]
3 |
4 | jobs:
5 | Create-MyBinderOrg-Cache:
6 | runs-on: ubuntu-latest
7 | steps:
8 | - name: cache binder build on mybinder.org
9 | uses: jupyterhub/repo2docker-action@master
10 | with:
11 | NO_PUSH: true
12 | MYBINDERORG_TAG: ${{ github.event.ref }} # This builds the container on mybinder.org with the branch that was pushed on.
13 |
--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
1 | name: deploy-book
2 |
3 | # Only run this when the master branch changes
4 | on:
5 | push:
6 | branches:
7 | - 2025
8 | # If your git repository has the Jupyter Book within some-subfolder next to
9 | # unrelated files, you can make this run only if a file within that specific
10 | # folder has been modified.
11 | #
12 | # paths:
13 | # - some-subfolder/**
14 |
15 | # This job installs dependencies, builds the book, and pushes it to `gh-pages`
16 | jobs:
17 | deploy-book:
18 | runs-on: ubuntu-latest
19 | steps:
20 | - uses: actions/checkout@v2
21 |
22 | # Install dependencies
23 | - name: Set up Python 3.8
24 | uses: actions/setup-python@v2
25 | with:
26 | python-version: 3.8
27 |
28 | - name: Install dependencies
29 | run: |
30 | pip install -r requirements.txt
31 |
32 | # Build the book
33 | - name: Build the book
34 | run: |
35 | jupyter-book build .
36 |
37 | # Push the book's HTML to github-pages
38 | - name: GitHub Pages action
39 | uses: peaceiris/actions-gh-pages@v3.6.1
40 | with:
41 | github_token: ${{ secrets.GITHUB_TOKEN }}
42 | publish_dir: ./_build/html
43 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | .DS_Store
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | pip-wheel-metadata/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .nox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | *.py,cover
53 | .hypothesis/
54 | .pytest_cache/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 | db.sqlite3-journal
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | target/
78 |
79 | # Jupyter Notebook
80 | .ipynb_checkpoints
81 | _build/
82 |
83 | # IPython
84 | profile_default/
85 | ipython_config.py
86 |
87 | # pyenv
88 | .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98 | __pypackages__/
99 |
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 |
104 | # SageMath parsed files
105 | *.sage.py
106 |
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 |
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 |
120 | # Rope project settings
121 | .ropeproject
122 |
123 | # mkdocs documentation
124 | /site
125 |
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 |
131 | # Pyre type checker
132 | .pyre/
133 | notebooks/CsPbI3.cif
134 | imagenet_labels.json
135 |
--------------------------------------------------------------------------------
/Contents.md:
--------------------------------------------------------------------------------
1 | # Course Contents
2 |
3 | 1. **Introduction**
4 | * Overview
5 | * Expectations and assessments
6 | * _Exercise: Getting started_
7 |
8 | 2. **Machine Learning Basics**
9 | * Terminology
10 | * Learning by example
11 | * Supervised
12 | * Unsupervised
13 | * Reinforcement
14 | * _Exercise: Crystal hardness_
15 |
16 | 3. **Materials Data**
17 | * Data sources and formats
18 | * API queries
19 | * _Exercise: Data-driven thermoelectrics_
20 |
21 | 4. **Crystal Representations**
22 | * Compositional
23 | * Structural
24 | * Graphs
25 | * _Exercise: Navigating crystal space_
26 |
27 | 5. **Classical Learning**
28 | * _k_-nearest neighbours
29 | * _k_-means clustering
30 | * Decision trees and beyond
31 | * _Exercise: Metal or insulator?_
32 |
33 | 6. **Artificial Neural Networks**
34 | * From neuron to perceptron
35 | * Network architecture and training
36 | * Convolutional neural networks
37 | * _Exercise: Learning microstructure_
38 |
39 | 7. **Building a Model from Scratch**
40 | * Data preparation
41 | * Model choice
42 | * Training and testing
43 | * _Exercise: Crystal hardness II_
44 |
45 | 8. **Accelerated Discovery**
46 | * Automated experiments
47 | * Bayesian optimisation
48 | * Reinforcement learning
49 | * _Exercise: Closed-loop optimisation_
50 |
51 | 9. **Generative Artificial Intelligence**
52 | * Large language models
53 | * From latent space to diffusion
54 | * _Exercise: Research challenge_
55 |
56 | 10. **Recent Advances**
57 | * Guest lecture
58 | * _Exercise: Research challenge_
59 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Creative Commons Legal Code
2 |
3 | CC0 1.0 Universal
4 |
5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
12 | HEREUNDER.
13 |
14 | Statement of Purpose
15 |
16 | The laws of most jurisdictions throughout the world automatically confer
17 | exclusive Copyright and Related Rights (defined below) upon the creator
18 | and subsequent owner(s) (each and all, an "owner") of an original work of
19 | authorship and/or a database (each, a "Work").
20 |
21 | Certain owners wish to permanently relinquish those rights to a Work for
22 | the purpose of contributing to a commons of creative, cultural and
23 | scientific works ("Commons") that the public can reliably and without fear
24 | of later claims of infringement build upon, modify, incorporate in other
25 | works, reuse and redistribute as freely as possible in any form whatsoever
26 | and for any purposes, including without limitation commercial purposes.
27 | These owners may contribute to the Commons to promote the ideal of a free
28 | culture and the further production of creative, cultural and scientific
29 | works, or to gain reputation or greater distribution for their Work in
30 | part through the use and efforts of others.
31 |
32 | For these and/or other purposes and motivations, and without any
33 | expectation of additional consideration or compensation, the person
34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
35 | is an owner of Copyright and Related Rights in the Work, voluntarily
36 | elects to apply CC0 to the Work and publicly distribute the Work under its
37 | terms, with knowledge of his or her Copyright and Related Rights in the
38 | Work and the meaning and intended legal effect of CC0 on those rights.
39 |
40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
41 | protected by copyright and related or neighboring rights ("Copyright and
42 | Related Rights"). Copyright and Related Rights include, but are not
43 | limited to, the following:
44 |
45 | i. the right to reproduce, adapt, distribute, perform, display,
46 | communicate, and translate a Work;
47 | ii. moral rights retained by the original author(s) and/or performer(s);
48 | iii. publicity and privacy rights pertaining to a person's image or
49 | likeness depicted in a Work;
50 | iv. rights protecting against unfair competition in regards to a Work,
51 | subject to the limitations in paragraph 4(a), below;
52 | v. rights protecting the extraction, dissemination, use and reuse of data
53 | in a Work;
54 | vi. database rights (such as those arising under Directive 96/9/EC of the
55 | European Parliament and of the Council of 11 March 1996 on the legal
56 | protection of databases, and under any national implementation
57 | thereof, including any amended or successor version of such
58 | directive); and
59 | vii. other similar, equivalent or corresponding rights throughout the
60 | world based on applicable law or treaty, and any national
61 | implementations thereof.
62 |
63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
65 | irrevocably and unconditionally waives, abandons, and surrenders all of
66 | Affirmer's Copyright and Related Rights and associated claims and causes
67 | of action, whether now known or unknown (including existing as well as
68 | future claims and causes of action), in the Work (i) in all territories
69 | worldwide, (ii) for the maximum duration provided by applicable law or
70 | treaty (including future time extensions), (iii) in any current or future
71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
72 | including without limitation commercial, advertising or promotional
73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
74 | member of the public at large and to the detriment of Affirmer's heirs and
75 | successors, fully intending that such Waiver shall not be subject to
76 | revocation, rescission, cancellation, termination, or any other legal or
77 | equitable action to disrupt the quiet enjoyment of the Work by the public
78 | as contemplated by Affirmer's express Statement of Purpose.
79 |
80 | 3. Public License Fallback. Should any part of the Waiver for any reason
81 | be judged legally invalid or ineffective under applicable law, then the
82 | Waiver shall be preserved to the maximum extent permitted taking into
83 | account Affirmer's express Statement of Purpose. In addition, to the
84 | extent the Waiver is so judged Affirmer hereby grants to each affected
85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
88 | maximum duration provided by applicable law or treaty (including future
89 | time extensions), (iii) in any current or future medium and for any number
90 | of copies, and (iv) for any purpose whatsoever, including without
91 | limitation commercial, advertising or promotional purposes (the
92 | "License"). The License shall be deemed effective as of the date CC0 was
93 | applied by Affirmer to the Work. Should any part of the License for any
94 | reason be judged legally invalid or ineffective under applicable law, such
95 | partial invalidity or ineffectiveness shall not invalidate the remainder
96 | of the License, and in such case Affirmer hereby affirms that he or she
97 | will not (i) exercise any of his or her remaining Copyright and Related
98 | Rights in the Work or (ii) assert any associated claims and causes of
99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 |
102 | 4. Limitations and Disclaimers.
103 |
104 | a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 | surrendered, licensed or otherwise affected by this document.
106 | b. Affirmer offers the Work as-is and makes no representations or
107 | warranties of any kind concerning the Work, express, implied,
108 | statutory or otherwise, including without limitation warranties of
109 | title, merchantability, fitness for a particular purpose, non
110 | infringement, or the absence of latent or other defects, accuracy, or
111 | the present or absence of errors, whether or not discoverable, all to
112 | the greatest extent permissible under applicable law.
113 | c. Affirmer disclaims responsibility for clearing rights of other persons
114 | that may apply to the Work or any use thereof, including without
115 | limitation any person's Copyright and Related Rights in the Work.
116 | Further, Affirmer disclaims responsibility for obtaining any necessary
117 | consents, permissions or other rights required for any use of the
118 | Work.
119 | d. Affirmer understands and acknowledges that Creative Commons is not a
120 | party to this document and has no duty or obligation with respect to
121 | this CC0 or use of the Work.
122 |
--------------------------------------------------------------------------------
/Learning.md:
--------------------------------------------------------------------------------
1 | # Learning Outcomes
2 |
3 | At the end of this course, you will be able to:
4 |
5 | - Specify and interpret the central concepts underpinning supervised, unsupervised, and reinforcement learning.
6 |
7 | - Describe approaches for materials representation including chemical composition and crystal structure.
8 |
9 | - Discover structure and property information from public databases using Python.
10 |
11 | - Compare a range of classical machine learning and deep learning approaches.
12 |
13 | - Train and evaluate machine learning models for chemical problems.
14 |
--------------------------------------------------------------------------------
/Lecture1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "uDQYZDh0ciGP"
7 | },
8 | "source": [
9 | "# Introduction"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "
\n",
17 | " 💡 Ada Lovelace: The more I study, the more insatiable do I feel my genius for it to be.\n",
18 | "
"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {
24 | "id": "qlQmXeW3ciGS"
25 | },
26 | "source": [
27 | "\n",
28 | "\n",
29 | "[Lecture slides](https://speakerdeck.com/aronwalsh/machine-learning-for-materials-lecture-1)"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {
35 | "id": "aB6tYmdQciGS",
36 | "tags": []
37 | },
38 | "source": [
39 | "## 👋 Getting started\n",
40 | "\n",
41 | "Welcome to our first practical session!\n",
42 | "\n",
43 | "This is a Jupyter Notebook loaded inside a Jupyter Book. They are part of [Project Jupyter](https://jupyter.org), a suite of open-source tools. A Jupyter Notebook also allows you to run and easily share computer code. This combination makes Jupyter notebooks a useful tool for analysing data.\n",
44 | "\n",
45 | "Unlike spreadsheets or combinations of separate data analysis codes, you can collect descriptions and notes for individual experiments, links to the raw data collected, the computer code that performs any necessary data analysis, and the final figures generated with these data, ready for use in a report or published paper.\n",
46 | "\n",
47 | "There are a few components to be aware of:\n",
48 | "\n",
49 | "### Python\n",
50 | "A working knowledge of the [Python](https://www.python.org) programming language is assumed for this course. If you are rusty, Chapters 1-4 of [Datacamp](https://www.datacamp.com/courses/intro-to-python-for-data-science) cover the base concepts, as do many other online resources including Imperial's [Introduction to Python](https://www.imperial.ac.uk/students/academic-support/graduate-school/professional-development/doctoral-students/research-computing-data-science/courses/python-for-researchers) course.\n",
51 | "\n",
52 | "\n",
61 | "\n",
62 | "
\n",
63 | "
Choose your degree programme:
\n",
64 | "
\n",
65 | " \n",
66 | " \n",
67 | "
\n",
68 | "
\n",
69 | " \n",
70 | " \n",
71 | "
\n",
72 | "
\n",
73 | "\n",
74 | "
\n",
75 | "
If MSc, have you completed the introductory Python course:
\n",
76 | "
\n",
77 | " \n",
78 | " \n",
79 | "
\n",
80 | "
\n",
81 | " \n",
82 | " \n",
83 | "
\n",
84 | "
\n",
85 | "\n",
86 | "
\n",
87 | "
Rate your current Python level:
\n",
88 | "
\n",
89 | " \n",
90 | " \n",
91 | "
\n",
92 | "
\n",
93 | " \n",
94 | " \n",
95 | "
\n",
96 | "
\n",
97 | " \n",
98 | " \n",
99 | "
\n",
100 | "
\n",
101 | "\n",
102 | "### Markdown\n",
103 | "Markdown is a markup language that allows easy formatting of text. It is widely used for creating and formatting online content. It is easier to read and write than html. A guide to the syntax can be found [here](https://www.markdownguide.org/basic-syntax/).\n",
104 | "\n",
105 | "```\n",
106 | "# Heading\n",
107 | "## Smaller heading\n",
108 | "### Even smaller heading\n",
109 | "```\n",
110 | "\n",
111 | "### Github\n",
112 | "[GitHub](https://github.com) is a platform for writing and sharing code. There are many materials science projects hosted there, which enable researchers from around the world to contribute to their development. These notebooks are hosted on GitHub too. If you find an error, you can raise an [issue](https://github.com/aronwalsh/MLforMaterials/issues) or even better fix it yourself with a [pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests).\n",
113 | "\n",
114 | "### Live coding\n",
115 | "The weekly notebooks are designed to be run online directly in your browser. You can activate the server by clicking the rocket icon on the top right and selecting `Live Code`. There is an option to open in [Binder](https://mybinder.org) or [Google Colab](https://colab.research.google.com). Colab is more powerful, but the formatting won't be as nice. You can opt to install Python on your own computer with [Anaconda](https://www.anaconda.com/products/distribution) and run the notebooks locally, but we do not offer support if things go wrong."
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {
121 | "id": "95tjL6dJciGU"
122 | },
123 | "source": [
124 | "## Analyse data with code\n",
125 | "\n",
126 | "By programming a series of instructions, researchers can consistently obtain the same results from a given dataset. This approach enables us to share datasets and code, allowing other scientists to review, repeat and reuse the analysis. The transparency and reproducibility of code-based analysis enhances research integrity and credibility, while minimising errors. It also enables efficient handling of large datasets and complex calculations, accelerating the exploration of different techniques.\n",
127 | "\n",
128 | "### Running code\n",
129 | "\n",
130 | "Different programming languages can be used in Jupyter notebooks. We will be using Python 3. The large scientific community for Python means that well-developed resources exist for data processing and specific prewritten tools for manipulating and plotting data.\n",
131 | "\n",
132 | "Any code typed into a code cell can be run (executed) by pressing the `run` button. You can also run the selected code block using `Shift-Enter` combination on your keyboard."
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {
139 | "colab": {
140 | "base_uri": "https://localhost:8080/"
141 | },
142 | "collapsed": false,
143 | "id": "wCimyGVFciGU",
144 | "jupyter": {
145 | "outputs_hidden": false
146 | },
147 | "outputId": "6f46572f-a956-4342-def3-8713a99c224d"
148 | },
149 | "outputs": [],
150 | "source": [
151 | "2+3 # run this cell"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {
158 | "colab": {
159 | "base_uri": "https://localhost:8080/"
160 | },
161 | "collapsed": false,
162 | "id": "2VOKhE8pciGW",
163 | "jupyter": {
164 | "outputs_hidden": false
165 | },
166 | "outputId": "c14bdab6-a0e1-4181-b0bb-dc43afb85865"
167 | },
168 | "outputs": [],
169 | "source": [
170 | "print(\"Beware of 小妖精\") # anything after '#' is a comment and ignored"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {
177 | "colab": {
178 | "base_uri": "https://localhost:8080/"
179 | },
180 | "collapsed": false,
181 | "id": "iRqw3mAwciGW",
182 | "jupyter": {
183 | "outputs_hidden": false
184 | },
185 | "outputId": "e774b03f-36f0-420c-9d2d-d29426602fa3"
186 | },
187 | "outputs": [],
188 | "source": [
189 | "12*2.40*3737*12 # you get the idea"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "metadata": {
196 | "colab": {
197 | "base_uri": "https://localhost:8080/"
198 | },
199 | "collapsed": false,
200 | "id": "unZ26LEociGW",
201 | "jupyter": {
202 | "outputs_hidden": false
203 | },
204 | "outputId": "65ccf1d5-52a2-49c7-dec9-6999d12ddd8e"
205 | },
206 | "outputs": [],
207 | "source": [
208 | "2**1000 - 2 # a big number"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": null,
214 | "metadata": {
215 | "colab": {
216 | "base_uri": "https://localhost:8080/"
217 | },
218 | "collapsed": false,
219 | "id": "MyM32PMxciGW",
220 | "jupyter": {
221 | "outputs_hidden": false
222 | },
223 | "outputId": "a53bd082-8a05-4dbf-c5cb-807809c725aa"
224 | },
225 | "outputs": [],
226 | "source": [
227 | "import math as m # import a math module\n",
228 | "m.pi"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": null,
234 | "metadata": {
235 | "colab": {
236 | "base_uri": "https://localhost:8080/"
237 | },
238 | "collapsed": false,
239 | "id": "P574cgsSciGX",
240 | "jupyter": {
241 | "outputs_hidden": false
242 | },
243 | "outputId": "a1f16417-6f1d-417b-b6ad-5ab5321a5dfd"
244 | },
245 | "outputs": [],
246 | "source": [
247 | "20*m.atan(1/7)+8*m.atan(3/79) # Euler's approximation"
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "metadata": {
253 | "id": "5B698R2pciGX"
254 | },
255 | "source": [
256 | "### Plotting with Matplotlib"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {
262 | "id": "neke0J4KifCW"
263 | },
264 | "source": [
265 | "Let's import the package [Matplotlib](https://matplotlib.org), which we will be using a lot for data visualisation."
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": null,
271 | "metadata": {
272 | "colab": {
273 | "base_uri": "https://localhost:8080/",
274 | "height": 448
275 | },
276 | "collapsed": false,
277 | "id": "oyBEOTXociGX",
278 | "jupyter": {
279 | "outputs_hidden": false
280 | },
281 | "outputId": "8b8a7522-08e3-4235-b064-f64adbf1b6b1",
282 | "tags": []
283 | },
284 | "outputs": [],
285 | "source": [
286 | "# Imports\n",
287 | "import matplotlib.pyplot as plt # Plotting\n",
288 | "import numpy as np # Numerical operations\n",
289 | "%matplotlib inline\n",
290 | "\n",
291 | "x = np.arange(0, 10, 0.001) # x = 0 to 10 in steps of 0.001\n",
292 | "y = np.sin(x*x) # define your function\n",
293 | "plt.figure(figsize=(5, 3)) # create a new figure (5x3 inches)\n",
294 | "plt.plot(,y) # plot x against y"
295 | ]
296 | },
297 | {
298 | "cell_type": "markdown",
299 | "metadata": {
300 | "id": "lxLc8V4tb5zh"
301 | },
302 | "source": [
303 | "\n",
304 | " Code hint \n",
305 | "You need to plot x vs y. Fix the plot command to (x,y).\n",
306 | ""
307 | ]
308 | },
309 | {
310 | "cell_type": "markdown",
311 | "metadata": {
312 | "id": "Z8_nYqMH2MW9"
313 | },
314 | "source": [
315 | "### Using a DataFrame\n",
316 | "\n",
317 | "A DataFrame organises data into a 2-dimensional table of rows and columns, much like a spreadsheet. They are useful tools to store, access, and modify large sets of data. \n",
318 | "\n",
319 | "In this module, we'll make use of [Pandas](https://pandas.pydata.org) to process input and output data for our machine learning models."
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": null,
325 | "metadata": {
326 | "colab": {
327 | "base_uri": "https://localhost:8080/"
328 | },
329 | "id": "UamDrzLn2LoS",
330 | "outputId": "47b9cf3b-9333-46d7-e785-d0b936bbc93e",
331 | "tags": []
332 | },
333 | "outputs": [],
334 | "source": [
335 | "import pandas as pd # Data manipulation using DataFrames\n",
336 | "\n",
337 | "df = pd.DataFrame() # This instantiates an empty pandas DataFrame\n",
338 | "\n",
339 | "data = {\n",
340 | " \"Element\" : ['C', 'O', 'Fe', 'Mg', 'Xe'],\n",
341 | " \"Atomic Number\" : [6, 8, 26, 12, 54],\n",
342 | " \"Atomic Mass\" : [12, 16, 56, 24, 131]\n",
343 | "}\n",
344 | "\n",
345 | "# Let's try loading data into DataFrame df\n",
346 | "df = pd.DataFrame(data)\n",
347 | "\n",
348 | "# We can make the 'Element' column the index using the set_index function\n",
349 | "df = df.set_index(\"Element\")\n",
350 | "\n",
351 | "# Printing the values in the 'Atomic Number' column\n",
352 | "print(df[\"Atom Number\"])"
353 | ]
354 | },
355 | {
356 | "cell_type": "markdown",
357 | "metadata": {
358 | "id": "BOl6MmQuifCW",
359 | "tags": []
360 | },
361 | "source": [
362 | "\n",
363 | " Code hint \n",
364 | "Check you are printing the correct column name. Try out some of the other options.\n",
365 | ""
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": null,
371 | "metadata": {
372 | "colab": {
373 | "base_uri": "https://localhost:8080/"
374 | },
375 | "id": "gcUlJMzWb5zi",
376 | "outputId": "7bb8809f-3477-4593-f177-857e9bc1a1b4",
377 | "tags": []
378 | },
379 | "outputs": [],
380 | "source": [
381 | "# Add a new column\n",
382 | "df[\"Energy (eV)\"] = [5.47, 5.14, 0.12, 4.34, 7.01]\n",
383 | "\n",
384 | "print(df[\"Energy (eV)\"])"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": null,
390 | "metadata": {
391 | "colab": {
392 | "base_uri": "https://localhost:8080/"
393 | },
394 | "id": "HxPmwuvub5zi",
395 | "outputId": "bbe23a6f-6569-40dc-d5ed-64431df3a9be",
396 | "tags": []
397 | },
398 | "outputs": [],
399 | "source": [
400 | "# Print a row from the DataFrame\n",
401 | "\n",
402 | "# The df.loc[index] function to print the entry \"C\"\n",
403 | "print(df.loc[''])\n",
404 | "\n",
405 | "print('-----')\n",
406 | "\n",
407 | "# The df.iloc[index] function to print the first entry (counting starts at 0...)\n",
408 | "print(df.iloc[0])"
409 | ]
410 | },
411 | {
412 | "cell_type": "markdown",
413 | "metadata": {
414 | "id": "LsKd-p8Ob5zi"
415 | },
416 | "source": [
417 | "\n",
418 | " Code hint \n",
419 | "You need to tell `df.loc` what to look for. Put an element name in between the quotes.\n",
420 | ""
421 | ]
422 | },
423 | {
424 | "cell_type": "markdown",
425 | "metadata": {
426 | "id": "Ug7HnFwUciGX"
427 | },
428 | "source": [
429 | "### Write an equation"
430 | ]
431 | },
432 | {
433 | "cell_type": "markdown",
434 | "metadata": {
435 | "id": "tlakAjCMciGX"
436 | },
437 | "source": [
438 | "This equation is written in [LaTeX](https://www.overleaf.com/learn/latex/Learn_LaTeX_in_30_minutes) format. It's easy to learn and useful for complex expressions, e.g. `\\frac{x}{y}` writes x/y as a fraction $\\dfrac{x}{y}$.\n",
439 | "\n",
440 | "`$-\\frac{\\hslash^2}{2m} \\, \\frac{\\partial^2 \\psi}{\\partial x^2}$`\n",
441 | "\n",
442 | "renders as\n",
443 | "\n",
444 | "$-\\dfrac{\\hslash^2}{2m} \\, \\dfrac{\\partial^2 \\psi}{\\partial x^2}$"
445 | ]
446 | },
447 | {
448 | "cell_type": "markdown",
449 | "metadata": {
450 | "id": "6LT9mCDQciGX"
451 | },
452 | "source": [
453 | "### Link an image"
454 | ]
455 | },
456 | {
457 | "cell_type": "markdown",
458 | "metadata": {
459 | "id": "oteHuO9DciGY"
460 | },
461 | "source": [
462 | "The syntax employed here is Markdown. It can be used in notebooks, is popular on Github for documentation, and can even be a fast way to take notes during lectures.\n",
463 | "\n",
464 | "``\n",
465 | "\n",
466 | "which renders as\n",
467 | "\n",
468 | ""
469 | ]
470 | },
471 | {
472 | "cell_type": "markdown",
473 | "metadata": {
474 | "id": "8uepYP7rciGY"
475 | },
476 | "source": [
477 | "## Computational science\n",
478 | "\n",
479 | "### Thermally-actived diffusion\n",
480 | "\n",
481 | "Ion transport in crystals is a fundamental process that underpins various technological applications, from batteries to semiconductor devices. Understanding the kinetics of ion movement within and between materials is crucial for optimising device performance.\n",
482 | "\n",
483 | "Like many chemical processes, solid-state diffusion transport is thermally activated. We can describe ion motion in a crystal using a familiar Arrhenius relationship.\n",
484 | "\n",
485 | "The diffusion coefficient of a species is given by $D_{ion} = D_0 \\cdot e^{-(\\frac{\\Delta E_a}{k_BT})}$, where:\n",
486 | "- $D_{ion}$ is the diffusion coefficient for a particular ion,\n",
487 | "- $D_0$ is the temperature-independent prefactor (containing an attempt frequency),\n",
488 | "- $\\Delta E_a$ is the activation energy for diffusion,\n",
489 | "- $k_B$ is the Boltzmann constant, and\n",
490 | "- $T$ is the temperature.\n",
491 | "\n",
492 | "Let's write a function for it, which will take advantage of the wonderful [NumPy](https://numpy.org) package. It also uses the [physical constants](https://docs.scipy.org/doc/scipy/reference/constants.html#physical-constants) in [SciPy](https://scipy.org), and explains the function with a [docstring](https://en.wikipedia.org/wiki/Docstring)."
493 | ]
494 | },
495 | {
496 | "cell_type": "code",
497 | "execution_count": null,
498 | "metadata": {
499 | "tags": []
500 | },
501 | "outputs": [],
502 | "source": [
503 | "import numpy as np\n",
504 | "from scipy.constants import physical_constants\n",
505 | "\n",
506 | "# Define constants\n",
507 | "k_B = physical_constants['Boltzmann constant in eV/K'][0]\n",
508 | "\n",
509 | "# Arrhenius function\n",
510 | "def arrhenius(activation_energy, temperature, D0=1):\n",
511 | " \"\"\"\n",
512 | " Calculates the rate using the Arrhenius equation.\n",
513 | " \n",
514 | " Parameters:\n",
515 | " activation_energy (float): the activation energy in eV.\n",
516 | " temperature (float): the temperature in K (must be > 0).\n",
517 | " D0 (float): the pre-exponential factor (default is 1).\n",
518 | " \n",
519 | " Returns:\n",
520 | " float: the rate of the reaction.\n",
521 | " \"\"\"\n",
522 | " if np.any(temperature <= 0):\n",
523 | " raise ValueError(\"Temperature must be greater than 0 K\")\n",
524 | " return D0 * np.exp(-activation_energy / (k_B * temperature))"
525 | ]
526 | },
527 | {
528 | "cell_type": "markdown",
529 | "metadata": {
530 | "id": "R8aKxKtuciGY"
531 | },
532 | "source": [
533 | "This function takes `activation_energy` (eV) and `temperature` (K) as inputs and returns the corresponding diffusion coefficient. Recall that the units of the exponential term cancel out, so $D_{ion}$ takes the same units as $D_0$. Now let's use the function:"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": null,
539 | "metadata": {
540 | "colab": {
541 | "base_uri": "https://localhost:8080/"
542 | },
543 | "id": "gO22e47tciGY",
544 | "outputId": "7f1557d3-674b-45b3-e878-7de2021946ae",
545 | "tags": []
546 | },
547 | "outputs": [],
548 | "source": [
549 | " # Call the function for Ea = 0.12 eV; T = 1000 K\n",
550 | "arrhenius(0.12, 1000) "
551 | ]
552 | },
553 | {
554 | "cell_type": "markdown",
555 | "metadata": {
556 | "id": "7rMW0e9bciGY"
557 | },
558 | "source": [
559 | "This value tells us the likelihood that each attempt has of overcoming the thermodynamic barrier for ionic diffusion. Decrease the temperature to 100 K and see the difference.\n",
560 | "\n",
561 | "Now let's take advantage of the function to make a plot. We will use the numpy function `linspace`, which is documented over [here](https://numpy.org/doc/stable/reference/generated/numpy.linspace.html). It is used here to generate 100 numbers evenly spaced between 100 and 5000 that represent the temperature range of our \"experiments\"."
562 | ]
563 | },
564 | {
565 | "cell_type": "code",
566 | "execution_count": null,
567 | "metadata": {
568 | "colab": {
569 | "base_uri": "https://localhost:8080/",
570 | "height": 472
571 | },
572 | "id": "WkWCrwDsb5zj",
573 | "outputId": "f4fade20-aa13-4ab5-9ba9-c6434e97e16b",
574 | "tags": []
575 | },
576 | "outputs": [],
577 | "source": [
578 | "import matplotlib.pyplot as plt\n",
579 | "\n",
580 | "# Pre-exponential term in cm^2/s\n",
581 | "D0 = 0.5\n",
582 | "\n",
583 | "# Range of activation energies in eV\n",
584 | "activation_energies = np.linspace(0.1, 1, 0) # Range from 0.1 to 1 eV in n steps\n",
585 | "\n",
586 | "# Temperature range in K\n",
587 | "T = np.linspace(100, 5000, 100)\n",
588 | "\n",
589 | "# Calculate rates and plot curves\n",
590 | "plt.figure(figsize=(5, 3)) \n",
591 | "\n",
592 | "for activation_energy in activation_energies:\n",
593 | " rates = arrhenius(activation_energy, T, D0)\n",
594 | " plt.plot(T, rates, label=f'{activation_energy:.1f} eV')\n",
595 | "\n",
596 | "plt.xlabel('Temperature (K)')\n",
597 | "plt.ylabel('$D_{ion}$ (cm$^2$/s)') \n",
598 | "plt.title('Varying activation energy')\n",
599 | "plt.legend()\n",
600 | "plt.grid(True)\n",
601 | "plt.show()"
602 | ]
603 | },
604 | {
605 | "cell_type": "markdown",
606 | "metadata": {
607 | "id": "8DvhjGpCb5zk"
608 | },
609 | "source": [
610 | "\n",
611 | " Code hint \n",
612 | "'np.linspace' requires three arguments (start, stop, number of points). 0 points won't work. Try changing it to 5.\n",
613 | ""
614 | ]
615 | },
616 | {
617 | "cell_type": "markdown",
618 | "metadata": {
619 | "id": "uWlaJMBQciGZ"
620 | },
621 | "source": [
622 | "To better visualise the trends, we can make an Arrhenius plot by plotting the natural logarithm of $D$ versus the inverse temperature, 1/T. We use 1000/T to give a nicer range on the $x$-axis."
623 | ]
624 | },
625 | {
626 | "cell_type": "code",
627 | "execution_count": null,
628 | "metadata": {
629 | "colab": {
630 | "base_uri": "https://localhost:8080/",
631 | "height": 472
632 | },
633 | "id": "7Wgi_g2wciGZ",
634 | "outputId": "8b602c70-d78e-40c2-a00b-226582c57e43",
635 | "tags": []
636 | },
637 | "outputs": [],
638 | "source": [
639 | "# Plotting ln(R) vs 1000/T\n",
640 | "plt.figure(figsize=(5, 3)) \n",
641 | "\n",
642 | "for activation_energy in activation_energies:\n",
643 | " rates = arrhenius(activation_energy, T, D0)\n",
644 | " plt.plot(1000/T, np.log(rates), label=f'{activation_energy:.1f} eV')\n",
645 | "\n",
646 | "plt.xlabel('1000 / Temperature (1/K)')\n",
647 | "plt.ylabel('ln($D_{ion}$)')\n",
648 | "plt.title('Arrhenius plot')\n",
649 | "plt.legend()\n",
650 | "plt.grid(True)\n",
651 | "plt.show()"
652 | ]
653 | },
654 | {
655 | "cell_type": "markdown",
656 | "metadata": {
657 | "id": "GzN2cRN0ciGZ"
658 | },
659 | "source": [
660 | "The last technique to pick up in this class is data fitting. Later in the module, we will use more complex functions in high dimensions, but let's start with linear regression. There is no need to code this by hand as we can use a [function](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html) in the machine learning package [scikit-learn](https://scikit-learn.org). The real power of Python is the quality and quantity of available libraries such as this one."
661 | ]
662 | },
663 | {
664 | "cell_type": "code",
665 | "execution_count": null,
666 | "metadata": {
667 | "colab": {
668 | "base_uri": "https://localhost:8080/"
669 | },
670 | "id": "8eOomWxMb5zk",
671 | "outputId": "b3cf49b2-076a-42c0-a274-486f7270b0ad",
672 | "tags": []
673 | },
674 | "outputs": [],
675 | "source": [
676 | "import numpy as np\n",
677 | "import pandas as pd\n",
678 | "\n",
679 | "num_points = # Number of data points to generate\n",
680 | "\n",
681 | "# Generate random x-y data points\n",
682 | "x_data = np.random.uniform(0, 10, num_points) # Adjust the range as needed\n",
683 | "y_data = np.random.uniform(0, 10, num_points)\n",
684 | "\n",
685 | "# Create a DataFrame\n",
686 | "data = {'X': x_data, 'Y': y_data}\n",
687 | "df = pd.DataFrame(data)\n",
688 | "\n",
689 | "# Print the DataFrame\n",
690 | "print(df)"
691 | ]
692 | },
693 | {
694 | "cell_type": "markdown",
695 | "metadata": {
696 | "id": "Ewl5y52Hb5zk"
697 | },
698 | "source": [
699 | "\n",
700 | " Code hint \n",
701 | "Again you need to choose the number of points. 50 should be fine, but you have the power to decide.\n",
702 | ""
703 | ]
704 | },
705 | {
706 | "cell_type": "code",
707 | "execution_count": null,
708 | "metadata": {
709 | "colab": {
710 | "base_uri": "https://localhost:8080/",
711 | "height": 472
712 | },
713 | "id": "eWwUdHjGciGZ",
714 | "outputId": "c733327e-7559-4f0b-c029-aaf1d843855f",
715 | "tags": []
716 | },
717 | "outputs": [],
718 | "source": [
719 | "from sklearn.linear_model import LinearRegression\n",
720 | "from sklearn.metrics import r2_score, mean_squared_error\n",
721 | "\n",
722 | "# Perform linear regression\n",
723 | "X = df['X'].values.reshape(-1, 1) # Reshape X for compatibility with sklearn\n",
724 | "y = df['Y'].values\n",
725 | "model = LinearRegression().fit(X, y)\n",
726 | "y_pred = model.predict(X)\n",
727 | "\n",
728 | "# Calculate error bars\n",
729 | "residuals = y - y_pred\n",
730 | "error_bars = np.abs(residuals)\n",
731 | "\n",
732 | "# Plot the linear regression line\n",
733 | "plt.figure(figsize=(5, 3)) \n",
734 | "plt.errorbar(df['X'], df['Y'], yerr=error_bars, fmt='o', color='skyblue', label='Prediction errors')\n",
735 | "plt.scatter(df['X'], df['Y'])\n",
736 | "plt.plot(df['X'], y_pred, color='red', label='Regression line')\n",
737 | "plt.xlabel('X')\n",
738 | "plt.ylabel('Y')\n",
739 | "plt.title('Linear regression')\n",
740 | "plt.legend()\n",
741 | "plt.show()"
742 | ]
743 | },
744 | {
745 | "cell_type": "markdown",
746 | "metadata": {
747 | "id": "nCig-VAmciGZ"
748 | },
749 | "source": [
750 | "There are a number of useful analysis tools built into `sklearn`, which we can use to probe the model properties."
751 | ]
752 | },
753 | {
754 | "cell_type": "code",
755 | "execution_count": null,
756 | "metadata": {
757 | "tags": []
758 | },
759 | "outputs": [],
760 | "source": [
761 | "# Print the model parameters and performance\n",
762 | "try:\n",
763 | " print(f'Slope: {model2.coef_[0]:.2f}') # Assuming model.coef_ might be an array for multidimensional X\n",
764 | " print(f'Intercept: {model2.intercept_:.2f}')\n",
765 | " print(f'R^2 Score: {r2_score(y, y_pred):.3f}') # R^2 - coefficient of determination\n",
766 | " print(f'RMSE: {np.sqrt(mean_squared_error(y, y_pred)):.3f}') # Root Mean Squared Error\n",
767 | "except Exception as e:\n",
768 | " print(\"Error in calculating model parameters or performance metrics:\", e)"
769 | ]
770 | },
771 | {
772 | "cell_type": "markdown",
773 | "metadata": {
774 | "id": "7K-w3ba2b5zs"
775 | },
776 | "source": [
777 | "\n",
778 | " Code hint \n",
779 | "Your model is not called `model2`. Try changing the name.\n",
780 | ""
781 | ]
782 | },
783 | {
784 | "cell_type": "markdown",
785 | "metadata": {
786 | "id": "2D92BAYzciGa"
787 | },
788 | "source": [
789 | "## 🚨 Exercise 1\n",
790 | "\n",
791 | "
\n",
792 | " 💡 Coding exercises: The exercises are designed to apply what you have learned with room for creativity. It is fine to discuss solutions with your classmates, but the actual code should not be directly copied.\n",
793 | "
\n",
794 | "\n",
795 | "### Your details"
796 | ]
797 | },
798 | {
799 | "cell_type": "code",
800 | "execution_count": null,
801 | "metadata": {
802 | "colab": {
803 | "base_uri": "https://localhost:8080/"
804 | },
805 | "id": "xqgBbaSjb5zs",
806 | "outputId": "27965ae7-5d0b-40f2-f4ae-7757939dfb1d",
807 | "tags": []
808 | },
809 | "outputs": [],
810 | "source": [
811 | "import numpy as np\n",
812 | "\n",
813 | "# Insert your values\n",
814 | "Name = \"No Name\" # Replace with your name\n",
815 | "CID = 123446 # Replace with your College ID (as a numeric value with no leading 0s)\n",
816 | "\n",
817 | "# Set a random seed using the CID value\n",
818 | "CID = int(CID)\n",
819 | "np.random.seed(CID)\n",
820 | "\n",
821 | "# Print the message\n",
822 | "print(\"This is the work of \" + Name + \" [CID: \" + str(CID) + \"]\")"
823 | ]
824 | },
825 | {
826 | "cell_type": "markdown",
827 | "metadata": {
828 | "id": "DIia0_h9ciGa"
829 | },
830 | "source": [
831 | "### Problem\n",
832 | "\n",
833 | "Due to their importance in the electronics industry, the diffusion of atoms in semiconductors has been well studied for decades. Below is a set of data for impurity diffusion in crystalline Si [Source: [Casey and Pearson (1975)](https://link.springer.com/chapter/10.1007/978-1-4684-0904-8_2)]. It has been arranged into a DataFrame for your convenience.\n",
834 | "\n",
835 | "```python\n",
836 | "import pandas as pd\n",
837 | "\n",
838 | "data = {\n",
839 | " 'Impurity': ['B', 'Al', 'Ga', 'In', 'P', 'As', 'Sb', 'Bi'],\n",
840 | " 'Mass': [10.81, 26.98, 69.72, 114.82, 30.97, 74.92, 121.76, 208.98], # atomic mass in g/mol\n",
841 | " 'D0': [5.1, 8.0, 3.6, 16.5, 10.5, 60.0, 12.9, 1.03E3], # cm2/sec\n",
842 | " 'Eact': [3.70, 3.47, 3.51, 3.91, 3.69, 4.20, 3.98, 4.63] # eV\n",
843 | "}\n",
844 | "\n",
845 | "df = pd.DataFrame(data)\n",
846 | "print(df)\n",
847 | "```\n",
848 | "\n",
849 | "Two tasks will be given in class."
850 | ]
851 | },
852 | {
853 | "cell_type": "code",
854 | "execution_count": null,
855 | "metadata": {
856 | "colab": {
857 | "base_uri": "https://localhost:8080/",
858 | "height": 472
859 | },
860 | "id": "g01sLM1xifCa",
861 | "outputId": "6d0c3d79-37f3-4dc4-d246-b069fc19e7ae",
862 | "tags": []
863 | },
864 | "outputs": [],
865 | "source": [
866 | "#Empty block for your answers\n",
867 | "\n",
868 | "\n"
869 | ]
870 | },
871 | {
872 | "cell_type": "code",
873 | "execution_count": null,
874 | "metadata": {
875 | "id": "wh5CNdABifCa"
876 | },
877 | "outputs": [],
878 | "source": [
879 | "#Empty block for your answers\n",
880 | "\n",
881 | "\n"
882 | ]
883 | },
884 | {
885 | "cell_type": "markdown",
886 | "metadata": {},
887 | "source": [
888 | "
\n",
889 | " 📓 Submission: When your notebook is complete in Google Colab, go to File > Download and choose .ipynb. The completed file should be uploaded to Blackboard under assignments for MATE70026.\n",
890 | "
\n",
19 | " 💡 Hugh Cartwright: The tools of science are changing; artificial intelligence has spread to the laboratory.\n",
20 | "
"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {
26 | "id": "vN4ra0MROMFp"
27 | },
28 | "source": [
29 | "\n",
30 | "\n",
31 | "[Lecture slides](https://speakerdeck.com/aronwalsh/mlformaterials-lecture5-classical)"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "metadata": {
37 | "id": "IPrAgT4POMFp"
38 | },
39 | "source": [
40 | "## 🎲 Metal or insulator?\n",
41 | "\n",
42 | "In life, some decisions are difficult to make. We hope that our experience informs a choice that is better than a random guess. The same is true for machine learning models.\n",
43 | "\n",
44 | "There are many situations where we want to classify materials according to their properties. One fundamental characteristic is whether a material is a metal or insulator. For this exercise, we can refer to these as class `0` and class `1` materials, respectively. \n",
45 | "\n",
46 | "From our general knowledge, Cu should be `0` and MgO should be `1`, but what about Tl2O3 or Ni2Zn4?\n",
47 | "\n",
48 | "### Theoretical background\n",
49 | "\n",
50 | "Metals are characterised by their free electrons that facilitate the flow of electric current. This arises from a partially filled conduction band, allowing electrons to move easily when subjected to an electric field.\n",
51 | "\n",
52 | "Insulators are characterised by an occupied valence band and empty conduction band, impeding the flow of current. The absence of charge carriers hinders electrical conductivity, making them effective insulators of electricity. Understanding these fundamental differences is crucial for designing and optimising electronic devices.\n",
53 | "\n",
54 | "In this practical, we can use the electronic band gap of a material as a simple descriptor of whether it is a metal (Eg = 0) or an insulator (Eg > 0).\n",
55 | "\n",
56 | "$$\n",
57 | "E_g = E^{conduction-band}_{minimum} - E^{valence-band}_{maximum}\n",
58 | "$$\n",
59 | "\n",
60 | "This classification is coarse as we are ignoring the intermediate regime of semiconductors and more exotic behaviour such as superconductivity.\n",
61 | "\n",
62 | "\n",
63 | "\n",
64 | "## $k$-means clustering\n",
65 | "\n",
66 | "Let's start by generating synthetic data for materials along with their class labels. To make the analysis faster and more illustrative, we can perform dimensionality reduction from a 10D to 2D feature space, and then cluster the data using $k$-means."
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {
73 | "id": "CLEjvAiAOMFp",
74 | "tags": []
75 | },
76 | "outputs": [],
77 | "source": [
78 | "# Installation of libraries\n",
79 | "!pip install elementembeddings --quiet\n",
80 | "!pip install matminer --quiet"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 2,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "# Import of modules\n",
90 | "import numpy as np # Numerical operations\n",
91 | "import pandas as pd # DataFrames\n",
92 | "import matplotlib.pyplot as plt # Plotting\n",
93 | "import seaborn as sns # Visualisation\n",
94 | "from sklearn.decomposition import PCA # Principal component analysis (PCA)\n",
95 | "from sklearn.cluster import KMeans # k-means clustering\n",
96 | "from sklearn.metrics import accuracy_score, confusion_matrix # Model evaluation\n",
97 | "from sklearn.tree import DecisionTreeClassifier # Decision tree classifier"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "\n",
105 | "Colab error solution\n",
106 | "If running the import module cell fails with an \"AttributeError\", click `Runtime` -> `Restart Session` and then simply rerun the cell.\n",
107 | ""
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {},
113 | "source": [
114 | "### Uncorrelated data\n",
115 | "\n",
116 | "Pay attention to each step in the process:"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "# Step 0: Set the number of clusters\n",
126 | "n_clusters = 0\n",
127 | "\n",
128 | "# Step 1: Generating synthetic (random) data\n",
129 | "np.random.seed(42)\n",
130 | "num_materials = 200\n",
131 | "num_features = 10\n",
132 | "data = np.random.rand(num_materials, num_features)\n",
133 | "labels = np.random.randint(0, 2, num_materials)\n",
134 | "\n",
135 | "# Step 2: Reduce dimensions to 2 using PCA\n",
136 | "pca = PCA(n_components=2)\n",
137 | "reduced_data = pca.fit_transform(data)\n",
138 | "\n",
139 | "# Step 3: Cluster the data using k-means\n",
140 | "kmeans = KMeans(n_clusters=n_clusters, random_state=42)\n",
141 | "predicted_labels = kmeans.fit_predict(reduced_data)\n",
142 | "\n",
143 | "# Step 4: Create a plot to visualise the clusters and known labels\n",
144 | "plt.figure(figsize=(5, 4))\n",
145 | "\n",
146 | "# Plot the materials labeled as metal (label=1)\n",
147 | "plt.scatter(reduced_data[labels == 1, 0], reduced_data[labels == 1, 1], c='lightblue', label='Metal')\n",
148 | "# Plot the materials labeled as insulator (label=0)\n",
149 | "plt.scatter(reduced_data[labels == 0, 0], reduced_data[labels == 0, 1], c='lightcoral', label='Insulator')\n",
150 | "# Plot the cluster centres as stars\n",
151 | "plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='gold', s=200, label='Cluster centres', marker='*')\n",
152 | "\n",
153 | "# Draw cluster boundaries\n",
154 | "h = 0.02 # step size for the meshgrid\n",
155 | "x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1\n",
156 | "y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1\n",
157 | "xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
158 | "Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])\n",
159 | "Z = Z.reshape(xx.shape)\n",
160 | "plt.contourf(xx, yy, Z, alpha=0.2, cmap='Pastel1')\n",
161 | "\n",
162 | "plt.xlabel('Principal Component 1')\n",
163 | "plt.ylabel('Principal Component 2')\n",
164 | "plt.title('$k$-means clustering of synthetic data')\n",
165 | "plt.legend()\n",
166 | "plt.show()"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {
172 | "id": "iyjuZJYmOMFq"
173 | },
174 | "source": [
175 | "\n",
176 | " Code hint \n",
177 | "The algorithm fails for 0 clusters. \n",
178 | "Increase the value of `n_clusters` and look at the behaviour.\n",
179 | ""
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {
185 | "id": "RWChaS_mOMFr"
186 | },
187 | "source": [
188 | "The cluster centres are shown by yellow stars. The model doesn't perform well, as we just generated this \"materials data\" from random numbers. There are no correlations for the algorithms to exploit. Nonetheless, this type of \"failed experiment\" is common in real research.\n",
189 | "\n",
190 | "Since we know the labels, we can quantify how bad the model using the classification accuracy. Is it better than flipping a coin? "
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": null,
196 | "metadata": {
197 | "colab": {
198 | "base_uri": "https://localhost:8080/"
199 | },
200 | "id": "XyShX2J-OMFr",
201 | "outputId": "c3c1425f-4354-4080-c42d-f025bedca416",
202 | "tags": []
203 | },
204 | "outputs": [],
205 | "source": [
206 | "# Step 5: Quantify classification accuracy\n",
207 | "accuracy = accuracy_score(labels, predicted_labels)\n",
208 | "conf_matrix = confusion_matrix(labels, predicted_labels)\n",
209 | "\n",
210 | "print(\"Accuracy:\", accuracy)\n",
211 | "print(\"\\nConfusion matrix:\")\n",
212 | "print(conf_matrix)"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {
218 | "id": "uSM_A4-ZOMFr"
219 | },
220 | "source": [
221 | "## Decision tree classifier\n",
222 | "\n",
223 | "Let's see if we can do better using a dedicated classifier. We will now train a decision tree to tackle the same problem and visualise the decision boundary."
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {
230 | "colab": {
231 | "base_uri": "https://localhost:8080/",
232 | "height": 564
233 | },
234 | "id": "ZKbtozXuOMFr",
235 | "outputId": "1e23e4bb-e043-4c37-b74a-413a5e002bc1",
236 | "tags": []
237 | },
238 | "outputs": [],
239 | "source": [
240 | "# Step 0: Set the depth of the decision tree\n",
241 | "max_tree_depth = 0\n",
242 | "\n",
243 | "# Step 1: Train a decision tree classifier\n",
244 | "def train_decision_tree(depth, reduced_data, labels):\n",
245 | " tree_classifier = DecisionTreeClassifier(max_depth=depth, random_state=42)\n",
246 | " tree_classifier.fit(reduced_data, labels)\n",
247 | " return tree_classifier\n",
248 | "\n",
249 | "tree_classifier = train_decision_tree(max_tree_depth, reduced_data, labels)\n",
250 | "predicted_labels = tree_classifier.predict(reduced_data)\n",
251 | "\n",
252 | "# Step 2: Create a plot to visualise the decision boundary of the decision tree\n",
253 | "plt.figure(figsize=(5, 4))\n",
254 | "\n",
255 | "# Plot the materials labeled as metal (label=1)\n",
256 | "plt.scatter(reduced_data[labels == 1, 0], reduced_data[labels == 1, 1], c='lightblue', label='Metal')\n",
257 | "# Plot the materials labeled as insulator (label=0)\n",
258 | "plt.scatter(reduced_data[labels == 0, 0], reduced_data[labels == 0, 1], c='lightcoral', label='Insulator')\n",
259 | "# Plot the decision boundary of the decision tree classifier\n",
260 | "h = 0.02 # step size for the meshgrid\n",
261 | "x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1\n",
262 | "y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1\n",
263 | "xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
264 | "Z = tree_classifier.predict(np.c_[xx.ravel(), yy.ravel()])\n",
265 | "Z = Z.reshape(xx.shape)\n",
266 | "plt.contourf(xx, yy, Z, alpha=0.5, cmap='Pastel1')\n",
267 | "\n",
268 | "plt.xlabel('Principal Component 1')\n",
269 | "plt.ylabel('Principal Component 2')\n",
270 | "plt.title(f'Decision tree (max depth={max_tree_depth}) of synthetic data')\n",
271 | "plt.legend()\n",
272 | "\n",
273 | "plt.show()"
274 | ]
275 | },
276 | {
277 | "cell_type": "markdown",
278 | "metadata": {
279 | "id": "SW0VbC_4OMFr"
280 | },
281 | "source": [
282 | "\n",
283 | " Code hint \n",
284 | "With no nodes, you have made an indecisive tree 🥁.\n",
285 | " \n",
286 | "Increase the value of `max_tree_depth` and look at the behaviour.\n",
287 | ""
288 | ]
289 | },
290 | {
291 | "cell_type": "markdown",
292 | "metadata": {
293 | "id": "sOqtQymnOMFs"
294 | },
295 | "source": [
296 | "There should be more structure in the decision boundary due to the more complex model, especially as you increase the tree depth.\n",
297 | "\n",
298 | "$k$-means clustering provides a simple way to group materials based on similarity, yielding a clear linear decision boundary. On the other hand, the decision tree classifier does better in handling non-linear separations. It constructs a boundary based on different feature thresholds, enabling it to capture fine-grained patterns. As always in ML, there is a balance of trade-offs between simplicity and accuracy.\n",
299 | "\n",
300 | "Is the decision tree more accurate? Let's see."
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": null,
306 | "metadata": {
307 | "colab": {
308 | "base_uri": "https://localhost:8080/"
309 | },
310 | "id": "PucrDphBOMFs",
311 | "outputId": "09ad3aa5-1ee9-4597-e2ca-c38b5d85057b",
312 | "tags": []
313 | },
314 | "outputs": [],
315 | "source": [
316 | "# Step 3: Quantify classification accuracy\n",
317 | "accuracy = accuracy_score(labels, predicted_labels)\n",
318 | "conf_matrix = confusion_matrix(labels, predicted_labels)\n",
319 | "\n",
320 | "print(\"Accuracy:\", accuracy)\n",
321 | "print(\"\\nConfusion Matrix:\")\n",
322 | "print(conf_matrix)"
323 | ]
324 | },
325 | {
326 | "cell_type": "markdown",
327 | "metadata": {
328 | "id": "SNQRw2RhOMFs"
329 | },
330 | "source": [
331 | "If you choose a large value for the tree depth, the decision tree will approach a perfect accuracy of 1.0. It does this by memorising (overfitting) the training data but is unlikely to generalise well to new (unseen) data, i.e. overfitting. In contrast, the accuracy of $k$-means clustering is lower because it is an unsupervised algorithm designed for clustering, not classification. Its performance depends on the data structure and the presence of distinct clusters in that feature space.\n",
332 | "\n",
333 | "### Correlated data\n",
334 | "\n",
335 | "Let's try again, but this time we will (manually) add some correlations into the dataset."
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": null,
341 | "metadata": {},
342 | "outputs": [],
343 | "source": [
344 | "# Modify dataset with correlation\n",
345 | "correlation_strength = 0.333\n",
346 | "for i in range(num_features):\n",
347 | " # For some features, add a linear correlation with the labels\n",
348 | " if i % 2 == 0: # Correlate every other feature\n",
349 | " data[:, i] = correlation_strength * labels + (1 - correlation_strength) * np.random.rand(num_materials)\n",
350 | "\n",
351 | "pca = PCA(n_components=2)\n",
352 | "reduced_data = pca.fit_transform(data)\n",
353 | "\n",
354 | "# Step 0: Set the depth of the decision tree\n",
355 | "max_tree_depth = 1\n",
356 | "\n",
357 | "# Step 1: Train a decision tree classifier\n",
358 | "def train_decision_tree(depth, reduced_data, labels):\n",
359 | " tree_classifier = DecisionTreeClassifier(max_depth=depth, random_state=42)\n",
360 | " tree_classifier.fit(reduced_data, labels)\n",
361 | " return tree_classifier\n",
362 | "\n",
363 | "tree_classifier = train_decision_tree(max_tree_depth, reduced_data, labels)\n",
364 | "predicted_labels = tree_classifier.predict(reduced_data)\n",
365 | "\n",
366 | "# Step 2: Create a plot to visualise the decision boundary of the decision tree\n",
367 | "plt.figure(figsize=(5, 4))\n",
368 | "\n",
369 | "# Plot the materials labeled as metal (label=1)\n",
370 | "plt.scatter(reduced_data[labels == 1, 0], reduced_data[labels == 1, 1], c='lightblue', label='Metal')\n",
371 | "# Plot the materials labeled as insulator (label=0)\n",
372 | "plt.scatter(reduced_data[labels == 0, 0], reduced_data[labels == 0, 1], c='lightcoral', label='Insulator')\n",
373 | "# Plot the decision boundary of the decision tree classifier\n",
374 | "h = 0.02 # step size for the meshgrid\n",
375 | "x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1\n",
376 | "y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1\n",
377 | "xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
378 | "Z = tree_classifier.predict(np.c_[xx.ravel(), yy.ravel()])\n",
379 | "Z = Z.reshape(xx.shape)\n",
380 | "plt.contourf(xx, yy, Z, alpha=0.5, cmap='Pastel1')\n",
381 | "\n",
382 | "plt.xlabel('Principal Component 1')\n",
383 | "plt.ylabel('Principal Component 2')\n",
384 | "plt.title(f'Decision tree (max depth={max_tree_depth}) for artificial materials')\n",
385 | "plt.legend()\n",
386 | "\n",
387 | "plt.show()\n",
388 | "\n",
389 | "# Step 3: Quantify classification accuracy\n",
390 | "accuracy = accuracy_score(labels, predicted_labels)\n",
391 | "conf_matrix = confusion_matrix(labels, predicted_labels)\n",
392 | "\n",
393 | "print(\"Accuracy:\", accuracy)\n",
394 | "print(\"\\nConfusion Matrix:\")\n",
395 | "print(conf_matrix)"
396 | ]
397 | },
398 | {
399 | "cell_type": "markdown",
400 | "metadata": {},
401 | "source": [
402 | "Now even a very simple tree can effectively draw a decision boundary. Machine learning models take advantage of such correlations in high dimensional feature spaces. You can modify the correlation strength on line 2 to see the effect."
403 | ]
404 | },
405 | {
406 | "cell_type": "markdown",
407 | "metadata": {
408 | "id": "yAJreGhfOMFs",
409 | "tags": []
410 | },
411 | "source": [
412 | "## Real materials\n",
413 | "\n",
414 | "We can save time again by making use of a pre-built dataset. We will return to [matminer](https://hackingmaterials.lbl.gov/matminer), which we used before, and load `matbench_expt_is_metal`.\n",
415 | "\n",
416 | "### Load dataset"
417 | ]
418 | },
419 | {
420 | "cell_type": "code",
421 | "execution_count": null,
422 | "metadata": {
423 | "tags": []
424 | },
425 | "outputs": [],
426 | "source": [
427 | "import matminer\n",
428 | "from matminer.datasets.dataset_retrieval import load_dataset\n",
429 | "\n",
430 | "# Use matminer to download the dataset\n",
431 | "df = load_dataset('matbench_expt_is_metal')\n",
432 | "print(f'The full dataset contains {df.shape[0]} entries. \\n')\n",
433 | "\n",
434 | "# Display the first 10 entries\n",
435 | "df.head(10)"
436 | ]
437 | },
438 | {
439 | "cell_type": "markdown",
440 | "metadata": {
441 | "id": "sXq9bXwGOMFs"
442 | },
443 | "source": [
444 | "\n",
445 | " Code hint \n",
446 | "To load a different dataset, you simply change the name in 'load_dataset()'.\n",
447 | ""
448 | ]
449 | },
450 | {
451 | "cell_type": "markdown",
452 | "metadata": {
453 | "id": "Y76d3NLhZADO"
454 | },
455 | "source": [
456 | "### Materials featurisation\n",
457 | "\n",
458 | "Revisiting concepts from earlier Notebooks, featurising the chemical compositions is necessary to create a useful set of input vectors. This allows the presence (or absence) of an element (or element combinations) to act as a feature that the classifier takes account for.\n",
459 | "\n",
460 | "We will use [ElementEmbeddings](https://wmd-group.github.io/ElementEmbeddings) to featurise the `composition` column. The importance of the pooling method can be tested by generating two sets of features. In the first, the mean of the atomic vectors is used, while in the second, a max pooling method takes the maximum value of each component across all the atomic vectors in the composition."
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": null,
466 | "metadata": {
467 | "colab": {
468 | "base_uri": "https://localhost:8080/",
469 | "height": 559
470 | },
471 | "id": "sTJg5-4yY9au",
472 | "outputId": "18f140fe-b6c1-41dc-b1d2-3adfc6c40e73",
473 | "tags": []
474 | },
475 | "outputs": [],
476 | "source": [
477 | "# Featurise all chemical compositions\n",
478 | "from elementembeddings.composition import composition_featuriser\n",
479 | "\n",
480 | "# Compute element embeddings using mean and max pooling\n",
481 | "mean_df = composition_featuriser(df[\"composition\"], embedding=\"magpie\", stats=[\"mean\"])\n",
482 | "max_df = composition_featuriser(df[\"composition\"], embedding=\"magpie\", stats=[\"maxpool\"])\n",
483 | "\n",
484 | "# Convert \"is_metal\" column to integer labels (0, 1)\n",
485 | "df['is_metal'] = df['is_metal'].astype(int)\n",
486 | "mean_df['is_metal'] = df['is_metal']\n",
487 | "max_df['is_metal'] = df['is_metal']\n",
488 | "\n",
489 | "# Define feature matrices and target variable\n",
490 | "cols_to_drop = ['is_metal', 'formula']\n",
491 | "\n",
492 | "X_mean = mean_df.drop(columns=cols_to_drop, errors='ignore').values\n",
493 | "X_max = max_df.drop(columns=cols_to_drop, errors='ignore').values\n",
494 | "y = df['is_metal'].values # Target variable\n",
495 | "\n",
496 | "# Preview first two rows \n",
497 | "print(\"Mean pooling features (first two rows, first 4 columns):\")\n",
498 | "print(mean_df.iloc[:2, :4]) \n",
499 | "print(\"\\nMax pooling features (first two rows, first 4 columns):\")\n",
500 | "print(max_df.iloc[:2, :4]) "
501 | ]
502 | },
503 | {
504 | "cell_type": "markdown",
505 | "metadata": {},
506 | "source": [
507 | "In the output, you can see two numerical representations of the chemical compositions using different feature extraction techniques. Now let's see how they cluster."
508 | ]
509 | },
510 | {
511 | "cell_type": "markdown",
512 | "metadata": {},
513 | "source": [
514 | "### $k$-means clustering \n",
515 | "\n",
516 | "#### Mean pool"
517 | ]
518 | },
519 | {
520 | "cell_type": "code",
521 | "execution_count": null,
522 | "metadata": {},
523 | "outputs": [],
524 | "source": [
525 | "# Perform k-means clustering\n",
526 | "kmeans = KMeans(n_clusters=2, random_state=42)\n",
527 | "predicted_labels = kmeans.fit_predict(X_mean)\n",
528 | "\n",
529 | "# Adjust k-means output to match true labels\n",
530 | "if accuracy_score(y, predicted_labels) < 0.5:\n",
531 | " predicted_labels = 1 - predicted_labels\n",
532 | "\n",
533 | "# Assess performance\n",
534 | "accuracy = accuracy_score(y, predicted_labels)\n",
535 | "print(f\"Accuracy: {accuracy:.2f}\")\n",
536 | "\n",
537 | "conf_matrix = confusion_matrix(y, predicted_labels)\n",
538 | "\n",
539 | "plt.figure(figsize=(5, 4))\n",
540 | "sns.heatmap(conf_matrix, annot=True, fmt=\"d\", cmap=\"Blues\", \n",
541 | " xticklabels=['Predicted Insulator', 'Predicted Metal'], \n",
542 | " yticklabels=['True Insulator', 'True Metal'])\n",
543 | "plt.xlabel('Predicted label')\n",
544 | "plt.ylabel('True label')\n",
545 | "plt.show()"
546 | ]
547 | },
548 | {
549 | "cell_type": "markdown",
550 | "metadata": {},
551 | "source": [
552 | "#### Max pool"
553 | ]
554 | },
555 | {
556 | "cell_type": "code",
557 | "execution_count": null,
558 | "metadata": {},
559 | "outputs": [],
560 | "source": [
561 | "# Perform k-means clustering\n",
562 | "kmeans = KMeans(n_clusters=2, random_state=42)\n",
563 | "predicted_labels = kmeans.fit_predict(X_max)\n",
564 | "\n",
565 | "# Adjust k-means output to match true labels\n",
566 | "if accuracy_score(y, predicted_labels) < 0.5:\n",
567 | " predicted_labels = 1 - predicted_labels\n",
568 | "\n",
569 | "# Assess performance\n",
570 | "accuracy = accuracy_score(y, predicted_labels)\n",
571 | "print(f\"Accuracy: {accuracy:.2f}\")\n",
572 | "\n",
573 | "conf_matrix = confusion_matrix(y, predicted_labels)\n",
574 | "\n",
575 | "plt.figure(figsize=(5, 4))\n",
576 | "sns.heatmap(conf_matrix, annot=True, fmt=\"d\", cmap=\"Blues\", \n",
577 | " xticklabels=['Predicted Insulator', 'Predicted Metal'], \n",
578 | " yticklabels=['True Insulator', 'True Metal'])\n",
579 | "plt.xlabel('Predicted label')\n",
580 | "plt.ylabel('True label')\n",
581 | "plt.show()"
582 | ]
583 | },
584 | {
585 | "cell_type": "markdown",
586 | "metadata": {},
587 | "source": [
588 | "The difference in accuracy between the two methods for this simple example highlights the importance of choosing an appropriate pooling strategy when featurising materials data. In this case, mean pooling provides a more balanced representation, which better distinguishes between metals and insulators."
589 | ]
590 | },
591 | {
592 | "cell_type": "markdown",
593 | "metadata": {
594 | "tags": []
595 | },
596 | "source": [
597 | "## 🚨 Exercise 5\n",
598 | "\n",
599 | "
\n",
600 | " 💡 Coding exercises: The exercises are designed to apply what you have learned with room for creativity. It is fine to discuss solutions with your classmates, but the actual code should not be directly copied.\n",
601 | "
\n",
602 | "\n",
603 | "### Your details"
604 | ]
605 | },
606 | {
607 | "cell_type": "code",
608 | "execution_count": null,
609 | "metadata": {},
610 | "outputs": [],
611 | "source": [
612 | "import numpy as np\n",
613 | "\n",
614 | "# Insert your values\n",
615 | "Name = \"No Name\" # Replace with your name\n",
616 | "CID = 123446 # Replace with your College ID (as a numeric value with no leading 0s)\n",
617 | "\n",
618 | "# Set a random seed using the CID value\n",
619 | "CID = int(CID)\n",
620 | "np.random.seed(CID)\n",
621 | "\n",
622 | "# Print the message\n",
623 | "print(\"This is the work of \" + Name + \" [CID: \" + str(CID) + \"]\")"
624 | ]
625 | },
626 | {
627 | "cell_type": "markdown",
628 | "metadata": {
629 | "id": "4WAC3QJYOMFs",
630 | "tags": []
631 | },
632 | "source": [
633 | "### Problem\n",
634 | "\n",
635 | "The choice of featurisation method can significantly impact the performance of machine learning models, particularly in decision trees, which rely on the features to make accurate splits. \n",
636 | "\n",
637 | "Tasks will be given in class focusing on comparing the impact of different featurisation methods on classification performance."
638 | ]
639 | },
640 | {
641 | "cell_type": "code",
642 | "execution_count": null,
643 | "metadata": {},
644 | "outputs": [],
645 | "source": [
646 | "#Empty block for your answers\n",
647 | "\n",
648 | "\n"
649 | ]
650 | },
651 | {
652 | "cell_type": "code",
653 | "execution_count": null,
654 | "metadata": {},
655 | "outputs": [],
656 | "source": [
657 | "#Empty block for your answers\n",
658 | "\n",
659 | "\n"
660 | ]
661 | },
662 | {
663 | "cell_type": "markdown",
664 | "metadata": {},
665 | "source": [
666 | "\n",
667 | " Task hint \n",
668 | "For task 4, you can featurise a new composition using a command such as `new_material = composition_featuriser([\"AlGaN2\"], embedding=\"atomic\", stats=[\"sum\"])`\n",
669 | "\n",
670 | "\n",
671 | "
\n",
672 | " 📓 Submission: When your notebook is complete in Google Colab, go to File > Download and choose .ipynb. The completed file should be uploaded to Blackboard under assignments for MATE70026.\n",
673 | "
\n",
15 | " 💡 Mildred Dresselhaus: People said you’re crazy... But if you think you’re right, stick to it. And we were right.\n",
16 | "
"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "\n",
24 | "\n",
25 | "[Lecture slides](https://speakerdeck.com/aronwalsh/mlformaterials-lecture7-build)"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "## 🦾 Crystal hardness revisited\n",
33 | "\n",
34 | "We first tackled the [bulk modulus](https://en.wikipedia.org/wiki/Bulk_modulus) of inorganic crystals in Lecture 2. However our model development was not thorough back then.\n",
35 | "\n",
36 | "Let's revisit this problem using the new knowledge and tricks we have picked up. We will follow the same initial steps, making use of [matminer](https://matminer.readthedocs.io) to access the materials dataset and featurise the data."
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "# Installation of libraries\n",
46 | "!pip install matminer --quiet\n",
47 | "!pip install xgboost --quiet"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "# Downgrade scikit to avoid a conflict with xgboost\n",
57 | " # Note: Ignore the error message\n",
58 | "!pip uninstall -y scikit-learn --quiet\n",
59 | "!pip install scikit-learn==1.3.1 --quiet "
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "# Import of modules\n",
69 | "import numpy as np \n",
70 | "import matplotlib.pyplot as plt \n",
71 | "import pandas as pd \n",
72 | "import pprint \n",
73 | "import seaborn as sns \n",
74 | "plt.style.use('ggplot') \n",
75 | "\n",
76 | "# Advanced\n",
77 | "from pymatgen.core import Structure \n",
78 | "import matminer \n",
79 | "from matminer.datasets.dataset_retrieval import load_dataset \n",
80 | "from monty.serialization import loadfn \n",
81 | "\n",
82 | "# To make the model run faster\n",
83 | "teaching_mode = True"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "\n",
91 | "Colab error solution\n",
92 | "If running the import module cell fails with an \"AttributeError\", click `Runtime` -> `Restart Session` and then simply rerun the cell. \n",
93 | ""
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {
99 | "tags": []
100 | },
101 | "source": [
102 | "## Data preparation\n",
103 | "\n",
104 | "The steps to load and featurise the bulk modulus data were introduced in Notebook 2, so we can jump straight in."
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "# Use matminer to load the dataset\n",
114 | "df = load_dataset('matbench_log_kvrh')\n",
115 | "print(f'The full dataset contains {df.shape[0]} entries. \\n')\n",
116 | "\n",
117 | "if teaching_mode:\n",
118 | " # Store the original DataFrame as a copy\n",
119 | " full_dataset_df = df.copy()\n",
120 | " # Create a subset of the original DataFrame for demonstration purposes\n",
121 | " df = df.sample(n=1000, random_state=33)\n",
122 | " print(f'For teaching purposes we will only work with {df.shape[0]} entries from the dataframe to make the model training and testing faster. \\n')\n",
123 | "\n",
124 | "print('The DataFrame is shown below:')\n",
125 | "df.head(10)"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "# Plot a histogram of values\n",
135 | "fig, ax = plt.subplots(figsize=(5, 4))\n",
136 | "ax.hist(df['log10(K_VRH)'])\n",
137 | "ax.set_xlabel(r'$log_{10}K_{VRH}$ [$log_{10}GPa$]' )\n",
138 | "ax.set_ylabel('Counts')\n",
139 | "plt.show()"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "# Use matminer to featurise the dataset\n",
149 | "from matminer.featurizers.composition.composite import ElementProperty\n",
150 | "from matminer.featurizers.structure.order import DensityFeatures\n",
151 | "\n",
152 | "# Add a composition column to df using the composition property of the Structure class\n",
153 | "df['composition'] = df.structure.apply(lambda x: x.composition )\n",
154 | "\n",
155 | "# Create the ElementProperty featuriser\n",
156 | "el_prop_featuriser = ElementProperty.from_preset(preset_name='magpie')\n",
157 | "\n",
158 | "# By default multiprocessing is enabled, however, this can slow performance, so we disable it\n",
159 | "el_prop_featuriser.set_n_jobs(1)\n",
160 | "\n",
161 | "# Featurise using the ElementProperty featuriser\n",
162 | "df = el_prop_featuriser.featurize_dataframe(df, col_id='composition')\n",
163 | "\n",
164 | "# Add structure features\n",
165 | "density_featuriser = DensityFeatures()\n",
166 | "density_featuriser.set_n_jobs(1)\n",
167 | "df=density_featuriser.fit_featurize_dataframe(df, col_id='structure')\n",
168 | "\n",
169 | "# Print the shape of the DataFrame\n",
170 | "print(df.shape)\n",
171 | "df.head()"
172 | ]
173 | },
174 | {
175 | "cell_type": "markdown",
176 | "metadata": {},
177 | "source": [
178 | "Let's understand the feature space a little better."
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": null,
184 | "metadata": {},
185 | "outputs": [],
186 | "source": [
187 | "# Extract the feature columns (excluding the first three)\n",
188 | "feature_columns = df.columns[3:]\n",
189 | "\n",
190 | "# Create a unique colour for each feature\n",
191 | "colors = [plt.cm.jet(i / float(len(feature_columns))) for i in range(len(feature_columns))]\n",
192 | "\n",
193 | "# Plot the distribution of feature values with different colours\n",
194 | "plt.figure(figsize=(5, 4))\n",
195 | "for i, column in enumerate(feature_columns):\n",
196 | " df[column].plot(kind='hist', bins=0, alpha=0.5, color=colors[i], label=column)\n",
197 | "\n",
198 | "plt.title('Feature Distributions')\n",
199 | "plt.show()"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "\n",
207 | " Code hint \n",
208 | "Add some bins to your histogram. 10-20 should be sufficient.\n",
209 | "\n",
210 | "\n",
211 | "Some dimensions have very different ranges, as you can see from the spread on the x-axis. We can standardise these. \n",
212 | "\n",
213 | "`MinMaxScaler` is a data scaling technique to transform numerical features within the range [0, 1]. It linearly scales data, preserving relationships between values, making it suitable for algorithms sensitive to feature magnitudes."
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "from sklearn.preprocessing import MinMaxScaler\n",
223 | "\n",
224 | "scaled_df = df.copy()\n",
225 | "\n",
226 | "# Step 1: Standardise the feature columns\n",
227 | "scaler = MinMaxScaler()\n",
228 | "scaled_df[feature_columns] = scaler.fit_transform(scaled_df[feature_columns])\n",
229 | "\n",
230 | "# Step 2: Plot the standardised feature distributions\n",
231 | "plt.figure(figsize=(5, 4))\n",
232 | "for column in feature_columns:\n",
233 | " scaled_df[column].plot(kind='hist', bins=20, alpha=0.5, label=column)\n",
234 | "\n",
235 | "plt.title('Standardised Feature Distributions')\n",
236 | "plt.show()"
237 | ]
238 | },
239 | {
240 | "cell_type": "markdown",
241 | "metadata": {},
242 | "source": [
243 | "Finally, let's prepare the data for model training. We need to split the dataset into the target variable `log10(K_VRH)` and the input features. For the input features, we must remove any non-numerical data to avoid getting errors later in our workflow."
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {},
250 | "outputs": [],
251 | "source": [
252 | "# Define the features we want \n",
253 | "features_to_drop = ['structure','composition','log10(K_VRH)']\n",
254 | "feature_cols = [col for col in list(df.columns) if col not in features_to_drop]\n",
255 | "\n",
256 | "# Get an array of the features\n",
257 | "X = df[feature_cols].values\n",
258 | "scaled_X = scaled_df[feature_cols].values\n",
259 | "\n",
260 | "# Get an array of the target variable\n",
261 | "y = df['log10(K_VRH)'].values\n",
262 | "\n",
263 | "print(f'Shape of X: {X.shape}')\n",
264 | "print(f'Shape of y: {y.shape}')"
265 | ]
266 | },
267 | {
268 | "cell_type": "markdown",
269 | "metadata": {},
270 | "source": [
271 | "## Model choice\n"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {},
277 | "source": [
278 | "We are dealing with a supervised regression problem, so should choose a suitable machine learning model. We can start by rebuilding a random forest. Are you curious if the feature scaling has an effect? I am."
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": null,
284 | "metadata": {},
285 | "outputs": [],
286 | "source": [
287 | "# Random forest - original features\n",
288 | "from sklearn.ensemble import RandomForestRegressor\n",
289 | "from sklearn import metrics\n",
290 | "\n",
291 | "# Define the model\n",
292 | "rf = RandomForestRegressor(n_estimators=100, criterion='squared_error', max_depth=3, min_samples_split=2, min_samples_leaf=1, random_state=42)\n",
293 | "\n",
294 | "# Fit the model\n",
295 | "rf.fit(X,y)\n",
296 | "\n",
297 | "# Wrap the lines of code for later sections\n",
298 | "def make_prediction_plot(X, y, model, label):\n",
299 | " y_pred = model.predict(X) # Calculate predictions here\n",
300 | " fig, ax = plt.subplots(figsize=(5, 4))\n",
301 | " ax.scatter(y, y_pred, c=y, cmap='viridis')\n",
302 | " ax.plot(y, y, 'r-')\n",
303 | " ax.set_xlabel(f'{label} True')\n",
304 | " ax.set_ylabel(f'{label} Predicted')\n",
305 | " plt.show()\n",
306 | " return y_pred # Return y_pred \n",
307 | "\n",
308 | "# Performance\n",
309 | "y_pred = make_prediction_plot(X, y, rf, 'log10(K_VRH)') \n",
310 | "\n",
311 | "print(f'The training MAE = {metrics.mean_absolute_error(y,y_pred):.3f} log10GPa')\n",
312 | "print(f'The training RMSE = {np.sqrt(metrics.mean_squared_error(y,y_pred)):.3f} log10GPa')\n",
313 | "print(f'The training r^2 = {rf.score(X,y):.3f}')"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": null,
319 | "metadata": {},
320 | "outputs": [],
321 | "source": [
322 | "# Random forest - scaled features\n",
323 | "\n",
324 | "# Define the model\n",
325 | "rf2 = RandomForestRegressor(n_estimators=100, criterion='squared_error', max_depth=3, min_samples_split=2, min_samples_leaf=1, random_state=42)\n",
326 | "\n",
327 | "# Fit the model\n",
328 | "rf2.fit(scaled_X, y)\n",
329 | "\n",
330 | "# Performance\n",
331 | "y_pred = make_prediction_plot(scaled_X, y, rf2, 'log10(K_VRH)') \n",
332 | "print(f'The training MAE = {metrics.mean_absolute_error(y, y_pred):.3f} log10GPa')\n",
333 | "print(f'The training RMSE = {np.sqrt(metrics.mean_squared_error(y, y_pred)):.3f} log10GPa')\n",
334 | "print(f'The training r^2 = {rf2.score(scaled_X, y):.3f}')"
335 | ]
336 | },
337 | {
338 | "cell_type": "markdown",
339 | "metadata": {},
340 | "source": [
341 | "We can see that Random Forest is not sensitive to feature scaling. Recall that this model works by averaging over multiple decision trees, and the decision boundaries are determined by feature thresholds, not their absolute values. \n",
342 | "\n",
343 | "We have time to try one more model. Let's go with the popular [XGBoost](https://xgboost.readthedocs.io). Like Random Forest, it is an ensemble learning method. XGBoost uses a gradient-boosting framework and often achieves higher predictive accuracy by optimising for both bias and variance in the model."
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": null,
349 | "metadata": {},
350 | "outputs": [],
351 | "source": [
352 | "# XGBoost model\n",
353 | "import xgboost as xgb\n",
354 | "\n",
355 | "# Define the model\n",
356 | "xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=3, random_state=42, objective='reg:squarederror')\n",
357 | "\n",
358 | "# Fit the model\n",
359 | "xgb_model.fit(scaled_X, y)\n",
360 | "\n",
361 | "# Performance\n",
362 | "y_pred = make_prediction_plot(scaled_X, y, xgb_model, 'log10(K_VRH)') \n",
363 | "print(f'The training MAE = {metrics.mean_absolute_error(y, y_pred):.3f} log10GPa')\n",
364 | "print(f'The training RMSE = {np.sqrt(metrics.mean_squared_error(y, y_pred)):.3f} log10GPa')\n",
365 | "print(f'The training r^2 = {xgb_model.score(scaled_X, y):.3f}')"
366 | ]
367 | },
368 | {
369 | "cell_type": "markdown",
370 | "metadata": {},
371 | "source": [
372 | "XGBoost does a better job, but wait... \n",
373 | "\n",
374 | "We haven't performed proper training and testing yet 😱. These models are likely to be overfit and unable to make useful predictions for new inputs. On to the next stage!"
375 | ]
376 | },
377 | {
378 | "cell_type": "markdown",
379 | "metadata": {},
380 | "source": [
381 | "## Training and testing\n",
382 | "\n",
383 | "### Train-test split\n",
384 | "\n",
385 | "We are ready to build a real model now. Let's separate the training data from the unseen test set used to assess model performance."
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": null,
391 | "metadata": {},
392 | "outputs": [],
393 | "source": [
394 | "from slearn.model_selection import train_test_split\n",
395 | "\n",
396 | "# Split the data into 80% training and 20% testing\n",
397 | "X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state=42)\n",
398 | "\n",
399 | "# Print the sizes of the arrays\n",
400 | "print(f\"X_train shape: {X_train.shape}\")\n",
401 | "print(f\"y_train shape: {y_train.shape}\")\n",
402 | "print(f\"X_test shape: {X_test.shape}\")\n",
403 | "print(f\"y_test shape: {y_test.shape}\")"
404 | ]
405 | },
406 | {
407 | "cell_type": "markdown",
408 | "metadata": {},
409 | "source": [
410 | "\n",
411 | " Code hint \n",
412 | "The library is \"sklearn\"!\n",
413 | ""
414 | ]
415 | },
416 | {
417 | "cell_type": "markdown",
418 | "metadata": {},
419 | "source": [
420 | "### Cross-validation \n",
421 | "\n",
422 | "Using the 80% training set, we can train a model by making use of [cross-validation](https://scikit-learn.org/stable/modules/cross_validation.html) in an attempt to avoid overfitting. Note that this step may take a minute to run as 10 models are being trained (i.e. 5-fold cross-validation x 2 models).\n",
423 | "\n",
424 | "\n",
425 | " Recap of cross-validation \n",
426 | "Cross-validation partitions data into multiple subsets, training the model on some and validating it on others, ensuring robust evaluation.\n",
427 | "\n",
428 | "_Key types include:_\n",
429 | "\n",
430 | "- **k-Fold Cross-Validation**: Data is split into *k* folds; each fold is used as a validation set once while training on the remaining *k-1* folds.\n",
431 | "- **Leave-One-Out Cross-Validation (LOOCV)**: Each data point is used as a validation set once, with the rest for training.\n",
432 | "- **Stratified k-Fold**: Preserves class proportions in each fold, useful for imbalanced datasets.\n",
433 | "- **Time Series Cross-Validation**: Ensures training always precedes validation, preserving temporal structure.\n",
434 | "\n",
435 | "_Typical workflow:_\n",
436 | "\n",
437 | "1. **Split Data**: Divide the dataset into *k* folds.\n",
438 | "2. **Train and Validate**: Train the model on *k-1* folds, validate on the remaining fold.\n",
439 | "3. **Repeat**: Cycle through all folds, ensuring each serves as a validation set.\n",
440 | "4. **Aggregate Results**: Compute performance metrics across all iterations.\n",
441 | "5. **Train Final Model:** Fit the model using the full training dataset based on cross-validation insights.\n",
442 | ""
443 | ]
444 | },
445 | {
446 | "cell_type": "code",
447 | "execution_count": null,
448 | "metadata": {},
449 | "outputs": [],
450 | "source": [
451 | "from sklearn.model_selection import cross_val_score\n",
452 | "from xgboost import XGBRegressor\n",
453 | "\n",
454 | "# Define models\n",
455 | "xgb_model = XGBRegressor(n_estimators=100, max_depth=3, random_state=42, objective='reg:squarederror') \n",
456 | "rf_model = RandomForestRegressor(n_estimators=100, max_depth=3, random_state=42)\n",
457 | "\n",
458 | "# Perform cross-validation for XGBoost\n",
459 | "xgb_cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
460 | "xgb_rmse = np.sqrt(-xgb_cv_scores) # Convert to RMSE\n",
461 | "\n",
462 | "# Perform cross-validation for Random Forest\n",
463 | "rf_cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
464 | "rf_rmse = np.sqrt(-rf_cv_scores) # Convert to RMSE\n",
465 | "\n",
466 | "# Print results\n",
467 | "# Compare the results\n",
468 | "print(\"XGBoost Cross-Validation Results\")\n",
469 | "print(f\" Mean RMSE: {xgb_rmse.mean():.3f}\")\n",
470 | "print(f\" Standard Deviation of RMSE: {xgb_rmse.std():.3f}\")\n",
471 | "\n",
472 | "print(\"\\nRandom Forest Cross-Validation Results\")\n",
473 | "print(f\" Mean RMSE: {rf_rmse.mean():.3f}\")\n",
474 | "print(f\" Standard Deviation of RMSE: {rf_rmse.std():.3f}\")"
475 | ]
476 | },
477 | {
478 | "cell_type": "markdown",
479 | "metadata": {},
480 | "source": [
481 | "
\n",
482 | " 🙋 Cross-validation output: \n",
483 | " • Mean RMSE: Mean error across the cross-validation folds (smaller = better). \n",
484 | " • Standard Deviation of RMSE: Variability in error across the folds (smaller = more consistent). \n",
485 | "
"
486 | ]
487 | },
488 | {
489 | "cell_type": "markdown",
490 | "metadata": {},
491 | "source": [
492 | "### Hyperparamater optimisation\n",
493 | "\n",
494 | "XGBoost is in the lead! So far, we have not adjusted the models themselves. It is possible to improve performance by tuning the hyperparameters. Manually tuning would be laborious. We can use `GridSearchCV` to automate the search. \n",
495 | "\n",
496 | "Note that this step will be even more computationally expensive as we are performing cross-validation as a function of model hyperparameters for two separate models. You can see how computational cost quickly escalates and this is where powerful GPUs can become essential for machine learning! \n",
497 | "\n",
498 | "
\n",
499 | " ⏱️ This will take 2-3 min to run. Think about how the model is learning from data.\n",
500 | "
"
501 | ]
502 | },
503 | {
504 | "cell_type": "code",
505 | "execution_count": null,
506 | "metadata": {},
507 | "outputs": [],
508 | "source": [
509 | "from sklearn.model_selection import GridSearchCV\n",
510 | "\n",
511 | "# Hyperparameter grid for XGBoost\n",
512 | "xgb_param_grid = {\n",
513 | " 'n_estimators': [100, 200],\n",
514 | " 'max_depth': [3, 6],\n",
515 | " 'learning_rate': [0.1, 0.2]\n",
516 | "}\n",
517 | "\n",
518 | "xgb_grid_search = GridSearchCV(XGBRegressor(random_state=42), xgb_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)\n",
519 | "xgb_grid_search.fit(X_train, y_train)\n",
520 | "\n",
521 | "best_xgb_params = xgb_grid_search.best_params_\n",
522 | "best_xgb_model = xgb_grid_search.best_estimator_\n",
523 | "\n",
524 | "# Hyperparameter grid for Random Forest\n",
525 | "rf_param_grid = {\n",
526 | " 'n_estimators': [100, 200],\n",
527 | " 'max_depth': [3, 6],\n",
528 | " 'min_samples_split': [2, 4]\n",
529 | "}\n",
530 | "\n",
531 | "rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)\n",
532 | "rf_grid_search.fit(X_train, y_train)\n",
533 | "\n",
534 | "best_rf_params = rf_grid_search.best_params_\n",
535 | "best_rf_model = rf_grid_search.best_estimator_\n",
536 | "\n",
537 | "# Evaluate the best models\n",
538 | "xgb_cv_scores = -cross_val_score(best_xgb_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
539 | "xgb_rmse = np.sqrt(xgb_cv_scores)\n",
540 | "\n",
541 | "rf_cv_scores = -cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
542 | "rf_rmse = np.sqrt(rf_cv_scores)\n",
543 | "\n",
544 | "# Compare the results of the best models\n",
545 | "print(\"Best XGBoost Hyperparameters:\", best_xgb_params)\n",
546 | "print(\"Best XGBoost Cross-Validation Results\")\n",
547 | "print(f\" Mean RMSE: {xgb_rmse.mean():.3f}\")\n",
548 | "print(f\" Standard Deviation of RMSE: {xgb_rmse.std():.3f}\")\n",
549 | "\n",
550 | "print(\"\\nBest Random Forest Hyperparameters:\", best_rf_params)\n",
551 | "print(\"Best Random Forest Cross-Validation Results\")\n",
552 | "print(f\" Mean RMSE: {rf_rmse.mean():.3f}\")\n",
553 | "print(f\" Standard Deviation of RMSE: {rf_rmse.std():.3f}\")"
554 | ]
555 | },
556 | {
557 | "cell_type": "markdown",
558 | "metadata": {},
559 | "source": [
560 | "Was it worth the effort? There should be improvements in the RMSE for both models. Note the optimal hyperparameters found.\n",
561 | "\n",
562 | "### Model assessment\n",
563 | "\n",
564 | "Now that we have our best trained models, let's see how they perform on *unseen* test data. Comparing test performance to training performance will help us determine if the model generalises well or shows signs of overfitting or underfitting."
565 | ]
566 | },
567 | {
568 | "cell_type": "code",
569 | "execution_count": null,
570 | "metadata": {},
571 | "outputs": [],
572 | "source": [
573 | "from sklearn.metrics import mean_squared_error, r2_score\n",
574 | "\n",
575 | "# Test the best XGBoost model\n",
576 | "xgb_test_preds = best_xgb_model.predict(X_test)\n",
577 | "xgb_test_rmse = np.sqrt(mean_squared_error(y_test, xgb_test_preds))\n",
578 | "xgb_test_r2 = r2_score(y_test, xgb_test_preds)\n",
579 | "\n",
580 | "# Test the best Random Forest model\n",
581 | "rf_test_preds = best_rf_model.predict(X_test)\n",
582 | "rf_test_rmse = np.sqrt(mean_squared_error(y_test, rf_test_preds))\n",
583 | "rf_test_r2 = r2_score(y_test, rf_test_preds)\n",
584 | "\n",
585 | "# Print test results\n",
586 | "print(\"XGBoost test results:\")\n",
587 | "print(f\"RMSE: {xgb_test_rmse:.3f}\")\n",
588 | "print(f\"R²: {xgb_test_r2:.3f}\")\n",
589 | "\n",
590 | "print(\"\\nRandom Forest test results:\")\n",
591 | "print(f\"RMSE: {rf_test_rmse:.3f}\")\n",
592 | "print(f\"R²: {rf_test_r2:.3f}\")\n",
593 | "\n",
594 | "# Create a scatter plot with both models in different colors\n",
595 | "plt.figure(figsize=(5, 4))\n",
596 | "plt.scatter(y_test, xgb_test_preds, c='blue', label=f'XGBoost (R²={xgb_test_r2:.2f})', alpha=0.5)\n",
597 | "plt.scatter(y_test, rf_test_preds, c='green', label=f'Random Forest (R²={rf_test_r2:.2f})', alpha=0.5)\n",
598 | "plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'k--', lw=2) # Reference line (y=x)\n",
599 | "plt.xlabel(\"Actual values\")\n",
600 | "plt.ylabel(\"Predicted values\")\n",
601 | "plt.title(\"Test set performance\")\n",
602 | "plt.legend()\n",
603 | "plt.show()"
604 | ]
605 | },
606 | {
607 | "cell_type": "markdown",
608 | "metadata": {},
609 | "source": [
610 | "XGBoost outperforms Random Forest in both cross-validation and test performance for this task, with the slight increase in RMSE from train to test suggesting both models generalise reasonably well."
611 | ]
612 | },
613 | {
614 | "cell_type": "markdown",
615 | "metadata": {},
616 | "source": [
617 | "### Model speed\n",
618 | "\n",
619 | "The speed of a model may also be important, e.g. a use case involving millions of predictions. Several factors can influence the computational performance, including the dataset size, model complexity, and hardware. We can perform a simple comparison of our two models using `time`."
620 | ]
621 | },
622 | {
623 | "cell_type": "code",
624 | "execution_count": null,
625 | "metadata": {},
626 | "outputs": [],
627 | "source": [
628 | "import time\n",
629 | "\n",
630 | "# Measure the training time for XGBoost\n",
631 | "start_time = time.time()\n",
632 | "xgb_model.fit(X_train, y_train)\n",
633 | "xgb_training_time = time.time() - start_time\n",
634 | "\n",
635 | "# Measure the training time for Random Forest\n",
636 | "start_time = time.time()\n",
637 | "rf_model.fit(X_train, y_train)\n",
638 | "rf_training_time = time.time() - start_time\n",
639 | "\n",
640 | "# Measure the prediction time for XGBoost\n",
641 | "start_time = time.time()\n",
642 | "xgb_test_preds = xgb_model.predict(X_test)\n",
643 | "xgb_prediction_time = time.time() - start_time\n",
644 | "\n",
645 | "# Measure the prediction time for Random Forest\n",
646 | "start_time = time.time()\n",
647 | "rf_test_preds = rf_model.predict(X_test)\n",
648 | "rf_prediction_time = time.time() - start_time\n",
649 | "\n",
650 | "print(f\"XGBoost training time: {xgb_training_time:.4f} seconds\")\n",
651 | "print(f\"Random Forest training time: {rf_training_time:.4f} seconds\")\n",
652 | "print(f\"\\nXGBoost prediction time: {xgb_prediction_time:.4f} seconds\")\n",
653 | "print(f\"Random Forest prediction time: {rf_prediction_time:.4f} seconds\")"
654 | ]
655 | },
656 | {
657 | "cell_type": "markdown",
658 | "metadata": {},
659 | "source": [
660 | "It is clear that the XGBoost library has been well optimised to run quickly."
661 | ]
662 | },
663 | {
664 | "cell_type": "markdown",
665 | "metadata": {
666 | "tags": []
667 | },
668 | "source": [
669 | "## 🚨 Exercise 7\n",
670 | "\n",
671 | "
\n",
672 | " 💡 Coding exercises: The exercises are designed to apply what you have learned with room for creativity. It is fine to discuss solutions with your classmates, but the actual code should not be directly copied.\n",
673 | "
\n",
674 | "\n",
675 | "### Your details"
676 | ]
677 | },
678 | {
679 | "cell_type": "code",
680 | "execution_count": null,
681 | "metadata": {},
682 | "outputs": [],
683 | "source": [
684 | "import numpy as np\n",
685 | "\n",
686 | "# Insert your values\n",
687 | "Name = \"No Name\" # Replace with your name\n",
688 | "CID = 123446 # Replace with your College ID (as a numeric value with no leading 0s)\n",
689 | "\n",
690 | "# Set a random seed using the CID value\n",
691 | "CID = int(CID)\n",
692 | "np.random.seed(CID)\n",
693 | "\n",
694 | "# Print the message\n",
695 | "print(\"This is the work of \" + Name + \" [CID: \" + str(CID) + \"]\")"
696 | ]
697 | },
698 | {
699 | "cell_type": "markdown",
700 | "metadata": {
701 | "tags": []
702 | },
703 | "source": [
704 | "### Problem\n",
705 | "\n",
706 | "Selecting the most appropriate ML model for a given purpose is important for achieving predictive performance. Your job will be to assess additional models (e.g. [Nearest Neighbours](https://scikit-learn.org/stable/modules/neighbors.html#neighbors) and [Support Vector Machines](https://scikit-learn.org/stable/modules/svm.html#svm)) for the hardness regression task. The tasks will be given in class."
707 | ]
708 | },
709 | {
710 | "cell_type": "code",
711 | "execution_count": null,
712 | "metadata": {},
713 | "outputs": [],
714 | "source": [
715 | "#Empty block for your answers\n",
716 | "\n",
717 | "\n"
718 | ]
719 | },
720 | {
721 | "cell_type": "code",
722 | "execution_count": null,
723 | "metadata": {},
724 | "outputs": [],
725 | "source": [
726 | "#Empty block for your answers\n",
727 | "\n",
728 | "\n"
729 | ]
730 | },
731 | {
732 | "cell_type": "markdown",
733 | "metadata": {},
734 | "source": [
735 | "\n",
736 | " Task hint \n",
737 | "You can perform cross-validation following the same procedure as the random forest model in the main notebook.\n",
738 | "\n",
739 | "\n",
740 | "
\n",
741 | " 📓 Submission: When your notebook is complete in Google Colab, go to File > Download and choose .ipynb. The completed file should be uploaded to Blackboard under assignments for MATE70026.\n",
742 | "
\n",
15 | " 💡 Geoffrey Hinton: It’s quite conceivable that humanity is just a passing phase in the evolution of intelligence.\n",
16 | "
"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "\n",
24 | "\n",
25 | "[Lecture slides](https://speakerdeck.com/aronwalsh/mlformaterials-lecture8-ai)"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "## 🤖 x 🧪 Closed-loop optimisation \n",
33 | "\n",
34 | "The combination of automation and optimisation is powerful. Closed-loop workflows are of growing importance in materials research for many reasons, including:\n",
35 | "\n",
36 | "1. **Efficiency:** Efficient allocation of resources, both in terms of time and materials. By continuously updating experimental parameters based on real-time feedback, we can reduce the number of trials needed to reach optimal outcomes. \n",
37 | "\n",
38 | "2. **Adapt to changing conditions:** Adaptive decision-making, ensuring that experiments remain effective even when external factors fluctuate. This adaptability is highly valuable for complex systems where traditional trial-and-error approaches are prone to fail.\n",
39 | "\n",
40 | "3. **Exploration of large parameter spaces:** Many materials science problems involve high-dimensional parameter spaces where exhaustive exploration is impractical. Techniques such as Bayesian optimisation can efficiently sample and search these spaces to identify optimal configurations and make discoveries.\n",
41 | "\n",
42 | "4. **Data-driven insights:** Generation of valuable data from ongoing experiments. This data can be analysed to gain a deeper understanding of the underlying processes and relationships, facilitating scientific discoveries and supporting future efforts.\n",
43 | "\n",
44 | "Today we will make use of the [scikit-optimise](https://scikit-optimize.github.io) package."
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "# Installation of libraries\n",
54 | "!pip install scikit-optimize --quiet"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 3,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "# Import of modules\n",
64 | "import numpy as np \n",
65 | "import matplotlib.pyplot as plt \n",
66 | "from scipy.stats import norm # Statistical functions\n",
67 | "from skopt import gp_minimize, dummy_minimize # Bayesian optimisation\n",
68 | "from skopt.utils import create_result # Utility functions for skopt\n",
69 | "from sklearn.metrics import r2_score # R-squared metric"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {
75 | "tags": []
76 | },
77 | "source": [
78 | "## Bayesian optimisation (BO)\n",
79 | "\n",
80 | "BO is a powerful technique for optimising complex and expensive-to-evaluate functions. It combines probabilistic modeling and decision theory to search for the optimal set of parameters. In materials research, parameters like chemical composition, sample thickness, and processing conditions can be optimised.\n",
81 | "\n",
82 | "BO aims to find the global minimum (or maximum) of an objective function, $O(x)$, where $x$ represents a set of parameters or design variables. Instead of exhaustive searches, BP builds a surrogate model, typically a Gaussian Process (GP), that approximates the true objective function. This surrogate model captures both the mean $\\mu(x)$ and uncertainty $\\sigma(x)$ associated with $O(x)$. The GP is defined as:\n",
83 | "\n",
84 | "$$\n",
85 | "O(x) \\sim \\text{GP}(\\mu(x), k(x, x'))\n",
86 | "$$\n",
87 | "\n",
88 | "where $k(x, x')$ is a kernel function that quantifies the similarity between two input points $x$ and $x'$.\n",
89 | "\n",
90 | "The surrogate model balances exploration and exploitation using an acquisition function $\\alpha(x)$, which trades off between exploring uncertain regions and exploiting promising areas:\n",
91 | "\n",
92 | "$$\n",
93 | "x_{\\text{next}} = \\arg \\max_x \\alpha(x)\n",
94 | "$$\n",
95 | "\n",
96 | "Common acquisition functions include Probability of Improvement (PI), Expected Improvement (EI), and Upper Confidence Bound (UCB). Each of these functions aims to maximise the expected gain in performance over the current best solution.\n",
97 | "\n",
98 | "\n",
99 | "Curious about the kernel function?\n",
100 | "\n",
101 | "The kernel determines the covariance structure of the GP. A commonly used kernel, and the default in `sklearn`, is the Radial Basis Function (RBF):\n",
102 | "\n",
103 | "$$\n",
104 | "k(x, x') = \\sigma^2 \\exp\\left(-\\frac{\\|x - x'\\|^2}{2l^2}\\right)\n",
105 | "$$\n",
106 | "\n",
107 | "where:\n",
108 | "- $\\sigma^2$ is the **signal variance**, which controls the overall magnitude of function variations,\n",
109 | "- $l$ is the **length scale**, which determines how quickly the function values change with respect to input differences.\n",
110 | "\n",
111 | "There are also many other choices, such as the [Matérn kernel](https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.kernels.Matern.html), which differ in how they model smoothness and continuity.\n",
112 | ""
113 | ]
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {},
118 | "source": [
119 | "## Building a BO model\n",
120 | "\n",
121 | "### Step 1. Target function\n",
122 | "\n",
123 | "We can start by generating a simple sine-like target function with added noise to keep things interesting. This acts as our \"virtual experiment\", i.e. we can call the function to obtain an output."
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "# Fixing the random seed for reproducibility\n",
133 | "np.random.seed(42)\n",
134 | "\n",
135 | "# Define the target function\n",
136 | "def target_function(x):\n",
137 | " x = np.atleast_1d(x) # Ensure x is an array\n",
138 | " return np.sin(x[0]) + 0.1 * x[0] + 0.5 * np.random.randn()\n",
139 | "\n",
140 | "# Generate data for visualisation\n",
141 | "x_values = np.linspace(-5, 5, 200).reshape(-1, 1)\n",
142 | "y_values = np.vectorize(target_function)(x_values)\n",
143 | "\n",
144 | "# Plot the target function\n",
145 | "plt.figure(figsize=(5, 4))\n",
146 | "plt.plot(x_values, y_values, 'r-', alpha=0.5, label='Target Function')\n",
147 | "plt.xlabel('Input')\n",
148 | "plt.ylabel('Output')\n",
149 | "plt.legend()\n",
150 | "plt.show()"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "Let's randomly sample the target function and fit a simple polynomial function to get a feeling for how the model works."
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "# Generate sample points from the target function\n",
167 | "num_initial_points = \n",
168 | "initial_points = np.random.uniform(-5, 5, num_initial_points)\n",
169 | "initial_values = np.vectorize(target_function)(initial_points)\n",
170 | "\n",
171 | "# Plot the sample points\n",
172 | "plt.figure(figsize=(5, 4))\n",
173 | "plt.plot(x_values, y_values, 'r-', alpha=0.5, label='Target Function')\n",
174 | "plt.scatter(initial_points, initial_values, color='blue', marker='o', label='Initial Samples')\n",
175 | "plt.xlabel('Input')\n",
176 | "plt.ylabel('Output')\n",
177 | "plt.legend()\n",
178 | "plt.show()"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "\n",
186 | " Code hint \n",
187 | "Try `num_initial_points = 10`\n",
188 | ""
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": null,
194 | "metadata": {},
195 | "outputs": [],
196 | "source": [
197 | "# Perform a polynomial fit\n",
198 | "degree = # Adjust the degree of the polynomial fit\n",
199 | "coefficients = np.polyfit(initial_points, initial_values, degree)\n",
200 | "poly_fit = np.poly1d(coefficients)\n",
201 | "\n",
202 | "# Calculate R^2\n",
203 | "y_pred = poly_fit(initial_points)\n",
204 | "r_squared = r2_score(initial_values, y_pred)\n",
205 | "\n",
206 | "# Plot the sample points and polynomial fit\n",
207 | "plt.figure(figsize=(5, 4))\n",
208 | "plt.plot(x_values, y_values, 'r-', alpha=0.5, label='Target Function')\n",
209 | "plt.scatter(initial_points, initial_values, color='blue', marker='o', label='Initial Samples')\n",
210 | "plt.plot(x_values, poly_fit(x_values), 'g--', label=f'Polynomial Fit (degree {degree})\\n$R^2 = {r_squared:.4f}$')\n",
211 | "plt.xlabel('Input')\n",
212 | "plt.ylabel('Output')\n",
213 | "plt.legend()\n",
214 | "plt.show()"
215 | ]
216 | },
217 | {
218 | "cell_type": "markdown",
219 | "metadata": {},
220 | "source": [
221 | "
\n",
222 | " 🐢 Take a beat: Adjust the degree of the polynomial to see how good the fit is. Start with `degree = 2` and gradually increase it.\n",
223 | "
"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | "### Step 3: Gaussian Process\n",
231 | "\n",
232 | "Now we can move to Bayesian Optimisation with a Gaussian Process model. The optimisation progress is visualised by plotting the target function, optimisation steps, and a colourbar indicating the step number.\n",
233 | "\n",
234 | "
\n",
235 | " ⏱️ This may take a minute to run. Reverend Bayes makes computers work hard!\n",
236 | "
"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "metadata": {},
243 | "outputs": [],
244 | "source": [
245 | "# Optimise the target function using Bayesian Optimisation\n",
246 | "result = gp_minimize(target_function, [(-5.0, 5.0)], n_calls=50, random_state=42)\n",
247 | "\n",
248 | "# Perform random sampling for comparison\n",
249 | "random_result = dummy_minimize(target_function, [(-5.0, 5.0)], n_calls=50, random_state=42)\n",
250 | "\n",
251 | "# Plot the Gaussian Process model after optimisation\n",
252 | "x_gp = np.array(result.x_iters).reshape(-1, 1)\n",
253 | "y_gp = result.func_vals\n",
254 | "\n",
255 | "# Plot the target function\n",
256 | "plt.figure(figsize=(5, 4))\n",
257 | "plt.plot(x_values, y_values, 'r-', alpha=0.5, label='Target function')\n",
258 | "\n",
259 | "# Plot the optimisation steps with a colormap\n",
260 | "plt.scatter(x_gp, y_gp, c=range(len(x_gp)), cmap='viridis', marker='o', label='Step number')\n",
261 | "\n",
262 | "# Add colorbar to indicate the progress\n",
263 | "cbar = plt.colorbar()\n",
264 | "cbar.set_label('Step number')\n",
265 | "\n",
266 | "plt.title('BO: Gaussian Process Model')\n",
267 | "plt.xlabel('Input')\n",
268 | "plt.ylabel('Output')\n",
269 | "plt.legend()\n",
270 | "plt.show()"
271 | ]
272 | },
273 | {
274 | "cell_type": "markdown",
275 | "metadata": {},
276 | "source": [
277 | "We can use `plot_gaussian_process` from scikit-optimize to visualise the confidence intervals. `n_samples` determines the number of samples to draw from the Gaussian Process for the estimation."
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {},
284 | "outputs": [],
285 | "source": [
286 | "from skopt.plots import plot_gaussian_process as plot_gp\n",
287 | "\n",
288 | "# Plot the Gaussian Process model with confidence intervals\n",
289 | "plt.figure(figsize=(5, 4))\n",
290 | "plot_gp(result)\n",
291 | "\n",
292 | "# Add the target function for reference\n",
293 | "plt.plot(x_values, y_values, 'r-', alpha=0.25, label='Target function')\n",
294 | "\n",
295 | "plt.title('Confidence Intervals')\n",
296 | "plt.xlabel('Input')\n",
297 | "plt.ylabel('Output')\n",
298 | "plt.legend()\n",
299 | "plt.show()"
300 | ]
301 | },
302 | {
303 | "cell_type": "markdown",
304 | "metadata": {},
305 | "source": [
306 | "The plot shows the GP mean (dashed green), confidence intervals (shaded), and sampled observations (red). The target function (light red) is also overlaid. The confidence region narrows where more observations exist and widens in unexplored areas, reflecting uncertainty in the GP model.\n",
307 | "\n",
308 | "We should always have a benchmark to compare our model to. This block extracts the best results from BO and random sampling, then compares and visualises their performance over optimisation steps."
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": null,
314 | "metadata": {},
315 | "outputs": [],
316 | "source": [
317 | "# Extract the cumulative minimum values\n",
318 | "bo_min_values = np.minimum.accumulate(result.func_vals)\n",
319 | "random_min_values = np.minimum.accumulate(random_result.func_vals)\n",
320 | "\n",
321 | "# Plot the cumulative minimum values vs steps for both methods\n",
322 | "plt.figure(figsize=(5, 4))\n",
323 | "plt.plot(range(1, len(bo_min_values) + 1), bo_min_values, 'o-', label='Bayesian Optimisation')\n",
324 | "plt.plot(range(1, len(random_min_values) + 1), random_min_values, 'x-', label='Random Sampling')\n",
325 | "\n",
326 | "plt.title('Does BO Beat Random Sampling?')\n",
327 | "plt.xlabel('Step')\n",
328 | "plt.ylabel('Cumulative Minimum Value')\n",
329 | "plt.legend()\n",
330 | "plt.show()"
331 | ]
332 | },
333 | {
334 | "cell_type": "markdown",
335 | "metadata": {},
336 | "source": [
337 | "BO (blue) converges faster to a lower minimum value. Random sampling (orange) fluctuates and struggles to improve beyond a certain point. This highlights BO’s advantage in structured search over purely random exploration."
338 | ]
339 | },
340 | {
341 | "cell_type": "markdown",
342 | "metadata": {
343 | "tags": []
344 | },
345 | "source": [
346 | "## 🚨 Exercise 8\n",
347 | "\n",
348 | "
\n",
349 | " 💡 Coding exercises: The exercises are designed to apply what you have learned with room for creativity. It is fine to discuss solutions with your classmates, but the actual code should not be directly copied.\n",
350 | "
\n",
351 | "\n",
352 | "### Your details"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": null,
358 | "metadata": {},
359 | "outputs": [],
360 | "source": [
361 | "import numpy as np\n",
362 | "\n",
363 | "# Insert your values\n",
364 | "Name = \"No Name\" # Replace with your name\n",
365 | "CID = 123446 # Replace with your College ID (as a numeric value with no leading 0s)\n",
366 | "\n",
367 | "# Set a random seed using the CID value\n",
368 | "CID = int(CID)\n",
369 | "np.random.seed(CID)\n",
370 | "\n",
371 | "# Print the message\n",
372 | "print(\"This is the work of \" + Name + \" [CID: \" + str(CID) + \"]\")"
373 | ]
374 | },
375 | {
376 | "cell_type": "markdown",
377 | "metadata": {},
378 | "source": [
379 | "### Problem\n",
380 | "\n",
381 | "The Department of Materials has purchased a new automated thin-film deposition system. The machine has two dials that provide a 2D parameter space (x, y) for materials processing. We can define a (hypothetical) target loss function for optimising the transition temperature of our candidate thin-film superconductors as:\n",
382 | "\n",
383 | "```python\n",
384 | "# Target function for materials processing with x and y \"dials\"\n",
385 | "def supermat(inputs):\n",
386 | " x, y = inputs\n",
387 | " a = 2, b = 5.1 / (2 * np.pi**2)\n",
388 | " c = 3 / np.pi\n",
389 | " r = 4, s = 10, t = 1 / (8 * np.pi)\n",
390 | "\n",
391 | " term1 = a * (y - b * x**2 + c * x - r)**2\n",
392 | " term2 = s * (1 - t) * np.cos(x)\n",
393 | " term3 = s\n",
394 | "\n",
395 | " return term1 + term2 + term3\n",
396 | "\n",
397 | "# Example usage:\n",
398 | "dials = [2.0, 3.0]\n",
399 | "result = supermat(dials)\n",
400 | "print(f\"Experiment by setting dials to ({dials[0]}, {dials[1]}): {result}\")\n",
401 | "```\n",
402 | "\n",
403 | "The tasks will be provided in class."
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": null,
409 | "metadata": {},
410 | "outputs": [],
411 | "source": [
412 | "#Empty block for your answers\n",
413 | "\n",
414 | "\n"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": null,
420 | "metadata": {},
421 | "outputs": [],
422 | "source": [
423 | "#Empty block for your answers\n",
424 | "\n",
425 | "\n"
426 | ]
427 | },
428 | {
429 | "cell_type": "markdown",
430 | "metadata": {},
431 | "source": [
432 | "\n",
433 | " Task hint \n",
434 | "Remember to first define the target function and then call it using gp_minimize()\n",
435 | "\n",
436 | "\n",
437 | "
\n",
438 | " 📓 Submission: When your notebook is complete in Google Colab, go to File > Download and choose .ipynb. The completed file should be uploaded to Blackboard under assignments for MATE70026.\n",
439 | "