├── .gitignore ├── .github ├── CODEOWNERS ├── PULL_REQUEST_TEMPLATE.md ├── workflows │ └── main.yml └── ISSUE_TEMPLATE.md ├── .devcontainer └── devcontainer.json ├── CONTRIBUTING.md ├── NOTICE ├── .vscode └── settings.json ├── README.md ├── requirements.txt ├── LICENSE ├── algos.ipynb └── soln.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules 3 | .tmp 4 | npm-debug.log 5 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Codeowners for these exercise files: 2 | # * (asterisk) denotes "all files and folders" 3 | # Example: * @producer @instructor 4 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Copy To Branches 2 | on: 3 | workflow_dispatch: 4 | jobs: 5 | copy-to-branches: 6 | runs-on: ubuntu-latest 7 | steps: 8 | - uses: actions/checkout@v2 9 | with: 10 | fetch-depth: 0 11 | - name: Copy To Branches Action 12 | uses: planetoftheweb/copy-to-branches@v1.2 13 | env: 14 | key: main 15 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "extensions": [ 3 | "GitHub.github-vscode-theme", 4 | "ms-toolsai.jupyter", 5 | "ms-python.python" 6 | // Additional Extensions Here 7 | ], 8 | "onCreateCommand" : "[ -f requirements.txt ] && pip install -r requirements.txt; echo PS1='\"$ \"' >> ~/.bashrc", //Set Terminal Prompt to $ 9 | } 10 | 11 | // DevContainer Reference: https://code.visualstudio.com/docs/remote/devcontainerjson-reference 12 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 2 | Contribution Agreement 3 | ====================== 4 | 5 | This repository does not accept pull requests (PRs). All pull requests will be closed. 6 | 7 | However, if any contributions (through pull requests, issues, feedback or otherwise) are provided, as a contributor, you represent that the code you submit is your original work or that of your employer (in which case you represent you have the right to bind your employer). By submitting code (or otherwise providing feedback), you (and, if applicable, your employer) are licensing the submitted code (and/or feedback) to LinkedIn and the open source community subject to the BSD 2-Clause license. 8 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright 2024 LinkedIn Corporation 2 | All Rights Reserved. 3 | 4 | Licensed under the LinkedIn Learning Exercise File License (the "License"). 5 | See LICENSE in the project root for license information. 6 | 7 | Please note, this project may automatically load third party code from external 8 | repositories (for example, NPM modules, Composer packages, or other dependencies). 9 | If so, such third party code may be subject to other license terms than as set 10 | forth above. In addition, such third party code may also depend on and load 11 | multiple tiers of dependencies. Please review the applicable licenses of the 12 | additional dependencies. 13 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.bracketPairColorization.enabled": true, 3 | "editor.cursorBlinking": "solid", 4 | "editor.fontFamily": "ui-monospace, Menlo, Monaco, 'Cascadia Mono', 'Segoe UI Mono', 'Roboto Mono', 'Oxygen Mono', 'Ubuntu Monospace', 'Source Code Pro', 'Fira Mono', 'Droid Sans Mono', 'Courier New', monospace", 5 | "editor.fontLigatures": false, 6 | "editor.fontSize": 22, 7 | "editor.formatOnPaste": true, 8 | "editor.formatOnSave": true, 9 | "editor.lineNumbers": "on", 10 | "editor.matchBrackets": "always", 11 | "editor.minimap.enabled": false, 12 | "editor.smoothScrolling": true, 13 | "editor.tabSize": 2, 14 | "editor.useTabStops": true, 15 | "emmet.triggerExpansionOnTab": true, 16 | "explorer.openEditors.visible": 0, 17 | "files.autoSave": "afterDelay", 18 | "screencastMode.onlyKeyboardShortcuts": true, 19 | "terminal.integrated.fontSize": 18, 20 | "workbench.colorTheme": "Visual Studio Dark", 21 | "workbench.fontAliasing": "antialiased", 22 | "workbench.statusBar.visible": true 23 | } 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 7 | 8 | ## Issue Overview 9 | 10 | 11 | ## Describe your environment 12 | 13 | 14 | ## Steps to Reproduce 15 | 16 | 1. 17 | 2. 18 | 3. 19 | 4. 20 | 21 | ## Expected Behavior 22 | 23 | 24 | ## Current Behavior 25 | 26 | 27 | ## Possible Solution 28 | 29 | 30 | ## Screenshots / Video 31 | 32 | 33 | ## Related Issues 34 | 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Applied Machine Learning: Algorithms 2 | This is the repository for the LinkedIn Learning course `Applied Machine Learning: Algorithms`. The full course is available from [LinkedIn Learning][lil-course-url]. 3 | 4 | ![lil-thumbnail-url] 5 | 6 |
With the growing importance of machine learning in almost every sector, professionals need a deeper understanding and practical approach to implementing ML algorithms effectively. 7 |
8 | This course covers commonly used machine learning algorithms. Instructor Matt Harrison focuses on non-deep learning algorithms, covering PCA, clustering, linear and logistic regression, decision trees, random forests, and gradient boosting. 9 |
10 | Join Matt in this course to understand common ML algorithms, learn their pros and cons, and develop hands-on skills to leverage them by following along with challenges and solutions in GitHub Codespaces. 11 | 12 | ### Instructor 13 | 14 | Matt Harrison 15 | 16 | Python and Data Science Corporate Trainer, Author, Speaker, Consultant 17 | 18 | Check out my other courses on [LinkedIn Learning](https://www.linkedin.com/learning/instructors/matt-harrison?u=104). 19 | 20 | 21 | 22 | [0]: # (Replace these placeholder URLs with actual course URLs) 23 | 24 | [lil-course-url]: https://www.linkedin.com/learning/applied-machine-learning-algorithms-23750732 25 | [lil-thumbnail-url]: https://media.licdn.com/dms/image/D560DAQG8-Uu_NXvIPQ/learning-public-crop_675_1200/0/1712273059536?e=2147483647&v=beta&t=MsAmeJayaZcGlo45adY21zt_BlzTqqZdqmC9t1b7XPc 26 | 27 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.9.1 2 | aiosignal==1.3.1 3 | anyio==4.2.0 4 | argon2-cffi==23.1.0 5 | argon2-cffi-bindings==21.2.0 6 | arrow==1.3.0 7 | asttokens==2.4.1 8 | async-lru==2.0.4 9 | async-timeout==4.0.3 10 | attrs==23.2.0 11 | Babel==2.14.0 12 | beautifulsoup4==4.12.2 13 | bleach==6.1.0 14 | certifi==2023.11.17 15 | cffi==1.16.0 16 | charset-normalizer==3.3.2 17 | colorama==0.4.6 18 | comm==0.2.1 19 | contourpy==1.2.0 20 | cycler==0.12.1 21 | datasets==2.16.1 22 | debugpy==1.8.0 23 | decorator==5.1.1 24 | defusedxml==0.7.1 25 | dill==0.3.7 26 | exceptiongroup==1.2.0 27 | executing==2.0.1 28 | fastjsonschema==2.19.1 29 | filelock==3.13.1 30 | fonttools==4.47.0 31 | fqdn==1.5.1 32 | frozenlist==1.4.1 33 | fsspec==2023.10.0 34 | gitdb==4.0.11 35 | GitPython==3.1.41 36 | huggingface-hub==0.20.3 37 | idna==3.6 38 | ipykernel==6.28.0 39 | ipython==8.20.0 40 | isoduration==20.11.0 41 | jedi==0.19.1 42 | Jinja2==3.1.2 43 | joblib==1.3.2 44 | json5==0.9.14 45 | jsonpointer==2.4 46 | jsonschema==4.20.0 47 | jsonschema-specifications==2023.12.1 48 | jupyter-events==0.9.0 49 | jupyter-lsp==2.2.1 50 | jupyter-server-mathjax==0.2.6 51 | jupyter_client==8.6.0 52 | jupyter_core==5.7.1 53 | jupyter_server==2.12.3 54 | jupyter_server_terminals==0.5.1 55 | jupyterlab==4.0.10 56 | jupyterlab_git==0.50.0 57 | jupyterlab_pygments==0.3.0 58 | jupyterlab_server==2.25.2 59 | kiwisolver==1.4.5 60 | llvmlite==0.41.1 61 | MarkupSafe==2.1.3 62 | matplotlib==3.8.2 63 | matplotlib-inline==0.1.6 64 | mistune==3.0.2 65 | mpmath==1.3.0 66 | multidict==6.0.4 67 | multiprocess==0.70.15 68 | nbclient==0.9.0 69 | nbconvert==7.14.0 70 | nbdime==4.0.1 71 | nbformat==5.9.2 72 | nest-asyncio==1.5.8 73 | networkx==3.2.1 74 | notebook_shim==0.2.3 75 | numba==0.58.1 76 | numpy==1.26.3 77 | nvidia-cublas-cu12==12.1.3.1 78 | nvidia-cuda-cupti-cu12==12.1.105 79 | nvidia-cuda-nvrtc-cu12==12.1.105 80 | nvidia-cuda-runtime-cu12==12.1.105 81 | nvidia-cudnn-cu12==8.9.2.26 82 | nvidia-cufft-cu12==11.0.2.54 83 | nvidia-curand-cu12==10.3.2.106 84 | nvidia-cusolver-cu12==11.4.5.107 85 | nvidia-cusparse-cu12==12.1.0.106 86 | nvidia-nccl-cu12==2.18.1 87 | nvidia-nvjitlink-cu12==12.3.101 88 | nvidia-nvtx-cu12==12.1.105 89 | overrides==7.4.0 90 | packaging==23.2 91 | pandas==2.2.0 92 | pandocfilters==1.5.0 93 | parso==0.8.3 94 | pexpect==4.9.0 95 | pillow==10.2.0 96 | platformdirs==4.1.0 97 | plotly==5.18.0 98 | prometheus-client==0.19.0 99 | prompt-toolkit==3.0.43 100 | psutil==5.9.7 101 | ptyprocess==0.7.0 102 | pure-eval==0.2.2 103 | pyarrow==15.0.0 104 | pyarrow-hotfix==0.6 105 | pycparser==2.21 106 | Pygments==2.17.2 107 | pynndescent==0.5.11 108 | pyparsing==3.1.1 109 | python-dateutil==2.8.2 110 | python-json-logger==2.0.7 111 | pytz==2023.3.post1 112 | PyYAML==6.0.1 113 | pyzmq==25.1.2 114 | referencing==0.32.1 115 | requests==2.31.0 116 | rfc3339-validator==0.1.4 117 | rfc3986-validator==0.1.1 118 | rpds-py==0.16.2 119 | scikit-learn==1.3.2 120 | scipy==1.11.4 121 | seaborn==0.13.1 122 | Send2Trash==1.8.2 123 | six==1.16.0 124 | smmap==5.0.1 125 | sniffio==1.3.0 126 | soupsieve==2.5 127 | stack-data==0.6.3 128 | sympy==1.12 129 | tenacity==8.2.3 130 | terminado==0.18.0 131 | threadpoolctl==3.2.0 132 | tinycss2==1.2.1 133 | tomli==2.0.1 134 | torch==2.1.2 135 | tornado==6.4 136 | tqdm==4.66.1 137 | traitlets==5.14.1 138 | triton==2.1.0 139 | types-python-dateutil==2.8.19.20240106 140 | typing_extensions==4.9.0 141 | tzdata==2023.4 142 | umap-learn==0.5.5 143 | uri-template==1.3.0 144 | urllib3==2.0.7 145 | wcwidth==0.2.13 146 | webcolors==1.13 147 | webencodings==0.5.1 148 | websocket-client==1.7.0 149 | xgboost==2.0.3 150 | xxhash==3.4.1 151 | yarl==1.9.4 152 | yellowbrick==1.5 153 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | LinkedIn Learning Exercise Files License Agreement 2 | ================================================== 3 | 4 | This License Agreement (the "Agreement") is a binding legal agreement 5 | between you (as an individual or entity, as applicable) and LinkedIn 6 | Corporation (“LinkedIn”). By downloading or using the LinkedIn Learning 7 | exercise files in this repository (“Licensed Materials”), you agree to 8 | be bound by the terms of this Agreement. If you do not agree to these 9 | terms, do not download or use the Licensed Materials. 10 | 11 | 1. License. 12 | - a. Subject to the terms of this Agreement, LinkedIn hereby grants LinkedIn 13 | members during their LinkedIn Learning subscription a non-exclusive, 14 | non-transferable copyright license, for internal use only, to 1) make a 15 | reasonable number of copies of the Licensed Materials, and 2) make 16 | derivative works of the Licensed Materials for the sole purpose of 17 | practicing skills taught in LinkedIn Learning courses. 18 | - b. Distribution. Unless otherwise noted in the Licensed Materials, subject 19 | to the terms of this Agreement, LinkedIn hereby grants LinkedIn members 20 | with a LinkedIn Learning subscription a non-exclusive, non-transferable 21 | copyright license to distribute the Licensed Materials, except the 22 | Licensed Materials may not be included in any product or service (or 23 | otherwise used) to instruct or educate others. 24 | 25 | 2. Restrictions and Intellectual Property. 26 | - a. You may not to use, modify, copy, make derivative works of, publish, 27 | distribute, rent, lease, sell, sublicense, assign or otherwise transfer the 28 | Licensed Materials, except as expressly set forth above in Section 1. 29 | - b. Linkedin (and its licensors) retains its intellectual property rights 30 | in the Licensed Materials. Except as expressly set forth in Section 1, 31 | LinkedIn grants no licenses. 32 | - c. You indemnify LinkedIn and its licensors and affiliates for i) any 33 | alleged infringement or misappropriation of any intellectual property rights 34 | of any third party based on modifications you make to the Licensed Materials, 35 | ii) any claims arising from your use or distribution of all or part of the 36 | Licensed Materials and iii) a breach of this Agreement. You will defend, hold 37 | harmless, and indemnify LinkedIn and its affiliates (and our and their 38 | respective employees, shareholders, and directors) from any claim or action 39 | brought by a third party, including all damages, liabilities, costs and 40 | expenses, including reasonable attorneys’ fees, to the extent resulting from, 41 | alleged to have resulted from, or in connection with: (a) your breach of your 42 | obligations herein; or (b) your use or distribution of any Licensed Materials. 43 | 44 | 3. Open source. This code may include open source software, which may be 45 | subject to other license terms as provided in the files. 46 | 47 | 4. Warranty Disclaimer. LINKEDIN PROVIDES THE LICENSED MATERIALS ON AN “AS IS” 48 | AND “AS AVAILABLE” BASIS. LINKEDIN MAKES NO REPRESENTATION OR WARRANTY, 49 | WHETHER EXPRESS OR IMPLIED, ABOUT THE LICENSED MATERIALS, INCLUDING ANY 50 | REPRESENTATION THAT THE LICENSED MATERIALS WILL BE FREE OF ERRORS, BUGS OR 51 | INTERRUPTIONS, OR THAT THE LICENSED MATERIALS ARE ACCURATE, COMPLETE OR 52 | OTHERWISE VALID. TO THE FULLEST EXTENT PERMITTED BY LAW, LINKEDIN AND ITS 53 | AFFILIATES DISCLAIM ANY IMPLIED OR STATUTORY WARRANTY OR CONDITION, INCLUDING 54 | ANY IMPLIED WARRANTY OR CONDITION OF MERCHANTABILITY OR FITNESS FOR A 55 | PARTICULAR PURPOSE, AVAILABILITY, SECURITY, TITLE AND/OR NON-INFRINGEMENT. 56 | YOUR USE OF THE LICENSED MATERIALS IS AT YOUR OWN DISCRETION AND RISK, AND 57 | YOU WILL BE SOLELY RESPONSIBLE FOR ANY DAMAGE THAT RESULTS FROM USE OF THE 58 | LICENSED MATERIALS TO YOUR COMPUTER SYSTEM OR LOSS OF DATA. NO ADVICE OR 59 | INFORMATION, WHETHER ORAL OR WRITTEN, OBTAINED BY YOU FROM US OR THROUGH OR 60 | FROM THE LICENSED MATERIALS WILL CREATE ANY WARRANTY OR CONDITION NOT 61 | EXPRESSLY STATED IN THESE TERMS. 62 | 63 | 5. Limitation of Liability. LINKEDIN SHALL NOT BE LIABLE FOR ANY INDIRECT, 64 | INCIDENTAL, SPECIAL, PUNITIVE, CONSEQUENTIAL OR EXEMPLARY DAMAGES, INCLUDING 65 | BUT NOT LIMITED TO, DAMAGES FOR LOSS OF PROFITS, GOODWILL, USE, DATA OR OTHER 66 | INTANGIBLE LOSSES . IN NO EVENT WILL LINKEDIN'S AGGREGATE LIABILITY TO YOU 67 | EXCEED $100. THIS LIMITATION OF LIABILITY SHALL: 68 | - i. APPLY REGARDLESS OF WHETHER (A) YOU BASE YOUR CLAIM ON CONTRACT, TORT, 69 | STATUTE, OR ANY OTHER LEGAL THEORY, (B) WE KNEW OR SHOULD HAVE KNOWN ABOUT 70 | THE POSSIBILITY OF SUCH DAMAGES, OR (C) THE LIMITED REMEDIES PROVIDED IN THIS 71 | SECTION FAIL OF THEIR ESSENTIAL PURPOSE; AND 72 | - ii. NOT APPLY TO ANY DAMAGE THAT LINKEDIN MAY CAUSE YOU INTENTIONALLY OR 73 | KNOWINGLY IN VIOLATION OF THESE TERMS OR APPLICABLE LAW, OR AS OTHERWISE 74 | MANDATED BY APPLICABLE LAW THAT CANNOT BE DISCLAIMED IN THESE TERMS. 75 | 76 | 6. Termination. This Agreement automatically terminates upon your breach of 77 | this Agreement or termination of your LinkedIn Learning subscription. On 78 | termination, all licenses granted under this Agreement will terminate 79 | immediately and you will delete the Licensed Materials. Sections 2-7 of this 80 | Agreement survive any termination of this Agreement. LinkedIn may discontinue 81 | the availability of some or all of the Licensed Materials at any time for any 82 | reason. 83 | 84 | 7. Miscellaneous. This Agreement will be governed by and construed in 85 | accordance with the laws of the State of California without regard to conflict 86 | of laws principles. The exclusive forum for any disputes arising out of or 87 | relating to this Agreement shall be an appropriate federal or state court 88 | sitting in the County of Santa Clara, State of California. If LinkedIn does 89 | not act to enforce a breach of this Agreement, that does not mean that 90 | LinkedIn has waived its right to enforce this Agreement. The Agreement does 91 | not create a partnership, agency relationship, or joint venture between the 92 | parties. Neither party has the power or authority to bind the other or to 93 | create any obligation or responsibility on behalf of the other. You may not, 94 | without LinkedIn’s prior written consent, assign or delegate any rights or 95 | obligations under these terms, including in connection with a change of 96 | control. Any purported assignment and delegation shall be ineffective. The 97 | Agreement shall bind and inure to the benefit of the parties, their respective 98 | successors and permitted assigns. If any provision of the Agreement is 99 | unenforceable, that provision will be modified to render it enforceable to the 100 | extent possible to give effect to the parties’ intentions and the remaining 101 | provisions will not be affected. This Agreement is the only agreement between 102 | you and LinkedIn regarding the Licensed Materials, and supersedes all prior 103 | agreements relating to the Licensed Materials. 104 | 105 | Last Updated: March 2019 106 | -------------------------------------------------------------------------------- /algos.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Machine Learning Algorithms\n", 8 | "\n", 9 | "Going to provide an overview of the most common machine learning algorithms.\n", 10 | "\n", 11 | "You should feel comfortable with:\n", 12 | "\n", 13 | "- Jupyter Notebooks\n", 14 | "- Python\n" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Clustering" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## K-means clustering\n", 43 | " \n", 44 | "K-means clustering is an unsupervised learning algorithm that classifies a data point based on the majority of its neighbors. \n", 45 | "\n", 46 | "The algorithm is as follows:\n", 47 | "\n", 48 | "- Choose the number of $k$ and a distance metric.\n", 49 | "- Find the $k$ nearest neighbors of the sample that we want to classify.\n", 50 | "- Assign the class label by majority vote.\n", 51 | "- Update the centroids of each class.\n", 52 | "- Repeat the steps above until convergence.\n", 53 | "\n", 54 | "Note that because this algorithm takes distance into account, it is important that the features (columns) are on the same scale. For the iris dataset, they are but for other datasets, you may need to scale the features." 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": false, 62 | "jupyter": { 63 | "outputs_hidden": false 64 | } 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "from sklearn.cluster import KMeans\n", 69 | "import sklearn.datasets\n", 70 | "import matplotlib.pyplot as plt\n", 71 | "\n", 72 | "dataset = sklearn.datasets.load_iris(as_frame=True)\n", 73 | "\n", 74 | "X = (dataset\n", 75 | " ['data']\n", 76 | " .loc[:, ['sepal length (cm)', 'sepal width (cm)']]\n", 77 | ")\n", 78 | "y = dataset['target']\n", 79 | "\n", 80 | "# demonstration of k-means clustering with iris dataset\n", 81 | "# keep list of all centroids\n", 82 | "centroids = []\n", 83 | "\n", 84 | "# loop over a few iterations and plot the results\n", 85 | "for i in range(10):\n", 86 | " model = KMeans(n_clusters=3, init='random', n_init=1,\n", 87 | " max_iter=i+1, random_state=42)\n", 88 | " model.fit(X)\n", 89 | " label = model.predict(X)\n", 90 | " # plot the input data color by cluster\n", 91 | " fig, ax = plt.subplots(figsize=(8, 6))\n", 92 | " X.plot.scatter(x='sepal length (cm)', y='sepal width (cm)', c=label, cmap='viridis', ax=ax)\n", 93 | " # plot the centers\n", 94 | " centers = model.cluster_centers_\n", 95 | " ax.scatter(centers[:, 0], centers[:, 1], marker='*', s=250, color='r', alpha=.5)\n", 96 | " ax.set_title('Iteration: ' + str(i))\n", 97 | " # plot previous centroids with reduced alpha value\n", 98 | " if i > 0:\n", 99 | " for centroid in centroids:\n", 100 | " ax.scatter(centroid[:, 0], centroid[:, 1], marker='*', s=250, color='r', alpha=.1)\n", 101 | " # save the current centroids\n", 102 | " centroids.append(model.cluster_centers_)\n", 103 | " \n", 104 | " \n", 105 | " # plot the original data color by target\n", 106 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 107 | "X.plot.scatter(x='sepal length (cm)', y='sepal width (cm)', c=y, cmap='viridis', ax=ax)\n", 108 | "ax.set_title('Original Data')\n" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "## K Evaluation for K-means\n", 137 | "\n", 138 | "We specify the K value for K-means clustering. However, we do not know the best K value for a given dataset. \n", 139 | "\n", 140 | "There are a few methods to evaluate the K value for K-means clustering:\n", 141 | "\n", 142 | "- Elbow method - Track the \"inertia\" of the model as the K value increases. The inertia is the sum of squared distances of samples to their closest cluster center. \n", 143 | "- Silhouette coefficient - The silhouette coefficient is a measure of how similar an object is to its own cluster compared to other clusters. The silhouette coefficient ranges from -1 to 1. " 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# Run elbow method to find optimal number of clusters\n", 153 | "\n", 154 | "inertias = []\n", 155 | "max_clusters = 20\n", 156 | "for i in range(max_clusters):\n", 157 | " km = KMeans(n_clusters=i+1, n_init='auto',\n", 158 | " max_iter=300, random_state=42)\n", 159 | " km.fit(X)\n", 160 | " inertias.append(km.inertia_)\n", 161 | "\n", 162 | "# plot the results\n", 163 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 164 | "ax.plot(range(1, max_clusters+1), inertias, marker='o')\n", 165 | "ax.set_xlabel('Number of clusters')\n", 166 | "ax.set_ylabel('Inertia')\n", 167 | "ax.set_title('Elbow Method')\n" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "!pip install yellowbrick" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "from yellowbrick.cluster import SilhouetteVisualizer\n", 186 | "SilhouetteVisualizer?" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "# list matplotlib fonts available\n", 196 | "import matplotlib.font_manager\n", 197 | "matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf')\n" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "# if on linux, set matplotlib font to DejaVuSans\n", 207 | "# get rid of warnings in next cell\n", 208 | "import matplotlib\n", 209 | "matplotlib.rcParams['font.family'] = 'DejaVu Sans'\n" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "# Run silhouette method to find optimal number of clusters\n", 219 | "from yellowbrick.cluster import silhouette_visualizer \n", 220 | "\n", 221 | "max_clusters = 6\n", 222 | "for i in range(max_clusters):\n", 223 | " km = KMeans(n_clusters=i+2, n_init='auto',\n", 224 | " max_iter=300, random_state=42)\n", 225 | "\n", 226 | " fig, ax = plt.subplots(figsize=(8, 6))\n", 227 | " # setting show=False so we can set xlim to same value for all plots\n", 228 | " viz = silhouette_visualizer(km, X, colors='yellowbrick', ax=ax, show=False)\n", 229 | " ax.set_xlim([-0.1, .8]) \n", 230 | "\n" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "## Understanding Clustering Results\n", 259 | "\n", 260 | "We can use the following methods to understand the clustering results:\n", 261 | "\n", 262 | "- Create a surrogate model to predict the cluster label for a given sample.\n", 263 | "- Summarize the cluster by the mean of each feature.\n", 264 | "- Visualize the clustering results in 2D or 3D." 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "!pip install datasets" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "# going to cluster electricity data from Australia\n", 283 | "# https://www.openml.org/search?type=data&sort=runs&id=151&status=active\n", 284 | "from datasets import load_dataset\n", 285 | "electricity = load_dataset('inria-soda/tabular-benchmark', data_files='clf_num/electricity.csv')" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "electricity" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "print(dir(electricity['train']))" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "elec = electricity['train'].to_pandas()\n", 313 | "elec" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "X = elec.drop(columns=['class'])" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "# get inertias for different numbers of clusters\n", 332 | "inertias = []\n", 333 | "max_clusters = 20\n", 334 | "for i in range(max_clusters):\n", 335 | " km = KMeans(n_clusters=i+1, n_init='auto',\n", 336 | " max_iter=300, random_state=42)\n", 337 | " km.fit(X)\n", 338 | " inertias.append(km.inertia_)\n", 339 | "\n", 340 | "# plot the results\n", 341 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 342 | "ax.plot(range(1, max_clusters+1), inertias, marker='o')" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "# Takes 2min 20s w/o sampling\n", 352 | "# get silhouette scores for different numbers of clusters\n", 353 | "from yellowbrick.cluster import silhouette_visualizer\n", 354 | "\n", 355 | "max_clusters = 6\n", 356 | "for i in range(max_clusters):\n", 357 | " km = KMeans(n_clusters=i+2, n_init='auto',\n", 358 | " max_iter=300, random_state=42)\n", 359 | "\n", 360 | " fig, ax = plt.subplots(figsize=(8, 6))\n", 361 | " # setting show=False so we can set xlim to same value for all plots\n", 362 | " viz = silhouette_visualizer(km, X.sample(1_000, random_state=42), colors='yellowbrick', ax=ax, show=False)\n", 363 | " ax.set_xlim([-0.1, .8])" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "# going to choose 5 clusters (if 5, 6, or 7 are all close, choose the simpler model)\n", 373 | "# summarize the results by group\n", 374 | "\n", 375 | "km = KMeans(n_clusters=5, n_init='auto',\n", 376 | " max_iter=300, random_state=42)\n", 377 | "km.fit(X)\n", 378 | "label = km.predict(X)\n", 379 | "(elec\n", 380 | " .assign(cluster=label)\n", 381 | " .groupby('cluster')\n", 382 | " .agg('mean', numeric_only=True)\n", 383 | " .T\n", 384 | " .style\n", 385 | " .background_gradient(cmap='RdBu', axis='columns')\n", 386 | ")" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "# Summarize by surrogate model decision tree\n", 403 | "from sklearn.tree import DecisionTreeClassifier\n", 404 | "\n", 405 | "dt = DecisionTreeClassifier(max_depth=3, random_state=42)\n", 406 | "dt.fit(X, label)\n", 407 | "# plot the tree\n", 408 | "from sklearn.tree import plot_tree\n", 409 | "fig, ax = plt.subplots(figsize=(14, 8))\n", 410 | "# make string for class names\n", 411 | "class_names = [str(i) for i in range(0, 5)]\n", 412 | "_ = plot_tree(dt, ax=ax, feature_names=X.columns, class_names=class_names, filled=True, fontsize=10)\n" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "## Other Clustering Algorithms\n", 441 | "\n", 442 | "- Hierarchical clustering - This clusters by distance. It first clusters the two closest points, then clusters the next closest point to the first cluster, and so on. It is not as efficient as k-means, but it does not require you to specify the number of clusters. It is also more robust to outliers than k-means.\n", 443 | "- DBSCAN - This clusters by density. It finds the densest region of points and clusters them together. It is also more robust to outliers than k-means.\n" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": {}, 463 | "source": [ 464 | "## Challenge: K-means Clustering\n", 465 | "\n", 466 | "With the Titanic dataset:\n", 467 | "- drop the missing values\n", 468 | "- drop the categorical features\n", 469 | "- scale the features\n", 470 | "- run K-means clustering\n", 471 | "- use the elbow method to find the best K value" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "## Solution: K-means Clustering\n" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": null, 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": null, 498 | "metadata": {}, 499 | "outputs": [], 500 | "source": [] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": {}, 505 | "source": [ 506 | "# PCA" 507 | ] 508 | }, 509 | { 510 | "cell_type": "markdown", 511 | "metadata": {}, 512 | "source": [ 513 | "## Intro to PCA\n", 514 | "\n", 515 | "Principal component analysis (PCA) is a dimensionality reduction technique. It finds the linear combinations of the features that explain the most variance in the data.\n" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": null, 521 | "metadata": {}, 522 | "outputs": [], 523 | "source": [ 524 | "from sklearn.cluster import KMeans\n", 525 | "import sklearn.datasets\n", 526 | "import matplotlib.pyplot as plt\n", 527 | "\n", 528 | "dataset = sklearn.datasets.load_iris(as_frame=True)\n", 529 | "\n", 530 | "X = (dataset\n", 531 | " ['data']\n", 532 | " .loc[:, ['sepal length (cm)', 'sepal width (cm)']]\n", 533 | ")\n", 534 | "y = dataset['target']" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": null, 540 | "metadata": {}, 541 | "outputs": [], 542 | "source": [ 543 | "X" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": null, 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "X_pca" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "# Run PCA on the first two components of the Iris dataset\n", 562 | "from sklearn.decomposition import PCA\n", 563 | "pca = PCA(n_components=2)\n", 564 | "pca.fit(X)\n", 565 | "X_pca = pca.transform(X)\n", 566 | "\n", 567 | "# plot the results\n", 568 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 569 | "ax.scatter(X_pca['pca0'], X_pca['pca1'], c=y, cmap='viridis')\n", 570 | "ax.set_xlabel('PC1')\n", 571 | "ax.set_ylabel('PC2')\n", 572 | "ax.set_title('PCA on Iris Data')\n" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": null, 578 | "metadata": {}, 579 | "outputs": [], 580 | "source": [ 581 | "X" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": null, 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [ 590 | "# Run PCA on all of the Iris dataset\n", 591 | "pca = PCA()\n", 592 | "\n", 593 | "X_all = (dataset\n", 594 | " ['data']\n", 595 | " #.loc[:, ['sepal length (cm)', 'sepal width (cm)']]\n", 596 | ")\n", 597 | "pca.fit(X_all)\n", 598 | "X_pca = pca.transform(X_all)\n", 599 | "\n", 600 | "# Plot to first 2 components\n", 601 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 602 | "ax.scatter(X_pca['pca0'], X_pca['pca1'], c=y, cmap='viridis')\n", 603 | "ax.set_xlabel('PC1')\n", 604 | "ax.set_ylabel('PC2')\n", 605 | "ax.set_title('PCA on Iris Data')" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": null, 611 | "metadata": {}, 612 | "outputs": [], 613 | "source": [] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": null, 618 | "metadata": {}, 619 | "outputs": [], 620 | "source": [] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": null, 625 | "metadata": {}, 626 | "outputs": [], 627 | "source": [] 628 | }, 629 | { 630 | "cell_type": "markdown", 631 | "metadata": {}, 632 | "source": [ 633 | "## Components of PCA\n", 634 | "\n", 635 | "When we run PCA, we get back the following:\n", 636 | "\n", 637 | "- Principal components (PCs): The PCs are the linear combinations of the features. \n", 638 | "- Explained variance ratio: The explained variance ratio tells us how much variance is explained by each PC.\n", 639 | "- Feature weights: The feature weights tell us how much each feature contributes to each PC.\n" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": null, 645 | "metadata": {}, 646 | "outputs": [], 647 | "source": [ 648 | "# tell sklearn to output pandas dataframes\n", 649 | "sklearn.set_config(transform_output='pandas')\n", 650 | "\n", 651 | "pca.transform(X_all)" 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": null, 657 | "metadata": {}, 658 | "outputs": [], 659 | "source": [ 660 | "# I don't like the column names, so I'll rename them\n", 661 | "# change pca0 to PC1, pca1 to PC2, etc\n", 662 | "\n", 663 | "def rename_pc0_to_PC1(col):\n", 664 | " num = int(col[3:]) + 1\n", 665 | " return 'PC' + str(num)\n", 666 | "\n", 667 | "pca.transform(X_all).rename(columns=rename_pc0_to_PC1)\n" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": null, 673 | "metadata": {}, 674 | "outputs": [], 675 | "source": [ 676 | "# Scree plot of explained variance\n", 677 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 678 | "ax.plot(range(1, len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_, marker='o')\n", 679 | "ax.set_xlabel('Principal Component')\n", 680 | "ax.set_ylabel('Explained Variance Ratio')\n", 681 | "ax.set_title('Scree Plot')" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": null, 687 | "metadata": {}, 688 | "outputs": [], 689 | "source": [ 690 | "# The explained variance ratio is the percentage of variance explained by each of the selected components.\n", 691 | "# The first principal component explains 92.5% of the variance in the data, and \n", 692 | "# the second principal component explains 5.3% of the variance in the data.\n", 693 | "pca.explained_variance_ratio_" 694 | ] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "execution_count": null, 699 | "metadata": {}, 700 | "outputs": [], 701 | "source": [ 702 | "pca.explained_variance_ratio_.cumsum()" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": null, 708 | "metadata": {}, 709 | "outputs": [], 710 | "source": [] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": null, 715 | "metadata": {}, 716 | "outputs": [], 717 | "source": [] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": null, 722 | "metadata": {}, 723 | "outputs": [], 724 | "source": [] 725 | }, 726 | { 727 | "cell_type": "markdown", 728 | "metadata": {}, 729 | "source": [ 730 | "## Weights\n", 731 | "\n", 732 | "For every PC, we get a set of weights for each feature. The weights tell us how much each feature contributes to the PC." 733 | ] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": null, 738 | "metadata": {}, 739 | "outputs": [], 740 | "source": [ 741 | "# convert components to a dataframe\n", 742 | "import pandas as pd\n", 743 | "\n", 744 | "components = pd.DataFrame(pca.components_, columns=X_all.columns,\n", 745 | " index=['PC1', 'PC2', 'PC3', 'PC4'])\n", 746 | " \n", 747 | "components\n" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": null, 753 | "metadata": {}, 754 | "outputs": [], 755 | "source": [ 756 | "# Centered data - for next cell's calculation\n", 757 | "X_all - X_all.mean()" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": null, 763 | "metadata": {}, 764 | "outputs": [], 765 | "source": [ 766 | "# calculating PC1 by hand for first row - linear combination of \n", 767 | "# centered variables and the first component \n", 768 | "-.743333 * .3613 + 0.4426 * -0.0845 + -2.358 * 0.8566 + -0.9993 * 0.3582" 769 | ] 770 | }, 771 | { 772 | "cell_type": "code", 773 | "execution_count": null, 774 | "metadata": {}, 775 | "outputs": [], 776 | "source": [ 777 | "pca.transform(X_all).rename(columns=rename_pc0_to_PC1)" 778 | ] 779 | }, 780 | { 781 | "cell_type": "code", 782 | "execution_count": null, 783 | "metadata": {}, 784 | "outputs": [], 785 | "source": [ 786 | "# Manually calculating PCA with numpy\n", 787 | "import numpy as np\n", 788 | "nums = X_all - X_all.mean()\n", 789 | "vals, vecs = np.linalg.eig(nums.cov())\n", 790 | "idxs = pd.Series(vals).argsort()\n", 791 | "\n", 792 | "explained_variance = pd.Series(sorted(vals, reverse=True))\n", 793 | "\n", 794 | "def set_columns(df_):\n", 795 | " df_.columns = [f'PC{i+1}' for i in range(len(df_.columns))]\n", 796 | " return df_\n", 797 | "\n", 798 | "comps = (pd.DataFrame(vecs, index=nums.columns)\n", 799 | " .iloc[:, idxs[::-1]]\n", 800 | " .pipe(set_columns)\n", 801 | ")\n", 802 | "\n", 803 | "pcas = (nums.dot(comps))\n", 804 | "pcas" 805 | ] 806 | }, 807 | { 808 | "cell_type": "code", 809 | "execution_count": null, 810 | "metadata": {}, 811 | "outputs": [], 812 | "source": [] 813 | }, 814 | { 815 | "cell_type": "markdown", 816 | "metadata": {}, 817 | "source": [ 818 | "## Scatter Plot" 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": null, 824 | "metadata": {}, 825 | "outputs": [], 826 | "source": [ 827 | "# Use plotly to plot the first three components\n", 828 | "import plotly.express as px\n", 829 | "fig = px.scatter_3d(pcas, x='PC1', y='PC2', z='PC3', color=y)\n", 830 | "fig.show()\n" 831 | ] 832 | }, 833 | { 834 | "cell_type": "code", 835 | "execution_count": null, 836 | "metadata": {}, 837 | "outputs": [], 838 | "source": [] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": null, 843 | "metadata": {}, 844 | "outputs": [], 845 | "source": [] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": null, 850 | "metadata": {}, 851 | "outputs": [], 852 | "source": [] 853 | }, 854 | { 855 | "cell_type": "code", 856 | "execution_count": null, 857 | "metadata": {}, 858 | "outputs": [], 859 | "source": [] 860 | }, 861 | { 862 | "cell_type": "code", 863 | "execution_count": null, 864 | "metadata": {}, 865 | "outputs": [], 866 | "source": [] 867 | }, 868 | { 869 | "cell_type": "code", 870 | "execution_count": null, 871 | "metadata": {}, 872 | "outputs": [], 873 | "source": [] 874 | }, 875 | { 876 | "cell_type": "markdown", 877 | "metadata": {}, 878 | "source": [ 879 | "## Other Dimensionality Reduction Techniques\n", 880 | "\n", 881 | "- t-SNE - t-distributed stochastic neighbor embedding. Tries to preserve the local structure of the data.\n", 882 | "- UMAP - Uniform Manifold Approximation and Projection. Tries to preserve the both global and structure of the data.\n", 883 | "- Autoencoders\n" 884 | ] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "execution_count": null, 889 | "metadata": {}, 890 | "outputs": [], 891 | "source": [ 892 | "# run t-SNE on the Iris dataset\n", 893 | "from sklearn.manifold import TSNE\n", 894 | "tsne = TSNE(n_components=3, random_state=42)\n", 895 | "X_tsne = tsne.fit_transform(X_all)\n" 896 | ] 897 | }, 898 | { 899 | "cell_type": "code", 900 | "execution_count": null, 901 | "metadata": {}, 902 | "outputs": [], 903 | "source": [ 904 | "X_tsne" 905 | ] 906 | }, 907 | { 908 | "cell_type": "code", 909 | "execution_count": null, 910 | "metadata": {}, 911 | "outputs": [], 912 | "source": [ 913 | "# plot results with plotly\n", 914 | "import plotly.express as px\n", 915 | "fig = px.scatter_3d(X_tsne, x='tsne0', y='tsne1', z='tsne2', color=y)\n", 916 | "fig.show()\n" 917 | ] 918 | }, 919 | { 920 | "cell_type": "code", 921 | "execution_count": null, 922 | "metadata": {}, 923 | "outputs": [], 924 | "source": [ 925 | "!pip install umap-learn" 926 | ] 927 | }, 928 | { 929 | "cell_type": "code", 930 | "execution_count": null, 931 | "metadata": {}, 932 | "outputs": [], 933 | "source": [ 934 | "# run UMAP\n", 935 | "import umap\n", 936 | "reducer = umap.UMAP(random_state=42, n_components=3)\n", 937 | "X_umap = pd.DataFrame(reducer.fit_transform(X_all), columns=['umap0', 'umap1', 'umap2'])\n", 938 | "X_umap" 939 | ] 940 | }, 941 | { 942 | "cell_type": "code", 943 | "execution_count": null, 944 | "metadata": {}, 945 | "outputs": [], 946 | "source": [ 947 | "# plot results with plotly\n", 948 | "import plotly.express as px\n", 949 | "fig = px.scatter_3d(X_umap, x='umap0', y='umap1', z='umap2', color=y)\n", 950 | "fig.show()" 951 | ] 952 | }, 953 | { 954 | "cell_type": "code", 955 | "execution_count": null, 956 | "metadata": {}, 957 | "outputs": [], 958 | "source": [] 959 | }, 960 | { 961 | "cell_type": "code", 962 | "execution_count": null, 963 | "metadata": {}, 964 | "outputs": [], 965 | "source": [] 966 | }, 967 | { 968 | "cell_type": "code", 969 | "execution_count": null, 970 | "metadata": {}, 971 | "outputs": [], 972 | "source": [] 973 | }, 974 | { 975 | "cell_type": "markdown", 976 | "metadata": {}, 977 | "source": [ 978 | "## Challenge: PCA\n", 979 | "\n", 980 | "Run PCA on the numeric columns of the Titanic data. Plot the result of the first three dimensons using plotly." 981 | ] 982 | }, 983 | { 984 | "cell_type": "code", 985 | "execution_count": null, 986 | "metadata": {}, 987 | "outputs": [], 988 | "source": [] 989 | }, 990 | { 991 | "cell_type": "markdown", 992 | "metadata": {}, 993 | "source": [ 994 | "Solution: PCA" 995 | ] 996 | }, 997 | { 998 | "cell_type": "code", 999 | "execution_count": null, 1000 | "metadata": {}, 1001 | "outputs": [], 1002 | "source": [] 1003 | }, 1004 | { 1005 | "cell_type": "code", 1006 | "execution_count": null, 1007 | "metadata": {}, 1008 | "outputs": [], 1009 | "source": [] 1010 | }, 1011 | { 1012 | "cell_type": "code", 1013 | "execution_count": null, 1014 | "metadata": {}, 1015 | "outputs": [], 1016 | "source": [] 1017 | }, 1018 | { 1019 | "cell_type": "markdown", 1020 | "metadata": {}, 1021 | "source": [ 1022 | "# Linear Regression" 1023 | ] 1024 | }, 1025 | { 1026 | "cell_type": "code", 1027 | "execution_count": null, 1028 | "metadata": {}, 1029 | "outputs": [], 1030 | "source": [] 1031 | }, 1032 | { 1033 | "cell_type": "markdown", 1034 | "metadata": {}, 1035 | "source": [ 1036 | "## Linear Regression Algorithm\n", 1037 | "\n", 1038 | "Linear regression calculates an intercept and slope (weights) for a line that minimizes the sum of squared errors between the line and the data points.\n", 1039 | "\n", 1040 | "The formula for linear regression is as follows:\n", 1041 | "\n", 1042 | "$$y = \\beta_0 + \\beta_1 x_1 + \\beta_2 x_2 + ... + \\beta_n x_n$$\n", 1043 | "\n", 1044 | "where $y$ is the target variable, $\\beta_0$ is the intercept, $\\beta_1$ to $\\beta_n$ are the weights, and $x_1$ to $x_n$ are the features.\n", 1045 | "\n", 1046 | "The algorithm is as follows:\n", 1047 | "\n", 1048 | "- Initialize the weights.\n", 1049 | "- Calculate the predicted values.\n", 1050 | "- Calculate the error.\n", 1051 | "- Update the weights.\n", 1052 | "- Repeat the steps above until convergence.\n", 1053 | "\n" 1054 | ] 1055 | }, 1056 | { 1057 | "cell_type": "code", 1058 | "execution_count": null, 1059 | "metadata": {}, 1060 | "outputs": [], 1061 | "source": [ 1062 | "# Load anscombe's quartet\n", 1063 | "x = [10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]\n", 1064 | "y1 = [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]\n", 1065 | "y2 = [9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74]\n", 1066 | "y3 = [7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73]\n", 1067 | "x4 = [8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8]\n", 1068 | "y4 = [6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89]\n", 1069 | "anscombe = (pd.DataFrame({'x': x, 'y1': y1, 'y2': y2, 'y3': y3, 'x4': x4, 'y4': y4})\n", 1070 | " )\n", 1071 | "\n", 1072 | "anscombe" 1073 | ] 1074 | }, 1075 | { 1076 | "cell_type": "code", 1077 | "execution_count": null, 1078 | "metadata": {}, 1079 | "outputs": [], 1080 | "source": [ 1081 | "# plot x y1\n", 1082 | "fig, ax = plt.subplots(figsize=(3, 3))\n", 1083 | "anscombe.plot.scatter(x='x', y='y1', ax=ax, color='k')" 1084 | ] 1085 | }, 1086 | { 1087 | "cell_type": "markdown", 1088 | "metadata": {}, 1089 | "source": [ 1090 | "Let's run the algorithm on x and y1\n", 1091 | "\n", 1092 | "Calculate the slope:\n", 1093 | "\n", 1094 | "$$\\beta_1 = \\frac{\\sum_{i=1}^{n} (x_i - \\bar{x})(y_i - \\bar{y})}{\\sum_{i=1}^{n} (x_i - \\bar{x})^2}$$\n", 1095 | "\n", 1096 | "Calculate the intercept:\n", 1097 | "\n", 1098 | "$$\\beta_0 = \\bar{y} - \\beta_1 \\bar{x}$$\n", 1099 | "\n", 1100 | "Model Equation:\n", 1101 | "\n", 1102 | "$$y = \\beta_0 + \\beta_1 x$$" 1103 | ] 1104 | }, 1105 | { 1106 | "cell_type": "code", 1107 | "execution_count": null, 1108 | "metadata": {}, 1109 | "outputs": [], 1110 | "source": [ 1111 | "# slope\n", 1112 | "\n", 1113 | "x1 = anscombe['x']\n", 1114 | "y1 = anscombe['y1']\n", 1115 | "slope = ((x1 - x1.mean())*(y1 - y1.mean())).sum() / ((x1 - x1.mean())**2).sum()\n", 1116 | "slope" 1117 | ] 1118 | }, 1119 | { 1120 | "cell_type": "code", 1121 | "execution_count": null, 1122 | "metadata": {}, 1123 | "outputs": [], 1124 | "source": [ 1125 | "# intercept\n", 1126 | "\n", 1127 | "intercept = y1.mean() - slope * x1.mean()\n", 1128 | "intercept" 1129 | ] 1130 | }, 1131 | { 1132 | "cell_type": "code", 1133 | "execution_count": null, 1134 | "metadata": {}, 1135 | "outputs": [], 1136 | "source": [ 1137 | "# plot x y1\n", 1138 | "fig, ax = plt.subplots(figsize=(3, 3))\n", 1139 | "anscombe.plot.scatter(x='x', y='y1', ax=ax, color='k')\n", 1140 | "# plot the line\n", 1141 | "x1 = np.linspace(4, 14, 100)\n", 1142 | "y1 = slope * x1 + intercept\n", 1143 | "ax.plot(x1, y1, color='r')" 1144 | ] 1145 | }, 1146 | { 1147 | "cell_type": "code", 1148 | "execution_count": null, 1149 | "metadata": {}, 1150 | "outputs": [], 1151 | "source": [] 1152 | }, 1153 | { 1154 | "cell_type": "code", 1155 | "execution_count": null, 1156 | "metadata": {}, 1157 | "outputs": [], 1158 | "source": [] 1159 | }, 1160 | { 1161 | "cell_type": "code", 1162 | "execution_count": null, 1163 | "metadata": {}, 1164 | "outputs": [], 1165 | "source": [] 1166 | }, 1167 | { 1168 | "cell_type": "markdown", 1169 | "metadata": {}, 1170 | "source": [ 1171 | "## Examples with Scikit-learn" 1172 | ] 1173 | }, 1174 | { 1175 | "cell_type": "code", 1176 | "execution_count": null, 1177 | "metadata": {}, 1178 | "outputs": [], 1179 | "source": [ 1180 | "from sklearn.linear_model import LinearRegression\n", 1181 | "\n", 1182 | "x1 = anscombe[['x']]\n", 1183 | "y1 = anscombe['y1']\n", 1184 | "y2 = anscombe['y2']\n", 1185 | "y3 = anscombe['y3']\n", 1186 | "\n", 1187 | "lr1 = LinearRegression()\n", 1188 | "lr1.fit(x1, y1)\n", 1189 | "\n" 1190 | ] 1191 | }, 1192 | { 1193 | "cell_type": "code", 1194 | "execution_count": null, 1195 | "metadata": {}, 1196 | "outputs": [], 1197 | "source": [ 1198 | "lr1.coef_" 1199 | ] 1200 | }, 1201 | { 1202 | "cell_type": "code", 1203 | "execution_count": null, 1204 | "metadata": {}, 1205 | "outputs": [], 1206 | "source": [ 1207 | "lr1.intercept_" 1208 | ] 1209 | }, 1210 | { 1211 | "cell_type": "code", 1212 | "execution_count": null, 1213 | "metadata": {}, 1214 | "outputs": [], 1215 | "source": [ 1216 | "lr2 = LinearRegression()\n", 1217 | "lr2.fit(x1, y2)\n", 1218 | "lr3 = LinearRegression()\n", 1219 | "lr3.fit(x1, y3)" 1220 | ] 1221 | }, 1222 | { 1223 | "cell_type": "code", 1224 | "execution_count": null, 1225 | "metadata": {}, 1226 | "outputs": [], 1227 | "source": [ 1228 | "# plot 1, 2 and 3 in different colors\n", 1229 | "fig, axs = plt.subplots(1, 3, figsize=(9, 3))\n", 1230 | "anscombe.plot.scatter(x='x', y='y1', ax=axs[0], color='k')\n", 1231 | "axs[0].plot(x1, lr1.predict(x1), color='#aaa')\n", 1232 | "axs[0].set_ylim(3, 13)\n", 1233 | "anscombe.plot.scatter(x='x', y='y2', ax=axs[1], color='b')\n", 1234 | "axs[1].plot(x1, lr2.predict(x1), color='#55a')\n", 1235 | "axs[1].set_ylim(3, 13)\n", 1236 | "anscombe.plot.scatter(x='x', y='y3', ax=axs[2], color='g')\n", 1237 | "axs[2].plot(x1, lr3.predict(x1), color='#5a5')\n", 1238 | "axs[2].set_ylim(3, 13)" 1239 | ] 1240 | }, 1241 | { 1242 | "cell_type": "code", 1243 | "execution_count": null, 1244 | "metadata": {}, 1245 | "outputs": [], 1246 | "source": [] 1247 | }, 1248 | { 1249 | "cell_type": "code", 1250 | "execution_count": null, 1251 | "metadata": {}, 1252 | "outputs": [], 1253 | "source": [] 1254 | }, 1255 | { 1256 | "cell_type": "code", 1257 | "execution_count": null, 1258 | "metadata": {}, 1259 | "outputs": [], 1260 | "source": [] 1261 | }, 1262 | { 1263 | "cell_type": "markdown", 1264 | "metadata": {}, 1265 | "source": [ 1266 | "## Real world example with Aircraft Elevators\n", 1267 | "\n", 1268 | "From website: This data set is also obtained from the task of controlling a F16 aircraft, although the target variable and attributes are different from the ailerons domain. In this case the goal variable is related to an action taken on the elevators of the aircraft.\n" 1269 | ] 1270 | }, 1271 | { 1272 | "cell_type": "code", 1273 | "execution_count": null, 1274 | "metadata": {}, 1275 | "outputs": [], 1276 | "source": [ 1277 | "# https://www.openml.org/search?type=data&sort=runs&id=216&satatus=active \n", 1278 | "from datasets import load_dataset\n", 1279 | "elevators = load_dataset('inria-soda/tabular-benchmark', data_files='reg_num/elevators.csv')" 1280 | ] 1281 | }, 1282 | { 1283 | "cell_type": "code", 1284 | "execution_count": null, 1285 | "metadata": {}, 1286 | "outputs": [], 1287 | "source": [ 1288 | "elev = elevators['train'].to_pandas()\n", 1289 | "elev\n" 1290 | ] 1291 | }, 1292 | { 1293 | "cell_type": "code", 1294 | "execution_count": null, 1295 | "metadata": {}, 1296 | "outputs": [], 1297 | "source": [ 1298 | "X = elev.drop(columns=['Goal'])\n", 1299 | "y = elev['Goal']\n", 1300 | "\n", 1301 | "lr_elev = LinearRegression()\n", 1302 | "lr_elev.fit(X, y)" 1303 | ] 1304 | }, 1305 | { 1306 | "cell_type": "code", 1307 | "execution_count": null, 1308 | "metadata": {}, 1309 | "outputs": [], 1310 | "source": [ 1311 | "lr_elev.coef_\n" 1312 | ] 1313 | }, 1314 | { 1315 | "cell_type": "code", 1316 | "execution_count": null, 1317 | "metadata": {}, 1318 | "outputs": [], 1319 | "source": [ 1320 | "lr_elev.intercept_" 1321 | ] 1322 | }, 1323 | { 1324 | "cell_type": "code", 1325 | "execution_count": null, 1326 | "metadata": {}, 1327 | "outputs": [], 1328 | "source": [ 1329 | "pd.Series(lr_elev.coef_, index=X.columns).sort_values().plot.barh(figsize=(8, 6))" 1330 | ] 1331 | }, 1332 | { 1333 | "cell_type": "code", 1334 | "execution_count": null, 1335 | "metadata": {}, 1336 | "outputs": [], 1337 | "source": [ 1338 | "# score is R^2 - the proportion of variance explained by the model\n", 1339 | "lr_elev.score(X, y)" 1340 | ] 1341 | }, 1342 | { 1343 | "cell_type": "code", 1344 | "execution_count": null, 1345 | "metadata": {}, 1346 | "outputs": [], 1347 | "source": [ 1348 | "from sklearn.metrics import mean_squared_error, mean_absolute_error\n", 1349 | "mean_absolute_error(y, lr_elev.predict(X)), mean_squared_error(y, lr_elev.predict(X))" 1350 | ] 1351 | }, 1352 | { 1353 | "cell_type": "code", 1354 | "execution_count": null, 1355 | "metadata": {}, 1356 | "outputs": [], 1357 | "source": [ 1358 | "lr_elev.predict(X.iloc[[0]])" 1359 | ] 1360 | }, 1361 | { 1362 | "cell_type": "code", 1363 | "execution_count": null, 1364 | "metadata": {}, 1365 | "outputs": [], 1366 | "source": [ 1367 | "y.iloc[0]" 1368 | ] 1369 | }, 1370 | { 1371 | "cell_type": "code", 1372 | "execution_count": null, 1373 | "metadata": {}, 1374 | "outputs": [], 1375 | "source": [] 1376 | }, 1377 | { 1378 | "cell_type": "code", 1379 | "execution_count": null, 1380 | "metadata": {}, 1381 | "outputs": [], 1382 | "source": [] 1383 | }, 1384 | { 1385 | "cell_type": "code", 1386 | "execution_count": null, 1387 | "metadata": {}, 1388 | "outputs": [], 1389 | "source": [] 1390 | }, 1391 | { 1392 | "cell_type": "markdown", 1393 | "metadata": {}, 1394 | "source": [ 1395 | "## Assumptions of Linear Regression\n", 1396 | "\n", 1397 | "- Linear relationship between the features and target variable\n", 1398 | "- No multicollinearity - no correlation between the features\n", 1399 | "- Homoscedasticity - the variance of the residuals is the same for all values of the target variable\n", 1400 | "- No outliers - the residuals are normally distributed\n", 1401 | "\n", 1402 | "Also, generally you will want to scale the features before running linear regression." 1403 | ] 1404 | }, 1405 | { 1406 | "cell_type": "code", 1407 | "execution_count": null, 1408 | "metadata": {}, 1409 | "outputs": [], 1410 | "source": [ 1411 | "# standardize the data\n", 1412 | "from sklearn.preprocessing import StandardScaler\n", 1413 | "scaler = StandardScaler()\n", 1414 | "X_scaled = scaler.fit_transform(X)\n", 1415 | "X_scaled" 1416 | ] 1417 | }, 1418 | { 1419 | "cell_type": "code", 1420 | "execution_count": null, 1421 | "metadata": {}, 1422 | "outputs": [], 1423 | "source": [ 1424 | "X_scaled.describe()" 1425 | ] 1426 | }, 1427 | { 1428 | "cell_type": "code", 1429 | "execution_count": null, 1430 | "metadata": {}, 1431 | "outputs": [], 1432 | "source": [ 1433 | "lr_std = LinearRegression()\n", 1434 | "lr_std.fit(X_scaled, y)\n", 1435 | "lr_std.score(X_scaled, y)" 1436 | ] 1437 | }, 1438 | { 1439 | "cell_type": "code", 1440 | "execution_count": null, 1441 | "metadata": {}, 1442 | "outputs": [], 1443 | "source": [ 1444 | "pd.Series(lr_std.coef_, index=X.columns).sort_values().plot.barh(figsize=(8, 6))" 1445 | ] 1446 | }, 1447 | { 1448 | "cell_type": "code", 1449 | "execution_count": null, 1450 | "metadata": {}, 1451 | "outputs": [], 1452 | "source": [ 1453 | "!pip install xgboost" 1454 | ] 1455 | }, 1456 | { 1457 | "cell_type": "code", 1458 | "execution_count": null, 1459 | "metadata": {}, 1460 | "outputs": [], 1461 | "source": [ 1462 | "# try with XGBoost\n", 1463 | "from xgboost import XGBRegressor\n", 1464 | "xgb = XGBRegressor()\n", 1465 | "xgb.fit(X, y)\n", 1466 | "xgb.score(X, y)" 1467 | ] 1468 | }, 1469 | { 1470 | "cell_type": "code", 1471 | "execution_count": null, 1472 | "metadata": {}, 1473 | "outputs": [], 1474 | "source": [] 1475 | }, 1476 | { 1477 | "cell_type": "code", 1478 | "execution_count": null, 1479 | "metadata": {}, 1480 | "outputs": [], 1481 | "source": [] 1482 | }, 1483 | { 1484 | "cell_type": "markdown", 1485 | "metadata": {}, 1486 | "source": [ 1487 | "## Challenge: Linear Regression\n", 1488 | "\n", 1489 | "Make a model to predict how much Titanic passengers paid for their tickets with Linear Regression. (Only use the numeric columns for the model.)" 1490 | ] 1491 | }, 1492 | { 1493 | "cell_type": "code", 1494 | "execution_count": null, 1495 | "metadata": {}, 1496 | "outputs": [], 1497 | "source": [] 1498 | }, 1499 | { 1500 | "cell_type": "markdown", 1501 | "metadata": {}, 1502 | "source": [ 1503 | "## Solution: Linear Regression" 1504 | ] 1505 | }, 1506 | { 1507 | "cell_type": "code", 1508 | "execution_count": null, 1509 | "metadata": {}, 1510 | "outputs": [], 1511 | "source": [] 1512 | }, 1513 | { 1514 | "cell_type": "code", 1515 | "execution_count": null, 1516 | "metadata": {}, 1517 | "outputs": [], 1518 | "source": [] 1519 | }, 1520 | { 1521 | "cell_type": "code", 1522 | "execution_count": null, 1523 | "metadata": {}, 1524 | "outputs": [], 1525 | "source": [] 1526 | }, 1527 | { 1528 | "cell_type": "markdown", 1529 | "metadata": {}, 1530 | "source": [ 1531 | "# Logistic Regression\n", 1532 | "\n" 1533 | ] 1534 | }, 1535 | { 1536 | "cell_type": "markdown", 1537 | "metadata": { 1538 | "collapsed": false, 1539 | "jupyter": { 1540 | "outputs_hidden": false 1541 | } 1542 | }, 1543 | "source": [ 1544 | "## Logistic Regression Algorithm\n", 1545 | "\n", 1546 | "Even though logistic regression has \"regression\" in its name, it is used as a classification algorithm. It calculates the probability that a sample belongs to a class. Rather than fitting a line to the data, it fits an \"S\" shaped curve called the sigmoid function.\n", 1547 | "\n", 1548 | "The formula for logistic regression is as follows:\n", 1549 | "\n", 1550 | "$$y = \\frac{1}{1 + e^{-z}}$$\n", 1551 | "\n", 1552 | "where $y$ is the probability that a sample belongs to a class and $z$ is the linear combination of the features.\n", 1553 | "\n", 1554 | "The algorithm is as follows:\n", 1555 | "\n", 1556 | "- Initialize the weights.\n", 1557 | "- Calculate the predicted values.\n", 1558 | "- Calculate the error.\n", 1559 | "- Update the weights.\n", 1560 | "- Repeat the steps above until convergence.\n" 1561 | ] 1562 | }, 1563 | { 1564 | "cell_type": "code", 1565 | "execution_count": null, 1566 | "metadata": {}, 1567 | "outputs": [], 1568 | "source": [ 1569 | "# plot sigmoid function\n", 1570 | "import numpy as np \n", 1571 | "from matplotlib import pyplot as plt\n", 1572 | "x = np.linspace(-10, 10, 100)\n", 1573 | "y = 1 / (1 + np.exp(-x))\n", 1574 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 1575 | "ax.plot(x, y)" 1576 | ] 1577 | }, 1578 | { 1579 | "cell_type": "code", 1580 | "execution_count": null, 1581 | "metadata": {}, 1582 | "outputs": [], 1583 | "source": [] 1584 | }, 1585 | { 1586 | "cell_type": "code", 1587 | "execution_count": null, 1588 | "metadata": {}, 1589 | "outputs": [], 1590 | "source": [] 1591 | }, 1592 | { 1593 | "cell_type": "markdown", 1594 | "metadata": {}, 1595 | "source": [ 1596 | "## Basic Example\n", 1597 | "\n", 1598 | "\n", 1599 | "\n" 1600 | ] 1601 | }, 1602 | { 1603 | "cell_type": "code", 1604 | "execution_count": null, 1605 | "metadata": {}, 1606 | "outputs": [], 1607 | "source": [ 1608 | "import pandas as pd\n", 1609 | "log_data = pd.DataFrame({'x': [-2, -2.3, -2.1, -1, -.5, 0, .5, .7, 1, 2, 3],\n", 1610 | " 'y': [0, 0, 0, 0, 1, 0, 1,1, 1, 1, 1]})\n", 1611 | "\n", 1612 | "log_data.plot.scatter(x='x', y='y')" 1613 | ] 1614 | }, 1615 | { 1616 | "cell_type": "code", 1617 | "execution_count": null, 1618 | "metadata": {}, 1619 | "outputs": [], 1620 | "source": [ 1621 | "from sklearn.linear_model import LogisticRegression\n", 1622 | "log_r = LogisticRegression()\n", 1623 | "log_r.fit(log_data[['x']], log_data['y'])\n" 1624 | ] 1625 | }, 1626 | { 1627 | "cell_type": "code", 1628 | "execution_count": null, 1629 | "metadata": {}, 1630 | "outputs": [], 1631 | "source": [ 1632 | "log_r.coef_" 1633 | ] 1634 | }, 1635 | { 1636 | "cell_type": "code", 1637 | "execution_count": null, 1638 | "metadata": {}, 1639 | "outputs": [], 1640 | "source": [ 1641 | "log_r.intercept_" 1642 | ] 1643 | }, 1644 | { 1645 | "cell_type": "code", 1646 | "execution_count": null, 1647 | "metadata": {}, 1648 | "outputs": [], 1649 | "source": [ 1650 | "# plot fitted sigmoid function on top of data\n", 1651 | "x = np.linspace(-3, 4, 100)\n", 1652 | "y = 1 / (1 + np.exp(-(log_r.coef_[0][0] * x + log_r.intercept_[0])))\n", 1653 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 1654 | "ax.plot(x, y)\n", 1655 | "log_data.plot.scatter(x='x', y='y', ax=ax)\n", 1656 | "# annotate above .5\n", 1657 | "ax.annotate('Predict 1\\nright of this', xy=(-.31, .5), xytext=(2, .4), arrowprops={'arrowstyle': '->'})" 1658 | ] 1659 | }, 1660 | { 1661 | "cell_type": "code", 1662 | "execution_count": null, 1663 | "metadata": {}, 1664 | "outputs": [], 1665 | "source": [ 1666 | "log_r.predict([[-.3]])" 1667 | ] 1668 | }, 1669 | { 1670 | "cell_type": "code", 1671 | "execution_count": null, 1672 | "metadata": {}, 1673 | "outputs": [], 1674 | "source": [] 1675 | }, 1676 | { 1677 | "cell_type": "code", 1678 | "execution_count": null, 1679 | "metadata": {}, 1680 | "outputs": [], 1681 | "source": [] 1682 | }, 1683 | { 1684 | "cell_type": "code", 1685 | "execution_count": null, 1686 | "metadata": {}, 1687 | "outputs": [], 1688 | "source": [] 1689 | }, 1690 | { 1691 | "cell_type": "markdown", 1692 | "metadata": {}, 1693 | "source": [ 1694 | "## Real World Example with Eye movements\n", 1695 | "\n", 1696 | "From the website:\n", 1697 | "\n", 1698 | "The dataset consist of several assignments. Each assignment consists of a question followed by ten sentences (titles of news articles). One of the sentences is the correct answer to the question (C) and five of the sentences are irrelevant to the question (I). Four of the sentences are relevant to the question (R), but they do not answer it.\n", 1699 | "\n", 1700 | "- Features are in columns, feature vectors in rows.\n", 1701 | "- Each assignment is a time sequence of 22-dimensional feature vectors.\n", 1702 | "- The first column is the line number, second the assignment number and the next 22 columns (3 to 24) are the different features. Columns 25 to 27 contain extra information about the example. The training data set contains the classification label in the 28th column: \"0\" for irrelevant, \"1\" for relevant and \"2\" for the correct answer.\n", 1703 | "- Each example (row) represents a single word. You are asked to return the classification of each read sentence.\n", 1704 | "- The 22 features provided are commonly used in psychological studies on eye movement. All of them are not necessarily relevant in this context.\n", 1705 | "\n", 1706 | "The objective of the Challenge is to predict the classification labels (I, R, C).\n" 1707 | ] 1708 | }, 1709 | { 1710 | "cell_type": "code", 1711 | "execution_count": null, 1712 | "metadata": {}, 1713 | "outputs": [], 1714 | "source": [ 1715 | "# https://www.openml.org/da/1044\n", 1716 | "from datasets import load_dataset\n", 1717 | "eye = load_dataset('inria-soda/tabular-benchmark', data_files='clf_num/eye_movements.csv')" 1718 | ] 1719 | }, 1720 | { 1721 | "cell_type": "code", 1722 | "execution_count": null, 1723 | "metadata": {}, 1724 | "outputs": [], 1725 | "source": [ 1726 | "eye_df = eye['train'].to_pandas()\n", 1727 | "eye_df\n" 1728 | ] 1729 | }, 1730 | { 1731 | "cell_type": "code", 1732 | "execution_count": null, 1733 | "metadata": {}, 1734 | "outputs": [], 1735 | "source": [ 1736 | "from sklearn.preprocessing import StandardScaler\n", 1737 | "\n", 1738 | "X = eye_df.drop(columns=['label'])\n", 1739 | "y = eye_df['label']\n", 1740 | "std = StandardScaler()\n", 1741 | "X_scaled = std.fit_transform(X)\n", 1742 | "eye_log = LogisticRegression()\n", 1743 | "eye_log.fit(X_scaled, y)\n", 1744 | "eye_log.score(X_scaled, y)" 1745 | ] 1746 | }, 1747 | { 1748 | "cell_type": "code", 1749 | "execution_count": null, 1750 | "metadata": {}, 1751 | "outputs": [], 1752 | "source": [ 1753 | "pd.Series(eye_log.coef_[0], index=X.columns).sort_values().plot.barh(figsize=(8, 6))" 1754 | ] 1755 | }, 1756 | { 1757 | "cell_type": "code", 1758 | "execution_count": null, 1759 | "metadata": {}, 1760 | "outputs": [], 1761 | "source": [] 1762 | }, 1763 | { 1764 | "cell_type": "code", 1765 | "execution_count": null, 1766 | "metadata": {}, 1767 | "outputs": [], 1768 | "source": [] 1769 | }, 1770 | { 1771 | "cell_type": "code", 1772 | "execution_count": null, 1773 | "metadata": {}, 1774 | "outputs": [], 1775 | "source": [] 1776 | }, 1777 | { 1778 | "cell_type": "markdown", 1779 | "metadata": {}, 1780 | "source": [ 1781 | "## Challenge: Logistic Regression\n", 1782 | "\n", 1783 | "Create a logistic regression model to predict whether a Titanic passenger survives based on the numeric columns.\n" 1784 | ] 1785 | }, 1786 | { 1787 | "cell_type": "code", 1788 | "execution_count": null, 1789 | "metadata": {}, 1790 | "outputs": [], 1791 | "source": [] 1792 | }, 1793 | { 1794 | "cell_type": "code", 1795 | "execution_count": null, 1796 | "metadata": {}, 1797 | "outputs": [], 1798 | "source": [] 1799 | }, 1800 | { 1801 | "cell_type": "markdown", 1802 | "metadata": {}, 1803 | "source": [ 1804 | "## Solution: Logistic Regression" 1805 | ] 1806 | }, 1807 | { 1808 | "cell_type": "code", 1809 | "execution_count": null, 1810 | "metadata": {}, 1811 | "outputs": [], 1812 | "source": [] 1813 | }, 1814 | { 1815 | "cell_type": "code", 1816 | "execution_count": null, 1817 | "metadata": {}, 1818 | "outputs": [], 1819 | "source": [] 1820 | }, 1821 | { 1822 | "cell_type": "code", 1823 | "execution_count": null, 1824 | "metadata": {}, 1825 | "outputs": [], 1826 | "source": [] 1827 | }, 1828 | { 1829 | "cell_type": "markdown", 1830 | "metadata": {}, 1831 | "source": [ 1832 | "# Decision Trees\n" 1833 | ] 1834 | }, 1835 | { 1836 | "cell_type": "markdown", 1837 | "metadata": {}, 1838 | "source": [ 1839 | "## Decision Tree Algorithm\n", 1840 | "\n", 1841 | "Decision trees are a type of supervised learning algorithm that can be used for both classification and regression. They work by splitting the data into subsets based on the features. The goal is to split the data in a way that minimizes the entropy of the subsets." 1842 | ] 1843 | }, 1844 | { 1845 | "cell_type": "code", 1846 | "execution_count": null, 1847 | "metadata": {}, 1848 | "outputs": [], 1849 | "source": [ 1850 | "## Create \"decision stump\"\n", 1851 | "## fit tree regressor to anscombe's quartet limit to 1 level\n", 1852 | "\n", 1853 | "from sklearn.tree import DecisionTreeRegressor\n", 1854 | "dt = DecisionTreeRegressor(max_depth=1)\n", 1855 | "X = anscombe[['x']]\n", 1856 | "y = anscombe['y1']\n", 1857 | "dt.fit(X, y)\n" 1858 | ] 1859 | }, 1860 | { 1861 | "cell_type": "code", 1862 | "execution_count": null, 1863 | "metadata": {}, 1864 | "outputs": [], 1865 | "source": [ 1866 | "## Plot the tree\n", 1867 | "from sklearn.tree import plot_tree\n", 1868 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 1869 | "_ = plot_tree(dt, ax=ax, feature_names=['x'], filled=True, fontsize=10)" 1870 | ] 1871 | }, 1872 | { 1873 | "cell_type": "code", 1874 | "execution_count": null, 1875 | "metadata": {}, 1876 | "outputs": [], 1877 | "source": [ 1878 | "## Plot the data and predictions on the same plot\n", 1879 | "import numpy as np\n", 1880 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 1881 | "anscombe.plot.scatter(x='x', y='y1', ax=ax, color='k')\n", 1882 | "# plot the line\n", 1883 | "x1 = np.linspace(4, 14, 100)\n", 1884 | "y1 = dt.predict(x1.reshape(-1, 1))\n", 1885 | "ax.plot(x1, y1, color='r')\n" 1886 | ] 1887 | }, 1888 | { 1889 | "cell_type": "code", 1890 | "execution_count": null, 1891 | "metadata": {}, 1892 | "outputs": [], 1893 | "source": [ 1894 | "## Now plot to two levels\n", 1895 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 1896 | "anscombe.plot.scatter(x='x', y='y1', ax=ax, color='k')\n", 1897 | "# plot the line\n", 1898 | "dt2 = DecisionTreeRegressor(max_depth=2)\n", 1899 | "dt2.fit(X, y)\n", 1900 | "\n", 1901 | "x1 = np.linspace(4, 14, 100)\n", 1902 | "y1 = dt2.predict(x1.reshape(-1, 1))\n", 1903 | "ax.plot(x1, y1, color='r')\n" 1904 | ] 1905 | }, 1906 | { 1907 | "cell_type": "code", 1908 | "execution_count": null, 1909 | "metadata": {}, 1910 | "outputs": [], 1911 | "source": [ 1912 | "## Now plot unlimited levels\n", 1913 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 1914 | "anscombe.plot.scatter(x='x', y='y1', ax=ax, color='k')\n", 1915 | "# plot the line\n", 1916 | "dt3 = DecisionTreeRegressor(max_depth=None)\n", 1917 | "dt3.fit(X, y)\n", 1918 | "\n", 1919 | "x1 = np.linspace(4, 14, 100)\n", 1920 | "y1 = dt3.predict(x1.reshape(-1, 1))\n", 1921 | "ax.plot(x1, y1, color='r')\n" 1922 | ] 1923 | }, 1924 | { 1925 | "cell_type": "code", 1926 | "execution_count": null, 1927 | "metadata": {}, 1928 | "outputs": [], 1929 | "source": [] 1930 | }, 1931 | { 1932 | "cell_type": "code", 1933 | "execution_count": null, 1934 | "metadata": {}, 1935 | "outputs": [], 1936 | "source": [] 1937 | }, 1938 | { 1939 | "cell_type": "code", 1940 | "execution_count": null, 1941 | "metadata": {}, 1942 | "outputs": [], 1943 | "source": [] 1944 | }, 1945 | { 1946 | "cell_type": "markdown", 1947 | "metadata": {}, 1948 | "source": [ 1949 | "## Real World with Aircraft Elevators" 1950 | ] 1951 | }, 1952 | { 1953 | "cell_type": "code", 1954 | "execution_count": null, 1955 | "metadata": {}, 1956 | "outputs": [], 1957 | "source": [ 1958 | "X_elev = elev.drop(columns=['Goal'])\n", 1959 | "y_elev = elev['Goal']\n", 1960 | "dt_elev = DecisionTreeRegressor(max_depth=3)\n", 1961 | "dt_elev.fit(X_elev, y_elev)\n" 1962 | ] 1963 | }, 1964 | { 1965 | "cell_type": "code", 1966 | "execution_count": null, 1967 | "metadata": {}, 1968 | "outputs": [], 1969 | "source": [ 1970 | "# plot the tree\n", 1971 | "from sklearn.tree import plot_tree\n", 1972 | "fig, ax = plt.subplots(figsize=(12, 8))\n", 1973 | "_ = plot_tree(dt_elev, ax=ax, feature_names=X_elev.columns, filled=True, fontsize=10, precision=4)\n" 1974 | ] 1975 | }, 1976 | { 1977 | "cell_type": "code", 1978 | "execution_count": null, 1979 | "metadata": {}, 1980 | "outputs": [], 1981 | "source": [ 1982 | "dt_elev.score(X_elev, y_elev)" 1983 | ] 1984 | }, 1985 | { 1986 | "cell_type": "code", 1987 | "execution_count": null, 1988 | "metadata": {}, 1989 | "outputs": [], 1990 | "source": [ 1991 | "from sklearn.linear_model import LinearRegression\n", 1992 | "lr_elev = LinearRegression()\n", 1993 | "lr_elev.fit(X_elev, y_elev)\n", 1994 | "lr_elev.score(X_elev, y_elev)" 1995 | ] 1996 | }, 1997 | { 1998 | "cell_type": "code", 1999 | "execution_count": null, 2000 | "metadata": {}, 2001 | "outputs": [], 2002 | "source": [ 2003 | "# loop over depths and plot the results\n", 2004 | "scores = []\n", 2005 | "for i in range(1, 20):\n", 2006 | " dt = DecisionTreeRegressor(max_depth=i)\n", 2007 | " dt.fit(X_elev, y_elev)\n", 2008 | " scores.append(dt.score(X_elev, y_elev))\n", 2009 | "\n", 2010 | "pd.Series(scores).plot.line(figsize=(8, 6))" 2011 | ] 2012 | }, 2013 | { 2014 | "cell_type": "code", 2015 | "execution_count": null, 2016 | "metadata": {}, 2017 | "outputs": [], 2018 | "source": [ 2019 | "# split the data and plot results of train and test\n", 2020 | "from sklearn.model_selection import train_test_split\n", 2021 | "X_train, X_test, y_train, y_test = train_test_split(X_elev, y_elev, random_state=42)\n", 2022 | "test_scores = []\n", 2023 | "train_scores = []\n", 2024 | "for i in range(1, 20):\n", 2025 | " dt = DecisionTreeRegressor(max_depth=i)\n", 2026 | " dt.fit(X_train, y_train)\n", 2027 | " test_scores.append(dt.score(X_test, y_test))\n", 2028 | " train_scores.append(dt.score(X_train, y_train))\n", 2029 | "\n", 2030 | "ax = pd.DataFrame({'train': train_scores, 'test': test_scores}).plot.line(figsize=(8, 6))\n", 2031 | "\n", 2032 | "# annotate overfitting at 10, .7\n", 2033 | "ax.annotate('Overfitting after here', xy=(10, .7), xytext=(12, .5), arrowprops={'arrowstyle': '->'})\n", 2034 | "\n", 2035 | "# set title\n", 2036 | "ax.set_title('Validation Curve for Decision Tree')\n" 2037 | ] 2038 | }, 2039 | { 2040 | "cell_type": "code", 2041 | "execution_count": null, 2042 | "metadata": {}, 2043 | "outputs": [], 2044 | "source": [ 2045 | "# Let's see if our model improves with a deeper tree\n", 2046 | "dt_elev = DecisionTreeRegressor(max_depth=11)\n", 2047 | "dt_elev.fit(X_train, y_train)\n", 2048 | "dt_elev.score(X_test, y_test)" 2049 | ] 2050 | }, 2051 | { 2052 | "cell_type": "code", 2053 | "execution_count": null, 2054 | "metadata": {}, 2055 | "outputs": [], 2056 | "source": [ 2057 | "lr_elev = LinearRegression()\n", 2058 | "lr_elev.fit(X_train, y_train)\n", 2059 | "lr_elev.score(X_test, y_test)" 2060 | ] 2061 | }, 2062 | { 2063 | "cell_type": "code", 2064 | "execution_count": null, 2065 | "metadata": {}, 2066 | "outputs": [], 2067 | "source": [] 2068 | }, 2069 | { 2070 | "cell_type": "markdown", 2071 | "metadata": {}, 2072 | "source": [ 2073 | "## Random Forests and XGBoost" 2074 | ] 2075 | }, 2076 | { 2077 | "cell_type": "code", 2078 | "execution_count": null, 2079 | "metadata": {}, 2080 | "outputs": [], 2081 | "source": [ 2082 | "# create a random forest regressor\n", 2083 | "from sklearn.ensemble import RandomForestRegressor\n", 2084 | "rf = RandomForestRegressor(n_estimators=100, max_depth=3)\n", 2085 | "rf.fit(X_train, y_train)\n", 2086 | "rf.score(X_test, y_test)\n" 2087 | ] 2088 | }, 2089 | { 2090 | "cell_type": "code", 2091 | "execution_count": null, 2092 | "metadata": {}, 2093 | "outputs": [], 2094 | "source": [ 2095 | "# sweep over depths and plot results\n", 2096 | "test_scores = []\n", 2097 | "train_scores = []\n", 2098 | "for i in range(1, 20):\n", 2099 | " rf = RandomForestRegressor(n_estimators=100, max_depth=i)\n", 2100 | " rf.fit(X_train, y_train)\n", 2101 | " test_scores.append(rf.score(X_test, y_test))\n", 2102 | " train_scores.append(rf.score(X_train, y_train))\n", 2103 | "\n", 2104 | "ax = pd.DataFrame({'train': train_scores, 'test': test_scores}).plot.line(figsize=(8, 6))" 2105 | ] 2106 | }, 2107 | { 2108 | "cell_type": "code", 2109 | "execution_count": null, 2110 | "metadata": {}, 2111 | "outputs": [], 2112 | "source": [ 2113 | "# create a random forest regressor\n", 2114 | "from sklearn.ensemble import RandomForestRegressor\n", 2115 | "rf = RandomForestRegressor(n_estimators=100, max_depth=13, random_state=42)\n", 2116 | "rf.fit(X_train, y_train)\n", 2117 | "rf.score(X_test, y_test)\n" 2118 | ] 2119 | }, 2120 | { 2121 | "cell_type": "code", 2122 | "execution_count": null, 2123 | "metadata": {}, 2124 | "outputs": [], 2125 | "source": [ 2126 | "# create an xgb regressor\n", 2127 | "from xgboost import XGBRegressor\n", 2128 | "xgb = XGBRegressor()\n", 2129 | "xgb.fit(X_train, y_train)\n", 2130 | "xgb.score(X_test, y_test)" 2131 | ] 2132 | }, 2133 | { 2134 | "cell_type": "code", 2135 | "execution_count": null, 2136 | "metadata": {}, 2137 | "outputs": [], 2138 | "source": [] 2139 | }, 2140 | { 2141 | "cell_type": "code", 2142 | "execution_count": null, 2143 | "metadata": {}, 2144 | "outputs": [], 2145 | "source": [] 2146 | }, 2147 | { 2148 | "cell_type": "code", 2149 | "execution_count": null, 2150 | "metadata": {}, 2151 | "outputs": [], 2152 | "source": [] 2153 | }, 2154 | { 2155 | "cell_type": "markdown", 2156 | "metadata": {}, 2157 | "source": [ 2158 | "## Challenge: Decision Trees\n", 2159 | "\n", 2160 | "Create a decision tree to predict survival on the titanic. See if you can determine the optimal depth of the tree." 2161 | ] 2162 | }, 2163 | { 2164 | "cell_type": "markdown", 2165 | "metadata": {}, 2166 | "source": [ 2167 | "## Solution: Decision Trees" 2168 | ] 2169 | }, 2170 | { 2171 | "cell_type": "code", 2172 | "execution_count": null, 2173 | "metadata": {}, 2174 | "outputs": [], 2175 | "source": [] 2176 | }, 2177 | { 2178 | "cell_type": "code", 2179 | "execution_count": null, 2180 | "metadata": {}, 2181 | "outputs": [], 2182 | "source": [] 2183 | }, 2184 | { 2185 | "cell_type": "code", 2186 | "execution_count": null, 2187 | "metadata": {}, 2188 | "outputs": [], 2189 | "source": [] 2190 | }, 2191 | { 2192 | "cell_type": "code", 2193 | "execution_count": null, 2194 | "metadata": {}, 2195 | "outputs": [], 2196 | "source": [] 2197 | }, 2198 | { 2199 | "cell_type": "code", 2200 | "execution_count": null, 2201 | "metadata": {}, 2202 | "outputs": [], 2203 | "source": [] 2204 | }, 2205 | { 2206 | "cell_type": "markdown", 2207 | "metadata": {}, 2208 | "source": [ 2209 | "# Conclusion - Next Steps\n", 2210 | "\n", 2211 | "- Practice, practice, practice! - I recommend using your own data to practice.\n", 2212 | "- Check out my Feature Engineering course.\n", 2213 | "- Check out my XGBoost course." 2214 | ] 2215 | }, 2216 | { 2217 | "cell_type": "markdown", 2218 | "metadata": {}, 2219 | "source": [] 2220 | } 2221 | ], 2222 | "metadata": { 2223 | "kernelspec": { 2224 | "display_name": "Python 3 (ipykernel)", 2225 | "language": "python", 2226 | "name": "python3" 2227 | }, 2228 | "language_info": { 2229 | "codemirror_mode": { 2230 | "name": "ipython", 2231 | "version": 3 2232 | }, 2233 | "file_extension": ".py", 2234 | "mimetype": "text/x-python", 2235 | "name": "python", 2236 | "nbconvert_exporter": "python", 2237 | "pygments_lexer": "ipython3", 2238 | "version": "3.10.13" 2239 | } 2240 | }, 2241 | "nbformat": 4, 2242 | "nbformat_minor": 4 2243 | } 2244 | -------------------------------------------------------------------------------- /soln.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Machine Learning Algorithms\n", 8 | "\n", 9 | "Going to provide an overview of the most common machine learning algorithms.\n", 10 | "\n", 11 | "You should feel comfortable with:\n", 12 | "\n", 13 | "- Jupyter Notebooks\n", 14 | "- Python\n" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Clustering" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## K-means clustering\n", 43 | " \n", 44 | "K-means clustering is an unsupervised learning algorithm that classifies a data point based on the majority of its neighbors. \n", 45 | "\n", 46 | "The algorithm is as follows:\n", 47 | "\n", 48 | "- Choose the number of $k$ and a distance metric.\n", 49 | "- Find the $k$ nearest neighbors of the sample that we want to classify.\n", 50 | "- Assign the class label by majority vote.\n", 51 | "- Update the centroids of each class.\n", 52 | "- Repeat the steps above until convergence.\n", 53 | "\n", 54 | "Note that because this algorithm takes distance into account, it is important that the features (columns) are on the same scale. For the iris dataset, they are but for other datasets, you may need to scale the features." 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": false, 62 | "jupyter": { 63 | "outputs_hidden": false 64 | } 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "from sklearn.cluster import KMeans\n", 69 | "import sklearn.datasets\n", 70 | "import matplotlib.pyplot as plt\n", 71 | "\n", 72 | "dataset = sklearn.datasets.load_iris(as_frame=True)\n", 73 | "\n", 74 | "X = (dataset\n", 75 | " ['data']\n", 76 | " .loc[:, ['sepal length (cm)', 'sepal width (cm)']]\n", 77 | ")\n", 78 | "y = dataset['target']\n", 79 | "\n", 80 | "# demonstration of k-means clustering with iris dataset\n", 81 | "# keep list of all centroids\n", 82 | "centroids = []\n", 83 | "\n", 84 | "# loop over a few iterations and plot the results\n", 85 | "for i in range(10):\n", 86 | " model = KMeans(n_clusters=3, init='random', n_init=1,\n", 87 | " max_iter=i+1, random_state=42)\n", 88 | " model.fit(X)\n", 89 | " label = model.predict(X)\n", 90 | " # plot the input data color by cluster\n", 91 | " fig, ax = plt.subplots(figsize=(8, 6))\n", 92 | " X.plot.scatter(x='sepal length (cm)', y='sepal width (cm)', c=label, cmap='viridis', ax=ax)\n", 93 | " # plot the centers\n", 94 | " centers = model.cluster_centers_\n", 95 | " ax.scatter(centers[:, 0], centers[:, 1], marker='*', s=250, color='r', alpha=.5)\n", 96 | " ax.set_title('Iteration: ' + str(i))\n", 97 | " # plot previous centroids with reduced alpha value\n", 98 | " if i > 0:\n", 99 | " for centroid in centroids:\n", 100 | " ax.scatter(centroid[:, 0], centroid[:, 1], marker='*', s=250, color='r', alpha=.1)\n", 101 | " # save the current centroids\n", 102 | " centroids.append(model.cluster_centers_)\n", 103 | " \n", 104 | " \n", 105 | " # plot the original data color by target\n", 106 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 107 | "X.plot.scatter(x='sepal length (cm)', y='sepal width (cm)', c=y, cmap='viridis', ax=ax)\n", 108 | "ax.set_title('Original Data')\n" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "## K Evaluation for K-means\n", 137 | "\n", 138 | "We specify the K value for K-means clustering. However, we do not know the best K value for a given dataset. \n", 139 | "\n", 140 | "There are a few methods to evaluate the K value for K-means clustering:\n", 141 | "\n", 142 | "- Elbow method - Track the \"inertia\" of the model as the K value increases. The inertia is the sum of squared distances of samples to their closest cluster center. \n", 143 | "- Silhouette coefficient - The silhouette coefficient is a measure of how similar an object is to its own cluster compared to other clusters. The silhouette coefficient ranges from -1 to 1. " 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# Run elbow method to find optimal number of clusters\n", 153 | "\n", 154 | "inertias = []\n", 155 | "max_clusters = 20\n", 156 | "for i in range(max_clusters):\n", 157 | " km = KMeans(n_clusters=i+1, n_init='auto',\n", 158 | " max_iter=300, random_state=42)\n", 159 | " km.fit(X)\n", 160 | " inertias.append(km.inertia_)\n", 161 | "\n", 162 | "# plot the results\n", 163 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 164 | "ax.plot(range(1, max_clusters+1), inertias, marker='o')\n", 165 | "ax.set_xlabel('Number of clusters')\n", 166 | "ax.set_ylabel('Inertia')\n", 167 | "ax.set_title('Elbow Method')\n" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "!pip install yellowbrick" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "from yellowbrick.cluster import SilhouetteVisualizer\n", 186 | "SilhouetteVisualizer?" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "# list matplotlib fonts available\n", 196 | "import matplotlib.font_manager\n", 197 | "matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf')\n" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "# if on linux, set matplotlib font to DejaVuSans\n", 207 | "# get rid of warnings in next cell\n", 208 | "import matplotlib\n", 209 | "matplotlib.rcParams['font.family'] = 'DejaVu Sans'\n" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "# Run silhouette method to find optimal number of clusters\n", 219 | "from yellowbrick.cluster import silhouette_visualizer \n", 220 | "\n", 221 | "max_clusters = 6\n", 222 | "for i in range(max_clusters):\n", 223 | " km = KMeans(n_clusters=i+2, n_init='auto',\n", 224 | " max_iter=300, random_state=42)\n", 225 | "\n", 226 | " fig, ax = plt.subplots(figsize=(8, 6))\n", 227 | " # setting show=False so we can set xlim to same value for all plots\n", 228 | " viz = silhouette_visualizer(km, X, colors='yellowbrick', ax=ax, show=False)\n", 229 | " ax.set_xlim([-0.1, .8]) \n", 230 | "\n" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "## Understanding Clustering Results\n", 259 | "\n", 260 | "We can use the following methods to understand the clustering results:\n", 261 | "\n", 262 | "- Create a surrogate model to predict the cluster label for a given sample.\n", 263 | "- Summarize the cluster by the mean of each feature.\n", 264 | "- Visualize the clustering results in 2D or 3D." 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "!pip install datasets" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "# going to cluster electricity data from Australia\n", 283 | "# https://www.openml.org/search?type=data&sort=runs&id=151&status=active\n", 284 | "from datasets import load_dataset\n", 285 | "electricity = load_dataset('inria-soda/tabular-benchmark', data_files='clf_num/electricity.csv')" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "electricity" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "print(dir(electricity['train']))" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "elec = electricity['train'].to_pandas()\n", 313 | "elec" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "X = elec.drop(columns=['class'])" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "# get inertias for different numbers of clusters\n", 332 | "inertias = []\n", 333 | "max_clusters = 20\n", 334 | "for i in range(max_clusters):\n", 335 | " km = KMeans(n_clusters=i+1, n_init='auto',\n", 336 | " max_iter=300, random_state=42)\n", 337 | " km.fit(X)\n", 338 | " inertias.append(km.inertia_)\n", 339 | "\n", 340 | "# plot the results\n", 341 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 342 | "ax.plot(range(1, max_clusters+1), inertias, marker='o')" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "# Takes 2min 20s w/o sampling\n", 352 | "# get silhouette scores for different numbers of clusters\n", 353 | "from yellowbrick.cluster import silhouette_visualizer\n", 354 | "\n", 355 | "max_clusters = 6\n", 356 | "for i in range(max_clusters):\n", 357 | " km = KMeans(n_clusters=i+2, n_init='auto',\n", 358 | " max_iter=300, random_state=42)\n", 359 | "\n", 360 | " fig, ax = plt.subplots(figsize=(8, 6))\n", 361 | " # setting show=False so we can set xlim to same value for all plots\n", 362 | " viz = silhouette_visualizer(km, X.sample(1_000, random_state=42), colors='yellowbrick', ax=ax, show=False)\n", 363 | " ax.set_xlim([-0.1, .8])" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "# going to choose 5 clusters (if 5, 6, or 7 are all close, choose the simpler model)\n", 373 | "# summarize the results by group\n", 374 | "\n", 375 | "km = KMeans(n_clusters=5, n_init='auto',\n", 376 | " max_iter=300, random_state=42)\n", 377 | "km.fit(X)\n", 378 | "label = km.predict(X)\n", 379 | "(elec\n", 380 | " .assign(cluster=label)\n", 381 | " .groupby('cluster')\n", 382 | " .agg('mean', numeric_only=True)\n", 383 | " .T\n", 384 | " .style\n", 385 | " .background_gradient(cmap='RdBu', axis='columns')\n", 386 | ")" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "# Summarize by surrogate model decision tree\n", 403 | "from sklearn.tree import DecisionTreeClassifier\n", 404 | "\n", 405 | "dt = DecisionTreeClassifier(max_depth=3, random_state=42)\n", 406 | "dt.fit(X, label)\n", 407 | "# plot the tree\n", 408 | "from sklearn.tree import plot_tree\n", 409 | "fig, ax = plt.subplots(figsize=(14, 8))\n", 410 | "# make string for class names\n", 411 | "class_names = [str(i) for i in range(0, 5)]\n", 412 | "_ = plot_tree(dt, ax=ax, feature_names=X.columns, class_names=class_names, filled=True, fontsize=10)\n" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "## Other Clustering Algorithms\n", 441 | "\n", 442 | "- Hierarchical clustering - This clusters by distance. It first clusters the two closest points, then clusters the next closest point to the first cluster, and so on. It is not as efficient as k-means, but it does not require you to specify the number of clusters. It is also more robust to outliers than k-means.\n", 443 | "- DBSCAN - This clusters by density. It finds the densest region of points and clusters them together. It is also more robust to outliers than k-means.\n" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": {}, 463 | "source": [ 464 | "## Challenge: K-means Clustering\n", 465 | "\n", 466 | "With the Titanic dataset:\n", 467 | "- drop the missing values\n", 468 | "- drop the categorical features\n", 469 | "- scale the features\n", 470 | "- run K-means clustering\n", 471 | "- use the elbow method to find the best K value" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "## Solution: K-means Clustering\n" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "import pandas as pd\n", 488 | "url = 'https://github.com/mattharrison/datasets/raw/master/data/titanic3.xls'\n", 489 | "raw = pd.read_excel(url)\n", 490 | "raw" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "raw.isna().sum()" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "raw.dtypes" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "raw.columns" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": null, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "def tweak_titanic(df):\n", 527 | " return (df \n", 528 | " .loc[:, ['pclass', 'survived', 'age', 'sibsp', 'parch',\n", 529 | " 'fare']]\n", 530 | " .dropna()\n", 531 | " )\n", 532 | "\n", 533 | "tweak_titanic(raw)" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "# standardize the data\n", 543 | "from sklearn.preprocessing import StandardScaler \n", 544 | "# have sklearn output pandas\n", 545 | "from sklearn import set_config\n", 546 | "set_config(transform_output='pandas')\n", 547 | "scaler = StandardScaler()\n", 548 | "X = scaler.fit_transform(tweak_titanic(raw))\n", 549 | "X" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": {}, 556 | "outputs": [], 557 | "source": [ 558 | "inertias = []\n", 559 | "for i in range(20):\n", 560 | " km = KMeans(n_clusters=i+1, n_init=1,\n", 561 | " max_iter=300, random_state=42)\n", 562 | " km.fit(X)\n", 563 | " inertias.append(km.inertia_)\n", 564 | "\n", 565 | "# plot the results\n", 566 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 567 | "ax.plot(range(1, 21), inertias, marker='o')\n" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": null, 573 | "metadata": {}, 574 | "outputs": [], 575 | "source": [] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": null, 580 | "metadata": {}, 581 | "outputs": [], 582 | "source": [] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": null, 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [] 590 | }, 591 | { 592 | "cell_type": "markdown", 593 | "metadata": {}, 594 | "source": [ 595 | "# PCA" 596 | ] 597 | }, 598 | { 599 | "cell_type": "markdown", 600 | "metadata": {}, 601 | "source": [ 602 | "## Intro to PCA\n", 603 | "\n", 604 | "Principal component analysis (PCA) is a dimensionality reduction technique. It finds the linear combinations of the features that explain the most variance in the data.\n" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [ 613 | "from sklearn.cluster import KMeans\n", 614 | "import sklearn.datasets\n", 615 | "import matplotlib.pyplot as plt\n", 616 | "\n", 617 | "dataset = sklearn.datasets.load_iris(as_frame=True)\n", 618 | "\n", 619 | "X = (dataset\n", 620 | " ['data']\n", 621 | " .loc[:, ['sepal length (cm)', 'sepal width (cm)']]\n", 622 | ")\n", 623 | "y = dataset['target']" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": null, 629 | "metadata": {}, 630 | "outputs": [], 631 | "source": [ 632 | "X" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": null, 638 | "metadata": {}, 639 | "outputs": [], 640 | "source": [ 641 | "X_pca" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": null, 647 | "metadata": {}, 648 | "outputs": [], 649 | "source": [ 650 | "# Run PCA on the first two components of the Iris dataset\n", 651 | "from sklearn.decomposition import PCA\n", 652 | "pca = PCA(n_components=2)\n", 653 | "pca.fit(X)\n", 654 | "X_pca = pca.transform(X)\n", 655 | "\n", 656 | "# plot the results\n", 657 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 658 | "ax.scatter(X_pca['pca0'], X_pca['pca1'], c=y, cmap='viridis')\n", 659 | "ax.set_xlabel('PC1')\n", 660 | "ax.set_ylabel('PC2')\n", 661 | "ax.set_title('PCA on Iris Data')\n" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": null, 667 | "metadata": {}, 668 | "outputs": [], 669 | "source": [ 670 | "X" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": null, 676 | "metadata": {}, 677 | "outputs": [], 678 | "source": [ 679 | "# Run PCA on all of the Iris dataset\n", 680 | "pca = PCA()\n", 681 | "\n", 682 | "X_all = (dataset\n", 683 | " ['data']\n", 684 | " #.loc[:, ['sepal length (cm)', 'sepal width (cm)']]\n", 685 | ")\n", 686 | "pca.fit(X_all)\n", 687 | "X_pca = pca.transform(X_all)\n", 688 | "\n", 689 | "# Plot to first 2 components\n", 690 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 691 | "ax.scatter(X_pca['pca0'], X_pca['pca1'], c=y, cmap='viridis')\n", 692 | "ax.set_xlabel('PC1')\n", 693 | "ax.set_ylabel('PC2')\n", 694 | "ax.set_title('PCA on Iris Data')" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": null, 700 | "metadata": {}, 701 | "outputs": [], 702 | "source": [] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": null, 714 | "metadata": {}, 715 | "outputs": [], 716 | "source": [] 717 | }, 718 | { 719 | "cell_type": "markdown", 720 | "metadata": {}, 721 | "source": [ 722 | "## Components of PCA\n", 723 | "\n", 724 | "When we run PCA, we get back the following:\n", 725 | "\n", 726 | "- Principal components (PCs): The PCs are the linear combinations of the features. \n", 727 | "- Explained variance ratio: The explained variance ratio tells us how much variance is explained by each PC.\n", 728 | "- Feature weights: The feature weights tell us how much each feature contributes to each PC.\n" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": null, 734 | "metadata": {}, 735 | "outputs": [], 736 | "source": [ 737 | "# tell sklearn to output pandas dataframes\n", 738 | "sklearn.set_config(transform_output='pandas')\n", 739 | "\n", 740 | "pca.transform(X_all)" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": null, 746 | "metadata": {}, 747 | "outputs": [], 748 | "source": [ 749 | "# I don't like the column names, so I'll rename them\n", 750 | "# change pca0 to PC1, pca1 to PC2, etc\n", 751 | "\n", 752 | "def rename_pc0_to_PC1(col):\n", 753 | " num = int(col[3:]) + 1\n", 754 | " return 'PC' + str(num)\n", 755 | "\n", 756 | "pca.transform(X_all).rename(columns=rename_pc0_to_PC1)\n" 757 | ] 758 | }, 759 | { 760 | "cell_type": "code", 761 | "execution_count": null, 762 | "metadata": {}, 763 | "outputs": [], 764 | "source": [ 765 | "# Scree plot of explained variance\n", 766 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 767 | "ax.plot(range(1, len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_, marker='o')\n", 768 | "ax.set_xlabel('Principal Component')\n", 769 | "ax.set_ylabel('Explained Variance Ratio')\n", 770 | "ax.set_title('Scree Plot')" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": null, 776 | "metadata": {}, 777 | "outputs": [], 778 | "source": [ 779 | "# The explained variance ratio is the percentage of variance explained by each of the selected components.\n", 780 | "# The first principal component explains 92.5% of the variance in the data, and \n", 781 | "# the second principal component explains 5.3% of the variance in the data.\n", 782 | "pca.explained_variance_ratio_" 783 | ] 784 | }, 785 | { 786 | "cell_type": "code", 787 | "execution_count": null, 788 | "metadata": {}, 789 | "outputs": [], 790 | "source": [ 791 | "pca.explained_variance_ratio_.cumsum()" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": null, 797 | "metadata": {}, 798 | "outputs": [], 799 | "source": [] 800 | }, 801 | { 802 | "cell_type": "code", 803 | "execution_count": null, 804 | "metadata": {}, 805 | "outputs": [], 806 | "source": [] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "execution_count": null, 811 | "metadata": {}, 812 | "outputs": [], 813 | "source": [] 814 | }, 815 | { 816 | "cell_type": "markdown", 817 | "metadata": {}, 818 | "source": [ 819 | "## Weights\n", 820 | "\n", 821 | "For every PC, we get a set of weights for each feature. The weights tell us how much each feature contributes to the PC." 822 | ] 823 | }, 824 | { 825 | "cell_type": "code", 826 | "execution_count": null, 827 | "metadata": {}, 828 | "outputs": [], 829 | "source": [ 830 | "# convert components to a dataframe\n", 831 | "import pandas as pd\n", 832 | "\n", 833 | "components = pd.DataFrame(pca.components_, columns=X_all.columns,\n", 834 | " index=['PC1', 'PC2', 'PC3', 'PC4'])\n", 835 | " \n", 836 | "components\n" 837 | ] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "execution_count": null, 842 | "metadata": {}, 843 | "outputs": [], 844 | "source": [ 845 | "# Centered data - for next cell's calculation\n", 846 | "X_all - X_all.mean()" 847 | ] 848 | }, 849 | { 850 | "cell_type": "code", 851 | "execution_count": null, 852 | "metadata": {}, 853 | "outputs": [], 854 | "source": [ 855 | "# calculating PC1 by hand for first row - linear combination of \n", 856 | "# centered variables and the first component \n", 857 | "-.743333 * .3613 + 0.4426 * -0.0845 + -2.358 * 0.8566 + -0.9993 * 0.3582" 858 | ] 859 | }, 860 | { 861 | "cell_type": "code", 862 | "execution_count": null, 863 | "metadata": {}, 864 | "outputs": [], 865 | "source": [ 866 | "pca.transform(X_all).rename(columns=rename_pc0_to_PC1)" 867 | ] 868 | }, 869 | { 870 | "cell_type": "code", 871 | "execution_count": null, 872 | "metadata": {}, 873 | "outputs": [], 874 | "source": [ 875 | "# Manually calculating PCA with numpy\n", 876 | "import numpy as np\n", 877 | "nums = X_all - X_all.mean()\n", 878 | "vals, vecs = np.linalg.eig(nums.cov())\n", 879 | "idxs = pd.Series(vals).argsort()\n", 880 | "\n", 881 | "explained_variance = pd.Series(sorted(vals, reverse=True))\n", 882 | "\n", 883 | "def set_columns(df_):\n", 884 | " df_.columns = [f'PC{i+1}' for i in range(len(df_.columns))]\n", 885 | " return df_\n", 886 | "\n", 887 | "comps = (pd.DataFrame(vecs, index=nums.columns)\n", 888 | " .iloc[:, idxs[::-1]]\n", 889 | " .pipe(set_columns)\n", 890 | ")\n", 891 | "\n", 892 | "pcas = (nums.dot(comps))\n", 893 | "pcas" 894 | ] 895 | }, 896 | { 897 | "cell_type": "code", 898 | "execution_count": null, 899 | "metadata": {}, 900 | "outputs": [], 901 | "source": [] 902 | }, 903 | { 904 | "cell_type": "markdown", 905 | "metadata": {}, 906 | "source": [ 907 | "## Scatter Plot" 908 | ] 909 | }, 910 | { 911 | "cell_type": "code", 912 | "execution_count": null, 913 | "metadata": {}, 914 | "outputs": [], 915 | "source": [ 916 | "# Use plotly to plot the first three components\n", 917 | "import plotly.express as px\n", 918 | "fig = px.scatter_3d(pcas, x='PC1', y='PC2', z='PC3', color=y)\n", 919 | "fig.show()\n" 920 | ] 921 | }, 922 | { 923 | "cell_type": "code", 924 | "execution_count": null, 925 | "metadata": {}, 926 | "outputs": [], 927 | "source": [] 928 | }, 929 | { 930 | "cell_type": "code", 931 | "execution_count": null, 932 | "metadata": {}, 933 | "outputs": [], 934 | "source": [] 935 | }, 936 | { 937 | "cell_type": "code", 938 | "execution_count": null, 939 | "metadata": {}, 940 | "outputs": [], 941 | "source": [] 942 | }, 943 | { 944 | "cell_type": "code", 945 | "execution_count": null, 946 | "metadata": {}, 947 | "outputs": [], 948 | "source": [] 949 | }, 950 | { 951 | "cell_type": "code", 952 | "execution_count": null, 953 | "metadata": {}, 954 | "outputs": [], 955 | "source": [] 956 | }, 957 | { 958 | "cell_type": "code", 959 | "execution_count": null, 960 | "metadata": {}, 961 | "outputs": [], 962 | "source": [] 963 | }, 964 | { 965 | "cell_type": "markdown", 966 | "metadata": {}, 967 | "source": [ 968 | "## Other Dimensionality Reduction Techniques\n", 969 | "\n", 970 | "- t-SNE - t-distributed stochastic neighbor embedding. Tries to preserve the local structure of the data.\n", 971 | "- UMAP - Uniform Manifold Approximation and Projection. Tries to preserve the both global and structure of the data.\n", 972 | "- Autoencoders\n" 973 | ] 974 | }, 975 | { 976 | "cell_type": "code", 977 | "execution_count": null, 978 | "metadata": {}, 979 | "outputs": [], 980 | "source": [ 981 | "# run t-SNE on the Iris dataset\n", 982 | "from sklearn.manifold import TSNE\n", 983 | "tsne = TSNE(n_components=3, random_state=42)\n", 984 | "X_tsne = tsne.fit_transform(X_all)\n" 985 | ] 986 | }, 987 | { 988 | "cell_type": "code", 989 | "execution_count": null, 990 | "metadata": {}, 991 | "outputs": [], 992 | "source": [ 993 | "X_tsne" 994 | ] 995 | }, 996 | { 997 | "cell_type": "code", 998 | "execution_count": null, 999 | "metadata": {}, 1000 | "outputs": [], 1001 | "source": [ 1002 | "# plot results with plotly\n", 1003 | "import plotly.express as px\n", 1004 | "fig = px.scatter_3d(X_tsne, x='tsne0', y='tsne1', z='tsne2', color=y)\n", 1005 | "fig.show()\n" 1006 | ] 1007 | }, 1008 | { 1009 | "cell_type": "code", 1010 | "execution_count": null, 1011 | "metadata": {}, 1012 | "outputs": [], 1013 | "source": [ 1014 | "!pip install umap-learn" 1015 | ] 1016 | }, 1017 | { 1018 | "cell_type": "code", 1019 | "execution_count": null, 1020 | "metadata": {}, 1021 | "outputs": [], 1022 | "source": [ 1023 | "# run UMAP\n", 1024 | "import umap\n", 1025 | "reducer = umap.UMAP(random_state=42, n_components=3)\n", 1026 | "X_umap = pd.DataFrame(reducer.fit_transform(X_all), columns=['umap0', 'umap1', 'umap2'])\n", 1027 | "X_umap" 1028 | ] 1029 | }, 1030 | { 1031 | "cell_type": "code", 1032 | "execution_count": null, 1033 | "metadata": {}, 1034 | "outputs": [], 1035 | "source": [ 1036 | "# plot results with plotly\n", 1037 | "import plotly.express as px\n", 1038 | "fig = px.scatter_3d(X_umap, x='umap0', y='umap1', z='umap2', color=y)\n", 1039 | "fig.show()" 1040 | ] 1041 | }, 1042 | { 1043 | "cell_type": "code", 1044 | "execution_count": null, 1045 | "metadata": {}, 1046 | "outputs": [], 1047 | "source": [] 1048 | }, 1049 | { 1050 | "cell_type": "code", 1051 | "execution_count": null, 1052 | "metadata": {}, 1053 | "outputs": [], 1054 | "source": [] 1055 | }, 1056 | { 1057 | "cell_type": "code", 1058 | "execution_count": null, 1059 | "metadata": {}, 1060 | "outputs": [], 1061 | "source": [] 1062 | }, 1063 | { 1064 | "cell_type": "markdown", 1065 | "metadata": {}, 1066 | "source": [ 1067 | "## Challenge: PCA\n", 1068 | "\n", 1069 | "Run PCA on the numeric columns of the Titanic data. Plot the result of the first three dimensons using plotly." 1070 | ] 1071 | }, 1072 | { 1073 | "cell_type": "code", 1074 | "execution_count": null, 1075 | "metadata": {}, 1076 | "outputs": [], 1077 | "source": [] 1078 | }, 1079 | { 1080 | "cell_type": "markdown", 1081 | "metadata": {}, 1082 | "source": [ 1083 | "Solution: PCA" 1084 | ] 1085 | }, 1086 | { 1087 | "cell_type": "code", 1088 | "execution_count": null, 1089 | "metadata": {}, 1090 | "outputs": [], 1091 | "source": [ 1092 | "tweak_titanic(raw)" 1093 | ] 1094 | }, 1095 | { 1096 | "cell_type": "code", 1097 | "execution_count": null, 1098 | "metadata": {}, 1099 | "outputs": [], 1100 | "source": [ 1101 | "# standardize the data\n", 1102 | "from sklearn.preprocessing import StandardScaler\n", 1103 | "scaler = StandardScaler()\n", 1104 | "\n", 1105 | "X = scaler.fit_transform(tweak_titanic(raw).drop(columns='survived'))\n", 1106 | "X" 1107 | ] 1108 | }, 1109 | { 1110 | "cell_type": "code", 1111 | "execution_count": null, 1112 | "metadata": {}, 1113 | "outputs": [], 1114 | "source": [ 1115 | "# run pca on the data\n", 1116 | "pca = PCA()\n", 1117 | "\n", 1118 | "X_pca = pca.fit_transform(X) " 1119 | ] 1120 | }, 1121 | { 1122 | "cell_type": "code", 1123 | "execution_count": null, 1124 | "metadata": {}, 1125 | "outputs": [], 1126 | "source": [ 1127 | "X_pca" 1128 | ] 1129 | }, 1130 | { 1131 | "cell_type": "code", 1132 | "execution_count": null, 1133 | "metadata": {}, 1134 | "outputs": [], 1135 | "source": [] 1136 | }, 1137 | { 1138 | "cell_type": "code", 1139 | "execution_count": null, 1140 | "metadata": {}, 1141 | "outputs": [], 1142 | "source": [ 1143 | "# plot the first 3 columns in plotly\n", 1144 | "import plotly.express as px\n", 1145 | "\n", 1146 | "fig = px.scatter_3d(X_pca, x='pca0', y='pca1', z='pca2', color=raw.loc[X_pca.index]['survived'])\n", 1147 | "fig.show()\n" 1148 | ] 1149 | }, 1150 | { 1151 | "cell_type": "code", 1152 | "execution_count": null, 1153 | "metadata": {}, 1154 | "outputs": [], 1155 | "source": [] 1156 | }, 1157 | { 1158 | "cell_type": "code", 1159 | "execution_count": null, 1160 | "metadata": {}, 1161 | "outputs": [], 1162 | "source": [] 1163 | }, 1164 | { 1165 | "cell_type": "code", 1166 | "execution_count": null, 1167 | "metadata": {}, 1168 | "outputs": [], 1169 | "source": [] 1170 | }, 1171 | { 1172 | "cell_type": "markdown", 1173 | "metadata": {}, 1174 | "source": [ 1175 | "# Linear Regression" 1176 | ] 1177 | }, 1178 | { 1179 | "cell_type": "code", 1180 | "execution_count": null, 1181 | "metadata": {}, 1182 | "outputs": [], 1183 | "source": [] 1184 | }, 1185 | { 1186 | "cell_type": "markdown", 1187 | "metadata": {}, 1188 | "source": [ 1189 | "## Linear Regression Algorithm\n", 1190 | "\n", 1191 | "Linear regression calculates an intercept and slope (weights) for a line that minimizes the sum of squared errors between the line and the data points.\n", 1192 | "\n", 1193 | "The formula for linear regression is as follows:\n", 1194 | "\n", 1195 | "$$y = \\beta_0 + \\beta_1 x_1 + \\beta_2 x_2 + ... + \\beta_n x_n$$\n", 1196 | "\n", 1197 | "where $y$ is the target variable, $\\beta_0$ is the intercept, $\\beta_1$ to $\\beta_n$ are the weights, and $x_1$ to $x_n$ are the features.\n", 1198 | "\n", 1199 | "The algorithm is as follows:\n", 1200 | "\n", 1201 | "- Initialize the weights.\n", 1202 | "- Calculate the predicted values.\n", 1203 | "- Calculate the error.\n", 1204 | "- Update the weights.\n", 1205 | "- Repeat the steps above until convergence.\n", 1206 | "\n" 1207 | ] 1208 | }, 1209 | { 1210 | "cell_type": "code", 1211 | "execution_count": null, 1212 | "metadata": {}, 1213 | "outputs": [], 1214 | "source": [ 1215 | "# Load anscombe's quartet\n", 1216 | "x = [10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]\n", 1217 | "y1 = [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]\n", 1218 | "y2 = [9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74]\n", 1219 | "y3 = [7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73]\n", 1220 | "x4 = [8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8]\n", 1221 | "y4 = [6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89]\n", 1222 | "anscombe = (pd.DataFrame({'x': x, 'y1': y1, 'y2': y2, 'y3': y3, 'x4': x4, 'y4': y4})\n", 1223 | " )\n", 1224 | "\n", 1225 | "anscombe" 1226 | ] 1227 | }, 1228 | { 1229 | "cell_type": "code", 1230 | "execution_count": null, 1231 | "metadata": {}, 1232 | "outputs": [], 1233 | "source": [ 1234 | "# plot x y1\n", 1235 | "fig, ax = plt.subplots(figsize=(3, 3))\n", 1236 | "anscombe.plot.scatter(x='x', y='y1', ax=ax, color='k')" 1237 | ] 1238 | }, 1239 | { 1240 | "cell_type": "markdown", 1241 | "metadata": {}, 1242 | "source": [ 1243 | "Let's run the algorithm on x and y1\n", 1244 | "\n", 1245 | "Calculate the slope:\n", 1246 | "\n", 1247 | "$$\\beta_1 = \\frac{\\sum_{i=1}^{n} (x_i - \\bar{x})(y_i - \\bar{y})}{\\sum_{i=1}^{n} (x_i - \\bar{x})^2}$$\n", 1248 | "\n", 1249 | "Calculate the intercept:\n", 1250 | "\n", 1251 | "$$\\beta_0 = \\bar{y} - \\beta_1 \\bar{x}$$\n", 1252 | "\n", 1253 | "Model Equation:\n", 1254 | "\n", 1255 | "$$y = \\beta_0 + \\beta_1 x$$" 1256 | ] 1257 | }, 1258 | { 1259 | "cell_type": "code", 1260 | "execution_count": null, 1261 | "metadata": {}, 1262 | "outputs": [], 1263 | "source": [ 1264 | "# slope\n", 1265 | "\n", 1266 | "x1 = anscombe['x']\n", 1267 | "y1 = anscombe['y1']\n", 1268 | "slope = ((x1 - x1.mean())*(y1 - y1.mean())).sum() / ((x1 - x1.mean())**2).sum()\n", 1269 | "slope" 1270 | ] 1271 | }, 1272 | { 1273 | "cell_type": "code", 1274 | "execution_count": null, 1275 | "metadata": {}, 1276 | "outputs": [], 1277 | "source": [ 1278 | "# intercept\n", 1279 | "\n", 1280 | "intercept = y1.mean() - slope * x1.mean()\n", 1281 | "intercept" 1282 | ] 1283 | }, 1284 | { 1285 | "cell_type": "code", 1286 | "execution_count": null, 1287 | "metadata": {}, 1288 | "outputs": [], 1289 | "source": [ 1290 | "# plot x y1\n", 1291 | "fig, ax = plt.subplots(figsize=(3, 3))\n", 1292 | "anscombe.plot.scatter(x='x', y='y1', ax=ax, color='k')\n", 1293 | "# plot the line\n", 1294 | "x1 = np.linspace(4, 14, 100)\n", 1295 | "y1 = slope * x1 + intercept\n", 1296 | "ax.plot(x1, y1, color='r')" 1297 | ] 1298 | }, 1299 | { 1300 | "cell_type": "code", 1301 | "execution_count": null, 1302 | "metadata": {}, 1303 | "outputs": [], 1304 | "source": [] 1305 | }, 1306 | { 1307 | "cell_type": "code", 1308 | "execution_count": null, 1309 | "metadata": {}, 1310 | "outputs": [], 1311 | "source": [] 1312 | }, 1313 | { 1314 | "cell_type": "code", 1315 | "execution_count": null, 1316 | "metadata": {}, 1317 | "outputs": [], 1318 | "source": [] 1319 | }, 1320 | { 1321 | "cell_type": "markdown", 1322 | "metadata": {}, 1323 | "source": [ 1324 | "## Examples with Scikit-learn" 1325 | ] 1326 | }, 1327 | { 1328 | "cell_type": "code", 1329 | "execution_count": null, 1330 | "metadata": {}, 1331 | "outputs": [], 1332 | "source": [ 1333 | "from sklearn.linear_model import LinearRegression\n", 1334 | "\n", 1335 | "x1 = anscombe[['x']]\n", 1336 | "y1 = anscombe['y1']\n", 1337 | "y2 = anscombe['y2']\n", 1338 | "y3 = anscombe['y3']\n", 1339 | "\n", 1340 | "lr1 = LinearRegression()\n", 1341 | "lr1.fit(x1, y1)\n", 1342 | "\n" 1343 | ] 1344 | }, 1345 | { 1346 | "cell_type": "code", 1347 | "execution_count": null, 1348 | "metadata": {}, 1349 | "outputs": [], 1350 | "source": [ 1351 | "lr1.coef_" 1352 | ] 1353 | }, 1354 | { 1355 | "cell_type": "code", 1356 | "execution_count": null, 1357 | "metadata": {}, 1358 | "outputs": [], 1359 | "source": [ 1360 | "lr1.intercept_" 1361 | ] 1362 | }, 1363 | { 1364 | "cell_type": "code", 1365 | "execution_count": null, 1366 | "metadata": {}, 1367 | "outputs": [], 1368 | "source": [ 1369 | "lr2 = LinearRegression()\n", 1370 | "lr2.fit(x1, y2)\n", 1371 | "lr3 = LinearRegression()\n", 1372 | "lr3.fit(x1, y3)" 1373 | ] 1374 | }, 1375 | { 1376 | "cell_type": "code", 1377 | "execution_count": null, 1378 | "metadata": {}, 1379 | "outputs": [], 1380 | "source": [ 1381 | "# plot 1, 2 and 3 in different colors\n", 1382 | "fig, axs = plt.subplots(1, 3, figsize=(9, 3))\n", 1383 | "anscombe.plot.scatter(x='x', y='y1', ax=axs[0], color='k')\n", 1384 | "axs[0].plot(x1, lr1.predict(x1), color='#aaa')\n", 1385 | "axs[0].set_ylim(3, 13)\n", 1386 | "anscombe.plot.scatter(x='x', y='y2', ax=axs[1], color='b')\n", 1387 | "axs[1].plot(x1, lr2.predict(x1), color='#55a')\n", 1388 | "axs[1].set_ylim(3, 13)\n", 1389 | "anscombe.plot.scatter(x='x', y='y3', ax=axs[2], color='g')\n", 1390 | "axs[2].plot(x1, lr3.predict(x1), color='#5a5')\n", 1391 | "axs[2].set_ylim(3, 13)" 1392 | ] 1393 | }, 1394 | { 1395 | "cell_type": "code", 1396 | "execution_count": null, 1397 | "metadata": {}, 1398 | "outputs": [], 1399 | "source": [] 1400 | }, 1401 | { 1402 | "cell_type": "code", 1403 | "execution_count": null, 1404 | "metadata": {}, 1405 | "outputs": [], 1406 | "source": [] 1407 | }, 1408 | { 1409 | "cell_type": "code", 1410 | "execution_count": null, 1411 | "metadata": {}, 1412 | "outputs": [], 1413 | "source": [] 1414 | }, 1415 | { 1416 | "cell_type": "markdown", 1417 | "metadata": {}, 1418 | "source": [ 1419 | "## Real world example with Aircraft Elevators\n", 1420 | "\n", 1421 | "From website: This data set is also obtained from the task of controlling a F16 aircraft, although the target variable and attributes are different from the ailerons domain. In this case the goal variable is related to an action taken on the elevators of the aircraft.\n" 1422 | ] 1423 | }, 1424 | { 1425 | "cell_type": "code", 1426 | "execution_count": null, 1427 | "metadata": {}, 1428 | "outputs": [], 1429 | "source": [ 1430 | "# https://www.openml.org/search?type=data&sort=runs&id=216&satatus=active \n", 1431 | "from datasets import load_dataset\n", 1432 | "elevators = load_dataset('inria-soda/tabular-benchmark', data_files='reg_num/elevators.csv')" 1433 | ] 1434 | }, 1435 | { 1436 | "cell_type": "code", 1437 | "execution_count": null, 1438 | "metadata": {}, 1439 | "outputs": [], 1440 | "source": [ 1441 | "elev = elevators['train'].to_pandas()\n", 1442 | "elev\n" 1443 | ] 1444 | }, 1445 | { 1446 | "cell_type": "code", 1447 | "execution_count": null, 1448 | "metadata": {}, 1449 | "outputs": [], 1450 | "source": [ 1451 | "X = elev.drop(columns=['Goal'])\n", 1452 | "y = elev['Goal']\n", 1453 | "\n", 1454 | "lr_elev = LinearRegression()\n", 1455 | "lr_elev.fit(X, y)" 1456 | ] 1457 | }, 1458 | { 1459 | "cell_type": "code", 1460 | "execution_count": null, 1461 | "metadata": {}, 1462 | "outputs": [], 1463 | "source": [ 1464 | "lr_elev.coef_\n" 1465 | ] 1466 | }, 1467 | { 1468 | "cell_type": "code", 1469 | "execution_count": null, 1470 | "metadata": {}, 1471 | "outputs": [], 1472 | "source": [ 1473 | "lr_elev.intercept_" 1474 | ] 1475 | }, 1476 | { 1477 | "cell_type": "code", 1478 | "execution_count": null, 1479 | "metadata": {}, 1480 | "outputs": [], 1481 | "source": [ 1482 | "pd.Series(lr_elev.coef_, index=X.columns).sort_values().plot.barh(figsize=(8, 6))" 1483 | ] 1484 | }, 1485 | { 1486 | "cell_type": "code", 1487 | "execution_count": null, 1488 | "metadata": {}, 1489 | "outputs": [], 1490 | "source": [ 1491 | "# score is R^2 - the proportion of variance explained by the model\n", 1492 | "lr_elev.score(X, y)" 1493 | ] 1494 | }, 1495 | { 1496 | "cell_type": "code", 1497 | "execution_count": null, 1498 | "metadata": {}, 1499 | "outputs": [], 1500 | "source": [ 1501 | "from sklearn.metrics import mean_squared_error, mean_absolute_error\n", 1502 | "mean_absolute_error(y, lr_elev.predict(X)), mean_squared_error(y, lr_elev.predict(X))" 1503 | ] 1504 | }, 1505 | { 1506 | "cell_type": "code", 1507 | "execution_count": null, 1508 | "metadata": {}, 1509 | "outputs": [], 1510 | "source": [ 1511 | "lr_elev.predict(X.iloc[[0]])" 1512 | ] 1513 | }, 1514 | { 1515 | "cell_type": "code", 1516 | "execution_count": null, 1517 | "metadata": {}, 1518 | "outputs": [], 1519 | "source": [ 1520 | "y.iloc[0]" 1521 | ] 1522 | }, 1523 | { 1524 | "cell_type": "code", 1525 | "execution_count": null, 1526 | "metadata": {}, 1527 | "outputs": [], 1528 | "source": [] 1529 | }, 1530 | { 1531 | "cell_type": "code", 1532 | "execution_count": null, 1533 | "metadata": {}, 1534 | "outputs": [], 1535 | "source": [] 1536 | }, 1537 | { 1538 | "cell_type": "code", 1539 | "execution_count": null, 1540 | "metadata": {}, 1541 | "outputs": [], 1542 | "source": [] 1543 | }, 1544 | { 1545 | "cell_type": "markdown", 1546 | "metadata": {}, 1547 | "source": [ 1548 | "## Assumptions of Linear Regression\n", 1549 | "\n", 1550 | "- Linear relationship between the features and target variable\n", 1551 | "- No multicollinearity - no correlation between the features\n", 1552 | "- Homoscedasticity - the variance of the residuals is the same for all values of the target variable\n", 1553 | "- No outliers - the residuals are normally distributed\n", 1554 | "\n", 1555 | "Also, generally you will want to scale the features before running linear regression." 1556 | ] 1557 | }, 1558 | { 1559 | "cell_type": "code", 1560 | "execution_count": null, 1561 | "metadata": {}, 1562 | "outputs": [], 1563 | "source": [ 1564 | "# standardize the data\n", 1565 | "from sklearn.preprocessing import StandardScaler\n", 1566 | "scaler = StandardScaler()\n", 1567 | "X_scaled = scaler.fit_transform(X)\n", 1568 | "X_scaled" 1569 | ] 1570 | }, 1571 | { 1572 | "cell_type": "code", 1573 | "execution_count": null, 1574 | "metadata": {}, 1575 | "outputs": [], 1576 | "source": [ 1577 | "X_scaled.describe()" 1578 | ] 1579 | }, 1580 | { 1581 | "cell_type": "code", 1582 | "execution_count": null, 1583 | "metadata": {}, 1584 | "outputs": [], 1585 | "source": [ 1586 | "lr_std = LinearRegression()\n", 1587 | "lr_std.fit(X_scaled, y)\n", 1588 | "lr_std.score(X_scaled, y)" 1589 | ] 1590 | }, 1591 | { 1592 | "cell_type": "code", 1593 | "execution_count": null, 1594 | "metadata": {}, 1595 | "outputs": [], 1596 | "source": [ 1597 | "pd.Series(lr_std.coef_, index=X.columns).sort_values().plot.barh(figsize=(8, 6))" 1598 | ] 1599 | }, 1600 | { 1601 | "cell_type": "code", 1602 | "execution_count": null, 1603 | "metadata": {}, 1604 | "outputs": [], 1605 | "source": [ 1606 | "!pip install xgboost" 1607 | ] 1608 | }, 1609 | { 1610 | "cell_type": "code", 1611 | "execution_count": null, 1612 | "metadata": {}, 1613 | "outputs": [], 1614 | "source": [ 1615 | "# try with XGBoost\n", 1616 | "from xgboost import XGBRegressor\n", 1617 | "xgb = XGBRegressor()\n", 1618 | "xgb.fit(X, y)\n", 1619 | "xgb.score(X, y)" 1620 | ] 1621 | }, 1622 | { 1623 | "cell_type": "code", 1624 | "execution_count": null, 1625 | "metadata": {}, 1626 | "outputs": [], 1627 | "source": [] 1628 | }, 1629 | { 1630 | "cell_type": "code", 1631 | "execution_count": null, 1632 | "metadata": {}, 1633 | "outputs": [], 1634 | "source": [] 1635 | }, 1636 | { 1637 | "cell_type": "markdown", 1638 | "metadata": {}, 1639 | "source": [ 1640 | "## Challenge: Linear Regression\n", 1641 | "\n", 1642 | "Make a model to predict how much Titanic passengers paid for their tickets with Linear Regression. (Only use the numeric columns for the model.)" 1643 | ] 1644 | }, 1645 | { 1646 | "cell_type": "code", 1647 | "execution_count": null, 1648 | "metadata": {}, 1649 | "outputs": [], 1650 | "source": [] 1651 | }, 1652 | { 1653 | "cell_type": "markdown", 1654 | "metadata": {}, 1655 | "source": [ 1656 | "## Solution: Linear Regression" 1657 | ] 1658 | }, 1659 | { 1660 | "cell_type": "code", 1661 | "execution_count": null, 1662 | "metadata": {}, 1663 | "outputs": [], 1664 | "source": [ 1665 | "raw" 1666 | ] 1667 | }, 1668 | { 1669 | "cell_type": "code", 1670 | "execution_count": null, 1671 | "metadata": {}, 1672 | "outputs": [], 1673 | "source": [ 1674 | "# predict fare from numeric columns\n", 1675 | "X = tweak_titanic(raw).drop(columns=['fare'])\n", 1676 | "y = tweak_titanic(raw)['fare']" 1677 | ] 1678 | }, 1679 | { 1680 | "cell_type": "code", 1681 | "execution_count": null, 1682 | "metadata": {}, 1683 | "outputs": [], 1684 | "source": [ 1685 | "# make linear regression model\n", 1686 | "lr = LinearRegression()\n", 1687 | "lr.fit(X, y)" 1688 | ] 1689 | }, 1690 | { 1691 | "cell_type": "code", 1692 | "execution_count": null, 1693 | "metadata": {}, 1694 | "outputs": [], 1695 | "source": [ 1696 | "lr.score(X, y)" 1697 | ] 1698 | }, 1699 | { 1700 | "cell_type": "code", 1701 | "execution_count": null, 1702 | "metadata": {}, 1703 | "outputs": [], 1704 | "source": [ 1705 | "# make an xgboost model\n", 1706 | "xgb = XGBRegressor()\n", 1707 | "\n", 1708 | "xgb.fit(X,y)\n", 1709 | "xgb.score(X,y)" 1710 | ] 1711 | }, 1712 | { 1713 | "cell_type": "code", 1714 | "execution_count": null, 1715 | "metadata": {}, 1716 | "outputs": [], 1717 | "source": [] 1718 | }, 1719 | { 1720 | "cell_type": "code", 1721 | "execution_count": null, 1722 | "metadata": {}, 1723 | "outputs": [], 1724 | "source": [] 1725 | }, 1726 | { 1727 | "cell_type": "code", 1728 | "execution_count": null, 1729 | "metadata": {}, 1730 | "outputs": [], 1731 | "source": [] 1732 | }, 1733 | { 1734 | "cell_type": "markdown", 1735 | "metadata": {}, 1736 | "source": [ 1737 | "# Logistic Regression\n", 1738 | "\n" 1739 | ] 1740 | }, 1741 | { 1742 | "cell_type": "markdown", 1743 | "metadata": { 1744 | "collapsed": false, 1745 | "jupyter": { 1746 | "outputs_hidden": false 1747 | } 1748 | }, 1749 | "source": [ 1750 | "## Logistic Regression Algorithm\n", 1751 | "\n", 1752 | "Even though logistic regression has \"regression\" in its name, it is used as a classification algorithm. It calculates the probability that a sample belongs to a class. Rather than fitting a line to the data, it fits an \"S\" shaped curve called the sigmoid function.\n", 1753 | "\n", 1754 | "The formula for logistic regression is as follows:\n", 1755 | "\n", 1756 | "$$y = \\frac{1}{1 + e^{-z}}$$\n", 1757 | "\n", 1758 | "where $y$ is the probability that a sample belongs to a class and $z$ is the linear combination of the features.\n", 1759 | "\n", 1760 | "The algorithm is as follows:\n", 1761 | "\n", 1762 | "- Initialize the weights.\n", 1763 | "- Calculate the predicted values.\n", 1764 | "- Calculate the error.\n", 1765 | "- Update the weights.\n", 1766 | "- Repeat the steps above until convergence.\n" 1767 | ] 1768 | }, 1769 | { 1770 | "cell_type": "code", 1771 | "execution_count": null, 1772 | "metadata": {}, 1773 | "outputs": [], 1774 | "source": [ 1775 | "# plot sigmoid function\n", 1776 | "import numpy as np \n", 1777 | "from matplotlib import pyplot as plt\n", 1778 | "x = np.linspace(-10, 10, 100)\n", 1779 | "y = 1 / (1 + np.exp(-x))\n", 1780 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 1781 | "ax.plot(x, y)" 1782 | ] 1783 | }, 1784 | { 1785 | "cell_type": "code", 1786 | "execution_count": null, 1787 | "metadata": {}, 1788 | "outputs": [], 1789 | "source": [] 1790 | }, 1791 | { 1792 | "cell_type": "code", 1793 | "execution_count": null, 1794 | "metadata": {}, 1795 | "outputs": [], 1796 | "source": [] 1797 | }, 1798 | { 1799 | "cell_type": "markdown", 1800 | "metadata": {}, 1801 | "source": [ 1802 | "## Basic Example\n", 1803 | "\n", 1804 | "\n", 1805 | "\n" 1806 | ] 1807 | }, 1808 | { 1809 | "cell_type": "code", 1810 | "execution_count": null, 1811 | "metadata": {}, 1812 | "outputs": [], 1813 | "source": [ 1814 | "import pandas as pd\n", 1815 | "log_data = pd.DataFrame({'x': [-2, -2.3, -2.1, -1, -.5, 0, .5, .7, 1, 2, 3],\n", 1816 | " 'y': [0, 0, 0, 0, 1, 0, 1,1, 1, 1, 1]})\n", 1817 | "\n", 1818 | "log_data.plot.scatter(x='x', y='y')" 1819 | ] 1820 | }, 1821 | { 1822 | "cell_type": "code", 1823 | "execution_count": null, 1824 | "metadata": {}, 1825 | "outputs": [], 1826 | "source": [ 1827 | "from sklearn.linear_model import LogisticRegression\n", 1828 | "log_r = LogisticRegression()\n", 1829 | "log_r.fit(log_data[['x']], log_data['y'])\n" 1830 | ] 1831 | }, 1832 | { 1833 | "cell_type": "code", 1834 | "execution_count": null, 1835 | "metadata": {}, 1836 | "outputs": [], 1837 | "source": [ 1838 | "log_r.coef_" 1839 | ] 1840 | }, 1841 | { 1842 | "cell_type": "code", 1843 | "execution_count": null, 1844 | "metadata": {}, 1845 | "outputs": [], 1846 | "source": [ 1847 | "log_r.intercept_" 1848 | ] 1849 | }, 1850 | { 1851 | "cell_type": "code", 1852 | "execution_count": null, 1853 | "metadata": {}, 1854 | "outputs": [], 1855 | "source": [ 1856 | "# plot fitted sigmoid function on top of data\n", 1857 | "x = np.linspace(-3, 4, 100)\n", 1858 | "y = 1 / (1 + np.exp(-(log_r.coef_[0][0] * x + log_r.intercept_[0])))\n", 1859 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 1860 | "ax.plot(x, y)\n", 1861 | "log_data.plot.scatter(x='x', y='y', ax=ax)\n", 1862 | "# annotate above .5\n", 1863 | "ax.annotate('Predict 1\\nright of this', xy=(-.31, .5), xytext=(2, .4), arrowprops={'arrowstyle': '->'})" 1864 | ] 1865 | }, 1866 | { 1867 | "cell_type": "code", 1868 | "execution_count": null, 1869 | "metadata": {}, 1870 | "outputs": [], 1871 | "source": [ 1872 | "log_r.predict([[-.3]])" 1873 | ] 1874 | }, 1875 | { 1876 | "cell_type": "code", 1877 | "execution_count": null, 1878 | "metadata": {}, 1879 | "outputs": [], 1880 | "source": [] 1881 | }, 1882 | { 1883 | "cell_type": "code", 1884 | "execution_count": null, 1885 | "metadata": {}, 1886 | "outputs": [], 1887 | "source": [] 1888 | }, 1889 | { 1890 | "cell_type": "code", 1891 | "execution_count": null, 1892 | "metadata": {}, 1893 | "outputs": [], 1894 | "source": [] 1895 | }, 1896 | { 1897 | "cell_type": "markdown", 1898 | "metadata": {}, 1899 | "source": [ 1900 | "## Real World Example with Eye movements\n", 1901 | "\n", 1902 | "From the website:\n", 1903 | "\n", 1904 | "The dataset consist of several assignments. Each assignment consists of a question followed by ten sentences (titles of news articles). One of the sentences is the correct answer to the question (C) and five of the sentences are irrelevant to the question (I). Four of the sentences are relevant to the question (R), but they do not answer it.\n", 1905 | "\n", 1906 | "- Features are in columns, feature vectors in rows.\n", 1907 | "- Each assignment is a time sequence of 22-dimensional feature vectors.\n", 1908 | "- The first column is the line number, second the assignment number and the next 22 columns (3 to 24) are the different features. Columns 25 to 27 contain extra information about the example. The training data set contains the classification label in the 28th column: \"0\" for irrelevant, \"1\" for relevant and \"2\" for the correct answer.\n", 1909 | "- Each example (row) represents a single word. You are asked to return the classification of each read sentence.\n", 1910 | "- The 22 features provided are commonly used in psychological studies on eye movement. All of them are not necessarily relevant in this context.\n", 1911 | "\n", 1912 | "The objective of the Challenge is to predict the classification labels (I, R, C).\n" 1913 | ] 1914 | }, 1915 | { 1916 | "cell_type": "code", 1917 | "execution_count": null, 1918 | "metadata": {}, 1919 | "outputs": [], 1920 | "source": [ 1921 | "# https://www.openml.org/da/1044\n", 1922 | "from datasets import load_dataset\n", 1923 | "eye = load_dataset('inria-soda/tabular-benchmark', data_files='clf_num/eye_movements.csv')" 1924 | ] 1925 | }, 1926 | { 1927 | "cell_type": "code", 1928 | "execution_count": null, 1929 | "metadata": {}, 1930 | "outputs": [], 1931 | "source": [ 1932 | "eye_df = eye['train'].to_pandas()\n", 1933 | "eye_df\n" 1934 | ] 1935 | }, 1936 | { 1937 | "cell_type": "code", 1938 | "execution_count": null, 1939 | "metadata": {}, 1940 | "outputs": [], 1941 | "source": [ 1942 | "from sklearn.preprocessing import StandardScaler\n", 1943 | "\n", 1944 | "X = eye_df.drop(columns=['label'])\n", 1945 | "y = eye_df['label']\n", 1946 | "std = StandardScaler()\n", 1947 | "X_scaled = std.fit_transform(X)\n", 1948 | "eye_log = LogisticRegression()\n", 1949 | "eye_log.fit(X_scaled, y)\n", 1950 | "eye_log.score(X_scaled, y)" 1951 | ] 1952 | }, 1953 | { 1954 | "cell_type": "code", 1955 | "execution_count": null, 1956 | "metadata": {}, 1957 | "outputs": [], 1958 | "source": [ 1959 | "pd.Series(eye_log.coef_[0], index=X.columns).sort_values().plot.barh(figsize=(8, 6))" 1960 | ] 1961 | }, 1962 | { 1963 | "cell_type": "code", 1964 | "execution_count": null, 1965 | "metadata": {}, 1966 | "outputs": [], 1967 | "source": [] 1968 | }, 1969 | { 1970 | "cell_type": "code", 1971 | "execution_count": null, 1972 | "metadata": {}, 1973 | "outputs": [], 1974 | "source": [] 1975 | }, 1976 | { 1977 | "cell_type": "code", 1978 | "execution_count": null, 1979 | "metadata": {}, 1980 | "outputs": [], 1981 | "source": [] 1982 | }, 1983 | { 1984 | "cell_type": "markdown", 1985 | "metadata": {}, 1986 | "source": [ 1987 | "## Challenge: Logistic Regression\n", 1988 | "\n", 1989 | "Create a logistic regression model to predict whether a Titanic passenger survives based on the numeric columns.\n" 1990 | ] 1991 | }, 1992 | { 1993 | "cell_type": "code", 1994 | "execution_count": null, 1995 | "metadata": {}, 1996 | "outputs": [], 1997 | "source": [] 1998 | }, 1999 | { 2000 | "cell_type": "code", 2001 | "execution_count": null, 2002 | "metadata": {}, 2003 | "outputs": [], 2004 | "source": [] 2005 | }, 2006 | { 2007 | "cell_type": "markdown", 2008 | "metadata": {}, 2009 | "source": [ 2010 | "## Solution: Logistic Regression" 2011 | ] 2012 | }, 2013 | { 2014 | "cell_type": "code", 2015 | "execution_count": null, 2016 | "metadata": {}, 2017 | "outputs": [], 2018 | "source": [ 2019 | "import pandas as pd\n", 2020 | "url = 'https://github.com/mattharrison/datasets/raw/master/data/titanic3.xls'\n", 2021 | "raw = pd.read_excel(url)\n", 2022 | "raw" 2023 | ] 2024 | }, 2025 | { 2026 | "cell_type": "code", 2027 | "execution_count": null, 2028 | "metadata": {}, 2029 | "outputs": [], 2030 | "source": [ 2031 | "# predicted survived column with logistic regression\n", 2032 | "from sklearn.linear_model import LogisticRegression\n", 2033 | "\n", 2034 | "titanic_X = (raw\n", 2035 | ".loc[:, ['pclass', 'age', 'sibsp', 'parch', 'fare']]\n", 2036 | ".dropna()\n", 2037 | " )\n", 2038 | "\n", 2039 | "# standardize the data\n", 2040 | "from sklearn.preprocessing import StandardScaler\n", 2041 | "from sklearn import set_config\n", 2042 | "set_config(transform_output='pandas')\n", 2043 | "\n", 2044 | "std = StandardScaler()\n", 2045 | "X = std.fit_transform(titanic_X)\n", 2046 | "X" 2047 | ] 2048 | }, 2049 | { 2050 | "cell_type": "code", 2051 | "execution_count": null, 2052 | "metadata": {}, 2053 | "outputs": [], 2054 | "source": [ 2055 | "# load train_test_split\n", 2056 | "from sklearn.model_selection import train_test_split\n", 2057 | "# split the data\n", 2058 | "y = raw.loc[X.index, 'survived']\n", 2059 | "\n", 2060 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n", 2061 | "lr = LogisticRegression()\n", 2062 | "lr.fit(X_train, y_train)\n", 2063 | "lr.score(X_test, y_test)" 2064 | ] 2065 | }, 2066 | { 2067 | "cell_type": "code", 2068 | "execution_count": null, 2069 | "metadata": {}, 2070 | "outputs": [], 2071 | "source": [ 2072 | "# try an XGBoost model\n", 2073 | "from xgboost import XGBClassifier\n", 2074 | "\n", 2075 | "xgb = XGBClassifier()\n", 2076 | "xgb.fit(X_train, y_train)\n", 2077 | "xgb.score(X_test, y_test)" 2078 | ] 2079 | }, 2080 | { 2081 | "cell_type": "code", 2082 | "execution_count": null, 2083 | "metadata": {}, 2084 | "outputs": [], 2085 | "source": [] 2086 | }, 2087 | { 2088 | "cell_type": "markdown", 2089 | "metadata": {}, 2090 | "source": [ 2091 | "# Decision Trees\n" 2092 | ] 2093 | }, 2094 | { 2095 | "cell_type": "markdown", 2096 | "metadata": {}, 2097 | "source": [ 2098 | "## Decision Tree Algorithm\n", 2099 | "\n", 2100 | "Decision trees are a type of supervised learning algorithm that can be used for both classification and regression. They work by splitting the data into subsets based on the features. The goal is to split the data in a way that minimizes the entropy of the subsets." 2101 | ] 2102 | }, 2103 | { 2104 | "cell_type": "code", 2105 | "execution_count": null, 2106 | "metadata": {}, 2107 | "outputs": [], 2108 | "source": [ 2109 | "## Create \"decision stump\"\n", 2110 | "## fit tree regressor to anscombe's quartet limit to 1 level\n", 2111 | "\n", 2112 | "from sklearn.tree import DecisionTreeRegressor\n", 2113 | "dt = DecisionTreeRegressor(max_depth=1)\n", 2114 | "X = anscombe[['x']]\n", 2115 | "y = anscombe['y1']\n", 2116 | "dt.fit(X, y)\n" 2117 | ] 2118 | }, 2119 | { 2120 | "cell_type": "code", 2121 | "execution_count": null, 2122 | "metadata": {}, 2123 | "outputs": [], 2124 | "source": [ 2125 | "## Plot the tree\n", 2126 | "from sklearn.tree import plot_tree\n", 2127 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 2128 | "_ = plot_tree(dt, ax=ax, feature_names=['x'], filled=True, fontsize=10)" 2129 | ] 2130 | }, 2131 | { 2132 | "cell_type": "code", 2133 | "execution_count": null, 2134 | "metadata": {}, 2135 | "outputs": [], 2136 | "source": [ 2137 | "## Plot the data and predictions on the same plot\n", 2138 | "import numpy as np\n", 2139 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 2140 | "anscombe.plot.scatter(x='x', y='y1', ax=ax, color='k')\n", 2141 | "# plot the line\n", 2142 | "x1 = np.linspace(4, 14, 100)\n", 2143 | "y1 = dt.predict(x1.reshape(-1, 1))\n", 2144 | "ax.plot(x1, y1, color='r')\n" 2145 | ] 2146 | }, 2147 | { 2148 | "cell_type": "code", 2149 | "execution_count": null, 2150 | "metadata": {}, 2151 | "outputs": [], 2152 | "source": [ 2153 | "## Now plot to two levels\n", 2154 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 2155 | "anscombe.plot.scatter(x='x', y='y1', ax=ax, color='k')\n", 2156 | "# plot the line\n", 2157 | "dt2 = DecisionTreeRegressor(max_depth=2)\n", 2158 | "dt2.fit(X, y)\n", 2159 | "\n", 2160 | "x1 = np.linspace(4, 14, 100)\n", 2161 | "y1 = dt2.predict(x1.reshape(-1, 1))\n", 2162 | "ax.plot(x1, y1, color='r')\n" 2163 | ] 2164 | }, 2165 | { 2166 | "cell_type": "code", 2167 | "execution_count": null, 2168 | "metadata": {}, 2169 | "outputs": [], 2170 | "source": [ 2171 | "## Now plot unlimited levels\n", 2172 | "fig, ax = plt.subplots(figsize=(8, 6))\n", 2173 | "anscombe.plot.scatter(x='x', y='y1', ax=ax, color='k')\n", 2174 | "# plot the line\n", 2175 | "dt3 = DecisionTreeRegressor(max_depth=None)\n", 2176 | "dt3.fit(X, y)\n", 2177 | "\n", 2178 | "x1 = np.linspace(4, 14, 100)\n", 2179 | "y1 = dt3.predict(x1.reshape(-1, 1))\n", 2180 | "ax.plot(x1, y1, color='r')\n" 2181 | ] 2182 | }, 2183 | { 2184 | "cell_type": "code", 2185 | "execution_count": null, 2186 | "metadata": {}, 2187 | "outputs": [], 2188 | "source": [] 2189 | }, 2190 | { 2191 | "cell_type": "code", 2192 | "execution_count": null, 2193 | "metadata": {}, 2194 | "outputs": [], 2195 | "source": [] 2196 | }, 2197 | { 2198 | "cell_type": "code", 2199 | "execution_count": null, 2200 | "metadata": {}, 2201 | "outputs": [], 2202 | "source": [] 2203 | }, 2204 | { 2205 | "cell_type": "markdown", 2206 | "metadata": {}, 2207 | "source": [ 2208 | "## Real World with Aircraft Elevators" 2209 | ] 2210 | }, 2211 | { 2212 | "cell_type": "code", 2213 | "execution_count": null, 2214 | "metadata": {}, 2215 | "outputs": [], 2216 | "source": [ 2217 | "X_elev = elev.drop(columns=['Goal'])\n", 2218 | "y_elev = elev['Goal']\n", 2219 | "dt_elev = DecisionTreeRegressor(max_depth=3)\n", 2220 | "dt_elev.fit(X_elev, y_elev)\n" 2221 | ] 2222 | }, 2223 | { 2224 | "cell_type": "code", 2225 | "execution_count": null, 2226 | "metadata": {}, 2227 | "outputs": [], 2228 | "source": [ 2229 | "# plot the tree\n", 2230 | "from sklearn.tree import plot_tree\n", 2231 | "fig, ax = plt.subplots(figsize=(12, 8))\n", 2232 | "_ = plot_tree(dt_elev, ax=ax, feature_names=X_elev.columns, filled=True, fontsize=10, precision=4)\n" 2233 | ] 2234 | }, 2235 | { 2236 | "cell_type": "code", 2237 | "execution_count": null, 2238 | "metadata": {}, 2239 | "outputs": [], 2240 | "source": [ 2241 | "dt_elev.score(X_elev, y_elev)" 2242 | ] 2243 | }, 2244 | { 2245 | "cell_type": "code", 2246 | "execution_count": null, 2247 | "metadata": {}, 2248 | "outputs": [], 2249 | "source": [ 2250 | "from sklearn.linear_model import LinearRegression\n", 2251 | "lr_elev = LinearRegression()\n", 2252 | "lr_elev.fit(X_elev, y_elev)\n", 2253 | "lr_elev.score(X_elev, y_elev)" 2254 | ] 2255 | }, 2256 | { 2257 | "cell_type": "code", 2258 | "execution_count": null, 2259 | "metadata": {}, 2260 | "outputs": [], 2261 | "source": [ 2262 | "# loop over depths and plot the results\n", 2263 | "scores = []\n", 2264 | "for i in range(1, 20):\n", 2265 | " dt = DecisionTreeRegressor(max_depth=i)\n", 2266 | " dt.fit(X_elev, y_elev)\n", 2267 | " scores.append(dt.score(X_elev, y_elev))\n", 2268 | "\n", 2269 | "pd.Series(scores).plot.line(figsize=(8, 6))" 2270 | ] 2271 | }, 2272 | { 2273 | "cell_type": "code", 2274 | "execution_count": null, 2275 | "metadata": {}, 2276 | "outputs": [], 2277 | "source": [ 2278 | "# split the data and plot results of train and test\n", 2279 | "from sklearn.model_selection import train_test_split\n", 2280 | "X_train, X_test, y_train, y_test = train_test_split(X_elev, y_elev, random_state=42)\n", 2281 | "test_scores = []\n", 2282 | "train_scores = []\n", 2283 | "for i in range(1, 20):\n", 2284 | " dt = DecisionTreeRegressor(max_depth=i)\n", 2285 | " dt.fit(X_train, y_train)\n", 2286 | " test_scores.append(dt.score(X_test, y_test))\n", 2287 | " train_scores.append(dt.score(X_train, y_train))\n", 2288 | "\n", 2289 | "ax = pd.DataFrame({'train': train_scores, 'test': test_scores}).plot.line(figsize=(8, 6))\n", 2290 | "\n", 2291 | "# annotate overfitting at 10, .7\n", 2292 | "ax.annotate('Overfitting after here', xy=(10, .7), xytext=(12, .5), arrowprops={'arrowstyle': '->'})\n", 2293 | "\n", 2294 | "# set title\n", 2295 | "ax.set_title('Validation Curve for Decision Tree')\n" 2296 | ] 2297 | }, 2298 | { 2299 | "cell_type": "code", 2300 | "execution_count": null, 2301 | "metadata": {}, 2302 | "outputs": [], 2303 | "source": [ 2304 | "# Let's see if our model improves with a deeper tree\n", 2305 | "dt_elev = DecisionTreeRegressor(max_depth=11)\n", 2306 | "dt_elev.fit(X_train, y_train)\n", 2307 | "dt_elev.score(X_test, y_test)" 2308 | ] 2309 | }, 2310 | { 2311 | "cell_type": "code", 2312 | "execution_count": null, 2313 | "metadata": {}, 2314 | "outputs": [], 2315 | "source": [ 2316 | "lr_elev = LinearRegression()\n", 2317 | "lr_elev.fit(X_train, y_train)\n", 2318 | "lr_elev.score(X_test, y_test)" 2319 | ] 2320 | }, 2321 | { 2322 | "cell_type": "code", 2323 | "execution_count": null, 2324 | "metadata": {}, 2325 | "outputs": [], 2326 | "source": [] 2327 | }, 2328 | { 2329 | "cell_type": "markdown", 2330 | "metadata": {}, 2331 | "source": [ 2332 | "## Random Forests and XGBoost" 2333 | ] 2334 | }, 2335 | { 2336 | "cell_type": "code", 2337 | "execution_count": null, 2338 | "metadata": {}, 2339 | "outputs": [], 2340 | "source": [ 2341 | "# create a random forest regressor\n", 2342 | "from sklearn.ensemble import RandomForestRegressor\n", 2343 | "rf = RandomForestRegressor(n_estimators=100, max_depth=3)\n", 2344 | "rf.fit(X_train, y_train)\n", 2345 | "rf.score(X_test, y_test)\n" 2346 | ] 2347 | }, 2348 | { 2349 | "cell_type": "code", 2350 | "execution_count": null, 2351 | "metadata": {}, 2352 | "outputs": [], 2353 | "source": [ 2354 | "# sweep over depths and plot results\n", 2355 | "test_scores = []\n", 2356 | "train_scores = []\n", 2357 | "for i in range(1, 20):\n", 2358 | " rf = RandomForestRegressor(n_estimators=100, max_depth=i)\n", 2359 | " rf.fit(X_train, y_train)\n", 2360 | " test_scores.append(rf.score(X_test, y_test))\n", 2361 | " train_scores.append(rf.score(X_train, y_train))\n", 2362 | "\n", 2363 | "ax = pd.DataFrame({'train': train_scores, 'test': test_scores}).plot.line(figsize=(8, 6))" 2364 | ] 2365 | }, 2366 | { 2367 | "cell_type": "code", 2368 | "execution_count": null, 2369 | "metadata": {}, 2370 | "outputs": [], 2371 | "source": [ 2372 | "# create a random forest regressor\n", 2373 | "from sklearn.ensemble import RandomForestRegressor\n", 2374 | "rf = RandomForestRegressor(n_estimators=100, max_depth=13, random_state=42)\n", 2375 | "rf.fit(X_train, y_train)\n", 2376 | "rf.score(X_test, y_test)\n" 2377 | ] 2378 | }, 2379 | { 2380 | "cell_type": "code", 2381 | "execution_count": null, 2382 | "metadata": {}, 2383 | "outputs": [], 2384 | "source": [ 2385 | "# create an xgb regressor\n", 2386 | "from xgboost import XGBRegressor\n", 2387 | "xgb = XGBRegressor()\n", 2388 | "xgb.fit(X_train, y_train)\n", 2389 | "xgb.score(X_test, y_test)" 2390 | ] 2391 | }, 2392 | { 2393 | "cell_type": "code", 2394 | "execution_count": null, 2395 | "metadata": {}, 2396 | "outputs": [], 2397 | "source": [] 2398 | }, 2399 | { 2400 | "cell_type": "code", 2401 | "execution_count": null, 2402 | "metadata": {}, 2403 | "outputs": [], 2404 | "source": [] 2405 | }, 2406 | { 2407 | "cell_type": "code", 2408 | "execution_count": null, 2409 | "metadata": {}, 2410 | "outputs": [], 2411 | "source": [] 2412 | }, 2413 | { 2414 | "cell_type": "markdown", 2415 | "metadata": {}, 2416 | "source": [ 2417 | "## Challenge: Decision Trees\n", 2418 | "\n", 2419 | "Create a decision tree to predict survival on the titanic. See if you can determine the optimal depth of the tree." 2420 | ] 2421 | }, 2422 | { 2423 | "cell_type": "markdown", 2424 | "metadata": {}, 2425 | "source": [ 2426 | "## Solution: Decision Trees" 2427 | ] 2428 | }, 2429 | { 2430 | "cell_type": "code", 2431 | "execution_count": null, 2432 | "metadata": {}, 2433 | "outputs": [], 2434 | "source": [] 2435 | }, 2436 | { 2437 | "cell_type": "code", 2438 | "execution_count": null, 2439 | "metadata": {}, 2440 | "outputs": [], 2441 | "source": [ 2442 | "from sklearn.tree import DecisionTreeClassifier\n", 2443 | "\n", 2444 | "raw" 2445 | ] 2446 | }, 2447 | { 2448 | "cell_type": "code", 2449 | "execution_count": null, 2450 | "metadata": {}, 2451 | "outputs": [], 2452 | "source": [ 2453 | "titanic_X = (raw\n", 2454 | " .loc[:, ['pclass', 'age', 'sibsp', 'parch', 'fare']]\n", 2455 | " .dropna()\n", 2456 | " )\n", 2457 | "\n", 2458 | "titanic_X\n", 2459 | "titanic_y = (raw\n", 2460 | " ['survived']\n", 2461 | " .loc[titanic_X.index]\n", 2462 | " )" 2463 | ] 2464 | }, 2465 | { 2466 | "cell_type": "code", 2467 | "execution_count": null, 2468 | "metadata": {}, 2469 | "outputs": [], 2470 | "source": [ 2471 | "# split data\n", 2472 | "X_train, X_test, y_train, y_test = train_test_split(titanic_X, titanic_y, random_state=43)" 2473 | ] 2474 | }, 2475 | { 2476 | "cell_type": "code", 2477 | "execution_count": null, 2478 | "metadata": {}, 2479 | "outputs": [], 2480 | "source": [ 2481 | "X_train" 2482 | ] 2483 | }, 2484 | { 2485 | "cell_type": "code", 2486 | "execution_count": null, 2487 | "metadata": {}, 2488 | "outputs": [], 2489 | "source": [ 2490 | "# sweep over depths and plot results\n", 2491 | "test_scores = []\n", 2492 | "train_scores = []\n", 2493 | "for i in range(1, 20):\n", 2494 | " dt = DecisionTreeClassifier(max_depth=i)\n", 2495 | " dt.fit(X_train, y_train)\n", 2496 | " test_scores.append(dt.score(X_test, y_test))\n", 2497 | " train_scores.append(dt.score(X_train, y_train))\n", 2498 | "\n", 2499 | "ax = pd.DataFrame({'train': train_scores, 'test': test_scores}).plot.line(figsize=(8, 6))" 2500 | ] 2501 | }, 2502 | { 2503 | "cell_type": "code", 2504 | "execution_count": null, 2505 | "metadata": {}, 2506 | "outputs": [], 2507 | "source": [] 2508 | }, 2509 | { 2510 | "cell_type": "code", 2511 | "execution_count": null, 2512 | "metadata": {}, 2513 | "outputs": [], 2514 | "source": [] 2515 | }, 2516 | { 2517 | "cell_type": "markdown", 2518 | "metadata": {}, 2519 | "source": [ 2520 | "# Conclusion - Next Steps\n", 2521 | "\n", 2522 | "- Practice, practice, practice! - I recommend using your own data to practice.\n", 2523 | "- Check out my Feature Engineering course.\n", 2524 | "- Check out my XGBoost course." 2525 | ] 2526 | }, 2527 | { 2528 | "cell_type": "markdown", 2529 | "metadata": {}, 2530 | "source": [] 2531 | } 2532 | ], 2533 | "metadata": { 2534 | "kernelspec": { 2535 | "display_name": "Python 3 (ipykernel)", 2536 | "language": "python", 2537 | "name": "python3" 2538 | }, 2539 | "language_info": { 2540 | "codemirror_mode": { 2541 | "name": "ipython", 2542 | "version": 3 2543 | }, 2544 | "file_extension": ".py", 2545 | "mimetype": "text/x-python", 2546 | "name": "python", 2547 | "nbconvert_exporter": "python", 2548 | "pygments_lexer": "ipython3", 2549 | "version": "3.10.13" 2550 | } 2551 | }, 2552 | "nbformat": 4, 2553 | "nbformat_minor": 4 2554 | } 2555 | --------------------------------------------------------------------------------