├── .gitignore ├── .github ├── CODEOWNERS ├── PULL_REQUEST_TEMPLATE.md ├── workflows │ └── main.yml └── ISSUE_TEMPLATE.md ├── .devcontainer └── devcontainer.json ├── CONTRIBUTING.md ├── NOTICE ├── .vscode └── settings.json ├── README.md ├── requirements.txt ├── LICENSE └── ml-foundations.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules 3 | .tmp 4 | npm-debug.log 5 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Codeowners for these exercise files: 2 | # * (asterisk) denotes "all files and folders" 3 | # Example: * @producer @instructor 4 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Copy To Branches 2 | on: 3 | workflow_dispatch: 4 | jobs: 5 | copy-to-branches: 6 | runs-on: ubuntu-latest 7 | steps: 8 | - uses: actions/checkout@v2 9 | with: 10 | fetch-depth: 0 11 | - name: Copy To Branches Action 12 | uses: planetoftheweb/copy-to-branches@v1.2 13 | env: 14 | key: main 15 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "extensions": [ 3 | "GitHub.github-vscode-theme", 4 | "ms-toolsai.jupyter", 5 | "ms-python.python" 6 | // Additional Extensions Here 7 | ], 8 | "onCreateCommand" : "[ -f requirements.txt ] && pip install -r requirements.txt; echo PS1='\"$ \"' >> ~/.bashrc", //Set Terminal Prompt to $ 9 | } 10 | 11 | // DevContainer Reference: https://code.visualstudio.com/docs/remote/devcontainerjson-reference 12 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 2 | Contribution Agreement 3 | ====================== 4 | 5 | This repository does not accept pull requests (PRs). All pull requests will be closed. 6 | 7 | However, if any contributions (through pull requests, issues, feedback or otherwise) are provided, as a contributor, you represent that the code you submit is your original work or that of your employer (in which case you represent you have the right to bind your employer). By submitting code (or otherwise providing feedback), you (and, if applicable, your employer) are licensing the submitted code (and/or feedback) to LinkedIn and the open source community subject to the BSD 2-Clause license. 8 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright 2024 LinkedIn Corporation 2 | All Rights Reserved. 3 | 4 | Licensed under the LinkedIn Learning Exercise File License (the "License"). 5 | See LICENSE in the project root for license information. 6 | 7 | Please note, this project may automatically load third party code from external 8 | repositories (for example, NPM modules, Composer packages, or other dependencies). 9 | If so, such third party code may be subject to other license terms than as set 10 | forth above. In addition, such third party code may also depend on and load 11 | multiple tiers of dependencies. Please review the applicable licenses of the 12 | additional dependencies. 13 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.bracketPairColorization.enabled": true, 3 | "editor.cursorBlinking": "solid", 4 | "editor.fontFamily": "ui-monospace, Menlo, Monaco, 'Cascadia Mono', 'Segoe UI Mono', 'Roboto Mono', 'Oxygen Mono', 'Ubuntu Monospace', 'Source Code Pro', 'Fira Mono', 'Droid Sans Mono', 'Courier New', monospace", 5 | "editor.fontLigatures": false, 6 | "editor.fontSize": 22, 7 | "editor.formatOnPaste": true, 8 | "editor.formatOnSave": true, 9 | "editor.lineNumbers": "on", 10 | "editor.matchBrackets": "always", 11 | "editor.minimap.enabled": false, 12 | "editor.smoothScrolling": true, 13 | "editor.tabSize": 2, 14 | "editor.useTabStops": true, 15 | "emmet.triggerExpansionOnTab": true, 16 | "explorer.openEditors.visible": 0, 17 | "files.autoSave": "afterDelay", 18 | "screencastMode.onlyKeyboardShortcuts": true, 19 | "terminal.integrated.fontSize": 18, 20 | "workbench.colorTheme": "Visual Studio Dark", 21 | "workbench.fontAliasing": "antialiased", 22 | "workbench.statusBar.visible": true 23 | } 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 7 | 8 | ## Issue Overview 9 | 10 | 11 | ## Describe your environment 12 | 13 | 14 | ## Steps to Reproduce 15 | 16 | 1. 17 | 2. 18 | 3. 19 | 4. 20 | 21 | ## Expected Behavior 22 | 23 | 24 | ## Current Behavior 25 | 26 | 27 | ## Possible Solution 28 | 29 | 30 | ## Screenshots / Video 31 | 32 | 33 | ## Related Issues 34 | 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Applied Machine Learning: Foundations 2 | This is the repository for the LinkedIn Learning course Applied Machine Learning: Foundations. The full course is available from [LinkedIn Learning][lil-course-url]. 3 | 4 | ![lil-thumbnail-url] 5 | 6 |
AI models are transforming the workplace. Knowing what’s going behind those models can help you apply machine learning (ML) techniques more effectively. In this course, instructor Matt Harrison shows you how to get started mastering the essentials of machine learning using the power of the Python programming language.
Explore the fundamentals of an end-to-end machine learning application, as you gain hands-on experience of data exploration, data processing, model creation, model evaluation, model tuning, and model deployment with MLFlow. Along the way, test out your new coding skills in the practice challenges at the end of each section.
7 | 8 | ## Getting Started 9 | 10 | This project can be set up and run in two ways: using GitHub Codespaces for a cloud-based environment, or locally on your machine by installing the required dependencies. Follow the instructions below to get started with the method that best suits your needs. 11 | 12 | ### Option 1: Using GitHub Codespaces 13 | 14 | GitHub Codespaces provides a complete, configurable dev environment on top of a powerful VS Code interface. It's an excellent option for quickly starting development without the need to set up your local environment. 15 | 16 | 1. **Open the project in Codespaces:** Navigate to the GitHub page of the project and click the "Code" button. Select "Open with Codespaces" > "New codespace". This will set up a new cloud-based development environment pre-configured for this project. 17 | 18 | 2. **Wait for installation:** The installation takes a few minutes after the Codespace launches. The terminal at the bottom of VSCode will be spinning for a little bit getting all of the dependencies built and installed. 19 | 20 | 3. **Open up `ml-foundations.ipynb` in VSCode:** The video will walk you through this. 21 | 22 | ### Option 2: Local Setup 23 | 24 | If you prefer to work on your local machine, follow these steps to set up the project environment. You'll need Python installed on your system (refer to [python.org](https://www.python.org/) for installation instructions). 25 | 26 | 1. **Clone the repository:** 27 | ```bash 28 | git clone https://github.com/your-username/your-project-name.git 29 | cd your-project-name 30 | ``` 31 | 2. **Create virtual environment:** Using your favorite mechanism, create a virtual environment for Python. 32 | 33 | 3. **Install dependencies:** 34 | Ensure you have your virtual environment activated. Then, install the required packages using the following command: 35 | ```bash 36 | pip install -r requirements.txt 37 | ``` 38 | 39 | 4. **Launch Jupyter and open `ml-foundations.ipynb`:** 40 | With the dependencies installed, you're ready to launch Juypter: 41 | ```bash 42 | jupyter lab 43 | ``` 44 | 45 | Navigate and open the `ml-foundations.ipynb` notebook in Jupyter. 46 | 47 | ### Instructor 48 | 49 | ![lil-avatar] 50 | 51 | Matt Harrison 52 | 53 | Python and Data Science Corporate Trainer, Author, Speaker, Consultant 54 | 55 | 56 | 57 | Check out my other courses on [LinkedIn Learning](https://www.linkedin.com/learning/instructors/matt-harrison?u=104). 58 | 59 | [lil-course-url]: https://www.linkedin.com/learning/applied-machine-learning-foundations-21404006 60 | [lil-thumbnail-url]: https://media.licdn.com/dms/image/D560DAQG-umFqe1oFDg/learning-public-crop_675_1200/0/1717432957394?e=2147483647&v=beta&t=AGzP3y5jqX0AiSZyW4rB5J3wBome6-i-9XA_h6pq91w 61 | [lil-avatar]: https://media.licdn.com/dms/image/D560DAQGLDZBKwtHv5Q/learning-author-crop_200_200/0/1680625154253?e=1717002000&v=beta&t=vjtUd7bQaz4CR1FeiTQ3nWGvbydzOnHnjKiftJ8bWGg 62 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | adbc-driver-manager==0.11.0 2 | adbc-driver-sqlite==0.11.0 3 | alembic==1.13.1 4 | aniso8601==9.0.1 5 | annotated-types==0.6.0 6 | anyio==4.3.0 7 | argon2-cffi==23.1.0 8 | argon2-cffi-bindings==21.2.0 9 | arrow==1.3.0 10 | asttokens==2.4.1 11 | async-lru==2.0.4 12 | attrs==23.2.0 13 | Babel==2.14.0 14 | beautifulsoup4==4.12.3 15 | bleach==6.1.0 16 | blinker==1.7.0 17 | bokeh==3.4.0 18 | catboost==1.2.3 19 | certifi==2024.2.2 20 | cffi==1.16.0 21 | charset-normalizer==3.3.2 22 | click==8.1.7 23 | cloudpickle==3.0.0 24 | colorama==0.4.6 25 | colorcet==3.1.0 26 | comm==0.2.2 27 | connectorx==0.3.2 28 | contourpy==1.2.0 29 | cycler==0.12.1 30 | debugpy==1.8.1 31 | decorator==5.1.1 32 | defusedxml==0.7.1 33 | deltalake==0.16.4 34 | docker==7.0.0 35 | entrypoints==0.4 36 | exceptiongroup==1.2.0 37 | executing==2.0.1 38 | fastexcel==0.10.2 39 | fastjsonschema==2.19.1 40 | filelock==3.13.1 41 | Flask==3.0.2 42 | fonttools==4.50.0 43 | fqdn==1.5.1 44 | fsspec==2023.12.2 45 | gevent==24.2.1 46 | gitdb==4.0.11 47 | GitPython==3.1.42 48 | graphene==3.3 49 | graphql-core==3.2.3 50 | graphql-relay==3.2.0 51 | graphviz==0.20.3 52 | greenlet==3.0.3 53 | gunicorn==21.2.0 54 | h11==0.14.0 55 | holoviews==1.18.3 56 | httpcore==1.0.4 57 | httpx==0.27.0 58 | hvplot==0.9.2 59 | idna==3.6 60 | importlib_metadata==7.1.0 61 | importlib_resources==6.4.0 62 | ipykernel==6.29.3 63 | ipython==8.22.2 64 | isoduration==20.11.0 65 | itsdangerous==2.1.2 66 | jedi==0.19.1 67 | Jinja2==3.1.3 68 | joblib==1.3.2 69 | json5==0.9.24 70 | jsonpointer==2.4 71 | jsonschema==4.21.1 72 | jsonschema-specifications==2023.12.1 73 | jupyter-events==0.10.0 74 | jupyter-lsp==2.2.4 75 | jupyter-server-mathjax==0.2.6 76 | jupyter_client==8.6.1 77 | jupyter_core==5.7.2 78 | jupyter_server==2.13.0 79 | jupyter_server_terminals==0.5.3 80 | jupyterlab==4.1.5 81 | jupyterlab_git==0.50.0 82 | jupyterlab_pygments==0.3.0 83 | jupyterlab_server==2.25.4 84 | kiwisolver==1.4.5 85 | linkify-it-py==2.0.3 86 | Mako==1.3.2 87 | Markdown==3.6 88 | markdown-it-py==3.0.0 89 | MarkupSafe==2.1.5 90 | matplotlib==3.8.3 91 | matplotlib-inline==0.1.6 92 | mdit-py-plugins==0.4.0 93 | mdurl==0.1.2 94 | mistune==3.0.2 95 | mlflow==2.11.3 96 | mmhash3==3.0.1 97 | mpmath==1.3.0 98 | nbclient==0.10.0 99 | nbconvert==7.16.2 100 | nbdime==4.0.1 101 | nbformat==5.10.3 102 | nest-asyncio==1.6.0 103 | networkx==3.2.1 104 | notebook==7.1.2 105 | notebook_shim==0.2.4 106 | numpy==1.26.4 107 | nvidia-cublas-cu12==12.1.3.1 108 | nvidia-cuda-cupti-cu12==12.1.105 109 | nvidia-cuda-nvrtc-cu12==12.1.105 110 | nvidia-cuda-runtime-cu12==12.1.105 111 | nvidia-cudnn-cu12==8.9.2.26 112 | nvidia-cufft-cu12==11.0.2.54 113 | nvidia-curand-cu12==10.3.2.106 114 | nvidia-cusolver-cu12==11.4.5.107 115 | nvidia-cusparse-cu12==12.1.0.106 116 | nvidia-nccl-cu12==2.19.3 117 | nvidia-nvjitlink-cu12==12.4.99 118 | nvidia-nvtx-cu12==12.1.105 119 | overrides==7.7.0 120 | packaging==23.2 121 | pandas==2.2.1 122 | pandocfilters==1.5.1 123 | panel==1.4.0 124 | param==2.1.0 125 | parso==0.8.3 126 | pexpect==4.9.0 127 | pillow==10.2.0 128 | platformdirs==4.2.0 129 | plotly==5.20.0 130 | polars==0.20.18 131 | prometheus_client==0.20.0 132 | prompt-toolkit==3.0.43 133 | protobuf==4.25.3 134 | psutil==5.9.8 135 | ptyprocess==0.7.0 136 | pure-eval==0.2.2 137 | pyarrow==15.0.2 138 | pyarrow-hotfix==0.6 139 | pycparser==2.21 140 | pydantic==2.6.4 141 | pydantic_core==2.16.3 142 | Pygments==2.17.2 143 | pyiceberg==0.6.0 144 | pyparsing==3.1.2 145 | python-dateutil==2.9.0.post0 146 | python-json-logger==2.0.7 147 | pytz==2024.1 148 | pyviz_comms==3.0.2 149 | PyYAML==6.0.1 150 | pyzmq==25.1.2 151 | querystring-parser==1.2.4 152 | referencing==0.34.0 153 | requests==2.31.0 154 | rfc3339-validator==0.1.4 155 | rfc3986-validator==0.1.1 156 | rich==13.7.1 157 | rpds-py==0.18.0 158 | scikit-learn==1.4.1.post1 159 | scipy==1.12.0 160 | seaborn==0.13.2 161 | Send2Trash==1.8.2 162 | six==1.16.0 163 | smmap==5.0.1 164 | sniffio==1.3.1 165 | sortedcontainers==2.4.0 166 | soupsieve==2.5 167 | SQLAlchemy==2.0.29 168 | sqlparse==0.4.4 169 | stack-data==0.6.3 170 | strictyaml==1.7.3 171 | sympy==1.12 172 | tenacity==8.2.3 173 | terminado==0.18.1 174 | threadpoolctl==3.4.0 175 | tinycss2==1.2.1 176 | tomli==2.0.1 177 | torch==2.2.1 178 | tornado==6.4 179 | tqdm==4.66.2 180 | traitlets==5.14.2 181 | triton==2.2.0 182 | types-python-dateutil==2.9.0.20240316 183 | typing_extensions==4.10.0 184 | tzdata==2024.1 185 | uc-micro-py==1.0.3 186 | uri-template==1.3.0 187 | urllib3==2.0.7 188 | wcwidth==0.2.13 189 | webcolors==1.13 190 | webencodings==0.5.1 191 | websocket-client==1.7.0 192 | Werkzeug==3.0.2 193 | xlsx2csv==0.8.2 194 | XlsxWriter==3.2.0 195 | xyzservices==2023.10.1 196 | zipp==3.18.1 197 | zope.event==5.0 198 | zope.interface==6.2 199 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | LinkedIn Learning Exercise Files License Agreement 2 | ================================================== 3 | 4 | This License Agreement (the "Agreement") is a binding legal agreement 5 | between you (as an individual or entity, as applicable) and LinkedIn 6 | Corporation (“LinkedIn”). By downloading or using the LinkedIn Learning 7 | exercise files in this repository (“Licensed Materials”), you agree to 8 | be bound by the terms of this Agreement. If you do not agree to these 9 | terms, do not download or use the Licensed Materials. 10 | 11 | 1. License. 12 | - a. Subject to the terms of this Agreement, LinkedIn hereby grants LinkedIn 13 | members during their LinkedIn Learning subscription a non-exclusive, 14 | non-transferable copyright license, for internal use only, to 1) make a 15 | reasonable number of copies of the Licensed Materials, and 2) make 16 | derivative works of the Licensed Materials for the sole purpose of 17 | practicing skills taught in LinkedIn Learning courses. 18 | - b. Distribution. Unless otherwise noted in the Licensed Materials, subject 19 | to the terms of this Agreement, LinkedIn hereby grants LinkedIn members 20 | with a LinkedIn Learning subscription a non-exclusive, non-transferable 21 | copyright license to distribute the Licensed Materials, except the 22 | Licensed Materials may not be included in any product or service (or 23 | otherwise used) to instruct or educate others. 24 | 25 | 2. Restrictions and Intellectual Property. 26 | - a. You may not to use, modify, copy, make derivative works of, publish, 27 | distribute, rent, lease, sell, sublicense, assign or otherwise transfer the 28 | Licensed Materials, except as expressly set forth above in Section 1. 29 | - b. Linkedin (and its licensors) retains its intellectual property rights 30 | in the Licensed Materials. Except as expressly set forth in Section 1, 31 | LinkedIn grants no licenses. 32 | - c. You indemnify LinkedIn and its licensors and affiliates for i) any 33 | alleged infringement or misappropriation of any intellectual property rights 34 | of any third party based on modifications you make to the Licensed Materials, 35 | ii) any claims arising from your use or distribution of all or part of the 36 | Licensed Materials and iii) a breach of this Agreement. You will defend, hold 37 | harmless, and indemnify LinkedIn and its affiliates (and our and their 38 | respective employees, shareholders, and directors) from any claim or action 39 | brought by a third party, including all damages, liabilities, costs and 40 | expenses, including reasonable attorneys’ fees, to the extent resulting from, 41 | alleged to have resulted from, or in connection with: (a) your breach of your 42 | obligations herein; or (b) your use or distribution of any Licensed Materials. 43 | 44 | 3. Open source. This code may include open source software, which may be 45 | subject to other license terms as provided in the files. 46 | 47 | 4. Warranty Disclaimer. LINKEDIN PROVIDES THE LICENSED MATERIALS ON AN “AS IS” 48 | AND “AS AVAILABLE” BASIS. LINKEDIN MAKES NO REPRESENTATION OR WARRANTY, 49 | WHETHER EXPRESS OR IMPLIED, ABOUT THE LICENSED MATERIALS, INCLUDING ANY 50 | REPRESENTATION THAT THE LICENSED MATERIALS WILL BE FREE OF ERRORS, BUGS OR 51 | INTERRUPTIONS, OR THAT THE LICENSED MATERIALS ARE ACCURATE, COMPLETE OR 52 | OTHERWISE VALID. TO THE FULLEST EXTENT PERMITTED BY LAW, LINKEDIN AND ITS 53 | AFFILIATES DISCLAIM ANY IMPLIED OR STATUTORY WARRANTY OR CONDITION, INCLUDING 54 | ANY IMPLIED WARRANTY OR CONDITION OF MERCHANTABILITY OR FITNESS FOR A 55 | PARTICULAR PURPOSE, AVAILABILITY, SECURITY, TITLE AND/OR NON-INFRINGEMENT. 56 | YOUR USE OF THE LICENSED MATERIALS IS AT YOUR OWN DISCRETION AND RISK, AND 57 | YOU WILL BE SOLELY RESPONSIBLE FOR ANY DAMAGE THAT RESULTS FROM USE OF THE 58 | LICENSED MATERIALS TO YOUR COMPUTER SYSTEM OR LOSS OF DATA. NO ADVICE OR 59 | INFORMATION, WHETHER ORAL OR WRITTEN, OBTAINED BY YOU FROM US OR THROUGH OR 60 | FROM THE LICENSED MATERIALS WILL CREATE ANY WARRANTY OR CONDITION NOT 61 | EXPRESSLY STATED IN THESE TERMS. 62 | 63 | 5. Limitation of Liability. LINKEDIN SHALL NOT BE LIABLE FOR ANY INDIRECT, 64 | INCIDENTAL, SPECIAL, PUNITIVE, CONSEQUENTIAL OR EXEMPLARY DAMAGES, INCLUDING 65 | BUT NOT LIMITED TO, DAMAGES FOR LOSS OF PROFITS, GOODWILL, USE, DATA OR OTHER 66 | INTANGIBLE LOSSES . IN NO EVENT WILL LINKEDIN'S AGGREGATE LIABILITY TO YOU 67 | EXCEED $100. THIS LIMITATION OF LIABILITY SHALL: 68 | - i. APPLY REGARDLESS OF WHETHER (A) YOU BASE YOUR CLAIM ON CONTRACT, TORT, 69 | STATUTE, OR ANY OTHER LEGAL THEORY, (B) WE KNEW OR SHOULD HAVE KNOWN ABOUT 70 | THE POSSIBILITY OF SUCH DAMAGES, OR (C) THE LIMITED REMEDIES PROVIDED IN THIS 71 | SECTION FAIL OF THEIR ESSENTIAL PURPOSE; AND 72 | - ii. NOT APPLY TO ANY DAMAGE THAT LINKEDIN MAY CAUSE YOU INTENTIONALLY OR 73 | KNOWINGLY IN VIOLATION OF THESE TERMS OR APPLICABLE LAW, OR AS OTHERWISE 74 | MANDATED BY APPLICABLE LAW THAT CANNOT BE DISCLAIMED IN THESE TERMS. 75 | 76 | 6. Termination. This Agreement automatically terminates upon your breach of 77 | this Agreement or termination of your LinkedIn Learning subscription. On 78 | termination, all licenses granted under this Agreement will terminate 79 | immediately and you will delete the Licensed Materials. Sections 2-7 of this 80 | Agreement survive any termination of this Agreement. LinkedIn may discontinue 81 | the availability of some or all of the Licensed Materials at any time for any 82 | reason. 83 | 84 | 7. Miscellaneous. This Agreement will be governed by and construed in 85 | accordance with the laws of the State of California without regard to conflict 86 | of laws principles. The exclusive forum for any disputes arising out of or 87 | relating to this Agreement shall be an appropriate federal or state court 88 | sitting in the County of Santa Clara, State of California. If LinkedIn does 89 | not act to enforce a breach of this Agreement, that does not mean that 90 | LinkedIn has waived its right to enforce this Agreement. The Agreement does 91 | not create a partnership, agency relationship, or joint venture between the 92 | parties. Neither party has the power or authority to bind the other or to 93 | create any obligation or responsibility on behalf of the other. You may not, 94 | without LinkedIn’s prior written consent, assign or delegate any rights or 95 | obligations under these terms, including in connection with a change of 96 | control. Any purported assignment and delegation shall be ineffective. The 97 | Agreement shall bind and inure to the benefit of the parties, their respective 98 | successors and permitted assigns. If any provision of the Agreement is 99 | unenforceable, that provision will be modified to render it enforceable to the 100 | extent possible to give effect to the parties’ intentions and the remaining 101 | provisions will not be affected. This Agreement is the only agreement between 102 | you and LinkedIn regarding the Licensed Materials, and supersedes all prior 103 | agreements relating to the Licensed Materials. 104 | 105 | Last Updated: March 2019 106 | -------------------------------------------------------------------------------- /ml-foundations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# EDA\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "\n", 15 | "## Exploring the Data\n" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import polars as pl\n", 25 | "import polars.selectors as cs\n", 26 | "import sklearn \n", 27 | "import catboost\n", 28 | "\n", 29 | "import warnings\n", 30 | "warnings.filterwarnings('ignore')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "# King County House Sales dataset from OpenML (includes Seattle)\n", 40 | "# this is an ARFF file, which is a text file with a specific format\n", 41 | "url = 'https://www.openml.org/data/download/22044765/dataset'\n", 42 | "cols = ['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', \n", 43 | " 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',\n", 44 | " 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'date_year', 'date_month', 'date_day']\n", 45 | "\n", 46 | "raw = pl.read_csv(url, new_columns=cols, skip_rows=31, has_header=False)\n", 47 | "raw" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "raw.describe()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "raw.corr()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "(raw\n", 75 | " .to_pandas(use_pyarrow_extension_array=True)\n", 76 | " .corr()\n", 77 | " .style.background_gradient(cmap='RdBu', vmin=-1, vmax=1)\n", 78 | ")" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "(raw\n", 88 | " .plot.scatter('sqft_living', 'price', alpha=0.1)\n", 89 | ")" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "(raw\n", 99 | " .group_by('date_month', 'zipcode')\n", 100 | " .agg(pl.col('price').mean())\n", 101 | " .plot.line('date_month', 'price', by='zipcode')\n", 102 | " )" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "(raw\n", 112 | " .group_by('date_month', 'zipcode')\n", 113 | " .agg(pl.col('price').mean())\n", 114 | " .sort('date_month')\n", 115 | " .plot.line('date_month', 'price', by='zipcode', alpha=0.5)\n", 116 | " )" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "# lat/long scatter plot\n", 126 | "(raw\n", 127 | " .sort('price')\n", 128 | " .plot.scatter(x='long', y='lat', alpha=0.5, c='price', s=1)\n", 129 | ")" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "# lat/long scatter plot\n", 139 | "(raw\n", 140 | " .filter(pl.col('price') > 1_000_000)\n", 141 | " .sort('price')\n", 142 | " .plot.scatter(x='long', y='lat', alpha=0.5, c='price', s=1)\n", 143 | ")" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "\n", 165 | "## Data Preprocessing\n" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "def tweak_housing(df):\n", 175 | " return (df\n", 176 | " .with_columns(zipcode=pl.col('zipcode').cast(pl.String).cast(pl.Categorical),\n", 177 | " date=pl.date(pl.col('date_year'), pl.col('date_month'), pl.col('date_day')),\n", 178 | " yr_renovated=pl.col('yr_renovated').replace(0, None),\n", 179 | " )\n", 180 | " .select(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', \n", 181 | " 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', \n", 182 | " 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', \n", 183 | " 'sqft_lot15', 'date', #'date_year', 'date_month', 'date_day', \n", 184 | " ])\n", 185 | " )\n", 186 | "\n", 187 | "tweak_housing(raw)\n", 188 | " " 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "\n", 217 | "## Sklearn Pipelines\n" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": { 224 | "scrolled": true 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "# The difference between sklearn pipelines and transformers is \n", 229 | "# that a pipeline is a sequence of steps. A transformer transforms\n", 230 | "# the data, and a pipeline is a sequence of transformers.\n", 231 | "# A ColumnTransformer applies multiple transformers to different\n", 232 | "# columns of the input data.\n", 233 | "\n", 234 | "from sklearn.pipeline import Pipeline\n", 235 | "from sklearn.compose import ColumnTransformer\n", 236 | "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", 237 | "from sklearn.impute import SimpleImputer\n", 238 | "from sklearn.model_selection import train_test_split\n", 239 | "from sklearn.preprocessing import FunctionTransformer\n", 240 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 241 | "from sklearn import set_config\n", 242 | "set_config(transform_output='polars')" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "print(tweak_housing(raw).select(cs.numeric()).columns)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": { 258 | "collapsed": true, 259 | "jupyter": { 260 | "outputs_hidden": true 261 | }, 262 | "scrolled": true 263 | }, 264 | "outputs": [], 265 | "source": [ 266 | "numeric_features = ['bedrooms', 'bathrooms', 'sqft_living']\n", 267 | "std = StandardScaler()\n", 268 | "std.fit_transform(tweak_housing(raw).select(numeric_features))" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": { 275 | "collapsed": true, 276 | "jupyter": { 277 | "outputs_hidden": true 278 | } 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "numeric_features = ['bedrooms', 'bathrooms', 'sqft_living']\n", 283 | "\n", 284 | "num_pipeline = Pipeline([\n", 285 | " ('std', StandardScaler())])\n", 286 | "\n", 287 | "num_pipeline.fit_transform(\n", 288 | " tweak_housing(raw)\n", 289 | " .select(numeric_features)\n", 290 | ")" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": { 297 | "collapsed": true, 298 | "jupyter": { 299 | "outputs_hidden": true 300 | }, 301 | "scrolled": true 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "# add another step\n", 306 | "numeric_features = ['bedrooms', 'bathrooms', 'sqft_living']\n", 307 | "\n", 308 | "num_pipeline = Pipeline([\n", 309 | " ('imputer', SimpleImputer(strategy='median')),\n", 310 | " ('std', StandardScaler())])\n", 311 | "\n", 312 | "num_pipeline.fit_transform(\n", 313 | " tweak_housing(raw)\n", 314 | " .select(numeric_features)\n", 315 | ")" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "scrolled": true 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "cat_features = ['zipcode']\n", 327 | "\n", 328 | "ohe = OneHotEncoder(handle_unknown='ignore')\n", 329 | "# sparse_output=False)\n", 330 | "\n", 331 | "ohe.fit_transform(\n", 332 | " tweak_housing(raw)\n", 333 | " .select(cat_features)\n", 334 | ")" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": { 341 | "scrolled": true 342 | }, 343 | "outputs": [], 344 | "source": [ 345 | "cat_features = ['zipcode']\n", 346 | "\n", 347 | "ohe = OneHotEncoder(handle_unknown='ignore',\n", 348 | " sparse_output=False)\n", 349 | "\n", 350 | "ohe.fit_transform(\n", 351 | " tweak_housing(raw)\n", 352 | " .select(cat_features)\n", 353 | ")" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "collapsed": true, 361 | "jupyter": { 362 | "outputs_hidden": true 363 | } 364 | }, 365 | "outputs": [], 366 | "source": [ 367 | "cat_features = ['zipcode']\n", 368 | "\n", 369 | "ohe = OneHotEncoder(handle_unknown='ignore',\n", 370 | " sparse_output=False, max_categories=10)\n", 371 | "\n", 372 | "ohe.fit_transform(\n", 373 | " tweak_housing(raw)\n", 374 | " .select(cat_features)\n", 375 | ")" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": { 382 | "scrolled": true 383 | }, 384 | "outputs": [], 385 | "source": [ 386 | "cat_features = ['zipcode']\n", 387 | "\n", 388 | "cat_pipeline = Pipeline(steps=[\n", 389 | " ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])\n", 390 | "\n", 391 | "cat_pipeline.set_params(cat__max_categories=10)\n", 392 | "cat_pipeline.fit_transform(\n", 393 | " tweak_housing(raw)\n", 394 | " .select(cat_features)\n", 395 | ")" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "# transformer from a function\n", 405 | "tweak_transformer = FunctionTransformer(tweak_housing)\n", 406 | "\n", 407 | "tweak_transformer.fit_transform(raw)" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": null, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "categorical_features = ['zipcode']\n", 417 | "\n", 418 | "numeric_transformer = Pipeline(steps=[\n", 419 | " ('imputer', SimpleImputer(strategy='median')),\n", 420 | " ('scaler', StandardScaler())])\n", 421 | "\n", 422 | "ct = ColumnTransformer(\n", 423 | " transformers=[\n", 424 | " ('num', numeric_transformer, numeric_features),\n", 425 | " ('cat', OneHotEncoder(handle_unknown='ignore',\n", 426 | " sparse_output=False), categorical_features)])\n", 427 | "\n", 428 | "ct.fit_transform(\n", 429 | " tweak_housing(raw)\n", 430 | " .select([*numeric_features, *cat_features])\n", 431 | ")" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "# Custom transformer \n", 441 | "class ZipAvgPriceAdder(BaseEstimator, TransformerMixin):\n", 442 | " def __init__(self):\n", 443 | " pass\n", 444 | " def fit(self, X, y=None):\n", 445 | " # assume X is a polars dataframe\n", 446 | " self.zip_avg_price = (X\n", 447 | " .group_by('zipcode')\n", 448 | " .agg(zip_mean=pl.col('price').mean())\n", 449 | " )\n", 450 | " return self\n", 451 | " \n", 452 | " def transform(self, X, y=None):\n", 453 | " return X.join(self.zip_avg_price, on='zipcode')\n", 454 | "\n", 455 | "zip_adder = ZipAvgPriceAdder()\n", 456 | "zip_adder.fit_transform(raw.select(['zipcode', 'price']))" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": { 463 | "scrolled": true 464 | }, 465 | "outputs": [], 466 | "source": [ 467 | "# make the pipeline\n", 468 | "numeric_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', \n", 469 | " 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', \n", 470 | " 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'zip_mean']\n", 471 | "numeric_transformer = Pipeline(steps=[\n", 472 | " ('imputer', SimpleImputer(strategy='median')),\n", 473 | " ('scaler', StandardScaler())])\n", 474 | "\n", 475 | "categorical_features = ['zipcode']\n", 476 | "\n", 477 | "preprocessor = ColumnTransformer(\n", 478 | " transformers=[\n", 479 | " ('num', numeric_transformer, numeric_features),\n", 480 | " ('cat', OneHotEncoder(handle_unknown='ignore',\n", 481 | " sparse_output=False), categorical_features)])\n", 482 | "\n", 483 | "tweak_transformer = FunctionTransformer(tweak_housing)\n", 484 | "\n", 485 | "class ZipAvgPriceAdder(BaseEstimator, TransformerMixin):\n", 486 | " def __init__(self):\n", 487 | " pass\n", 488 | " def fit(self, X, y=None):\n", 489 | " # assume X is a polars dataframe\n", 490 | " self.zip_avg_price = (X\n", 491 | " .group_by('zipcode')\n", 492 | " .agg(zip_mean=pl.col('price').mean())\n", 493 | " )\n", 494 | " return self\n", 495 | " \n", 496 | " def transform(self, X, y=None):\n", 497 | " return X.join(self.zip_avg_price, on='zipcode')\n", 498 | "\n", 499 | "# Append classifier to preprocessing pipeline.\n", 500 | "# Now we have a full prediction pipeline.\n", 501 | "pipe = Pipeline(steps=[('tweak', tweak_transformer),\n", 502 | " ('zip_avg_price', ZipAvgPriceAdder()),\n", 503 | " ('preprocessor', preprocessor),\n", 504 | " ])\n", 505 | "\n", 506 | "X = raw #.drop('price')\n", 507 | "y = raw.select('price') # Note sklearn wants a Polars dataframe for y\n", 508 | "\n", 509 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", 510 | "\n", 511 | "pipe.fit_transform(raw, raw.select('price'))" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [ 520 | "pipe" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "# Note sklearn wants a Polars dataframe for y\n", 530 | "X = raw #.drop('price')\n", 531 | "y = raw.select('price') \n", 532 | "#y = raw['price']\n", 533 | "\n", 534 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": null, 540 | "metadata": {}, 541 | "outputs": [], 542 | "source": [] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": null, 547 | "metadata": {}, 548 | "outputs": [], 549 | "source": [] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": null, 554 | "metadata": {}, 555 | "outputs": [], 556 | "source": [] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "metadata": {}, 562 | "outputs": [], 563 | "source": [] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": null, 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [] 571 | }, 572 | { 573 | "cell_type": "markdown", 574 | "metadata": { 575 | "jp-MarkdownHeadingCollapsed": true 576 | }, 577 | "source": [ 578 | "\n", 579 | "## Challenge\n", 580 | "\n", 581 | "Make a plot to explore the relationship between the number of bedrooms and the price of the house." 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": null, 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": null, 594 | "metadata": {}, 595 | "outputs": [], 596 | "source": [] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": null, 601 | "metadata": {}, 602 | "outputs": [], 603 | "source": [] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": null, 608 | "metadata": {}, 609 | "outputs": [], 610 | "source": [] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": null, 615 | "metadata": {}, 616 | "outputs": [], 617 | "source": [] 618 | }, 619 | { 620 | "cell_type": "markdown", 621 | "metadata": {}, 622 | "source": [ 623 | "\n", 624 | "## Solution" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": null, 630 | "metadata": {}, 631 | "outputs": [], 632 | "source": [] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": null, 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": {}, 645 | "outputs": [], 646 | "source": [] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": null, 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": null, 658 | "metadata": {}, 659 | "outputs": [], 660 | "source": [] 661 | }, 662 | { 663 | "cell_type": "markdown", 664 | "metadata": { 665 | "vscode": { 666 | "languageId": "plaintext" 667 | } 668 | }, 669 | "source": [ 670 | "# Model Creation\n" 671 | ] 672 | }, 673 | { 674 | "cell_type": "markdown", 675 | "metadata": {}, 676 | "source": [ 677 | "\n", 678 | "## Dummy Model\n" 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": null, 684 | "metadata": { 685 | "scrolled": true 686 | }, 687 | "outputs": [], 688 | "source": [ 689 | "from sklearn.dummy import DummyRegressor\n", 690 | "\n", 691 | "dummy = DummyRegressor(strategy='mean')\n", 692 | "y = raw.select('price')\n", 693 | "X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)\n", 694 | "dummy_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n", 695 | " ('zip_avg_price', ZipAvgPriceAdder()),\n", 696 | " ('preprocessor', preprocessor),\n", 697 | " ('dummy', dummy),\n", 698 | " ])\n", 699 | "\n", 700 | "dummy_pipe.fit(X_train, y_train)\n", 701 | "dummy_pipe.score(X_test, y_test)" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [ 710 | "dummy_pipe" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": null, 716 | "metadata": { 717 | "scrolled": true 718 | }, 719 | "outputs": [], 720 | "source": [ 721 | "dummy_pipe.predict(X_test)" 722 | ] 723 | }, 724 | { 725 | "cell_type": "code", 726 | "execution_count": null, 727 | "metadata": {}, 728 | "outputs": [], 729 | "source": [] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": null, 734 | "metadata": {}, 735 | "outputs": [], 736 | "source": [] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": null, 741 | "metadata": {}, 742 | "outputs": [], 743 | "source": [] 744 | }, 745 | { 746 | "cell_type": "markdown", 747 | "metadata": {}, 748 | "source": [ 749 | "\n", 750 | "## Linear Regression\n" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": null, 756 | "metadata": {}, 757 | "outputs": [], 758 | "source": [ 759 | "from sklearn.linear_model import LinearRegression\n", 760 | "\n", 761 | "\n", 762 | "lr = LinearRegression()\n", 763 | "y = raw.select('price')\n", 764 | "X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)\n", 765 | "lr_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n", 766 | " ('zip_avg_price', ZipAvgPriceAdder()),\n", 767 | " ('preprocessor', preprocessor),\n", 768 | " ('lr', lr),\n", 769 | " ])\n", 770 | "\n", 771 | "lr_pipe.fit(X_train, y_train)\n", 772 | "lr_pipe.score(X_test, y_test)" 773 | ] 774 | }, 775 | { 776 | "cell_type": "code", 777 | "execution_count": null, 778 | "metadata": {}, 779 | "outputs": [], 780 | "source": [ 781 | "lr_pipe.predict(X_test)" 782 | ] 783 | }, 784 | { 785 | "cell_type": "code", 786 | "execution_count": null, 787 | "metadata": {}, 788 | "outputs": [], 789 | "source": [] 790 | }, 791 | { 792 | "cell_type": "code", 793 | "execution_count": null, 794 | "metadata": {}, 795 | "outputs": [], 796 | "source": [] 797 | }, 798 | { 799 | "cell_type": "markdown", 800 | "metadata": {}, 801 | "source": [ 802 | "\n", 803 | "## Decision Trees\n" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": null, 809 | "metadata": {}, 810 | "outputs": [], 811 | "source": [ 812 | "from sklearn.tree import DecisionTreeRegressor\n", 813 | "\n", 814 | "\n", 815 | "dt = DecisionTreeRegressor()\n", 816 | "y = raw.select('price')\n", 817 | "X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)\n", 818 | "dt_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n", 819 | " ('zip_avg_price', ZipAvgPriceAdder()),\n", 820 | " ('preprocessor', preprocessor),\n", 821 | " ('dt', dt),\n", 822 | " ])\n", 823 | "\n", 824 | "dt_pipe.fit(X_train, y_train)\n", 825 | "dt_pipe.score(X_test, y_test)" 826 | ] 827 | }, 828 | { 829 | "cell_type": "code", 830 | "execution_count": null, 831 | "metadata": {}, 832 | "outputs": [], 833 | "source": [ 834 | "dt_pipe.set_params(dt__max_depth=1)\n", 835 | "dt_pipe.fit(X_train, y_train)\n", 836 | "dt_pipe.score(X_test, y_test)" 837 | ] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "execution_count": null, 842 | "metadata": {}, 843 | "outputs": [], 844 | "source": [ 845 | "dt_pipe.set_params(dt__max_depth=9)\n", 846 | "dt_pipe.fit(X_train, y_train)\n", 847 | "dt_pipe.score(X_test, y_test)" 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "execution_count": null, 853 | "metadata": {}, 854 | "outputs": [], 855 | "source": [] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "execution_count": null, 860 | "metadata": {}, 861 | "outputs": [], 862 | "source": [] 863 | }, 864 | { 865 | "cell_type": "code", 866 | "execution_count": null, 867 | "metadata": {}, 868 | "outputs": [], 869 | "source": [] 870 | }, 871 | { 872 | "cell_type": "markdown", 873 | "metadata": {}, 874 | "source": [ 875 | "\n", 876 | "## CatBoost\n" 877 | ] 878 | }, 879 | { 880 | "cell_type": "code", 881 | "execution_count": null, 882 | "metadata": {}, 883 | "outputs": [], 884 | "source": [] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "execution_count": null, 889 | "metadata": { 890 | "scrolled": true 891 | }, 892 | "outputs": [], 893 | "source": [ 894 | "from catboost import CatBoostRegressor\n", 895 | "\n", 896 | "\n", 897 | "cat = CatBoostRegressor()\n", 898 | "# has issues with Polars input going to use a pandas_transformer\n", 899 | "def to_pandas(df):\n", 900 | " return df.to_pandas()\n", 901 | "pandas_transformer = FunctionTransformer(to_pandas)\n", 902 | "\n", 903 | "y = raw.select('price')\n", 904 | "\n", 905 | "X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)\n", 906 | "cat_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n", 907 | " ('zip_avg_price', ZipAvgPriceAdder()),\n", 908 | " ('preprocessor', preprocessor),\n", 909 | " ('to_pandas', pandas_transformer),\n", 910 | " ('cat', cat), \n", 911 | " ])\n", 912 | "\n", 913 | "cat_pipe.fit(X_train, y_train.to_numpy()[:,0])\n", 914 | "cat_pipe.score(X_test, y_test.to_numpy()[:,0])" 915 | ] 916 | }, 917 | { 918 | "cell_type": "code", 919 | "execution_count": null, 920 | "metadata": {}, 921 | "outputs": [], 922 | "source": [] 923 | }, 924 | { 925 | "cell_type": "code", 926 | "execution_count": null, 927 | "metadata": {}, 928 | "outputs": [], 929 | "source": [] 930 | }, 931 | { 932 | "cell_type": "code", 933 | "execution_count": null, 934 | "metadata": {}, 935 | "outputs": [], 936 | "source": [] 937 | }, 938 | { 939 | "cell_type": "markdown", 940 | "metadata": {}, 941 | "source": [ 942 | "\n", 943 | "## Challenge\n", 944 | "\n", 945 | "Create a pipeline for a Random Forest model and train it on the data. (see `ensemble.RandomForestRegressor` in scikit-learn). What is the score?" 946 | ] 947 | }, 948 | { 949 | "cell_type": "code", 950 | "execution_count": null, 951 | "metadata": {}, 952 | "outputs": [], 953 | "source": [] 954 | }, 955 | { 956 | "cell_type": "markdown", 957 | "metadata": {}, 958 | "source": [ 959 | "\n", 960 | "\n", 961 | "## Solution" 962 | ] 963 | }, 964 | { 965 | "cell_type": "code", 966 | "execution_count": null, 967 | "metadata": {}, 968 | "outputs": [], 969 | "source": [] 970 | }, 971 | { 972 | "cell_type": "code", 973 | "execution_count": null, 974 | "metadata": {}, 975 | "outputs": [], 976 | "source": [] 977 | }, 978 | { 979 | "cell_type": "code", 980 | "execution_count": null, 981 | "metadata": {}, 982 | "outputs": [], 983 | "source": [] 984 | }, 985 | { 986 | "cell_type": "code", 987 | "execution_count": null, 988 | "metadata": {}, 989 | "outputs": [], 990 | "source": [] 991 | }, 992 | { 993 | "cell_type": "code", 994 | "execution_count": null, 995 | "metadata": {}, 996 | "outputs": [], 997 | "source": [] 998 | }, 999 | { 1000 | "cell_type": "markdown", 1001 | "metadata": { 1002 | "vscode": { 1003 | "languageId": "plaintext" 1004 | } 1005 | }, 1006 | "source": [ 1007 | "# Evaluation\n" 1008 | ] 1009 | }, 1010 | { 1011 | "cell_type": "markdown", 1012 | "metadata": {}, 1013 | "source": [ 1014 | "\n", 1015 | "## R2\n", 1016 | "\n", 1017 | "\n", 1018 | "The Coefficient of Determination, R2, is a measure of how well the model fits the data. It is a value between 0 and 1. It tells us how much of the variance in the target variable is predictable from the features.\n", 1019 | "\n", 1020 | "A value of 0 means that the model explains none of the variability. A value of 1 means that the model explains all the variability.\n", 1021 | "\n", 1022 | "Note that it doesn't indicate whether a model is overfitting or underfitting the data." 1023 | ] 1024 | }, 1025 | { 1026 | "cell_type": "code", 1027 | "execution_count": null, 1028 | "metadata": { 1029 | "scrolled": true 1030 | }, 1031 | "outputs": [], 1032 | "source": [ 1033 | "cat_pipe.score(X_test, y_test.to_numpy()[:,0])" 1034 | ] 1035 | }, 1036 | { 1037 | "cell_type": "code", 1038 | "execution_count": null, 1039 | "metadata": {}, 1040 | "outputs": [], 1041 | "source": [] 1042 | }, 1043 | { 1044 | "cell_type": "code", 1045 | "execution_count": null, 1046 | "metadata": {}, 1047 | "outputs": [], 1048 | "source": [] 1049 | }, 1050 | { 1051 | "cell_type": "code", 1052 | "execution_count": null, 1053 | "metadata": {}, 1054 | "outputs": [], 1055 | "source": [] 1056 | }, 1057 | { 1058 | "cell_type": "code", 1059 | "execution_count": null, 1060 | "metadata": {}, 1061 | "outputs": [], 1062 | "source": [] 1063 | }, 1064 | { 1065 | "cell_type": "markdown", 1066 | "metadata": {}, 1067 | "source": [ 1068 | "\n", 1069 | "## Mean Squared/Absolute Error\n" 1070 | ] 1071 | }, 1072 | { 1073 | "cell_type": "code", 1074 | "execution_count": null, 1075 | "metadata": {}, 1076 | "outputs": [], 1077 | "source": [ 1078 | "from sklearn.metrics import mean_squared_error\n", 1079 | "\n", 1080 | "mean_squared_error(y_test, cat_pipe.predict(X_test))" 1081 | ] 1082 | }, 1083 | { 1084 | "cell_type": "code", 1085 | "execution_count": null, 1086 | "metadata": {}, 1087 | "outputs": [], 1088 | "source": [ 1089 | "# rmse\n", 1090 | "mean_squared_error(y_test, cat_pipe.predict(X_test), squared=False)" 1091 | ] 1092 | }, 1093 | { 1094 | "cell_type": "code", 1095 | "execution_count": null, 1096 | "metadata": { 1097 | "scrolled": true 1098 | }, 1099 | "outputs": [], 1100 | "source": [ 1101 | "# absolute error\n", 1102 | "from sklearn.metrics import mean_absolute_error\n", 1103 | "\n", 1104 | "mean_absolute_error(y_test, cat_pipe.predict(X_test))" 1105 | ] 1106 | }, 1107 | { 1108 | "cell_type": "code", 1109 | "execution_count": null, 1110 | "metadata": {}, 1111 | "outputs": [], 1112 | "source": [ 1113 | "# compare to lr model\n", 1114 | "from sklearn.metrics import mean_absolute_error\n", 1115 | "\n", 1116 | "mean_absolute_error(y_test, lr_pipe.predict(X_test))" 1117 | ] 1118 | }, 1119 | { 1120 | "cell_type": "code", 1121 | "execution_count": null, 1122 | "metadata": {}, 1123 | "outputs": [], 1124 | "source": [] 1125 | }, 1126 | { 1127 | "cell_type": "code", 1128 | "execution_count": null, 1129 | "metadata": {}, 1130 | "outputs": [], 1131 | "source": [] 1132 | }, 1133 | { 1134 | "cell_type": "code", 1135 | "execution_count": null, 1136 | "metadata": {}, 1137 | "outputs": [], 1138 | "source": [] 1139 | }, 1140 | { 1141 | "cell_type": "code", 1142 | "execution_count": null, 1143 | "metadata": {}, 1144 | "outputs": [], 1145 | "source": [] 1146 | }, 1147 | { 1148 | "cell_type": "code", 1149 | "execution_count": null, 1150 | "metadata": {}, 1151 | "outputs": [], 1152 | "source": [] 1153 | }, 1154 | { 1155 | "cell_type": "code", 1156 | "execution_count": null, 1157 | "metadata": {}, 1158 | "outputs": [], 1159 | "source": [] 1160 | }, 1161 | { 1162 | "cell_type": "markdown", 1163 | "metadata": {}, 1164 | "source": [ 1165 | "\n", 1166 | "## Residuals Plot\n" 1167 | ] 1168 | }, 1169 | { 1170 | "cell_type": "code", 1171 | "execution_count": null, 1172 | "metadata": {}, 1173 | "outputs": [], 1174 | "source": [ 1175 | "# make a residual plot\n", 1176 | "import matplotlib.pyplot as plt\n", 1177 | "\n", 1178 | "ax = plt.scatter(cat_pipe.predict(X_test), \n", 1179 | " y_test.to_series().to_numpy() - cat_pipe.predict(X_test), alpha=0.1)\n", 1180 | "# make labels not be scientific notation\n", 1181 | "plt.ticklabel_format(style='plain', axis='y')\n", 1182 | "plt.ticklabel_format(style='plain', axis='x')\n", 1183 | "plt.ylim(-500_000, 500_000)\n", 1184 | "plt.xlabel('Predicted price')\n", 1185 | "plt.ylabel('Residual')\n", 1186 | "plt.title('Residual plot')" 1187 | ] 1188 | }, 1189 | { 1190 | "cell_type": "code", 1191 | "execution_count": null, 1192 | "metadata": {}, 1193 | "outputs": [], 1194 | "source": [ 1195 | "# plot with Polars\n", 1196 | "(y_test\n", 1197 | " .with_columns(predicted_price=cat_pipe.predict(X_test),\n", 1198 | " residual=y_test.to_series().to_numpy() - cat_pipe.predict(X_test))\n", 1199 | " .plot.scatter('predicted_price', 'residual', alpha=0.1, yformatter='$%.0f',\n", 1200 | " xformatter='$%.0f')\n", 1201 | " )" 1202 | ] 1203 | }, 1204 | { 1205 | "cell_type": "code", 1206 | "execution_count": null, 1207 | "metadata": {}, 1208 | "outputs": [], 1209 | "source": [ 1210 | "def residuals_plot(model, X_train, y_train, X_test, y_test):\n", 1211 | " return (y_test\n", 1212 | " .with_columns(prediction=model.predict(X_test),\n", 1213 | " residual=y_test.to_series().to_numpy() - model.predict(X_test),\n", 1214 | " type=pl.lit('test'))\n", 1215 | " .vstack(y_train\n", 1216 | " .with_columns(prediction=model.predict(X_train),\n", 1217 | " residual=y_train.to_series().to_numpy() - model.predict(X_train),\n", 1218 | " type=pl.lit('train'))\n", 1219 | " )\n", 1220 | " .reverse()\n", 1221 | " .plot.scatter('prediction', 'residual', alpha=0.1, yformatter='$%.0f',\n", 1222 | " xformatter='$%.0f', by='type')\n", 1223 | " )\n", 1224 | "\n", 1225 | "residuals_plot(cat_pipe, X_train, y_train, X_test, y_test)" 1226 | ] 1227 | }, 1228 | { 1229 | "cell_type": "code", 1230 | "execution_count": null, 1231 | "metadata": {}, 1232 | "outputs": [], 1233 | "source": [ 1234 | "residuals_plot(dt_pipe, X_train, y_train, X_test, y_test)" 1235 | ] 1236 | }, 1237 | { 1238 | "cell_type": "code", 1239 | "execution_count": null, 1240 | "metadata": {}, 1241 | "outputs": [], 1242 | "source": [] 1243 | }, 1244 | { 1245 | "cell_type": "code", 1246 | "execution_count": null, 1247 | "metadata": {}, 1248 | "outputs": [], 1249 | "source": [] 1250 | }, 1251 | { 1252 | "cell_type": "code", 1253 | "execution_count": null, 1254 | "metadata": {}, 1255 | "outputs": [], 1256 | "source": [] 1257 | }, 1258 | { 1259 | "cell_type": "markdown", 1260 | "metadata": {}, 1261 | "source": [ 1262 | "## Challenge\n", 1263 | "\n", 1264 | "What is the mean squared error of the Random Forest model? What is the R2 score? What do these values tell us about the model?" 1265 | ] 1266 | }, 1267 | { 1268 | "cell_type": "code", 1269 | "execution_count": null, 1270 | "metadata": {}, 1271 | "outputs": [], 1272 | "source": [] 1273 | }, 1274 | { 1275 | "cell_type": "markdown", 1276 | "metadata": {}, 1277 | "source": [ 1278 | "\n", 1279 | "\n", 1280 | "## Solution" 1281 | ] 1282 | }, 1283 | { 1284 | "cell_type": "code", 1285 | "execution_count": null, 1286 | "metadata": {}, 1287 | "outputs": [], 1288 | "source": [] 1289 | }, 1290 | { 1291 | "cell_type": "code", 1292 | "execution_count": null, 1293 | "metadata": {}, 1294 | "outputs": [], 1295 | "source": [] 1296 | }, 1297 | { 1298 | "cell_type": "code", 1299 | "execution_count": null, 1300 | "metadata": {}, 1301 | "outputs": [], 1302 | "source": [] 1303 | }, 1304 | { 1305 | "cell_type": "code", 1306 | "execution_count": null, 1307 | "metadata": {}, 1308 | "outputs": [], 1309 | "source": [] 1310 | }, 1311 | { 1312 | "cell_type": "markdown", 1313 | "metadata": {}, 1314 | "source": [ 1315 | "# Model Tuning\n" 1316 | ] 1317 | }, 1318 | { 1319 | "cell_type": "markdown", 1320 | "metadata": {}, 1321 | "source": [ 1322 | "\n", 1323 | "## Hyperparameters\n", 1324 | "\n", 1325 | "Hyperparameters are the levers we can pull to adjust the behavior of a model. They are set before the model is trained and remain constant during training." 1326 | ] 1327 | }, 1328 | { 1329 | "cell_type": "markdown", 1330 | "metadata": {}, 1331 | "source": [ 1332 | "\n", 1333 | "## Tuning Linear Regression\n" 1334 | ] 1335 | }, 1336 | { 1337 | "cell_type": "code", 1338 | "execution_count": null, 1339 | "metadata": {}, 1340 | "outputs": [], 1341 | "source": [ 1342 | "lr_pipe" 1343 | ] 1344 | }, 1345 | { 1346 | "cell_type": "code", 1347 | "execution_count": null, 1348 | "metadata": {}, 1349 | "outputs": [], 1350 | "source": [ 1351 | "lr_pipe.named_steps['lr']" 1352 | ] 1353 | }, 1354 | { 1355 | "cell_type": "code", 1356 | "execution_count": null, 1357 | "metadata": { 1358 | "scrolled": true 1359 | }, 1360 | "outputs": [], 1361 | "source": [ 1362 | "help(lr_pipe.named_steps['lr'])" 1363 | ] 1364 | }, 1365 | { 1366 | "cell_type": "code", 1367 | "execution_count": null, 1368 | "metadata": { 1369 | "scrolled": true 1370 | }, 1371 | "outputs": [], 1372 | "source": [ 1373 | "from sklearn.linear_model import Ridge\n", 1374 | "Ridge?" 1375 | ] 1376 | }, 1377 | { 1378 | "cell_type": "code", 1379 | "execution_count": null, 1380 | "metadata": {}, 1381 | "outputs": [], 1382 | "source": [ 1383 | "rr = Ridge()\n", 1384 | "y = raw.select('price')\n", 1385 | "X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)\n", 1386 | "rr_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n", 1387 | " ('zip_avg_price', ZipAvgPriceAdder()),\n", 1388 | " ('preprocessor', preprocessor),\n", 1389 | " ('rr', rr),\n", 1390 | " ])\n", 1391 | "\n", 1392 | "rr_pipe.fit(X_train, y_train)\n", 1393 | "rr_pipe.score(X_test, y_test)" 1394 | ] 1395 | }, 1396 | { 1397 | "cell_type": "code", 1398 | "execution_count": null, 1399 | "metadata": {}, 1400 | "outputs": [], 1401 | "source": [] 1402 | }, 1403 | { 1404 | "cell_type": "code", 1405 | "execution_count": null, 1406 | "metadata": { 1407 | "scrolled": true 1408 | }, 1409 | "outputs": [], 1410 | "source": [ 1411 | "lr_pipe.score(X_test, y_test)" 1412 | ] 1413 | }, 1414 | { 1415 | "cell_type": "code", 1416 | "execution_count": null, 1417 | "metadata": { 1418 | "collapsed": true, 1419 | "jupyter": { 1420 | "outputs_hidden": true 1421 | } 1422 | }, 1423 | "outputs": [], 1424 | "source": [ 1425 | "from sklearn.model_selection import validation_curve\n", 1426 | "\n", 1427 | "param_range = [0, .01, .05, .1, .5, 1, 2]\n", 1428 | "scores = []\n", 1429 | "for val in param_range:\n", 1430 | " rr_pipe.set_params(rr__alpha=val)\n", 1431 | " rr_pipe.fit(X_train, y_train)\n", 1432 | " scores.append(rr_pipe.score(X_test, y_test))" 1433 | ] 1434 | }, 1435 | { 1436 | "cell_type": "code", 1437 | "execution_count": null, 1438 | "metadata": {}, 1439 | "outputs": [], 1440 | "source": [ 1441 | "# Our be score is at 0 (which is normal Linear Regression)\n", 1442 | "alpha = pl.DataFrame({'val': param_range,\n", 1443 | " 'scores': scores})\n", 1444 | "alpha.plot(x='val', y='scores')" 1445 | ] 1446 | }, 1447 | { 1448 | "cell_type": "code", 1449 | "execution_count": null, 1450 | "metadata": {}, 1451 | "outputs": [], 1452 | "source": [] 1453 | }, 1454 | { 1455 | "cell_type": "code", 1456 | "execution_count": null, 1457 | "metadata": {}, 1458 | "outputs": [], 1459 | "source": [] 1460 | }, 1461 | { 1462 | "cell_type": "code", 1463 | "execution_count": null, 1464 | "metadata": {}, 1465 | "outputs": [], 1466 | "source": [] 1467 | }, 1468 | { 1469 | "cell_type": "code", 1470 | "execution_count": null, 1471 | "metadata": {}, 1472 | "outputs": [], 1473 | "source": [] 1474 | }, 1475 | { 1476 | "cell_type": "markdown", 1477 | "metadata": {}, 1478 | "source": [ 1479 | "\n", 1480 | "## Tuning Decision Trees\n" 1481 | ] 1482 | }, 1483 | { 1484 | "cell_type": "code", 1485 | "execution_count": null, 1486 | "metadata": { 1487 | "scrolled": true 1488 | }, 1489 | "outputs": [], 1490 | "source": [ 1491 | "dt_pipe.named_steps['dt']" 1492 | ] 1493 | }, 1494 | { 1495 | "cell_type": "code", 1496 | "execution_count": null, 1497 | "metadata": { 1498 | "scrolled": true 1499 | }, 1500 | "outputs": [], 1501 | "source": [ 1502 | "help(dt_pipe.named_steps['dt'])" 1503 | ] 1504 | }, 1505 | { 1506 | "cell_type": "code", 1507 | "execution_count": null, 1508 | "metadata": { 1509 | "scrolled": true 1510 | }, 1511 | "outputs": [], 1512 | "source": [ 1513 | "# plot a validation curve tracking mse as the max_depth of the decision tree increases\n", 1514 | "from sklearn.model_selection import validation_curve\n", 1515 | "\n", 1516 | "param_range = range(1, 20)\n", 1517 | "train_scores, test_scores = validation_curve(\n", 1518 | " dt_pipe, X_train, y_train, param_name=\"dt__max_depth\", param_range=param_range,\n", 1519 | " scoring=\"neg_mean_squared_error\", n_jobs=1)" 1520 | ] 1521 | }, 1522 | { 1523 | "cell_type": "code", 1524 | "execution_count": null, 1525 | "metadata": {}, 1526 | "outputs": [], 1527 | "source": [ 1528 | "# make a validation curve from train_scores and test_scores\n", 1529 | "import matplotlib.pyplot as plt\n", 1530 | "import numpy as np\n", 1531 | "\n", 1532 | "train_scores_mean = np.mean(train_scores, axis=1)\n", 1533 | "train_scores_std = np.std(train_scores, axis=1)\n", 1534 | "test_scores_mean = np.mean(test_scores, axis=1)\n", 1535 | "test_scores_std = np.std(test_scores, axis=1)\n", 1536 | "\n", 1537 | "plt.title(\"Validation Curve with Decision Tree\")\n", 1538 | "plt.xlabel(\"max_depth\")\n", 1539 | "plt.ylabel(\"Score\")\n", 1540 | "#plt.ylim(-1, 0)\n", 1541 | "lw = 2\n", 1542 | "plt.plot(param_range, train_scores_mean, label=\"Training score\",\n", 1543 | " color=\"darkorange\", lw=lw)\n", 1544 | "plt.fill_between(param_range, train_scores_mean - train_scores_std,\n", 1545 | " train_scores_mean + train_scores_std, alpha=0.2,\n", 1546 | " color=\"darkorange\", lw=lw)\n", 1547 | "plt.plot(param_range, test_scores_mean, label=\"Cross-validation score\",\n", 1548 | " color=\"navy\", lw=lw)\n", 1549 | "\n", 1550 | "plt.fill_between(param_range, test_scores_mean - test_scores_std, \n", 1551 | " test_scores_mean + test_scores_std, alpha=0.2,\n", 1552 | " color=\"navy\", lw=lw)\n", 1553 | "plt.legend(loc=\"best\")\n", 1554 | "\n", 1555 | "\n" 1556 | ] 1557 | }, 1558 | { 1559 | "cell_type": "code", 1560 | "execution_count": null, 1561 | "metadata": {}, 1562 | "outputs": [], 1563 | "source": [ 1564 | "# train dt_pipe with max_depth=8\n", 1565 | "dt8_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n", 1566 | " ('zip_avg_price', ZipAvgPriceAdder()),\n", 1567 | " ('to_pandas', pandas_transformer),\n", 1568 | " ('preprocessor', preprocessor),\n", 1569 | " ('dt', DecisionTreeRegressor(max_depth=8)),\n", 1570 | " ])\n", 1571 | "\n", 1572 | "dt8_pipe.fit(X_train, y_train)\n", 1573 | "dt8_pipe.score(X_test, y_test)" 1574 | ] 1575 | }, 1576 | { 1577 | "cell_type": "code", 1578 | "execution_count": null, 1579 | "metadata": {}, 1580 | "outputs": [], 1581 | "source": [ 1582 | "from sklearn.metrics import mean_squared_error\n", 1583 | "mean_squared_error(y_test, dt8_pipe.predict(X_test), squared=False) " 1584 | ] 1585 | }, 1586 | { 1587 | "cell_type": "code", 1588 | "execution_count": null, 1589 | "metadata": {}, 1590 | "outputs": [], 1591 | "source": [ 1592 | "dt_pipe.score(X_test, y_test)" 1593 | ] 1594 | }, 1595 | { 1596 | "cell_type": "code", 1597 | "execution_count": null, 1598 | "metadata": {}, 1599 | "outputs": [], 1600 | "source": [ 1601 | "mean_squared_error(y_test, dt_pipe.predict(X_test), squared=False) " 1602 | ] 1603 | }, 1604 | { 1605 | "cell_type": "code", 1606 | "execution_count": null, 1607 | "metadata": {}, 1608 | "outputs": [], 1609 | "source": [] 1610 | }, 1611 | { 1612 | "cell_type": "code", 1613 | "execution_count": null, 1614 | "metadata": {}, 1615 | "outputs": [], 1616 | "source": [] 1617 | }, 1618 | { 1619 | "cell_type": "code", 1620 | "execution_count": null, 1621 | "metadata": {}, 1622 | "outputs": [], 1623 | "source": [] 1624 | }, 1625 | { 1626 | "cell_type": "code", 1627 | "execution_count": null, 1628 | "metadata": {}, 1629 | "outputs": [], 1630 | "source": [] 1631 | }, 1632 | { 1633 | "cell_type": "code", 1634 | "execution_count": null, 1635 | "metadata": {}, 1636 | "outputs": [], 1637 | "source": [] 1638 | }, 1639 | { 1640 | "cell_type": "markdown", 1641 | "metadata": {}, 1642 | "source": [ 1643 | "\n", 1644 | "## Tuning CatBoost\n", 1645 | "\n", 1646 | "* Boosting - `iterations` (`num_trees`, `n_estimators`), `learning_rate` (`eta`), `early_stopping_rounds`\n", 1647 | "\n", 1648 | "* Tree based - `depth` (`max_depth`), `grow_policy`, `min_child_samples` (`min_data_in_leaf`), `max_leaves` (`num_leaves`)\n", 1649 | "\n", 1650 | "* Sampling - `subsample`, `sampling_frequency`, `rsm` (`colsample_bylevel`), `random_strength`, `bagging_temperature`\n", 1651 | "\n", 1652 | "* Regularization - `l2_leaf_reg` (`reg_lambda`), `model_shrink_rate`\n", 1653 | "\n", 1654 | "* Constraints - `monotone_constraints`, `feature_weights`" 1655 | ] 1656 | }, 1657 | { 1658 | "cell_type": "code", 1659 | "execution_count": null, 1660 | "metadata": { 1661 | "scrolled": true 1662 | }, 1663 | "outputs": [], 1664 | "source": [ 1665 | "catboost.CatBoostRegressor?" 1666 | ] 1667 | }, 1668 | { 1669 | "cell_type": "code", 1670 | "execution_count": null, 1671 | "metadata": {}, 1672 | "outputs": [], 1673 | "source": [ 1674 | "cr2 = catboost.CatBoostRegressor(iterations=3000, learning_rate=0.1,\n", 1675 | " early_stopping_rounds=10)\n", 1676 | "X_train, X_test, y_train, y_test = train_test_split(raw.drop('price'), y, \n", 1677 | " test_size=0.2, random_state=42)\n", 1678 | "\n", 1679 | "cr2.fit(X_train.to_pandas(), y_train.to_numpy(), cat_features=['zipcode'], verbose=100,\n", 1680 | " early_stopping_rounds=10, eval_set=(X_test.to_pandas(), y_test.to_numpy()))" 1681 | ] 1682 | }, 1683 | { 1684 | "cell_type": "code", 1685 | "execution_count": null, 1686 | "metadata": { 1687 | "scrolled": true 1688 | }, 1689 | "outputs": [], 1690 | "source": [ 1691 | "# plot a validation curve tracking mse as the max_depth of the decision tree increases\n", 1692 | "from sklearn.model_selection import validation_curve\n", 1693 | "\n", 1694 | "param_range = range(1, 10)\n", 1695 | "train_scores, test_scores = validation_curve(\n", 1696 | " cr2, X_train.to_pandas(), y_train.to_numpy(), param_name=\"max_depth\", \n", 1697 | " param_range=param_range,\n", 1698 | " scoring=\"neg_mean_squared_error\", n_jobs=1,\n", 1699 | " fit_params=dict(early_stopping_rounds=10, \n", 1700 | " eval_set=(X_test.to_pandas(), y_test.to_numpy())))" 1701 | ] 1702 | }, 1703 | { 1704 | "cell_type": "code", 1705 | "execution_count": null, 1706 | "metadata": {}, 1707 | "outputs": [], 1708 | "source": [ 1709 | "# make a validation curve from train_scores and test_scores\n", 1710 | "import matplotlib.pyplot as plt\n", 1711 | "import numpy as np\n", 1712 | "\n", 1713 | "train_scores_mean = np.mean(train_scores, axis=1)\n", 1714 | "train_scores_std = np.std(train_scores, axis=1)\n", 1715 | "test_scores_mean = np.mean(test_scores, axis=1)\n", 1716 | "test_scores_std = np.std(test_scores, axis=1)\n", 1717 | "\n", 1718 | "plt.title(\"Validation Curve with CatBoost\")\n", 1719 | "plt.xlabel(\"max_depth\")\n", 1720 | "plt.ylabel(\"Score\")\n", 1721 | "#plt.ylim(-1, 0)\n", 1722 | "lw = 2\n", 1723 | "plt.plot(param_range, train_scores_mean, label=\"Training score\",\n", 1724 | " color=\"darkorange\", lw=lw)\n", 1725 | "plt.fill_between(param_range, train_scores_mean - train_scores_std,\n", 1726 | " train_scores_mean + train_scores_std, alpha=0.2,\n", 1727 | " color=\"darkorange\", lw=lw)\n", 1728 | "plt.plot(param_range, test_scores_mean, label=\"Cross-validation score\",\n", 1729 | " color=\"navy\", lw=lw)\n", 1730 | "\n", 1731 | "plt.fill_between(param_range, test_scores_mean - test_scores_std, \n", 1732 | " test_scores_mean + test_scores_std, alpha=0.2,\n", 1733 | " color=\"navy\", lw=lw)\n", 1734 | "plt.legend(loc=\"best\")\n", 1735 | "\n", 1736 | "\n" 1737 | ] 1738 | }, 1739 | { 1740 | "cell_type": "code", 1741 | "execution_count": null, 1742 | "metadata": {}, 1743 | "outputs": [], 1744 | "source": [ 1745 | "# set max_depth to 4\n", 1746 | "cr2_4 = catboost.CatBoostRegressor(iterations=3000, learning_rate=0.1,\n", 1747 | " max_depth=4)\n", 1748 | "\n", 1749 | "X_train, X_test, y_train, y_test = train_test_split(raw.drop('price'), y, \n", 1750 | " test_size=0.2, random_state=42)\n", 1751 | "\n", 1752 | "cr2_4.fit(X_train.to_pandas(), y_train.to_numpy(), cat_features=['zipcode'], verbose=100,\n", 1753 | " early_stopping_rounds=10, eval_set=(X_test.to_pandas(), y_test.to_numpy()))\n", 1754 | "cr2_4.score(X_test.to_pandas(), y_test.to_numpy())" 1755 | ] 1756 | }, 1757 | { 1758 | "cell_type": "code", 1759 | "execution_count": null, 1760 | "metadata": { 1761 | "jupyter": { 1762 | "source_hidden": true 1763 | } 1764 | }, 1765 | "outputs": [], 1766 | "source": [] 1767 | }, 1768 | { 1769 | "cell_type": "code", 1770 | "execution_count": null, 1771 | "metadata": {}, 1772 | "outputs": [], 1773 | "source": [] 1774 | }, 1775 | { 1776 | "cell_type": "code", 1777 | "execution_count": null, 1778 | "metadata": {}, 1779 | "outputs": [], 1780 | "source": [] 1781 | }, 1782 | { 1783 | "cell_type": "code", 1784 | "execution_count": null, 1785 | "metadata": {}, 1786 | "outputs": [], 1787 | "source": [] 1788 | }, 1789 | { 1790 | "cell_type": "code", 1791 | "execution_count": null, 1792 | "metadata": {}, 1793 | "outputs": [], 1794 | "source": [] 1795 | }, 1796 | { 1797 | "cell_type": "code", 1798 | "execution_count": null, 1799 | "metadata": {}, 1800 | "outputs": [], 1801 | "source": [] 1802 | }, 1803 | { 1804 | "cell_type": "code", 1805 | "execution_count": null, 1806 | "metadata": {}, 1807 | "outputs": [], 1808 | "source": [] 1809 | }, 1810 | { 1811 | "cell_type": "markdown", 1812 | "metadata": {}, 1813 | "source": [ 1814 | "\n", 1815 | "## Grid Search\n" 1816 | ] 1817 | }, 1818 | { 1819 | "cell_type": "code", 1820 | "execution_count": null, 1821 | "metadata": { 1822 | "scrolled": true 1823 | }, 1824 | "outputs": [], 1825 | "source": [ 1826 | "from sklearn.tree import DecisionTreeRegressor\n", 1827 | "\n", 1828 | "\n", 1829 | "dt = DecisionTreeRegressor()\n", 1830 | "y = raw.select('price')\n", 1831 | "X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)\n", 1832 | "dt_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n", 1833 | " ('zip_avg_price', ZipAvgPriceAdder()),\n", 1834 | " ('preprocessor', preprocessor),\n", 1835 | " ('dt', dt),\n", 1836 | " ])\n", 1837 | "\n", 1838 | "dt_pipe.fit(X_train, y_train)\n", 1839 | "dt_pipe.score(X_test, y_test)" 1840 | ] 1841 | }, 1842 | { 1843 | "cell_type": "code", 1844 | "execution_count": null, 1845 | "metadata": { 1846 | "collapsed": true, 1847 | "jupyter": { 1848 | "outputs_hidden": true 1849 | }, 1850 | "scrolled": true 1851 | }, 1852 | "outputs": [], 1853 | "source": [ 1854 | "dt_pipe" 1855 | ] 1856 | }, 1857 | { 1858 | "cell_type": "code", 1859 | "execution_count": null, 1860 | "metadata": { 1861 | "scrolled": true 1862 | }, 1863 | "outputs": [], 1864 | "source": [ 1865 | "# use grid search on decision tree\n", 1866 | "from sklearn.model_selection import GridSearchCV\n", 1867 | "\n", 1868 | "param_grid = {\n", 1869 | " 'dt__max_depth': [3, 6, 9],\n", 1870 | " 'dt__min_samples_split': [10, 20, 100],\n", 1871 | " 'dt__min_samples_leaf': [10, 20, 100],\n", 1872 | "}\n", 1873 | "\n", 1874 | "grid_search = GridSearchCV(dt_pipe, param_grid, cv=5)#, scoring='neg_mean_squared_error')\n", 1875 | "grid_search.fit(X_train, y_train)" 1876 | ] 1877 | }, 1878 | { 1879 | "cell_type": "code", 1880 | "execution_count": null, 1881 | "metadata": {}, 1882 | "outputs": [], 1883 | "source": [ 1884 | "grid_search.best_params_" 1885 | ] 1886 | }, 1887 | { 1888 | "cell_type": "code", 1889 | "execution_count": null, 1890 | "metadata": { 1891 | "scrolled": true 1892 | }, 1893 | "outputs": [], 1894 | "source": [ 1895 | "# make a tree from the params\n", 1896 | "dt = DecisionTreeRegressor()#max_depth=9, min_samples_leaf=20, min_samples_split=10)\n", 1897 | "dt_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n", 1898 | " ('zip_avg_price', ZipAvgPriceAdder()),\n", 1899 | " ('to_pandas', pandas_transformer),\n", 1900 | " ('preprocessor', preprocessor),\n", 1901 | " ('dt', dt),\n", 1902 | " ])\n", 1903 | "dt_pipe.set_params(**grid_search.best_params_)\n", 1904 | "dt_pipe.fit(X_train, y_train)\n", 1905 | "dt_pipe.score(X_test, y_test)" 1906 | ] 1907 | }, 1908 | { 1909 | "cell_type": "code", 1910 | "execution_count": null, 1911 | "metadata": { 1912 | "scrolled": true 1913 | }, 1914 | "outputs": [], 1915 | "source": [ 1916 | "# compare to default\n", 1917 | "dt = DecisionTreeRegressor(random_state=42)\n", 1918 | "dt_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n", 1919 | " ('zip_avg_price', ZipAvgPriceAdder()),\n", 1920 | " ('to_pandas', pandas_transformer),\n", 1921 | " ('preprocessor', preprocessor),\n", 1922 | " ('dt', dt),\n", 1923 | " ])\n", 1924 | "\n", 1925 | "dt_pipe.fit(X_train, y_train)\n", 1926 | "dt_pipe.score(X_test, y_test)" 1927 | ] 1928 | }, 1929 | { 1930 | "cell_type": "code", 1931 | "execution_count": null, 1932 | "metadata": {}, 1933 | "outputs": [], 1934 | "source": [] 1935 | }, 1936 | { 1937 | "cell_type": "code", 1938 | "execution_count": null, 1939 | "metadata": {}, 1940 | "outputs": [], 1941 | "source": [] 1942 | }, 1943 | { 1944 | "cell_type": "code", 1945 | "execution_count": null, 1946 | "metadata": {}, 1947 | "outputs": [], 1948 | "source": [] 1949 | }, 1950 | { 1951 | "cell_type": "code", 1952 | "execution_count": null, 1953 | "metadata": {}, 1954 | "outputs": [], 1955 | "source": [] 1956 | }, 1957 | { 1958 | "cell_type": "markdown", 1959 | "metadata": {}, 1960 | "source": [ 1961 | "\n", 1962 | "## Challenge\n", 1963 | "\n", 1964 | "Do a grid search to find the best depth for the random forest model. What is the best depth? What is the score of the model with the best depth?" 1965 | ] 1966 | }, 1967 | { 1968 | "cell_type": "markdown", 1969 | "metadata": {}, 1970 | "source": [ 1971 | "\n", 1972 | "## Solution" 1973 | ] 1974 | }, 1975 | { 1976 | "cell_type": "code", 1977 | "execution_count": null, 1978 | "metadata": {}, 1979 | "outputs": [], 1980 | "source": [] 1981 | }, 1982 | { 1983 | "cell_type": "code", 1984 | "execution_count": null, 1985 | "metadata": {}, 1986 | "outputs": [], 1987 | "source": [] 1988 | }, 1989 | { 1990 | "cell_type": "code", 1991 | "execution_count": null, 1992 | "metadata": {}, 1993 | "outputs": [], 1994 | "source": [] 1995 | }, 1996 | { 1997 | "cell_type": "code", 1998 | "execution_count": null, 1999 | "metadata": {}, 2000 | "outputs": [], 2001 | "source": [] 2002 | }, 2003 | { 2004 | "cell_type": "markdown", 2005 | "metadata": {}, 2006 | "source": [ 2007 | "# Model Deployment\n" 2008 | ] 2009 | }, 2010 | { 2011 | "cell_type": "markdown", 2012 | "metadata": {}, 2013 | "source": [ 2014 | "\n", 2015 | "## End to end notebook\n" 2016 | ] 2017 | }, 2018 | { 2019 | "cell_type": "code", 2020 | "execution_count": null, 2021 | "metadata": {}, 2022 | "outputs": [], 2023 | "source": [ 2024 | "import polars as pl\n", 2025 | "from sklearn.pipeline import Pipeline\n", 2026 | "from sklearn.compose import ColumnTransformer\n", 2027 | "from sklearn.linear_model import LinearRegression\n", 2028 | "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", 2029 | "from sklearn.impute import SimpleImputer\n", 2030 | "from sklearn.model_selection import train_test_split\n", 2031 | "from sklearn.preprocessing import FunctionTransformer\n", 2032 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 2033 | "from sklearn import set_config\n", 2034 | "set_config(transform_output='polars')\n", 2035 | "\n", 2036 | "def tweak_housing(df):\n", 2037 | " return (df\n", 2038 | " .with_columns(zipcode=pl.col('zipcode').cast(pl.String).cast(pl.Categorical),\n", 2039 | " date=pl.date(pl.col('date_year'), pl.col('date_month'), pl.col('date_day')),\n", 2040 | " yr_renovated=pl.col('yr_renovated').replace(0, None),\n", 2041 | " )\n", 2042 | " .select(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', \n", 2043 | " 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', \n", 2044 | " 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', \n", 2045 | " 'sqft_lot15', 'date', #'date_year', 'date_month', 'date_day', \n", 2046 | " ])\n", 2047 | " )\n", 2048 | "\n", 2049 | "# make the pipeline\n", 2050 | "numeric_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', \n", 2051 | " 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', \n", 2052 | " 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'zip_mean']\n", 2053 | "numeric_transformer = Pipeline(steps=[\n", 2054 | " ('imputer', SimpleImputer(strategy='median')),\n", 2055 | " ('scaler', StandardScaler())])\n", 2056 | "\n", 2057 | "categorical_features = ['zipcode']\n", 2058 | "\n", 2059 | "preprocessor = ColumnTransformer(\n", 2060 | " transformers=[\n", 2061 | " ('num', numeric_transformer, numeric_features),\n", 2062 | " ('cat', OneHotEncoder(handle_unknown='ignore',\n", 2063 | " sparse_output=False), categorical_features)])\n", 2064 | "\n", 2065 | "def to_pandas(df):\n", 2066 | " return df.to_pandas()\n", 2067 | "pandas_transformer = FunctionTransformer(to_pandas)\n", 2068 | "\n", 2069 | "tweak_transformer = FunctionTransformer(tweak_housing)\n", 2070 | "\n", 2071 | "class ZipAvgPriceAdder(BaseEstimator, TransformerMixin):\n", 2072 | " def __init__(self):\n", 2073 | " pass\n", 2074 | " def fit(self, X, y=None):\n", 2075 | " # assume X is a polars dataframe\n", 2076 | " self.zip_avg_price = (X\n", 2077 | " .group_by('zipcode')\n", 2078 | " .agg(zip_mean=pl.col('price').mean())\n", 2079 | " )\n", 2080 | " return self\n", 2081 | " \n", 2082 | " def transform(self, X, y=None):\n", 2083 | " with pl.StringCache():\n", 2084 | " return X.join(self.zip_avg_price, on='zipcode')\n", 2085 | "\n", 2086 | "\n", 2087 | "# King County House Sales dataset from OpenML (includes Seattle)\n", 2088 | "# this is an ARFF file, which is a text file with a specific format\n", 2089 | "url = 'https://www.openml.org/data/download/22044765/dataset'\n", 2090 | "cols = ['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', \n", 2091 | " 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',\n", 2092 | " 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'date_year', 'date_month', 'date_day']\n", 2093 | "\n", 2094 | "raw = pl.read_csv(url, new_columns=cols, skip_rows=31, has_header=False)\n", 2095 | "\n", 2096 | "lr = LinearRegression()\n", 2097 | "y = raw.select('price')\n", 2098 | "X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)\n", 2099 | "lr_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n", 2100 | " ('zip_avg_price', ZipAvgPriceAdder()),\n", 2101 | " ('preprocessor', preprocessor),\n", 2102 | " ('lr', lr),\n", 2103 | " ])\n", 2104 | "\n", 2105 | "lr_pipe.fit(X_train, y_train)\n", 2106 | "lr_pipe.score(X_test, y_test)\n", 2107 | " " 2108 | ] 2109 | }, 2110 | { 2111 | "cell_type": "markdown", 2112 | "metadata": {}, 2113 | "source": [] 2114 | }, 2115 | { 2116 | "cell_type": "code", 2117 | "execution_count": null, 2118 | "metadata": {}, 2119 | "outputs": [], 2120 | "source": [] 2121 | }, 2122 | { 2123 | "cell_type": "code", 2124 | "execution_count": null, 2125 | "metadata": {}, 2126 | "outputs": [], 2127 | "source": [] 2128 | }, 2129 | { 2130 | "cell_type": "code", 2131 | "execution_count": null, 2132 | "metadata": {}, 2133 | "outputs": [], 2134 | "source": [] 2135 | }, 2136 | { 2137 | "cell_type": "code", 2138 | "execution_count": null, 2139 | "metadata": {}, 2140 | "outputs": [], 2141 | "source": [] 2142 | }, 2143 | { 2144 | "cell_type": "code", 2145 | "execution_count": null, 2146 | "metadata": {}, 2147 | "outputs": [], 2148 | "source": [] 2149 | }, 2150 | { 2151 | "cell_type": "markdown", 2152 | "metadata": {}, 2153 | "source": [ 2154 | "## Using MLFlow\n", 2155 | "\n", 2156 | "Going to show how to persist and load a model, but can also:\n", 2157 | "\n", 2158 | "- Start a endpoint to serve predictions\n", 2159 | "- Build a Docker image\n" 2160 | ] 2161 | }, 2162 | { 2163 | "cell_type": "code", 2164 | "execution_count": null, 2165 | "metadata": {}, 2166 | "outputs": [], 2167 | "source": [ 2168 | "import mlflow" 2169 | ] 2170 | }, 2171 | { 2172 | "cell_type": "code", 2173 | "execution_count": null, 2174 | "metadata": {}, 2175 | "outputs": [], 2176 | "source": [ 2177 | "mlflow.__version__" 2178 | ] 2179 | }, 2180 | { 2181 | "cell_type": "code", 2182 | "execution_count": null, 2183 | "metadata": {}, 2184 | "outputs": [], 2185 | "source": [ 2186 | "model_info = mlflow.sklearn.log_model(lr_pipe, artifact_path='lr_pipe')" 2187 | ] 2188 | }, 2189 | { 2190 | "cell_type": "code", 2191 | "execution_count": null, 2192 | "metadata": {}, 2193 | "outputs": [], 2194 | "source": [ 2195 | "model_info.artifact_path " 2196 | ] 2197 | }, 2198 | { 2199 | "cell_type": "code", 2200 | "execution_count": null, 2201 | "metadata": {}, 2202 | "outputs": [], 2203 | "source": [ 2204 | "!tree" 2205 | ] 2206 | }, 2207 | { 2208 | "cell_type": "code", 2209 | "execution_count": null, 2210 | "metadata": {}, 2211 | "outputs": [], 2212 | "source": [ 2213 | "model_info.run_id" 2214 | ] 2215 | }, 2216 | { 2217 | "cell_type": "code", 2218 | "execution_count": null, 2219 | "metadata": {}, 2220 | "outputs": [], 2221 | "source": [ 2222 | "model = mlflow.pyfunc.load_model(f'mlruns/0/{model_info.run_id}/artifacts/lr_pipe')" 2223 | ] 2224 | }, 2225 | { 2226 | "cell_type": "code", 2227 | "execution_count": null, 2228 | "metadata": {}, 2229 | "outputs": [], 2230 | "source": [ 2231 | "model" 2232 | ] 2233 | }, 2234 | { 2235 | "cell_type": "code", 2236 | "execution_count": null, 2237 | "metadata": {}, 2238 | "outputs": [], 2239 | "source": [ 2240 | "model.predict(X_test)" 2241 | ] 2242 | }, 2243 | { 2244 | "cell_type": "code", 2245 | "execution_count": null, 2246 | "metadata": {}, 2247 | "outputs": [], 2248 | "source": [] 2249 | }, 2250 | { 2251 | "cell_type": "markdown", 2252 | "metadata": {}, 2253 | "source": [ 2254 | "## Challenge\n", 2255 | "\n", 2256 | "Reformat your notebook so that you can load the data and create an optimized random forest model in a single cell. Then, use MLFlow to log the model and its parameters." 2257 | ] 2258 | }, 2259 | { 2260 | "cell_type": "code", 2261 | "execution_count": null, 2262 | "metadata": {}, 2263 | "outputs": [], 2264 | "source": [] 2265 | }, 2266 | { 2267 | "cell_type": "markdown", 2268 | "metadata": {}, 2269 | "source": [ 2270 | "\n", 2271 | "## Solution" 2272 | ] 2273 | }, 2274 | { 2275 | "cell_type": "markdown", 2276 | "metadata": {}, 2277 | "source": [] 2278 | }, 2279 | { 2280 | "cell_type": "code", 2281 | "execution_count": null, 2282 | "metadata": {}, 2283 | "outputs": [], 2284 | "source": [] 2285 | }, 2286 | { 2287 | "cell_type": "code", 2288 | "execution_count": null, 2289 | "metadata": {}, 2290 | "outputs": [], 2291 | "source": [] 2292 | } 2293 | ], 2294 | "metadata": { 2295 | "kernelspec": { 2296 | "display_name": "Python 3 (ipykernel)", 2297 | "language": "python", 2298 | "name": "python3" 2299 | }, 2300 | "language_info": { 2301 | "codemirror_mode": { 2302 | "name": "ipython", 2303 | "version": 3 2304 | }, 2305 | "file_extension": ".py", 2306 | "mimetype": "text/x-python", 2307 | "name": "python", 2308 | "nbconvert_exporter": "python", 2309 | "pygments_lexer": "ipython3", 2310 | "version": "3.10.13" 2311 | } 2312 | }, 2313 | "nbformat": 4, 2314 | "nbformat_minor": 4 2315 | } 2316 | --------------------------------------------------------------------------------