├── .gitignore
├── .github
    ├── CODEOWNERS
    ├── PULL_REQUEST_TEMPLATE.md
    ├── workflows
    │   └── main.yml
    └── ISSUE_TEMPLATE.md
├── .devcontainer
    └── devcontainer.json
├── CONTRIBUTING.md
├── NOTICE
├── .vscode
    └── settings.json
├── README.md
├── requirements.txt
├── LICENSE
└── ml-foundations.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | node_modules
3 | .tmp
4 | npm-debug.log
5 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Codeowners for these exercise files:
2 | # * (asterisk) denotes "all files and folders"
3 | # Example: * @producer @instructor
4 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | <!-- This repository *does not* accept pull requests (PRs). All pull requests will be closed. See CONTRIBUTING.md for further details. -->
2 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: Copy To Branches
 2 | on:
 3 |   workflow_dispatch:
 4 | jobs:
 5 |   copy-to-branches:
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |       - uses: actions/checkout@v2
 9 |         with:
10 |           fetch-depth: 0
11 |       - name: Copy To Branches Action
12 |         uses: planetoftheweb/copy-to-branches@v1.2
13 |         env:
14 |           key: main
15 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extensions": [
 3 |     "GitHub.github-vscode-theme",
 4 |     "ms-toolsai.jupyter",
 5 |     "ms-python.python"
 6 |     // Additional Extensions Here
 7 |   ],
 8 |   "onCreateCommand" : "[ -f requirements.txt ] && pip install -r requirements.txt; echo PS1='\"$ \"' >> ~/.bashrc", //Set Terminal Prompt to $
 9 | }
10 | 
11 | // DevContainer Reference: https://code.visualstudio.com/docs/remote/devcontainerjson-reference
12 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | 
2 | Contribution Agreement
3 | ======================
4 | 
5 | This repository does not accept pull requests (PRs). All pull requests will be closed.
6 | 
7 | However, if any contributions (through pull requests, issues, feedback or otherwise) are provided, as a contributor, you represent that the code you submit is your original work or that of your employer (in which case you represent you have the right to bind your employer). By submitting code (or otherwise providing feedback), you (and, if applicable, your employer) are licensing the submitted code (and/or feedback) to LinkedIn and the open source community subject to the BSD 2-Clause license.
8 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | Copyright 2024 LinkedIn Corporation
 2 | All Rights Reserved.
 3 | 
 4 | Licensed under the LinkedIn Learning Exercise File License (the "License").
 5 | See LICENSE in the project root for license information.
 6 | 
 7 | Please note, this project may automatically load third party code from external 
 8 | repositories (for example, NPM modules, Composer packages, or other dependencies). 
 9 | If so, such third party code may be subject to other license terms than as set 
10 | forth above. In addition, such third party code may also depend on and load 
11 | multiple tiers of dependencies. Please review the applicable licenses of the 
12 | additional dependencies.
13 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "editor.bracketPairColorization.enabled": true,
 3 |   "editor.cursorBlinking": "solid",
 4 |   "editor.fontFamily": "ui-monospace, Menlo, Monaco, 'Cascadia Mono', 'Segoe UI Mono', 'Roboto Mono', 'Oxygen Mono', 'Ubuntu Monospace', 'Source Code Pro', 'Fira Mono', 'Droid Sans Mono', 'Courier New', monospace",
 5 |   "editor.fontLigatures": false,
 6 |   "editor.fontSize": 22,
 7 |   "editor.formatOnPaste": true,
 8 |   "editor.formatOnSave": true,
 9 |   "editor.lineNumbers": "on",
10 |   "editor.matchBrackets": "always",
11 |   "editor.minimap.enabled": false,
12 |   "editor.smoothScrolling": true,
13 |   "editor.tabSize": 2,
14 |   "editor.useTabStops": true,
15 |   "emmet.triggerExpansionOnTab": true,
16 |   "explorer.openEditors.visible": 0,
17 |   "files.autoSave": "afterDelay",
18 |   "screencastMode.onlyKeyboardShortcuts": true,
19 |   "terminal.integrated.fontSize": 18,
20 |   "workbench.colorTheme": "Visual Studio Dark",
21 |   "workbench.fontAliasing": "antialiased",
22 |   "workbench.statusBar.visible": true
23 | }
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | BEFORE POSTING YOUR ISSUE:
 3 | - These comments won't show up when you submit the issue.
 4 | - Please use the sections below to provide information about the issue.
 5 | - Be specific: Add as much detail as possible.
 6 | -->
 7 | 
 8 | ## Issue Overview
 9 | <!-- A brief overview of the issue --->
10 | 
11 | ## Describe your environment
12 | <!-- Provide details about your environment: what editor, browser, and other software you are using and any other specifics to your setup -->
13 | 
14 | ## Steps to Reproduce
15 | <!-- Provide an unambiguous set of steps to reproduce this bug. Include code to reproduce, if relevant. Include a live link if available. -->
16 | 1.
17 | 2.
18 | 3.
19 | 4.
20 | 
21 | ## Expected Behavior
22 | <!-- What behavior did you expect? -->
23 | 
24 | ## Current Behavior
25 | <!-- What happened instead of the expected behavior? Describe the difference. -->
26 | 
27 | ## Possible Solution
28 | <!-- Optional: Do you have a fix or a suggestion on how to fix the issue? -->
29 | 
30 | ## Screenshots / Video
31 | <!-- Optional: Add any screenshots or video of the issue if available. -->
32 | 
33 | ## Related Issues
34 | <!-- List related issues -->
35 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Applied Machine Learning: Foundations
 2 | This is the repository for the LinkedIn Learning course Applied Machine Learning: Foundations. The full course is available from [LinkedIn Learning][lil-course-url].
 3 | 
 4 | ![lil-thumbnail-url]
 5 | 
 6 | <p>AI models are transforming the workplace. Knowing what’s going behind those models can help you apply machine learning (ML) techniques more effectively. In this course, instructor Matt Harrison shows you how to get started mastering the essentials of machine learning using the power of the Python programming language.</p><p>Explore the fundamentals of an end-to-end machine learning application, as you gain hands-on experience of data exploration, data processing, model creation, model evaluation, model tuning, and model deployment with MLFlow. Along the way, test out your new coding skills in the practice challenges at the end of each section.</p>
 7 | 
 8 | ## Getting Started
 9 | 
10 | This project can be set up and run in two ways: using GitHub Codespaces for a cloud-based environment, or locally on your machine by installing the required dependencies. Follow the instructions below to get started with the method that best suits your needs.
11 | 
12 | ### Option 1: Using GitHub Codespaces
13 | 
14 | GitHub Codespaces provides a complete, configurable dev environment on top of a powerful VS Code interface. It's an excellent option for quickly starting development without the need to set up your local environment.
15 | 
16 | 1. **Open the project in Codespaces:** Navigate to the GitHub page of the project and click the "Code" button. Select "Open with Codespaces" > "New codespace". This will set up a new cloud-based development environment pre-configured for this project.
17 | 
18 | 2. **Wait for installation:** The installation takes a few minutes after the Codespace launches. The terminal at the bottom of VSCode will be spinning for a little bit getting all of the dependencies built and installed.
19 | 
20 | 3. **Open up `ml-foundations.ipynb` in VSCode:** The video will walk you through this. 
21 | 
22 | ### Option 2: Local Setup
23 | 
24 | If you prefer to work on your local machine, follow these steps to set up the project environment. You'll need Python installed on your system (refer to [python.org](https://www.python.org/) for installation instructions).
25 | 
26 | 1. **Clone the repository:**
27 |    ```bash
28 |    git clone https://github.com/your-username/your-project-name.git
29 |    cd your-project-name
30 |    ```
31 | 2. **Create virtual environment:** Using your favorite mechanism, create a virtual environment for Python.
32 | 
33 | 3. **Install dependencies:**
34 |    Ensure you have your virtual environment activated. Then, install the required packages using the following command:
35 |    ```bash
36 |    pip install -r requirements.txt
37 |    ```
38 | 
39 | 4. **Launch Jupyter and open `ml-foundations.ipynb`:**
40 |    With the dependencies installed, you're ready to launch Juypter:
41 |    ```bash
42 |    jupyter lab
43 |    ```
44 | 
45 |    Navigate and open the `ml-foundations.ipynb` notebook in Jupyter.
46 | 
47 | ### Instructor
48 | 
49 | ![lil-avatar]
50 | 
51 | Matt Harrison
52 | 
53 | Python and Data Science Corporate Trainer, Author, Speaker, Consultant
54 | 
55 |                             
56 | 
57 | Check out my other courses on [LinkedIn Learning](https://www.linkedin.com/learning/instructors/matt-harrison?u=104).
58 | 
59 | [lil-course-url]: https://www.linkedin.com/learning/applied-machine-learning-foundations-21404006
60 | [lil-thumbnail-url]: https://media.licdn.com/dms/image/D560DAQG-umFqe1oFDg/learning-public-crop_675_1200/0/1717432957394?e=2147483647&v=beta&t=AGzP3y5jqX0AiSZyW4rB5J3wBome6-i-9XA_h6pq91w
61 | [lil-avatar]: https://media.licdn.com/dms/image/D560DAQGLDZBKwtHv5Q/learning-author-crop_200_200/0/1680625154253?e=1717002000&v=beta&t=vjtUd7bQaz4CR1FeiTQ3nWGvbydzOnHnjKiftJ8bWGg
62 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | adbc-driver-manager==0.11.0
  2 | adbc-driver-sqlite==0.11.0
  3 | alembic==1.13.1
  4 | aniso8601==9.0.1
  5 | annotated-types==0.6.0
  6 | anyio==4.3.0
  7 | argon2-cffi==23.1.0
  8 | argon2-cffi-bindings==21.2.0
  9 | arrow==1.3.0
 10 | asttokens==2.4.1
 11 | async-lru==2.0.4
 12 | attrs==23.2.0
 13 | Babel==2.14.0
 14 | beautifulsoup4==4.12.3
 15 | bleach==6.1.0
 16 | blinker==1.7.0
 17 | bokeh==3.4.0
 18 | catboost==1.2.3
 19 | certifi==2024.2.2
 20 | cffi==1.16.0
 21 | charset-normalizer==3.3.2
 22 | click==8.1.7
 23 | cloudpickle==3.0.0
 24 | colorama==0.4.6
 25 | colorcet==3.1.0
 26 | comm==0.2.2
 27 | connectorx==0.3.2
 28 | contourpy==1.2.0
 29 | cycler==0.12.1
 30 | debugpy==1.8.1
 31 | decorator==5.1.1
 32 | defusedxml==0.7.1
 33 | deltalake==0.16.4
 34 | docker==7.0.0
 35 | entrypoints==0.4
 36 | exceptiongroup==1.2.0
 37 | executing==2.0.1
 38 | fastexcel==0.10.2
 39 | fastjsonschema==2.19.1
 40 | filelock==3.13.1
 41 | Flask==3.0.2
 42 | fonttools==4.50.0
 43 | fqdn==1.5.1
 44 | fsspec==2023.12.2
 45 | gevent==24.2.1
 46 | gitdb==4.0.11
 47 | GitPython==3.1.42
 48 | graphene==3.3
 49 | graphql-core==3.2.3
 50 | graphql-relay==3.2.0
 51 | graphviz==0.20.3
 52 | greenlet==3.0.3
 53 | gunicorn==21.2.0
 54 | h11==0.14.0
 55 | holoviews==1.18.3
 56 | httpcore==1.0.4
 57 | httpx==0.27.0
 58 | hvplot==0.9.2
 59 | idna==3.6
 60 | importlib_metadata==7.1.0
 61 | importlib_resources==6.4.0
 62 | ipykernel==6.29.3
 63 | ipython==8.22.2
 64 | isoduration==20.11.0
 65 | itsdangerous==2.1.2
 66 | jedi==0.19.1
 67 | Jinja2==3.1.3
 68 | joblib==1.3.2
 69 | json5==0.9.24
 70 | jsonpointer==2.4
 71 | jsonschema==4.21.1
 72 | jsonschema-specifications==2023.12.1
 73 | jupyter-events==0.10.0
 74 | jupyter-lsp==2.2.4
 75 | jupyter-server-mathjax==0.2.6
 76 | jupyter_client==8.6.1
 77 | jupyter_core==5.7.2
 78 | jupyter_server==2.13.0
 79 | jupyter_server_terminals==0.5.3
 80 | jupyterlab==4.1.5
 81 | jupyterlab_git==0.50.0
 82 | jupyterlab_pygments==0.3.0
 83 | jupyterlab_server==2.25.4
 84 | kiwisolver==1.4.5
 85 | linkify-it-py==2.0.3
 86 | Mako==1.3.2
 87 | Markdown==3.6
 88 | markdown-it-py==3.0.0
 89 | MarkupSafe==2.1.5
 90 | matplotlib==3.8.3
 91 | matplotlib-inline==0.1.6
 92 | mdit-py-plugins==0.4.0
 93 | mdurl==0.1.2
 94 | mistune==3.0.2
 95 | mlflow==2.11.3
 96 | mmhash3==3.0.1
 97 | mpmath==1.3.0
 98 | nbclient==0.10.0
 99 | nbconvert==7.16.2
100 | nbdime==4.0.1
101 | nbformat==5.10.3
102 | nest-asyncio==1.6.0
103 | networkx==3.2.1
104 | notebook==7.1.2
105 | notebook_shim==0.2.4
106 | numpy==1.26.4
107 | nvidia-cublas-cu12==12.1.3.1
108 | nvidia-cuda-cupti-cu12==12.1.105
109 | nvidia-cuda-nvrtc-cu12==12.1.105
110 | nvidia-cuda-runtime-cu12==12.1.105
111 | nvidia-cudnn-cu12==8.9.2.26
112 | nvidia-cufft-cu12==11.0.2.54
113 | nvidia-curand-cu12==10.3.2.106
114 | nvidia-cusolver-cu12==11.4.5.107
115 | nvidia-cusparse-cu12==12.1.0.106
116 | nvidia-nccl-cu12==2.19.3
117 | nvidia-nvjitlink-cu12==12.4.99
118 | nvidia-nvtx-cu12==12.1.105
119 | overrides==7.7.0
120 | packaging==23.2
121 | pandas==2.2.1
122 | pandocfilters==1.5.1
123 | panel==1.4.0
124 | param==2.1.0
125 | parso==0.8.3
126 | pexpect==4.9.0
127 | pillow==10.2.0
128 | platformdirs==4.2.0
129 | plotly==5.20.0
130 | polars==0.20.18
131 | prometheus_client==0.20.0
132 | prompt-toolkit==3.0.43
133 | protobuf==4.25.3
134 | psutil==5.9.8
135 | ptyprocess==0.7.0
136 | pure-eval==0.2.2
137 | pyarrow==15.0.2
138 | pyarrow-hotfix==0.6
139 | pycparser==2.21
140 | pydantic==2.6.4
141 | pydantic_core==2.16.3
142 | Pygments==2.17.2
143 | pyiceberg==0.6.0
144 | pyparsing==3.1.2
145 | python-dateutil==2.9.0.post0
146 | python-json-logger==2.0.7
147 | pytz==2024.1
148 | pyviz_comms==3.0.2
149 | PyYAML==6.0.1
150 | pyzmq==25.1.2
151 | querystring-parser==1.2.4
152 | referencing==0.34.0
153 | requests==2.31.0
154 | rfc3339-validator==0.1.4
155 | rfc3986-validator==0.1.1
156 | rich==13.7.1
157 | rpds-py==0.18.0
158 | scikit-learn==1.4.1.post1
159 | scipy==1.12.0
160 | seaborn==0.13.2
161 | Send2Trash==1.8.2
162 | six==1.16.0
163 | smmap==5.0.1
164 | sniffio==1.3.1
165 | sortedcontainers==2.4.0
166 | soupsieve==2.5
167 | SQLAlchemy==2.0.29
168 | sqlparse==0.4.4
169 | stack-data==0.6.3
170 | strictyaml==1.7.3
171 | sympy==1.12
172 | tenacity==8.2.3
173 | terminado==0.18.1
174 | threadpoolctl==3.4.0
175 | tinycss2==1.2.1
176 | tomli==2.0.1
177 | torch==2.2.1
178 | tornado==6.4
179 | tqdm==4.66.2
180 | traitlets==5.14.2
181 | triton==2.2.0
182 | types-python-dateutil==2.9.0.20240316
183 | typing_extensions==4.10.0
184 | tzdata==2024.1
185 | uc-micro-py==1.0.3
186 | uri-template==1.3.0
187 | urllib3==2.0.7
188 | wcwidth==0.2.13
189 | webcolors==1.13
190 | webencodings==0.5.1
191 | websocket-client==1.7.0
192 | Werkzeug==3.0.2
193 | xlsx2csv==0.8.2
194 | XlsxWriter==3.2.0
195 | xyzservices==2023.10.1
196 | zipp==3.18.1
197 | zope.event==5.0
198 | zope.interface==6.2
199 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | LinkedIn Learning Exercise Files License Agreement
  2 | ==================================================
  3 | 
  4 | This License Agreement (the "Agreement") is a binding legal agreement
  5 | between you (as an individual or entity, as applicable) and LinkedIn
  6 | Corporation (“LinkedIn”). By downloading or using the LinkedIn Learning
  7 | exercise files in this repository (“Licensed Materials”), you agree to
  8 | be bound by the terms of this Agreement. If you do not agree to these
  9 | terms, do not download or use the Licensed Materials. 
 10 | 
 11 | 1. License.
 12 | - a. Subject to the terms of this Agreement, LinkedIn hereby grants LinkedIn
 13 | members during their LinkedIn Learning subscription a non-exclusive,
 14 | non-transferable copyright license, for internal use only, to 1) make a
 15 | reasonable number of copies of the Licensed Materials, and 2) make
 16 | derivative works of the Licensed Materials for the sole purpose of
 17 | practicing skills taught in LinkedIn Learning courses.
 18 | - b. Distribution. Unless otherwise noted in the Licensed Materials, subject
 19 | to the terms of this Agreement, LinkedIn hereby grants LinkedIn members
 20 | with a LinkedIn Learning subscription a non-exclusive, non-transferable
 21 | copyright license to distribute the Licensed Materials, except the
 22 | Licensed Materials may not be included in any product or service (or
 23 | otherwise used) to instruct or educate others.
 24 | 
 25 | 2. Restrictions and Intellectual Property. 
 26 | - a. You may not to use, modify, copy, make derivative works of, publish,
 27 | distribute, rent, lease, sell, sublicense, assign or otherwise transfer the
 28 | Licensed Materials, except as expressly set forth above in Section 1. 
 29 | - b. Linkedin (and its licensors) retains its intellectual property rights
 30 | in the Licensed Materials. Except as expressly set forth in Section 1,
 31 | LinkedIn grants no licenses.
 32 | - c. You indemnify LinkedIn and its licensors and affiliates for i) any
 33 | alleged infringement or misappropriation of any intellectual property rights
 34 | of any third party based on modifications you make to the Licensed Materials,
 35 | ii) any claims arising from your use or distribution of all or part of the
 36 | Licensed Materials and iii) a breach of this Agreement. You will defend, hold
 37 | harmless, and indemnify LinkedIn and its affiliates (and our and their
 38 | respective employees, shareholders, and directors) from any claim or action
 39 | brought by a third party, including all damages, liabilities, costs and
 40 | expenses, including reasonable attorneys’ fees, to the extent resulting from,
 41 | alleged to have resulted from, or in connection with: (a) your breach of your
 42 | obligations herein; or (b) your use or distribution of any Licensed Materials.
 43 | 
 44 | 3. Open source. This code may include open source software, which may be
 45 | subject to other license terms as provided in the files. 
 46 |  
 47 | 4. Warranty Disclaimer. LINKEDIN PROVIDES THE LICENSED MATERIALS ON AN “AS IS”
 48 | AND “AS AVAILABLE” BASIS. LINKEDIN MAKES NO REPRESENTATION OR WARRANTY,
 49 | WHETHER EXPRESS OR IMPLIED, ABOUT THE LICENSED MATERIALS, INCLUDING ANY
 50 | REPRESENTATION THAT THE LICENSED MATERIALS WILL BE FREE OF ERRORS, BUGS OR
 51 | INTERRUPTIONS, OR THAT THE LICENSED MATERIALS ARE ACCURATE, COMPLETE OR
 52 | OTHERWISE VALID. TO THE FULLEST EXTENT PERMITTED BY LAW, LINKEDIN AND ITS
 53 | AFFILIATES DISCLAIM ANY IMPLIED OR STATUTORY WARRANTY OR CONDITION, INCLUDING
 54 | ANY IMPLIED WARRANTY OR CONDITION OF MERCHANTABILITY OR FITNESS FOR A
 55 | PARTICULAR PURPOSE, AVAILABILITY, SECURITY, TITLE AND/OR NON-INFRINGEMENT.
 56 | YOUR USE OF THE LICENSED MATERIALS IS AT YOUR OWN DISCRETION AND RISK, AND
 57 | YOU WILL BE SOLELY RESPONSIBLE FOR ANY DAMAGE THAT RESULTS FROM USE OF THE
 58 | LICENSED MATERIALS TO YOUR COMPUTER SYSTEM OR LOSS OF DATA.  NO ADVICE OR
 59 | INFORMATION, WHETHER ORAL OR WRITTEN, OBTAINED BY YOU FROM US OR THROUGH OR
 60 | FROM THE LICENSED MATERIALS WILL CREATE ANY WARRANTY OR CONDITION NOT
 61 | EXPRESSLY STATED IN THESE TERMS.
 62 | 
 63 | 5. Limitation of Liability. LINKEDIN SHALL NOT BE LIABLE FOR ANY INDIRECT,
 64 | INCIDENTAL, SPECIAL, PUNITIVE, CONSEQUENTIAL OR EXEMPLARY DAMAGES, INCLUDING
 65 | BUT NOT LIMITED TO, DAMAGES FOR LOSS OF PROFITS, GOODWILL, USE, DATA OR OTHER
 66 | INTANGIBLE LOSSES . IN NO EVENT WILL LINKEDIN'S AGGREGATE LIABILITY TO YOU
 67 | EXCEED $100. THIS LIMITATION OF LIABILITY SHALL:
 68 | - i. APPLY REGARDLESS OF WHETHER (A) YOU BASE YOUR CLAIM ON CONTRACT, TORT,
 69 | STATUTE, OR ANY OTHER LEGAL THEORY, (B) WE KNEW OR SHOULD HAVE KNOWN ABOUT
 70 | THE POSSIBILITY OF SUCH DAMAGES, OR (C) THE LIMITED REMEDIES PROVIDED IN THIS
 71 | SECTION FAIL OF THEIR ESSENTIAL PURPOSE; AND
 72 | - ii. NOT APPLY TO ANY DAMAGE THAT LINKEDIN MAY CAUSE YOU INTENTIONALLY OR
 73 | KNOWINGLY IN VIOLATION OF THESE TERMS OR APPLICABLE LAW, OR AS OTHERWISE
 74 | MANDATED BY APPLICABLE LAW THAT CANNOT BE DISCLAIMED IN THESE TERMS.
 75 | 
 76 | 6. Termination. This Agreement automatically terminates upon your breach of
 77 | this Agreement or termination of your LinkedIn Learning subscription. On
 78 | termination, all licenses granted under this Agreement will terminate
 79 | immediately and you will delete the Licensed Materials. Sections 2-7 of this
 80 | Agreement survive any termination of this Agreement. LinkedIn may discontinue
 81 | the availability of some or all of the Licensed Materials at any time for any
 82 | reason.
 83 | 
 84 | 7. Miscellaneous. This Agreement will be governed by and construed in
 85 | accordance with the laws of the State of California without regard to conflict
 86 | of laws principles. The exclusive forum for any disputes arising out of or
 87 | relating to this Agreement shall be an appropriate federal or state court
 88 | sitting in the County of Santa Clara, State of California. If LinkedIn does
 89 | not act to enforce a breach of this Agreement, that does not mean that
 90 | LinkedIn has waived its right to enforce this Agreement. The Agreement does
 91 | not create a partnership, agency relationship, or joint venture between the
 92 | parties.  Neither party has the power or authority to bind the other or to
 93 | create any obligation or responsibility on behalf of the other. You may not,
 94 | without LinkedIn’s prior written consent, assign or delegate any rights or
 95 | obligations under these terms, including in connection with a change of
 96 | control. Any purported assignment and delegation shall be ineffective. The
 97 | Agreement shall bind and inure to the benefit of the parties, their respective
 98 | successors and permitted assigns. If any provision of the Agreement is
 99 | unenforceable, that provision will be modified to render it enforceable to the
100 | extent possible to give effect to the parties’ intentions and the remaining
101 | provisions will not be affected. This Agreement is the only agreement between
102 | you and LinkedIn regarding the Licensed Materials, and supersedes all prior
103 | agreements relating to the Licensed Materials.  
104 | 
105 | Last Updated: March 2019
106 | 


--------------------------------------------------------------------------------
/ml-foundations.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# EDA\n"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "\n",
  15 |     "## Exploring the Data\n"
  16 |    ]
  17 |   },
  18 |   {
  19 |    "cell_type": "code",
  20 |    "execution_count": null,
  21 |    "metadata": {},
  22 |    "outputs": [],
  23 |    "source": [
  24 |     "import polars as pl\n",
  25 |     "import polars.selectors as cs\n",
  26 |     "import sklearn \n",
  27 |     "import catboost\n",
  28 |     "\n",
  29 |     "import warnings\n",
  30 |     "warnings.filterwarnings('ignore')"
  31 |    ]
  32 |   },
  33 |   {
  34 |    "cell_type": "code",
  35 |    "execution_count": null,
  36 |    "metadata": {},
  37 |    "outputs": [],
  38 |    "source": [
  39 |     "# King County House Sales dataset from OpenML (includes Seattle)\n",
  40 |     "# this is an ARFF file, which is a text file with a specific format\n",
  41 |     "url = 'https://www.openml.org/data/download/22044765/dataset'\n",
  42 |     "cols = ['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', \n",
  43 |     "        'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',\n",
  44 |     "        'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'date_year', 'date_month', 'date_day']\n",
  45 |     "\n",
  46 |     "raw = pl.read_csv(url, new_columns=cols, skip_rows=31, has_header=False)\n",
  47 |     "raw"
  48 |    ]
  49 |   },
  50 |   {
  51 |    "cell_type": "code",
  52 |    "execution_count": null,
  53 |    "metadata": {},
  54 |    "outputs": [],
  55 |    "source": [
  56 |     "raw.describe()"
  57 |    ]
  58 |   },
  59 |   {
  60 |    "cell_type": "code",
  61 |    "execution_count": null,
  62 |    "metadata": {},
  63 |    "outputs": [],
  64 |    "source": [
  65 |     "raw.corr()"
  66 |    ]
  67 |   },
  68 |   {
  69 |    "cell_type": "code",
  70 |    "execution_count": null,
  71 |    "metadata": {},
  72 |    "outputs": [],
  73 |    "source": [
  74 |     "(raw\n",
  75 |     " .to_pandas(use_pyarrow_extension_array=True)\n",
  76 |     " .corr()\n",
  77 |     " .style.background_gradient(cmap='RdBu', vmin=-1, vmax=1)\n",
  78 |     ")"
  79 |    ]
  80 |   },
  81 |   {
  82 |    "cell_type": "code",
  83 |    "execution_count": null,
  84 |    "metadata": {},
  85 |    "outputs": [],
  86 |    "source": [
  87 |     "(raw\n",
  88 |     " .plot.scatter('sqft_living', 'price', alpha=0.1)\n",
  89 |     ")"
  90 |    ]
  91 |   },
  92 |   {
  93 |    "cell_type": "code",
  94 |    "execution_count": null,
  95 |    "metadata": {},
  96 |    "outputs": [],
  97 |    "source": [
  98 |     "(raw\n",
  99 |     " .group_by('date_month', 'zipcode')\n",
 100 |     " .agg(pl.col('price').mean())\n",
 101 |     " .plot.line('date_month', 'price', by='zipcode')\n",
 102 |     " )"
 103 |    ]
 104 |   },
 105 |   {
 106 |    "cell_type": "code",
 107 |    "execution_count": null,
 108 |    "metadata": {},
 109 |    "outputs": [],
 110 |    "source": [
 111 |     "(raw\n",
 112 |     " .group_by('date_month', 'zipcode')\n",
 113 |     " .agg(pl.col('price').mean())\n",
 114 |     " .sort('date_month')\n",
 115 |     " .plot.line('date_month', 'price', by='zipcode', alpha=0.5)\n",
 116 |     " )"
 117 |    ]
 118 |   },
 119 |   {
 120 |    "cell_type": "code",
 121 |    "execution_count": null,
 122 |    "metadata": {},
 123 |    "outputs": [],
 124 |    "source": [
 125 |     "# lat/long scatter plot\n",
 126 |     "(raw\n",
 127 |     " .sort('price')\n",
 128 |     " .plot.scatter(x='long', y='lat', alpha=0.5, c='price', s=1)\n",
 129 |     ")"
 130 |    ]
 131 |   },
 132 |   {
 133 |    "cell_type": "code",
 134 |    "execution_count": null,
 135 |    "metadata": {},
 136 |    "outputs": [],
 137 |    "source": [
 138 |     "# lat/long scatter plot\n",
 139 |     "(raw\n",
 140 |     " .filter(pl.col('price') > 1_000_000)\n",
 141 |     " .sort('price')\n",
 142 |     " .plot.scatter(x='long', y='lat', alpha=0.5, c='price', s=1)\n",
 143 |     ")"
 144 |    ]
 145 |   },
 146 |   {
 147 |    "cell_type": "code",
 148 |    "execution_count": null,
 149 |    "metadata": {},
 150 |    "outputs": [],
 151 |    "source": []
 152 |   },
 153 |   {
 154 |    "cell_type": "code",
 155 |    "execution_count": null,
 156 |    "metadata": {},
 157 |    "outputs": [],
 158 |    "source": []
 159 |   },
 160 |   {
 161 |    "cell_type": "markdown",
 162 |    "metadata": {},
 163 |    "source": [
 164 |     "\n",
 165 |     "## Data Preprocessing\n"
 166 |    ]
 167 |   },
 168 |   {
 169 |    "cell_type": "code",
 170 |    "execution_count": null,
 171 |    "metadata": {},
 172 |    "outputs": [],
 173 |    "source": [
 174 |     "def tweak_housing(df):\n",
 175 |     "    return (df\n",
 176 |     "            .with_columns(zipcode=pl.col('zipcode').cast(pl.String).cast(pl.Categorical),\n",
 177 |     "                          date=pl.date(pl.col('date_year'), pl.col('date_month'), pl.col('date_day')),\n",
 178 |     "                          yr_renovated=pl.col('yr_renovated').replace(0, None),\n",
 179 |     "                          )\n",
 180 |     "            .select(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', \n",
 181 |     "                     'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', \n",
 182 |     "                     'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', \n",
 183 |     "                     'sqft_lot15', 'date',  #'date_year', 'date_month', 'date_day', \n",
 184 |     "                     ])\n",
 185 |     "    )\n",
 186 |     "\n",
 187 |     "tweak_housing(raw)\n",
 188 |     "    "
 189 |    ]
 190 |   },
 191 |   {
 192 |    "cell_type": "code",
 193 |    "execution_count": null,
 194 |    "metadata": {},
 195 |    "outputs": [],
 196 |    "source": []
 197 |   },
 198 |   {
 199 |    "cell_type": "code",
 200 |    "execution_count": null,
 201 |    "metadata": {},
 202 |    "outputs": [],
 203 |    "source": []
 204 |   },
 205 |   {
 206 |    "cell_type": "code",
 207 |    "execution_count": null,
 208 |    "metadata": {},
 209 |    "outputs": [],
 210 |    "source": []
 211 |   },
 212 |   {
 213 |    "cell_type": "markdown",
 214 |    "metadata": {},
 215 |    "source": [
 216 |     "\n",
 217 |     "## Sklearn Pipelines\n"
 218 |    ]
 219 |   },
 220 |   {
 221 |    "cell_type": "code",
 222 |    "execution_count": null,
 223 |    "metadata": {
 224 |     "scrolled": true
 225 |    },
 226 |    "outputs": [],
 227 |    "source": [
 228 |     "# The difference between sklearn pipelines and transformers is \n",
 229 |     "# that a pipeline is a sequence of steps. A transformer transforms\n",
 230 |     "# the data, and a pipeline is a sequence of transformers.\n",
 231 |     "# A ColumnTransformer applies multiple transformers to different\n",
 232 |     "# columns of the input data.\n",
 233 |     "\n",
 234 |     "from sklearn.pipeline import Pipeline\n",
 235 |     "from sklearn.compose import ColumnTransformer\n",
 236 |     "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
 237 |     "from sklearn.impute import SimpleImputer\n",
 238 |     "from sklearn.model_selection import train_test_split\n",
 239 |     "from sklearn.preprocessing import FunctionTransformer\n",
 240 |     "from sklearn.base import BaseEstimator, TransformerMixin\n",
 241 |     "from sklearn import set_config\n",
 242 |     "set_config(transform_output='polars')"
 243 |    ]
 244 |   },
 245 |   {
 246 |    "cell_type": "code",
 247 |    "execution_count": null,
 248 |    "metadata": {},
 249 |    "outputs": [],
 250 |    "source": [
 251 |     "print(tweak_housing(raw).select(cs.numeric()).columns)"
 252 |    ]
 253 |   },
 254 |   {
 255 |    "cell_type": "code",
 256 |    "execution_count": null,
 257 |    "metadata": {
 258 |     "collapsed": true,
 259 |     "jupyter": {
 260 |      "outputs_hidden": true
 261 |     },
 262 |     "scrolled": true
 263 |    },
 264 |    "outputs": [],
 265 |    "source": [
 266 |     "numeric_features = ['bedrooms', 'bathrooms', 'sqft_living']\n",
 267 |     "std = StandardScaler()\n",
 268 |     "std.fit_transform(tweak_housing(raw).select(numeric_features))"
 269 |    ]
 270 |   },
 271 |   {
 272 |    "cell_type": "code",
 273 |    "execution_count": null,
 274 |    "metadata": {
 275 |     "collapsed": true,
 276 |     "jupyter": {
 277 |      "outputs_hidden": true
 278 |     }
 279 |    },
 280 |    "outputs": [],
 281 |    "source": [
 282 |     "numeric_features = ['bedrooms', 'bathrooms', 'sqft_living']\n",
 283 |     "\n",
 284 |     "num_pipeline = Pipeline([\n",
 285 |     "     ('std', StandardScaler())])\n",
 286 |     "\n",
 287 |     "num_pipeline.fit_transform(\n",
 288 |     "    tweak_housing(raw)\n",
 289 |     "    .select(numeric_features)\n",
 290 |     ")"
 291 |    ]
 292 |   },
 293 |   {
 294 |    "cell_type": "code",
 295 |    "execution_count": null,
 296 |    "metadata": {
 297 |     "collapsed": true,
 298 |     "jupyter": {
 299 |      "outputs_hidden": true
 300 |     },
 301 |     "scrolled": true
 302 |    },
 303 |    "outputs": [],
 304 |    "source": [
 305 |     "# add another step\n",
 306 |     "numeric_features = ['bedrooms', 'bathrooms', 'sqft_living']\n",
 307 |     "\n",
 308 |     "num_pipeline = Pipeline([\n",
 309 |     "    ('imputer', SimpleImputer(strategy='median')),\n",
 310 |     "    ('std', StandardScaler())])\n",
 311 |     "\n",
 312 |     "num_pipeline.fit_transform(\n",
 313 |     "    tweak_housing(raw)\n",
 314 |     "    .select(numeric_features)\n",
 315 |     ")"
 316 |    ]
 317 |   },
 318 |   {
 319 |    "cell_type": "code",
 320 |    "execution_count": null,
 321 |    "metadata": {
 322 |     "scrolled": true
 323 |    },
 324 |    "outputs": [],
 325 |    "source": [
 326 |     "cat_features = ['zipcode']\n",
 327 |     "\n",
 328 |     "ohe = OneHotEncoder(handle_unknown='ignore')\n",
 329 |     "#                    sparse_output=False)\n",
 330 |     "\n",
 331 |     "ohe.fit_transform(\n",
 332 |     "    tweak_housing(raw)\n",
 333 |     "    .select(cat_features)\n",
 334 |     ")"
 335 |    ]
 336 |   },
 337 |   {
 338 |    "cell_type": "code",
 339 |    "execution_count": null,
 340 |    "metadata": {
 341 |     "scrolled": true
 342 |    },
 343 |    "outputs": [],
 344 |    "source": [
 345 |     "cat_features = ['zipcode']\n",
 346 |     "\n",
 347 |     "ohe = OneHotEncoder(handle_unknown='ignore',\n",
 348 |     "                    sparse_output=False)\n",
 349 |     "\n",
 350 |     "ohe.fit_transform(\n",
 351 |     "    tweak_housing(raw)\n",
 352 |     "    .select(cat_features)\n",
 353 |     ")"
 354 |    ]
 355 |   },
 356 |   {
 357 |    "cell_type": "code",
 358 |    "execution_count": null,
 359 |    "metadata": {
 360 |     "collapsed": true,
 361 |     "jupyter": {
 362 |      "outputs_hidden": true
 363 |     }
 364 |    },
 365 |    "outputs": [],
 366 |    "source": [
 367 |     "cat_features = ['zipcode']\n",
 368 |     "\n",
 369 |     "ohe = OneHotEncoder(handle_unknown='ignore',\n",
 370 |     "                    sparse_output=False, max_categories=10)\n",
 371 |     "\n",
 372 |     "ohe.fit_transform(\n",
 373 |     "    tweak_housing(raw)\n",
 374 |     "    .select(cat_features)\n",
 375 |     ")"
 376 |    ]
 377 |   },
 378 |   {
 379 |    "cell_type": "code",
 380 |    "execution_count": null,
 381 |    "metadata": {
 382 |     "scrolled": true
 383 |    },
 384 |    "outputs": [],
 385 |    "source": [
 386 |     "cat_features = ['zipcode']\n",
 387 |     "\n",
 388 |     "cat_pipeline = Pipeline(steps=[\n",
 389 |     "    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])\n",
 390 |     "\n",
 391 |     "cat_pipeline.set_params(cat__max_categories=10)\n",
 392 |     "cat_pipeline.fit_transform(\n",
 393 |     "    tweak_housing(raw)\n",
 394 |     "    .select(cat_features)\n",
 395 |     ")"
 396 |    ]
 397 |   },
 398 |   {
 399 |    "cell_type": "code",
 400 |    "execution_count": null,
 401 |    "metadata": {},
 402 |    "outputs": [],
 403 |    "source": [
 404 |     "# transformer from a function\n",
 405 |     "tweak_transformer = FunctionTransformer(tweak_housing)\n",
 406 |     "\n",
 407 |     "tweak_transformer.fit_transform(raw)"
 408 |    ]
 409 |   },
 410 |   {
 411 |    "cell_type": "code",
 412 |    "execution_count": null,
 413 |    "metadata": {},
 414 |    "outputs": [],
 415 |    "source": [
 416 |     "categorical_features = ['zipcode']\n",
 417 |     "\n",
 418 |     "numeric_transformer = Pipeline(steps=[\n",
 419 |     "    ('imputer', SimpleImputer(strategy='median')),\n",
 420 |     "    ('scaler', StandardScaler())])\n",
 421 |     "\n",
 422 |     "ct = ColumnTransformer(\n",
 423 |     "    transformers=[\n",
 424 |     "        ('num', numeric_transformer, numeric_features),\n",
 425 |     "        ('cat', OneHotEncoder(handle_unknown='ignore',\n",
 426 |     "                              sparse_output=False), categorical_features)])\n",
 427 |     "\n",
 428 |     "ct.fit_transform(\n",
 429 |     "    tweak_housing(raw)\n",
 430 |     "    .select([*numeric_features, *cat_features])\n",
 431 |     ")"
 432 |    ]
 433 |   },
 434 |   {
 435 |    "cell_type": "code",
 436 |    "execution_count": null,
 437 |    "metadata": {},
 438 |    "outputs": [],
 439 |    "source": [
 440 |     "# Custom transformer \n",
 441 |     "class ZipAvgPriceAdder(BaseEstimator, TransformerMixin):\n",
 442 |     "    def __init__(self):\n",
 443 |     "        pass\n",
 444 |     "    def fit(self, X, y=None):\n",
 445 |     "        # assume X is a polars dataframe\n",
 446 |     "        self.zip_avg_price = (X\n",
 447 |     "                              .group_by('zipcode')\n",
 448 |     "                              .agg(zip_mean=pl.col('price').mean())\n",
 449 |     "        )\n",
 450 |     "        return self\n",
 451 |     "    \n",
 452 |     "    def transform(self, X, y=None):\n",
 453 |     "        return X.join(self.zip_avg_price, on='zipcode')\n",
 454 |     "\n",
 455 |     "zip_adder = ZipAvgPriceAdder()\n",
 456 |     "zip_adder.fit_transform(raw.select(['zipcode', 'price']))"
 457 |    ]
 458 |   },
 459 |   {
 460 |    "cell_type": "code",
 461 |    "execution_count": null,
 462 |    "metadata": {
 463 |     "scrolled": true
 464 |    },
 465 |    "outputs": [],
 466 |    "source": [
 467 |     "# make the pipeline\n",
 468 |     "numeric_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', \n",
 469 |     "                    'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', \n",
 470 |     "                    'lat', 'long', 'sqft_living15', 'sqft_lot15', 'zip_mean']\n",
 471 |     "numeric_transformer = Pipeline(steps=[\n",
 472 |     "    ('imputer', SimpleImputer(strategy='median')),\n",
 473 |     "    ('scaler', StandardScaler())])\n",
 474 |     "\n",
 475 |     "categorical_features = ['zipcode']\n",
 476 |     "\n",
 477 |     "preprocessor = ColumnTransformer(\n",
 478 |     "    transformers=[\n",
 479 |     "        ('num', numeric_transformer, numeric_features),\n",
 480 |     "        ('cat', OneHotEncoder(handle_unknown='ignore',\n",
 481 |     "                              sparse_output=False), categorical_features)])\n",
 482 |     "\n",
 483 |     "tweak_transformer = FunctionTransformer(tweak_housing)\n",
 484 |     "\n",
 485 |     "class ZipAvgPriceAdder(BaseEstimator, TransformerMixin):\n",
 486 |     "    def __init__(self):\n",
 487 |     "        pass\n",
 488 |     "    def fit(self, X, y=None):\n",
 489 |     "        # assume X is a polars dataframe\n",
 490 |     "        self.zip_avg_price = (X\n",
 491 |     "                              .group_by('zipcode')\n",
 492 |     "                              .agg(zip_mean=pl.col('price').mean())\n",
 493 |     "        )\n",
 494 |     "        return self\n",
 495 |     "    \n",
 496 |     "    def transform(self, X, y=None):\n",
 497 |     "        return X.join(self.zip_avg_price, on='zipcode')\n",
 498 |     "\n",
 499 |     "# Append classifier to preprocessing pipeline.\n",
 500 |     "# Now we have a full prediction pipeline.\n",
 501 |     "pipe = Pipeline(steps=[('tweak', tweak_transformer),\n",
 502 |     "                      ('zip_avg_price', ZipAvgPriceAdder()),\n",
 503 |     "                      ('preprocessor', preprocessor),\n",
 504 |     "                      ])\n",
 505 |     "\n",
 506 |     "X = raw #.drop('price')\n",
 507 |     "y = raw.select('price') # Note sklearn wants a Polars dataframe for y\n",
 508 |     "\n",
 509 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
 510 |     "\n",
 511 |     "pipe.fit_transform(raw, raw.select('price'))"
 512 |    ]
 513 |   },
 514 |   {
 515 |    "cell_type": "code",
 516 |    "execution_count": null,
 517 |    "metadata": {},
 518 |    "outputs": [],
 519 |    "source": [
 520 |     "pipe"
 521 |    ]
 522 |   },
 523 |   {
 524 |    "cell_type": "code",
 525 |    "execution_count": null,
 526 |    "metadata": {},
 527 |    "outputs": [],
 528 |    "source": [
 529 |     "# Note sklearn wants a Polars dataframe for y\n",
 530 |     "X = raw #.drop('price')\n",
 531 |     "y = raw.select('price') \n",
 532 |     "#y = raw['price']\n",
 533 |     "\n",
 534 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
 535 |    ]
 536 |   },
 537 |   {
 538 |    "cell_type": "code",
 539 |    "execution_count": null,
 540 |    "metadata": {},
 541 |    "outputs": [],
 542 |    "source": []
 543 |   },
 544 |   {
 545 |    "cell_type": "code",
 546 |    "execution_count": null,
 547 |    "metadata": {},
 548 |    "outputs": [],
 549 |    "source": []
 550 |   },
 551 |   {
 552 |    "cell_type": "code",
 553 |    "execution_count": null,
 554 |    "metadata": {},
 555 |    "outputs": [],
 556 |    "source": []
 557 |   },
 558 |   {
 559 |    "cell_type": "code",
 560 |    "execution_count": null,
 561 |    "metadata": {},
 562 |    "outputs": [],
 563 |    "source": []
 564 |   },
 565 |   {
 566 |    "cell_type": "code",
 567 |    "execution_count": null,
 568 |    "metadata": {},
 569 |    "outputs": [],
 570 |    "source": []
 571 |   },
 572 |   {
 573 |    "cell_type": "markdown",
 574 |    "metadata": {
 575 |     "jp-MarkdownHeadingCollapsed": true
 576 |    },
 577 |    "source": [
 578 |     "\n",
 579 |     "## Challenge\n",
 580 |     "\n",
 581 |     "Make a plot to explore the relationship between the number of bedrooms and the price of the house."
 582 |    ]
 583 |   },
 584 |   {
 585 |    "cell_type": "code",
 586 |    "execution_count": null,
 587 |    "metadata": {},
 588 |    "outputs": [],
 589 |    "source": []
 590 |   },
 591 |   {
 592 |    "cell_type": "code",
 593 |    "execution_count": null,
 594 |    "metadata": {},
 595 |    "outputs": [],
 596 |    "source": []
 597 |   },
 598 |   {
 599 |    "cell_type": "code",
 600 |    "execution_count": null,
 601 |    "metadata": {},
 602 |    "outputs": [],
 603 |    "source": []
 604 |   },
 605 |   {
 606 |    "cell_type": "code",
 607 |    "execution_count": null,
 608 |    "metadata": {},
 609 |    "outputs": [],
 610 |    "source": []
 611 |   },
 612 |   {
 613 |    "cell_type": "code",
 614 |    "execution_count": null,
 615 |    "metadata": {},
 616 |    "outputs": [],
 617 |    "source": []
 618 |   },
 619 |   {
 620 |    "cell_type": "markdown",
 621 |    "metadata": {},
 622 |    "source": [
 623 |     "\n",
 624 |     "## Solution"
 625 |    ]
 626 |   },
 627 |   {
 628 |    "cell_type": "code",
 629 |    "execution_count": null,
 630 |    "metadata": {},
 631 |    "outputs": [],
 632 |    "source": []
 633 |   },
 634 |   {
 635 |    "cell_type": "code",
 636 |    "execution_count": null,
 637 |    "metadata": {},
 638 |    "outputs": [],
 639 |    "source": []
 640 |   },
 641 |   {
 642 |    "cell_type": "code",
 643 |    "execution_count": null,
 644 |    "metadata": {},
 645 |    "outputs": [],
 646 |    "source": []
 647 |   },
 648 |   {
 649 |    "cell_type": "code",
 650 |    "execution_count": null,
 651 |    "metadata": {},
 652 |    "outputs": [],
 653 |    "source": []
 654 |   },
 655 |   {
 656 |    "cell_type": "code",
 657 |    "execution_count": null,
 658 |    "metadata": {},
 659 |    "outputs": [],
 660 |    "source": []
 661 |   },
 662 |   {
 663 |    "cell_type": "markdown",
 664 |    "metadata": {
 665 |     "vscode": {
 666 |      "languageId": "plaintext"
 667 |     }
 668 |    },
 669 |    "source": [
 670 |     "# Model Creation\n"
 671 |    ]
 672 |   },
 673 |   {
 674 |    "cell_type": "markdown",
 675 |    "metadata": {},
 676 |    "source": [
 677 |     "\n",
 678 |     "## Dummy Model\n"
 679 |    ]
 680 |   },
 681 |   {
 682 |    "cell_type": "code",
 683 |    "execution_count": null,
 684 |    "metadata": {
 685 |     "scrolled": true
 686 |    },
 687 |    "outputs": [],
 688 |    "source": [
 689 |     "from sklearn.dummy import DummyRegressor\n",
 690 |     "\n",
 691 |     "dummy = DummyRegressor(strategy='mean')\n",
 692 |     "y = raw.select('price')\n",
 693 |     "X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)\n",
 694 |     "dummy_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n",
 695 |     "                      ('zip_avg_price', ZipAvgPriceAdder()),\n",
 696 |     "                      ('preprocessor', preprocessor),\n",
 697 |     "                      ('dummy', dummy),\n",
 698 |     "                      ])\n",
 699 |     "\n",
 700 |     "dummy_pipe.fit(X_train, y_train)\n",
 701 |     "dummy_pipe.score(X_test, y_test)"
 702 |    ]
 703 |   },
 704 |   {
 705 |    "cell_type": "code",
 706 |    "execution_count": null,
 707 |    "metadata": {},
 708 |    "outputs": [],
 709 |    "source": [
 710 |     "dummy_pipe"
 711 |    ]
 712 |   },
 713 |   {
 714 |    "cell_type": "code",
 715 |    "execution_count": null,
 716 |    "metadata": {
 717 |     "scrolled": true
 718 |    },
 719 |    "outputs": [],
 720 |    "source": [
 721 |     "dummy_pipe.predict(X_test)"
 722 |    ]
 723 |   },
 724 |   {
 725 |    "cell_type": "code",
 726 |    "execution_count": null,
 727 |    "metadata": {},
 728 |    "outputs": [],
 729 |    "source": []
 730 |   },
 731 |   {
 732 |    "cell_type": "code",
 733 |    "execution_count": null,
 734 |    "metadata": {},
 735 |    "outputs": [],
 736 |    "source": []
 737 |   },
 738 |   {
 739 |    "cell_type": "code",
 740 |    "execution_count": null,
 741 |    "metadata": {},
 742 |    "outputs": [],
 743 |    "source": []
 744 |   },
 745 |   {
 746 |    "cell_type": "markdown",
 747 |    "metadata": {},
 748 |    "source": [
 749 |     "\n",
 750 |     "## Linear Regression\n"
 751 |    ]
 752 |   },
 753 |   {
 754 |    "cell_type": "code",
 755 |    "execution_count": null,
 756 |    "metadata": {},
 757 |    "outputs": [],
 758 |    "source": [
 759 |     "from sklearn.linear_model import LinearRegression\n",
 760 |     "\n",
 761 |     "\n",
 762 |     "lr =  LinearRegression()\n",
 763 |     "y = raw.select('price')\n",
 764 |     "X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)\n",
 765 |     "lr_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n",
 766 |     "                      ('zip_avg_price', ZipAvgPriceAdder()),\n",
 767 |     "                      ('preprocessor', preprocessor),\n",
 768 |     "                        ('lr', lr),\n",
 769 |     "                      ])\n",
 770 |     "\n",
 771 |     "lr_pipe.fit(X_train, y_train)\n",
 772 |     "lr_pipe.score(X_test, y_test)"
 773 |    ]
 774 |   },
 775 |   {
 776 |    "cell_type": "code",
 777 |    "execution_count": null,
 778 |    "metadata": {},
 779 |    "outputs": [],
 780 |    "source": [
 781 |     "lr_pipe.predict(X_test)"
 782 |    ]
 783 |   },
 784 |   {
 785 |    "cell_type": "code",
 786 |    "execution_count": null,
 787 |    "metadata": {},
 788 |    "outputs": [],
 789 |    "source": []
 790 |   },
 791 |   {
 792 |    "cell_type": "code",
 793 |    "execution_count": null,
 794 |    "metadata": {},
 795 |    "outputs": [],
 796 |    "source": []
 797 |   },
 798 |   {
 799 |    "cell_type": "markdown",
 800 |    "metadata": {},
 801 |    "source": [
 802 |     "\n",
 803 |     "## Decision Trees\n"
 804 |    ]
 805 |   },
 806 |   {
 807 |    "cell_type": "code",
 808 |    "execution_count": null,
 809 |    "metadata": {},
 810 |    "outputs": [],
 811 |    "source": [
 812 |     "from sklearn.tree import DecisionTreeRegressor\n",
 813 |     "\n",
 814 |     "\n",
 815 |     "dt = DecisionTreeRegressor()\n",
 816 |     "y = raw.select('price')\n",
 817 |     "X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)\n",
 818 |     "dt_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n",
 819 |     "                      ('zip_avg_price', ZipAvgPriceAdder()),\n",
 820 |     "                      ('preprocessor', preprocessor),\n",
 821 |     "                      ('dt', dt),\n",
 822 |     "                      ])\n",
 823 |     "\n",
 824 |     "dt_pipe.fit(X_train, y_train)\n",
 825 |     "dt_pipe.score(X_test, y_test)"
 826 |    ]
 827 |   },
 828 |   {
 829 |    "cell_type": "code",
 830 |    "execution_count": null,
 831 |    "metadata": {},
 832 |    "outputs": [],
 833 |    "source": [
 834 |     "dt_pipe.set_params(dt__max_depth=1)\n",
 835 |     "dt_pipe.fit(X_train, y_train)\n",
 836 |     "dt_pipe.score(X_test, y_test)"
 837 |    ]
 838 |   },
 839 |   {
 840 |    "cell_type": "code",
 841 |    "execution_count": null,
 842 |    "metadata": {},
 843 |    "outputs": [],
 844 |    "source": [
 845 |     "dt_pipe.set_params(dt__max_depth=9)\n",
 846 |     "dt_pipe.fit(X_train, y_train)\n",
 847 |     "dt_pipe.score(X_test, y_test)"
 848 |    ]
 849 |   },
 850 |   {
 851 |    "cell_type": "code",
 852 |    "execution_count": null,
 853 |    "metadata": {},
 854 |    "outputs": [],
 855 |    "source": []
 856 |   },
 857 |   {
 858 |    "cell_type": "code",
 859 |    "execution_count": null,
 860 |    "metadata": {},
 861 |    "outputs": [],
 862 |    "source": []
 863 |   },
 864 |   {
 865 |    "cell_type": "code",
 866 |    "execution_count": null,
 867 |    "metadata": {},
 868 |    "outputs": [],
 869 |    "source": []
 870 |   },
 871 |   {
 872 |    "cell_type": "markdown",
 873 |    "metadata": {},
 874 |    "source": [
 875 |     "\n",
 876 |     "## CatBoost\n"
 877 |    ]
 878 |   },
 879 |   {
 880 |    "cell_type": "code",
 881 |    "execution_count": null,
 882 |    "metadata": {},
 883 |    "outputs": [],
 884 |    "source": []
 885 |   },
 886 |   {
 887 |    "cell_type": "code",
 888 |    "execution_count": null,
 889 |    "metadata": {
 890 |     "scrolled": true
 891 |    },
 892 |    "outputs": [],
 893 |    "source": [
 894 |     "from catboost import CatBoostRegressor\n",
 895 |     "\n",
 896 |     "\n",
 897 |     "cat = CatBoostRegressor()\n",
 898 |     "# has issues with Polars input going to use a pandas_transformer\n",
 899 |     "def to_pandas(df):\n",
 900 |     "    return df.to_pandas()\n",
 901 |     "pandas_transformer = FunctionTransformer(to_pandas)\n",
 902 |     "\n",
 903 |     "y = raw.select('price')\n",
 904 |     "\n",
 905 |     "X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)\n",
 906 |     "cat_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n",
 907 |     "                      ('zip_avg_price', ZipAvgPriceAdder()),\n",
 908 |     "                      ('preprocessor', preprocessor),\n",
 909 |     "                      ('to_pandas', pandas_transformer),\n",
 910 |     "                      ('cat', cat), \n",
 911 |     "                      ])\n",
 912 |     "\n",
 913 |     "cat_pipe.fit(X_train, y_train.to_numpy()[:,0])\n",
 914 |     "cat_pipe.score(X_test, y_test.to_numpy()[:,0])"
 915 |    ]
 916 |   },
 917 |   {
 918 |    "cell_type": "code",
 919 |    "execution_count": null,
 920 |    "metadata": {},
 921 |    "outputs": [],
 922 |    "source": []
 923 |   },
 924 |   {
 925 |    "cell_type": "code",
 926 |    "execution_count": null,
 927 |    "metadata": {},
 928 |    "outputs": [],
 929 |    "source": []
 930 |   },
 931 |   {
 932 |    "cell_type": "code",
 933 |    "execution_count": null,
 934 |    "metadata": {},
 935 |    "outputs": [],
 936 |    "source": []
 937 |   },
 938 |   {
 939 |    "cell_type": "markdown",
 940 |    "metadata": {},
 941 |    "source": [
 942 |     "\n",
 943 |     "## Challenge\n",
 944 |     "\n",
 945 |     "Create a pipeline for a Random Forest model and train it on the data. (see `ensemble.RandomForestRegressor` in scikit-learn). What is the score?"
 946 |    ]
 947 |   },
 948 |   {
 949 |    "cell_type": "code",
 950 |    "execution_count": null,
 951 |    "metadata": {},
 952 |    "outputs": [],
 953 |    "source": []
 954 |   },
 955 |   {
 956 |    "cell_type": "markdown",
 957 |    "metadata": {},
 958 |    "source": [
 959 |     "\n",
 960 |     "\n",
 961 |     "## Solution"
 962 |    ]
 963 |   },
 964 |   {
 965 |    "cell_type": "code",
 966 |    "execution_count": null,
 967 |    "metadata": {},
 968 |    "outputs": [],
 969 |    "source": []
 970 |   },
 971 |   {
 972 |    "cell_type": "code",
 973 |    "execution_count": null,
 974 |    "metadata": {},
 975 |    "outputs": [],
 976 |    "source": []
 977 |   },
 978 |   {
 979 |    "cell_type": "code",
 980 |    "execution_count": null,
 981 |    "metadata": {},
 982 |    "outputs": [],
 983 |    "source": []
 984 |   },
 985 |   {
 986 |    "cell_type": "code",
 987 |    "execution_count": null,
 988 |    "metadata": {},
 989 |    "outputs": [],
 990 |    "source": []
 991 |   },
 992 |   {
 993 |    "cell_type": "code",
 994 |    "execution_count": null,
 995 |    "metadata": {},
 996 |    "outputs": [],
 997 |    "source": []
 998 |   },
 999 |   {
1000 |    "cell_type": "markdown",
1001 |    "metadata": {
1002 |     "vscode": {
1003 |      "languageId": "plaintext"
1004 |     }
1005 |    },
1006 |    "source": [
1007 |     "# Evaluation\n"
1008 |    ]
1009 |   },
1010 |   {
1011 |    "cell_type": "markdown",
1012 |    "metadata": {},
1013 |    "source": [
1014 |     "\n",
1015 |     "## R2\n",
1016 |     "\n",
1017 |     "\n",
1018 |     "The Coefficient of Determination, R2, is a measure of how well the model fits the data. It is a value between 0 and 1. It tells us how much of the variance in the target variable is predictable from the features.\n",
1019 |     "\n",
1020 |     "A value of 0 means that the model explains none of the variability. A value of 1 means that the model explains all the variability.\n",
1021 |     "\n",
1022 |     "Note that it doesn't indicate whether a model is overfitting or underfitting the data."
1023 |    ]
1024 |   },
1025 |   {
1026 |    "cell_type": "code",
1027 |    "execution_count": null,
1028 |    "metadata": {
1029 |     "scrolled": true
1030 |    },
1031 |    "outputs": [],
1032 |    "source": [
1033 |     "cat_pipe.score(X_test, y_test.to_numpy()[:,0])"
1034 |    ]
1035 |   },
1036 |   {
1037 |    "cell_type": "code",
1038 |    "execution_count": null,
1039 |    "metadata": {},
1040 |    "outputs": [],
1041 |    "source": []
1042 |   },
1043 |   {
1044 |    "cell_type": "code",
1045 |    "execution_count": null,
1046 |    "metadata": {},
1047 |    "outputs": [],
1048 |    "source": []
1049 |   },
1050 |   {
1051 |    "cell_type": "code",
1052 |    "execution_count": null,
1053 |    "metadata": {},
1054 |    "outputs": [],
1055 |    "source": []
1056 |   },
1057 |   {
1058 |    "cell_type": "code",
1059 |    "execution_count": null,
1060 |    "metadata": {},
1061 |    "outputs": [],
1062 |    "source": []
1063 |   },
1064 |   {
1065 |    "cell_type": "markdown",
1066 |    "metadata": {},
1067 |    "source": [
1068 |     "\n",
1069 |     "## Mean Squared/Absolute Error\n"
1070 |    ]
1071 |   },
1072 |   {
1073 |    "cell_type": "code",
1074 |    "execution_count": null,
1075 |    "metadata": {},
1076 |    "outputs": [],
1077 |    "source": [
1078 |     "from sklearn.metrics import mean_squared_error\n",
1079 |     "\n",
1080 |     "mean_squared_error(y_test, cat_pipe.predict(X_test))"
1081 |    ]
1082 |   },
1083 |   {
1084 |    "cell_type": "code",
1085 |    "execution_count": null,
1086 |    "metadata": {},
1087 |    "outputs": [],
1088 |    "source": [
1089 |     "# rmse\n",
1090 |     "mean_squared_error(y_test, cat_pipe.predict(X_test), squared=False)"
1091 |    ]
1092 |   },
1093 |   {
1094 |    "cell_type": "code",
1095 |    "execution_count": null,
1096 |    "metadata": {
1097 |     "scrolled": true
1098 |    },
1099 |    "outputs": [],
1100 |    "source": [
1101 |     "# absolute error\n",
1102 |     "from sklearn.metrics import mean_absolute_error\n",
1103 |     "\n",
1104 |     "mean_absolute_error(y_test, cat_pipe.predict(X_test))"
1105 |    ]
1106 |   },
1107 |   {
1108 |    "cell_type": "code",
1109 |    "execution_count": null,
1110 |    "metadata": {},
1111 |    "outputs": [],
1112 |    "source": [
1113 |     "# compare to lr model\n",
1114 |     "from sklearn.metrics import mean_absolute_error\n",
1115 |     "\n",
1116 |     "mean_absolute_error(y_test, lr_pipe.predict(X_test))"
1117 |    ]
1118 |   },
1119 |   {
1120 |    "cell_type": "code",
1121 |    "execution_count": null,
1122 |    "metadata": {},
1123 |    "outputs": [],
1124 |    "source": []
1125 |   },
1126 |   {
1127 |    "cell_type": "code",
1128 |    "execution_count": null,
1129 |    "metadata": {},
1130 |    "outputs": [],
1131 |    "source": []
1132 |   },
1133 |   {
1134 |    "cell_type": "code",
1135 |    "execution_count": null,
1136 |    "metadata": {},
1137 |    "outputs": [],
1138 |    "source": []
1139 |   },
1140 |   {
1141 |    "cell_type": "code",
1142 |    "execution_count": null,
1143 |    "metadata": {},
1144 |    "outputs": [],
1145 |    "source": []
1146 |   },
1147 |   {
1148 |    "cell_type": "code",
1149 |    "execution_count": null,
1150 |    "metadata": {},
1151 |    "outputs": [],
1152 |    "source": []
1153 |   },
1154 |   {
1155 |    "cell_type": "code",
1156 |    "execution_count": null,
1157 |    "metadata": {},
1158 |    "outputs": [],
1159 |    "source": []
1160 |   },
1161 |   {
1162 |    "cell_type": "markdown",
1163 |    "metadata": {},
1164 |    "source": [
1165 |     "\n",
1166 |     "## Residuals Plot\n"
1167 |    ]
1168 |   },
1169 |   {
1170 |    "cell_type": "code",
1171 |    "execution_count": null,
1172 |    "metadata": {},
1173 |    "outputs": [],
1174 |    "source": [
1175 |     "# make a residual plot\n",
1176 |     "import matplotlib.pyplot as plt\n",
1177 |     "\n",
1178 |     "ax = plt.scatter(cat_pipe.predict(X_test), \n",
1179 |     "    y_test.to_series().to_numpy() - cat_pipe.predict(X_test), alpha=0.1)\n",
1180 |     "# make labels not be scientific notation\n",
1181 |     "plt.ticklabel_format(style='plain', axis='y')\n",
1182 |     "plt.ticklabel_format(style='plain', axis='x')\n",
1183 |     "plt.ylim(-500_000, 500_000)\n",
1184 |     "plt.xlabel('Predicted price')\n",
1185 |     "plt.ylabel('Residual')\n",
1186 |     "plt.title('Residual plot')"
1187 |    ]
1188 |   },
1189 |   {
1190 |    "cell_type": "code",
1191 |    "execution_count": null,
1192 |    "metadata": {},
1193 |    "outputs": [],
1194 |    "source": [
1195 |     "# plot with Polars\n",
1196 |     "(y_test\n",
1197 |     " .with_columns(predicted_price=cat_pipe.predict(X_test),\n",
1198 |     "   residual=y_test.to_series().to_numpy() - cat_pipe.predict(X_test))\n",
1199 |     " .plot.scatter('predicted_price', 'residual', alpha=0.1, yformatter='$%.0f',\n",
1200 |     "               xformatter='$%.0f')\n",
1201 |     " )"
1202 |    ]
1203 |   },
1204 |   {
1205 |    "cell_type": "code",
1206 |    "execution_count": null,
1207 |    "metadata": {},
1208 |    "outputs": [],
1209 |    "source": [
1210 |     "def residuals_plot(model, X_train, y_train, X_test, y_test):\n",
1211 |     "    return (y_test\n",
1212 |     "      .with_columns(prediction=model.predict(X_test),\n",
1213 |     "        residual=y_test.to_series().to_numpy() - model.predict(X_test),\n",
1214 |     "        type=pl.lit('test'))\n",
1215 |     "      .vstack(y_train\n",
1216 |     "        .with_columns(prediction=model.predict(X_train),\n",
1217 |     "          residual=y_train.to_series().to_numpy() - model.predict(X_train),\n",
1218 |     "          type=pl.lit('train'))\n",
1219 |     "              )\n",
1220 |     "      .reverse()\n",
1221 |     "      .plot.scatter('prediction', 'residual', alpha=0.1, yformatter='$%.0f',\n",
1222 |     "                    xformatter='$%.0f', by='type')\n",
1223 |     " )\n",
1224 |     "\n",
1225 |     "residuals_plot(cat_pipe, X_train, y_train, X_test, y_test)"
1226 |    ]
1227 |   },
1228 |   {
1229 |    "cell_type": "code",
1230 |    "execution_count": null,
1231 |    "metadata": {},
1232 |    "outputs": [],
1233 |    "source": [
1234 |     "residuals_plot(dt_pipe, X_train, y_train, X_test, y_test)"
1235 |    ]
1236 |   },
1237 |   {
1238 |    "cell_type": "code",
1239 |    "execution_count": null,
1240 |    "metadata": {},
1241 |    "outputs": [],
1242 |    "source": []
1243 |   },
1244 |   {
1245 |    "cell_type": "code",
1246 |    "execution_count": null,
1247 |    "metadata": {},
1248 |    "outputs": [],
1249 |    "source": []
1250 |   },
1251 |   {
1252 |    "cell_type": "code",
1253 |    "execution_count": null,
1254 |    "metadata": {},
1255 |    "outputs": [],
1256 |    "source": []
1257 |   },
1258 |   {
1259 |    "cell_type": "markdown",
1260 |    "metadata": {},
1261 |    "source": [
1262 |     "## Challenge\n",
1263 |     "\n",
1264 |     "What is the mean squared error of the Random Forest model? What is the R2 score? What do these values tell us about the model?"
1265 |    ]
1266 |   },
1267 |   {
1268 |    "cell_type": "code",
1269 |    "execution_count": null,
1270 |    "metadata": {},
1271 |    "outputs": [],
1272 |    "source": []
1273 |   },
1274 |   {
1275 |    "cell_type": "markdown",
1276 |    "metadata": {},
1277 |    "source": [
1278 |     "\n",
1279 |     "\n",
1280 |     "## Solution"
1281 |    ]
1282 |   },
1283 |   {
1284 |    "cell_type": "code",
1285 |    "execution_count": null,
1286 |    "metadata": {},
1287 |    "outputs": [],
1288 |    "source": []
1289 |   },
1290 |   {
1291 |    "cell_type": "code",
1292 |    "execution_count": null,
1293 |    "metadata": {},
1294 |    "outputs": [],
1295 |    "source": []
1296 |   },
1297 |   {
1298 |    "cell_type": "code",
1299 |    "execution_count": null,
1300 |    "metadata": {},
1301 |    "outputs": [],
1302 |    "source": []
1303 |   },
1304 |   {
1305 |    "cell_type": "code",
1306 |    "execution_count": null,
1307 |    "metadata": {},
1308 |    "outputs": [],
1309 |    "source": []
1310 |   },
1311 |   {
1312 |    "cell_type": "markdown",
1313 |    "metadata": {},
1314 |    "source": [
1315 |     "# Model Tuning\n"
1316 |    ]
1317 |   },
1318 |   {
1319 |    "cell_type": "markdown",
1320 |    "metadata": {},
1321 |    "source": [
1322 |     "\n",
1323 |     "## Hyperparameters\n",
1324 |     "\n",
1325 |     "Hyperparameters are the levers we can pull to adjust the behavior of a model. They are set before the model is trained and remain constant during training."
1326 |    ]
1327 |   },
1328 |   {
1329 |    "cell_type": "markdown",
1330 |    "metadata": {},
1331 |    "source": [
1332 |     "\n",
1333 |     "## Tuning Linear Regression\n"
1334 |    ]
1335 |   },
1336 |   {
1337 |    "cell_type": "code",
1338 |    "execution_count": null,
1339 |    "metadata": {},
1340 |    "outputs": [],
1341 |    "source": [
1342 |     "lr_pipe"
1343 |    ]
1344 |   },
1345 |   {
1346 |    "cell_type": "code",
1347 |    "execution_count": null,
1348 |    "metadata": {},
1349 |    "outputs": [],
1350 |    "source": [
1351 |     "lr_pipe.named_steps['lr']"
1352 |    ]
1353 |   },
1354 |   {
1355 |    "cell_type": "code",
1356 |    "execution_count": null,
1357 |    "metadata": {
1358 |     "scrolled": true
1359 |    },
1360 |    "outputs": [],
1361 |    "source": [
1362 |     "help(lr_pipe.named_steps['lr'])"
1363 |    ]
1364 |   },
1365 |   {
1366 |    "cell_type": "code",
1367 |    "execution_count": null,
1368 |    "metadata": {
1369 |     "scrolled": true
1370 |    },
1371 |    "outputs": [],
1372 |    "source": [
1373 |     "from sklearn.linear_model import Ridge\n",
1374 |     "Ridge?"
1375 |    ]
1376 |   },
1377 |   {
1378 |    "cell_type": "code",
1379 |    "execution_count": null,
1380 |    "metadata": {},
1381 |    "outputs": [],
1382 |    "source": [
1383 |     "rr =  Ridge()\n",
1384 |     "y = raw.select('price')\n",
1385 |     "X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)\n",
1386 |     "rr_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n",
1387 |     "                      ('zip_avg_price', ZipAvgPriceAdder()),\n",
1388 |     "                      ('preprocessor', preprocessor),\n",
1389 |     "                        ('rr', rr),\n",
1390 |     "                      ])\n",
1391 |     "\n",
1392 |     "rr_pipe.fit(X_train, y_train)\n",
1393 |     "rr_pipe.score(X_test, y_test)"
1394 |    ]
1395 |   },
1396 |   {
1397 |    "cell_type": "code",
1398 |    "execution_count": null,
1399 |    "metadata": {},
1400 |    "outputs": [],
1401 |    "source": []
1402 |   },
1403 |   {
1404 |    "cell_type": "code",
1405 |    "execution_count": null,
1406 |    "metadata": {
1407 |     "scrolled": true
1408 |    },
1409 |    "outputs": [],
1410 |    "source": [
1411 |     "lr_pipe.score(X_test, y_test)"
1412 |    ]
1413 |   },
1414 |   {
1415 |    "cell_type": "code",
1416 |    "execution_count": null,
1417 |    "metadata": {
1418 |     "collapsed": true,
1419 |     "jupyter": {
1420 |      "outputs_hidden": true
1421 |     }
1422 |    },
1423 |    "outputs": [],
1424 |    "source": [
1425 |     "from sklearn.model_selection import validation_curve\n",
1426 |     "\n",
1427 |     "param_range = [0, .01, .05, .1, .5, 1, 2]\n",
1428 |     "scores = []\n",
1429 |     "for val in param_range:\n",
1430 |     "    rr_pipe.set_params(rr__alpha=val)\n",
1431 |     "    rr_pipe.fit(X_train, y_train)\n",
1432 |     "    scores.append(rr_pipe.score(X_test, y_test))"
1433 |    ]
1434 |   },
1435 |   {
1436 |    "cell_type": "code",
1437 |    "execution_count": null,
1438 |    "metadata": {},
1439 |    "outputs": [],
1440 |    "source": [
1441 |     "# Our be score is at 0 (which is normal Linear Regression)\n",
1442 |     "alpha = pl.DataFrame({'val': param_range,\n",
1443 |     "              'scores': scores})\n",
1444 |     "alpha.plot(x='val', y='scores')"
1445 |    ]
1446 |   },
1447 |   {
1448 |    "cell_type": "code",
1449 |    "execution_count": null,
1450 |    "metadata": {},
1451 |    "outputs": [],
1452 |    "source": []
1453 |   },
1454 |   {
1455 |    "cell_type": "code",
1456 |    "execution_count": null,
1457 |    "metadata": {},
1458 |    "outputs": [],
1459 |    "source": []
1460 |   },
1461 |   {
1462 |    "cell_type": "code",
1463 |    "execution_count": null,
1464 |    "metadata": {},
1465 |    "outputs": [],
1466 |    "source": []
1467 |   },
1468 |   {
1469 |    "cell_type": "code",
1470 |    "execution_count": null,
1471 |    "metadata": {},
1472 |    "outputs": [],
1473 |    "source": []
1474 |   },
1475 |   {
1476 |    "cell_type": "markdown",
1477 |    "metadata": {},
1478 |    "source": [
1479 |     "\n",
1480 |     "## Tuning Decision Trees\n"
1481 |    ]
1482 |   },
1483 |   {
1484 |    "cell_type": "code",
1485 |    "execution_count": null,
1486 |    "metadata": {
1487 |     "scrolled": true
1488 |    },
1489 |    "outputs": [],
1490 |    "source": [
1491 |     "dt_pipe.named_steps['dt']"
1492 |    ]
1493 |   },
1494 |   {
1495 |    "cell_type": "code",
1496 |    "execution_count": null,
1497 |    "metadata": {
1498 |     "scrolled": true
1499 |    },
1500 |    "outputs": [],
1501 |    "source": [
1502 |     "help(dt_pipe.named_steps['dt'])"
1503 |    ]
1504 |   },
1505 |   {
1506 |    "cell_type": "code",
1507 |    "execution_count": null,
1508 |    "metadata": {
1509 |     "scrolled": true
1510 |    },
1511 |    "outputs": [],
1512 |    "source": [
1513 |     "# plot a validation curve tracking mse as the max_depth of the decision tree increases\n",
1514 |     "from sklearn.model_selection import validation_curve\n",
1515 |     "\n",
1516 |     "param_range = range(1, 20)\n",
1517 |     "train_scores, test_scores = validation_curve(\n",
1518 |     "    dt_pipe, X_train, y_train, param_name=\"dt__max_depth\", param_range=param_range,\n",
1519 |     "    scoring=\"neg_mean_squared_error\", n_jobs=1)"
1520 |    ]
1521 |   },
1522 |   {
1523 |    "cell_type": "code",
1524 |    "execution_count": null,
1525 |    "metadata": {},
1526 |    "outputs": [],
1527 |    "source": [
1528 |     "# make a validation curve from train_scores and test_scores\n",
1529 |     "import matplotlib.pyplot as plt\n",
1530 |     "import numpy as np\n",
1531 |     "\n",
1532 |     "train_scores_mean = np.mean(train_scores, axis=1)\n",
1533 |     "train_scores_std = np.std(train_scores, axis=1)\n",
1534 |     "test_scores_mean = np.mean(test_scores, axis=1)\n",
1535 |     "test_scores_std = np.std(test_scores, axis=1)\n",
1536 |     "\n",
1537 |     "plt.title(\"Validation Curve with Decision Tree\")\n",
1538 |     "plt.xlabel(\"max_depth\")\n",
1539 |     "plt.ylabel(\"Score\")\n",
1540 |     "#plt.ylim(-1, 0)\n",
1541 |     "lw = 2\n",
1542 |     "plt.plot(param_range, train_scores_mean, label=\"Training score\",\n",
1543 |     "             color=\"darkorange\", lw=lw)\n",
1544 |     "plt.fill_between(param_range, train_scores_mean - train_scores_std,\n",
1545 |     "                 train_scores_mean + train_scores_std, alpha=0.2,\n",
1546 |     "                 color=\"darkorange\", lw=lw)\n",
1547 |     "plt.plot(param_range, test_scores_mean, label=\"Cross-validation score\",\n",
1548 |     "                color=\"navy\", lw=lw)\n",
1549 |     "\n",
1550 |     "plt.fill_between(param_range, test_scores_mean - test_scores_std,   \n",
1551 |     "                    test_scores_mean + test_scores_std, alpha=0.2,\n",
1552 |     "                    color=\"navy\", lw=lw)\n",
1553 |     "plt.legend(loc=\"best\")\n",
1554 |     "\n",
1555 |     "\n"
1556 |    ]
1557 |   },
1558 |   {
1559 |    "cell_type": "code",
1560 |    "execution_count": null,
1561 |    "metadata": {},
1562 |    "outputs": [],
1563 |    "source": [
1564 |     "# train dt_pipe with max_depth=8\n",
1565 |     "dt8_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n",
1566 |     "                        ('zip_avg_price', ZipAvgPriceAdder()),\n",
1567 |     "                        ('to_pandas', pandas_transformer),\n",
1568 |     "                        ('preprocessor', preprocessor),\n",
1569 |     "                        ('dt', DecisionTreeRegressor(max_depth=8)),\n",
1570 |     "                        ])\n",
1571 |     "\n",
1572 |     "dt8_pipe.fit(X_train, y_train)\n",
1573 |     "dt8_pipe.score(X_test, y_test)"
1574 |    ]
1575 |   },
1576 |   {
1577 |    "cell_type": "code",
1578 |    "execution_count": null,
1579 |    "metadata": {},
1580 |    "outputs": [],
1581 |    "source": [
1582 |     "from sklearn.metrics import mean_squared_error\n",
1583 |     "mean_squared_error(y_test, dt8_pipe.predict(X_test), squared=False) "
1584 |    ]
1585 |   },
1586 |   {
1587 |    "cell_type": "code",
1588 |    "execution_count": null,
1589 |    "metadata": {},
1590 |    "outputs": [],
1591 |    "source": [
1592 |     "dt_pipe.score(X_test, y_test)"
1593 |    ]
1594 |   },
1595 |   {
1596 |    "cell_type": "code",
1597 |    "execution_count": null,
1598 |    "metadata": {},
1599 |    "outputs": [],
1600 |    "source": [
1601 |     "mean_squared_error(y_test, dt_pipe.predict(X_test), squared=False) "
1602 |    ]
1603 |   },
1604 |   {
1605 |    "cell_type": "code",
1606 |    "execution_count": null,
1607 |    "metadata": {},
1608 |    "outputs": [],
1609 |    "source": []
1610 |   },
1611 |   {
1612 |    "cell_type": "code",
1613 |    "execution_count": null,
1614 |    "metadata": {},
1615 |    "outputs": [],
1616 |    "source": []
1617 |   },
1618 |   {
1619 |    "cell_type": "code",
1620 |    "execution_count": null,
1621 |    "metadata": {},
1622 |    "outputs": [],
1623 |    "source": []
1624 |   },
1625 |   {
1626 |    "cell_type": "code",
1627 |    "execution_count": null,
1628 |    "metadata": {},
1629 |    "outputs": [],
1630 |    "source": []
1631 |   },
1632 |   {
1633 |    "cell_type": "code",
1634 |    "execution_count": null,
1635 |    "metadata": {},
1636 |    "outputs": [],
1637 |    "source": []
1638 |   },
1639 |   {
1640 |    "cell_type": "markdown",
1641 |    "metadata": {},
1642 |    "source": [
1643 |     "\n",
1644 |     "## Tuning CatBoost\n",
1645 |     "\n",
1646 |     "* Boosting - `iterations` (`num_trees`, `n_estimators`), `learning_rate` (`eta`), `early_stopping_rounds`\n",
1647 |     "\n",
1648 |     "* Tree based - `depth` (`max_depth`), `grow_policy`, `min_child_samples` (`min_data_in_leaf`), `max_leaves` (`num_leaves`)\n",
1649 |     "\n",
1650 |     "* Sampling - `subsample`, `sampling_frequency`, `rsm` (`colsample_bylevel`), `random_strength`, `bagging_temperature`\n",
1651 |     "\n",
1652 |     "* Regularization - `l2_leaf_reg` (`reg_lambda`), `model_shrink_rate`\n",
1653 |     "\n",
1654 |     "* Constraints - `monotone_constraints`, `feature_weights`"
1655 |    ]
1656 |   },
1657 |   {
1658 |    "cell_type": "code",
1659 |    "execution_count": null,
1660 |    "metadata": {
1661 |     "scrolled": true
1662 |    },
1663 |    "outputs": [],
1664 |    "source": [
1665 |     "catboost.CatBoostRegressor?"
1666 |    ]
1667 |   },
1668 |   {
1669 |    "cell_type": "code",
1670 |    "execution_count": null,
1671 |    "metadata": {},
1672 |    "outputs": [],
1673 |    "source": [
1674 |     "cr2 = catboost.CatBoostRegressor(iterations=3000, learning_rate=0.1,\n",
1675 |     "                                 early_stopping_rounds=10)\n",
1676 |     "X_train, X_test, y_train, y_test = train_test_split(raw.drop('price'), y, \n",
1677 |     "                                                    test_size=0.2, random_state=42)\n",
1678 |     "\n",
1679 |     "cr2.fit(X_train.to_pandas(), y_train.to_numpy(), cat_features=['zipcode'], verbose=100,\n",
1680 |     "        early_stopping_rounds=10, eval_set=(X_test.to_pandas(), y_test.to_numpy()))"
1681 |    ]
1682 |   },
1683 |   {
1684 |    "cell_type": "code",
1685 |    "execution_count": null,
1686 |    "metadata": {
1687 |     "scrolled": true
1688 |    },
1689 |    "outputs": [],
1690 |    "source": [
1691 |     "# plot a validation curve tracking mse as the max_depth of the decision tree increases\n",
1692 |     "from sklearn.model_selection import validation_curve\n",
1693 |     "\n",
1694 |     "param_range = range(1, 10)\n",
1695 |     "train_scores, test_scores = validation_curve(\n",
1696 |     "    cr2, X_train.to_pandas(), y_train.to_numpy(), param_name=\"max_depth\", \n",
1697 |     "    param_range=param_range,\n",
1698 |     "    scoring=\"neg_mean_squared_error\", n_jobs=1,\n",
1699 |     "    fit_params=dict(early_stopping_rounds=10, \n",
1700 |     "                    eval_set=(X_test.to_pandas(), y_test.to_numpy())))"
1701 |    ]
1702 |   },
1703 |   {
1704 |    "cell_type": "code",
1705 |    "execution_count": null,
1706 |    "metadata": {},
1707 |    "outputs": [],
1708 |    "source": [
1709 |     "# make a validation curve from train_scores and test_scores\n",
1710 |     "import matplotlib.pyplot as plt\n",
1711 |     "import numpy as np\n",
1712 |     "\n",
1713 |     "train_scores_mean = np.mean(train_scores, axis=1)\n",
1714 |     "train_scores_std = np.std(train_scores, axis=1)\n",
1715 |     "test_scores_mean = np.mean(test_scores, axis=1)\n",
1716 |     "test_scores_std = np.std(test_scores, axis=1)\n",
1717 |     "\n",
1718 |     "plt.title(\"Validation Curve with CatBoost\")\n",
1719 |     "plt.xlabel(\"max_depth\")\n",
1720 |     "plt.ylabel(\"Score\")\n",
1721 |     "#plt.ylim(-1, 0)\n",
1722 |     "lw = 2\n",
1723 |     "plt.plot(param_range, train_scores_mean, label=\"Training score\",\n",
1724 |     "             color=\"darkorange\", lw=lw)\n",
1725 |     "plt.fill_between(param_range, train_scores_mean - train_scores_std,\n",
1726 |     "                 train_scores_mean + train_scores_std, alpha=0.2,\n",
1727 |     "                 color=\"darkorange\", lw=lw)\n",
1728 |     "plt.plot(param_range, test_scores_mean, label=\"Cross-validation score\",\n",
1729 |     "                color=\"navy\", lw=lw)\n",
1730 |     "\n",
1731 |     "plt.fill_between(param_range, test_scores_mean - test_scores_std,   \n",
1732 |     "                    test_scores_mean + test_scores_std, alpha=0.2,\n",
1733 |     "                    color=\"navy\", lw=lw)\n",
1734 |     "plt.legend(loc=\"best\")\n",
1735 |     "\n",
1736 |     "\n"
1737 |    ]
1738 |   },
1739 |   {
1740 |    "cell_type": "code",
1741 |    "execution_count": null,
1742 |    "metadata": {},
1743 |    "outputs": [],
1744 |    "source": [
1745 |     "# set max_depth to 4\n",
1746 |     "cr2_4 = catboost.CatBoostRegressor(iterations=3000, learning_rate=0.1,\n",
1747 |     "                                max_depth=4)\n",
1748 |     "\n",
1749 |     "X_train, X_test, y_train, y_test = train_test_split(raw.drop('price'), y, \n",
1750 |     "                                                    test_size=0.2, random_state=42)\n",
1751 |     "\n",
1752 |     "cr2_4.fit(X_train.to_pandas(), y_train.to_numpy(), cat_features=['zipcode'], verbose=100,\n",
1753 |     "        early_stopping_rounds=10, eval_set=(X_test.to_pandas(), y_test.to_numpy()))\n",
1754 |     "cr2_4.score(X_test.to_pandas(), y_test.to_numpy())"
1755 |    ]
1756 |   },
1757 |   {
1758 |    "cell_type": "code",
1759 |    "execution_count": null,
1760 |    "metadata": {
1761 |     "jupyter": {
1762 |      "source_hidden": true
1763 |     }
1764 |    },
1765 |    "outputs": [],
1766 |    "source": []
1767 |   },
1768 |   {
1769 |    "cell_type": "code",
1770 |    "execution_count": null,
1771 |    "metadata": {},
1772 |    "outputs": [],
1773 |    "source": []
1774 |   },
1775 |   {
1776 |    "cell_type": "code",
1777 |    "execution_count": null,
1778 |    "metadata": {},
1779 |    "outputs": [],
1780 |    "source": []
1781 |   },
1782 |   {
1783 |    "cell_type": "code",
1784 |    "execution_count": null,
1785 |    "metadata": {},
1786 |    "outputs": [],
1787 |    "source": []
1788 |   },
1789 |   {
1790 |    "cell_type": "code",
1791 |    "execution_count": null,
1792 |    "metadata": {},
1793 |    "outputs": [],
1794 |    "source": []
1795 |   },
1796 |   {
1797 |    "cell_type": "code",
1798 |    "execution_count": null,
1799 |    "metadata": {},
1800 |    "outputs": [],
1801 |    "source": []
1802 |   },
1803 |   {
1804 |    "cell_type": "code",
1805 |    "execution_count": null,
1806 |    "metadata": {},
1807 |    "outputs": [],
1808 |    "source": []
1809 |   },
1810 |   {
1811 |    "cell_type": "markdown",
1812 |    "metadata": {},
1813 |    "source": [
1814 |     "\n",
1815 |     "## Grid Search\n"
1816 |    ]
1817 |   },
1818 |   {
1819 |    "cell_type": "code",
1820 |    "execution_count": null,
1821 |    "metadata": {
1822 |     "scrolled": true
1823 |    },
1824 |    "outputs": [],
1825 |    "source": [
1826 |     "from sklearn.tree import DecisionTreeRegressor\n",
1827 |     "\n",
1828 |     "\n",
1829 |     "dt = DecisionTreeRegressor()\n",
1830 |     "y = raw.select('price')\n",
1831 |     "X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)\n",
1832 |     "dt_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n",
1833 |     "                      ('zip_avg_price', ZipAvgPriceAdder()),\n",
1834 |     "                      ('preprocessor', preprocessor),\n",
1835 |     "                      ('dt', dt),\n",
1836 |     "                      ])\n",
1837 |     "\n",
1838 |     "dt_pipe.fit(X_train, y_train)\n",
1839 |     "dt_pipe.score(X_test, y_test)"
1840 |    ]
1841 |   },
1842 |   {
1843 |    "cell_type": "code",
1844 |    "execution_count": null,
1845 |    "metadata": {
1846 |     "collapsed": true,
1847 |     "jupyter": {
1848 |      "outputs_hidden": true
1849 |     },
1850 |     "scrolled": true
1851 |    },
1852 |    "outputs": [],
1853 |    "source": [
1854 |     "dt_pipe"
1855 |    ]
1856 |   },
1857 |   {
1858 |    "cell_type": "code",
1859 |    "execution_count": null,
1860 |    "metadata": {
1861 |     "scrolled": true
1862 |    },
1863 |    "outputs": [],
1864 |    "source": [
1865 |     "# use grid search on decision tree\n",
1866 |     "from sklearn.model_selection import GridSearchCV\n",
1867 |     "\n",
1868 |     "param_grid = {\n",
1869 |     "    'dt__max_depth': [3, 6, 9],\n",
1870 |     "    'dt__min_samples_split': [10, 20, 100],\n",
1871 |     "    'dt__min_samples_leaf': [10, 20, 100],\n",
1872 |     "}\n",
1873 |     "\n",
1874 |     "grid_search = GridSearchCV(dt_pipe, param_grid, cv=5)#, scoring='neg_mean_squared_error')\n",
1875 |     "grid_search.fit(X_train, y_train)"
1876 |    ]
1877 |   },
1878 |   {
1879 |    "cell_type": "code",
1880 |    "execution_count": null,
1881 |    "metadata": {},
1882 |    "outputs": [],
1883 |    "source": [
1884 |     "grid_search.best_params_"
1885 |    ]
1886 |   },
1887 |   {
1888 |    "cell_type": "code",
1889 |    "execution_count": null,
1890 |    "metadata": {
1891 |     "scrolled": true
1892 |    },
1893 |    "outputs": [],
1894 |    "source": [
1895 |     "# make a tree from the params\n",
1896 |     "dt = DecisionTreeRegressor()#max_depth=9, min_samples_leaf=20, min_samples_split=10)\n",
1897 |     "dt_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n",
1898 |     "                      ('zip_avg_price', ZipAvgPriceAdder()),\n",
1899 |     "                      ('to_pandas', pandas_transformer),\n",
1900 |     "                      ('preprocessor', preprocessor),\n",
1901 |     "                        ('dt', dt),\n",
1902 |     "                      ])\n",
1903 |     "dt_pipe.set_params(**grid_search.best_params_)\n",
1904 |     "dt_pipe.fit(X_train, y_train)\n",
1905 |     "dt_pipe.score(X_test, y_test)"
1906 |    ]
1907 |   },
1908 |   {
1909 |    "cell_type": "code",
1910 |    "execution_count": null,
1911 |    "metadata": {
1912 |     "scrolled": true
1913 |    },
1914 |    "outputs": [],
1915 |    "source": [
1916 |     "# compare to default\n",
1917 |     "dt = DecisionTreeRegressor(random_state=42)\n",
1918 |     "dt_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n",
1919 |     "                      ('zip_avg_price', ZipAvgPriceAdder()),\n",
1920 |     "                      ('to_pandas', pandas_transformer),\n",
1921 |     "                      ('preprocessor', preprocessor),\n",
1922 |     "                        ('dt', dt),\n",
1923 |     "                      ])\n",
1924 |     "\n",
1925 |     "dt_pipe.fit(X_train, y_train)\n",
1926 |     "dt_pipe.score(X_test, y_test)"
1927 |    ]
1928 |   },
1929 |   {
1930 |    "cell_type": "code",
1931 |    "execution_count": null,
1932 |    "metadata": {},
1933 |    "outputs": [],
1934 |    "source": []
1935 |   },
1936 |   {
1937 |    "cell_type": "code",
1938 |    "execution_count": null,
1939 |    "metadata": {},
1940 |    "outputs": [],
1941 |    "source": []
1942 |   },
1943 |   {
1944 |    "cell_type": "code",
1945 |    "execution_count": null,
1946 |    "metadata": {},
1947 |    "outputs": [],
1948 |    "source": []
1949 |   },
1950 |   {
1951 |    "cell_type": "code",
1952 |    "execution_count": null,
1953 |    "metadata": {},
1954 |    "outputs": [],
1955 |    "source": []
1956 |   },
1957 |   {
1958 |    "cell_type": "markdown",
1959 |    "metadata": {},
1960 |    "source": [
1961 |     "\n",
1962 |     "## Challenge\n",
1963 |     "\n",
1964 |     "Do a grid search to find the best depth for the random forest model. What is the best depth? What is the score of the model with the best depth?"
1965 |    ]
1966 |   },
1967 |   {
1968 |    "cell_type": "markdown",
1969 |    "metadata": {},
1970 |    "source": [
1971 |     "\n",
1972 |     "## Solution"
1973 |    ]
1974 |   },
1975 |   {
1976 |    "cell_type": "code",
1977 |    "execution_count": null,
1978 |    "metadata": {},
1979 |    "outputs": [],
1980 |    "source": []
1981 |   },
1982 |   {
1983 |    "cell_type": "code",
1984 |    "execution_count": null,
1985 |    "metadata": {},
1986 |    "outputs": [],
1987 |    "source": []
1988 |   },
1989 |   {
1990 |    "cell_type": "code",
1991 |    "execution_count": null,
1992 |    "metadata": {},
1993 |    "outputs": [],
1994 |    "source": []
1995 |   },
1996 |   {
1997 |    "cell_type": "code",
1998 |    "execution_count": null,
1999 |    "metadata": {},
2000 |    "outputs": [],
2001 |    "source": []
2002 |   },
2003 |   {
2004 |    "cell_type": "markdown",
2005 |    "metadata": {},
2006 |    "source": [
2007 |     "# Model Deployment\n"
2008 |    ]
2009 |   },
2010 |   {
2011 |    "cell_type": "markdown",
2012 |    "metadata": {},
2013 |    "source": [
2014 |     "\n",
2015 |     "## End to end notebook\n"
2016 |    ]
2017 |   },
2018 |   {
2019 |    "cell_type": "code",
2020 |    "execution_count": null,
2021 |    "metadata": {},
2022 |    "outputs": [],
2023 |    "source": [
2024 |     "import polars as pl\n",
2025 |     "from sklearn.pipeline import Pipeline\n",
2026 |     "from sklearn.compose import ColumnTransformer\n",
2027 |     "from sklearn.linear_model import LinearRegression\n",
2028 |     "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
2029 |     "from sklearn.impute import SimpleImputer\n",
2030 |     "from sklearn.model_selection import train_test_split\n",
2031 |     "from sklearn.preprocessing import FunctionTransformer\n",
2032 |     "from sklearn.base import BaseEstimator, TransformerMixin\n",
2033 |     "from sklearn import set_config\n",
2034 |     "set_config(transform_output='polars')\n",
2035 |     "\n",
2036 |     "def tweak_housing(df):\n",
2037 |     "    return (df\n",
2038 |     "            .with_columns(zipcode=pl.col('zipcode').cast(pl.String).cast(pl.Categorical),\n",
2039 |     "                          date=pl.date(pl.col('date_year'), pl.col('date_month'), pl.col('date_day')),\n",
2040 |     "                          yr_renovated=pl.col('yr_renovated').replace(0, None),\n",
2041 |     "                          )\n",
2042 |     "            .select(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', \n",
2043 |     "                     'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', \n",
2044 |     "                     'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', \n",
2045 |     "                     'sqft_lot15', 'date',  #'date_year', 'date_month', 'date_day', \n",
2046 |     "                     ])\n",
2047 |     "    )\n",
2048 |     "\n",
2049 |     "# make the pipeline\n",
2050 |     "numeric_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', \n",
2051 |     "                    'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', \n",
2052 |     "                    'lat', 'long', 'sqft_living15', 'sqft_lot15', 'zip_mean']\n",
2053 |     "numeric_transformer = Pipeline(steps=[\n",
2054 |     "    ('imputer', SimpleImputer(strategy='median')),\n",
2055 |     "    ('scaler', StandardScaler())])\n",
2056 |     "\n",
2057 |     "categorical_features = ['zipcode']\n",
2058 |     "\n",
2059 |     "preprocessor = ColumnTransformer(\n",
2060 |     "    transformers=[\n",
2061 |     "        ('num', numeric_transformer, numeric_features),\n",
2062 |     "        ('cat', OneHotEncoder(handle_unknown='ignore',\n",
2063 |     "                              sparse_output=False), categorical_features)])\n",
2064 |     "\n",
2065 |     "def to_pandas(df):\n",
2066 |     "    return df.to_pandas()\n",
2067 |     "pandas_transformer = FunctionTransformer(to_pandas)\n",
2068 |     "\n",
2069 |     "tweak_transformer = FunctionTransformer(tweak_housing)\n",
2070 |     "\n",
2071 |     "class ZipAvgPriceAdder(BaseEstimator, TransformerMixin):\n",
2072 |     "    def __init__(self):\n",
2073 |     "        pass\n",
2074 |     "    def fit(self, X, y=None):\n",
2075 |     "        # assume X is a polars dataframe\n",
2076 |     "        self.zip_avg_price = (X\n",
2077 |     "                              .group_by('zipcode')\n",
2078 |     "                              .agg(zip_mean=pl.col('price').mean())\n",
2079 |     "        )\n",
2080 |     "        return self\n",
2081 |     "    \n",
2082 |     "    def transform(self, X, y=None):\n",
2083 |     "        with pl.StringCache():\n",
2084 |     "            return X.join(self.zip_avg_price, on='zipcode')\n",
2085 |     "\n",
2086 |     "\n",
2087 |     "# King County House Sales dataset from OpenML (includes Seattle)\n",
2088 |     "# this is an ARFF file, which is a text file with a specific format\n",
2089 |     "url = 'https://www.openml.org/data/download/22044765/dataset'\n",
2090 |     "cols = ['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', \n",
2091 |     "        'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',\n",
2092 |     "        'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'date_year', 'date_month', 'date_day']\n",
2093 |     "\n",
2094 |     "raw = pl.read_csv(url, new_columns=cols, skip_rows=31, has_header=False)\n",
2095 |     "\n",
2096 |     "lr =  LinearRegression()\n",
2097 |     "y = raw.select('price')\n",
2098 |     "X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)\n",
2099 |     "lr_pipe = Pipeline(steps=[('tweak', tweak_transformer),\n",
2100 |     "                      ('zip_avg_price', ZipAvgPriceAdder()),\n",
2101 |     "                      ('preprocessor', preprocessor),\n",
2102 |     "                      ('lr', lr),\n",
2103 |     "                      ])\n",
2104 |     "\n",
2105 |     "lr_pipe.fit(X_train, y_train)\n",
2106 |     "lr_pipe.score(X_test, y_test)\n",
2107 |     "    "
2108 |    ]
2109 |   },
2110 |   {
2111 |    "cell_type": "markdown",
2112 |    "metadata": {},
2113 |    "source": []
2114 |   },
2115 |   {
2116 |    "cell_type": "code",
2117 |    "execution_count": null,
2118 |    "metadata": {},
2119 |    "outputs": [],
2120 |    "source": []
2121 |   },
2122 |   {
2123 |    "cell_type": "code",
2124 |    "execution_count": null,
2125 |    "metadata": {},
2126 |    "outputs": [],
2127 |    "source": []
2128 |   },
2129 |   {
2130 |    "cell_type": "code",
2131 |    "execution_count": null,
2132 |    "metadata": {},
2133 |    "outputs": [],
2134 |    "source": []
2135 |   },
2136 |   {
2137 |    "cell_type": "code",
2138 |    "execution_count": null,
2139 |    "metadata": {},
2140 |    "outputs": [],
2141 |    "source": []
2142 |   },
2143 |   {
2144 |    "cell_type": "code",
2145 |    "execution_count": null,
2146 |    "metadata": {},
2147 |    "outputs": [],
2148 |    "source": []
2149 |   },
2150 |   {
2151 |    "cell_type": "markdown",
2152 |    "metadata": {},
2153 |    "source": [
2154 |     "## Using MLFlow\n",
2155 |     "\n",
2156 |     "Going to show how to persist and load a model, but can also:\n",
2157 |     "\n",
2158 |     "- Start a endpoint to serve predictions\n",
2159 |     "- Build a Docker image\n"
2160 |    ]
2161 |   },
2162 |   {
2163 |    "cell_type": "code",
2164 |    "execution_count": null,
2165 |    "metadata": {},
2166 |    "outputs": [],
2167 |    "source": [
2168 |     "import mlflow"
2169 |    ]
2170 |   },
2171 |   {
2172 |    "cell_type": "code",
2173 |    "execution_count": null,
2174 |    "metadata": {},
2175 |    "outputs": [],
2176 |    "source": [
2177 |     "mlflow.__version__"
2178 |    ]
2179 |   },
2180 |   {
2181 |    "cell_type": "code",
2182 |    "execution_count": null,
2183 |    "metadata": {},
2184 |    "outputs": [],
2185 |    "source": [
2186 |     "model_info = mlflow.sklearn.log_model(lr_pipe, artifact_path='lr_pipe')"
2187 |    ]
2188 |   },
2189 |   {
2190 |    "cell_type": "code",
2191 |    "execution_count": null,
2192 |    "metadata": {},
2193 |    "outputs": [],
2194 |    "source": [
2195 |     "model_info.artifact_path    "
2196 |    ]
2197 |   },
2198 |   {
2199 |    "cell_type": "code",
2200 |    "execution_count": null,
2201 |    "metadata": {},
2202 |    "outputs": [],
2203 |    "source": [
2204 |     "!tree"
2205 |    ]
2206 |   },
2207 |   {
2208 |    "cell_type": "code",
2209 |    "execution_count": null,
2210 |    "metadata": {},
2211 |    "outputs": [],
2212 |    "source": [
2213 |     "model_info.run_id"
2214 |    ]
2215 |   },
2216 |   {
2217 |    "cell_type": "code",
2218 |    "execution_count": null,
2219 |    "metadata": {},
2220 |    "outputs": [],
2221 |    "source": [
2222 |     "model = mlflow.pyfunc.load_model(f'mlruns/0/{model_info.run_id}/artifacts/lr_pipe')"
2223 |    ]
2224 |   },
2225 |   {
2226 |    "cell_type": "code",
2227 |    "execution_count": null,
2228 |    "metadata": {},
2229 |    "outputs": [],
2230 |    "source": [
2231 |     "model"
2232 |    ]
2233 |   },
2234 |   {
2235 |    "cell_type": "code",
2236 |    "execution_count": null,
2237 |    "metadata": {},
2238 |    "outputs": [],
2239 |    "source": [
2240 |     "model.predict(X_test)"
2241 |    ]
2242 |   },
2243 |   {
2244 |    "cell_type": "code",
2245 |    "execution_count": null,
2246 |    "metadata": {},
2247 |    "outputs": [],
2248 |    "source": []
2249 |   },
2250 |   {
2251 |    "cell_type": "markdown",
2252 |    "metadata": {},
2253 |    "source": [
2254 |     "## Challenge\n",
2255 |     "\n",
2256 |     "Reformat your notebook so that you can load the data and create an optimized random forest model in a single cell. Then, use MLFlow to log the model and its parameters."
2257 |    ]
2258 |   },
2259 |   {
2260 |    "cell_type": "code",
2261 |    "execution_count": null,
2262 |    "metadata": {},
2263 |    "outputs": [],
2264 |    "source": []
2265 |   },
2266 |   {
2267 |    "cell_type": "markdown",
2268 |    "metadata": {},
2269 |    "source": [
2270 |     "\n",
2271 |     "## Solution"
2272 |    ]
2273 |   },
2274 |   {
2275 |    "cell_type": "markdown",
2276 |    "metadata": {},
2277 |    "source": []
2278 |   },
2279 |   {
2280 |    "cell_type": "code",
2281 |    "execution_count": null,
2282 |    "metadata": {},
2283 |    "outputs": [],
2284 |    "source": []
2285 |   },
2286 |   {
2287 |    "cell_type": "code",
2288 |    "execution_count": null,
2289 |    "metadata": {},
2290 |    "outputs": [],
2291 |    "source": []
2292 |   }
2293 |  ],
2294 |  "metadata": {
2295 |   "kernelspec": {
2296 |    "display_name": "Python 3 (ipykernel)",
2297 |    "language": "python",
2298 |    "name": "python3"
2299 |   },
2300 |   "language_info": {
2301 |    "codemirror_mode": {
2302 |     "name": "ipython",
2303 |     "version": 3
2304 |    },
2305 |    "file_extension": ".py",
2306 |    "mimetype": "text/x-python",
2307 |    "name": "python",
2308 |    "nbconvert_exporter": "python",
2309 |    "pygments_lexer": "ipython3",
2310 |    "version": "3.10.13"
2311 |   }
2312 |  },
2313 |  "nbformat": 4,
2314 |  "nbformat_minor": 4
2315 | }
2316 | 


--------------------------------------------------------------------------------