├── .github
└── workflows
│ └── test.yaml
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── data
├── adult-census.csv
├── ames.csv
├── ames_raw.csv
└── planes.csv
├── environment.yaml
├── notebooks
├── 00-Introduction.ipynb
├── 00-Setting-the-Stage.ipynb
├── 01-git.ipynb
├── 02-explore_data.ipynb
├── 03-first_model.ipynb
├── 04-modular_code.ipynb
├── 05-feat_eng.ipynb
├── 06-model_eval.ipynb
├── 07-modularity-pt2.ipynb
├── 08-testing.ipynb
├── 09-ml_lifecycle_mgt.ipynb
├── Case Study Solutions.ipynb
├── Case Study.ipynb
├── images
│ ├── all-split-1.png
│ ├── api_diagram-columntransformer.svg
│ ├── api_diagram-predictor.fit.svg
│ ├── api_diagram-predictor.predict.svg
│ ├── api_diagram-predictor.score.svg
│ ├── api_diagram-transformer.fit.svg
│ ├── api_diagram-transformer.fit_transform.svg
│ ├── api_diagram-transformer.transform.svg
│ ├── assert-false.png
│ ├── bias-model-1.png
│ ├── brad.jpg
│ ├── clone-repo.png
│ ├── clustering_vs_pca.jpeg
│ ├── commit.png
│ ├── create-repo.png
│ ├── cross_validation_diagram.png
│ ├── cv.png
│ ├── engineering-icon.jpeg
│ ├── ethan-headshot.gif
│ ├── ethan.jpg
│ ├── example-knn-1.png
│ ├── explore-icon.jpeg
│ ├── gh-desktop.png
│ ├── grid_search_cross_validation.png
│ ├── gus.jpg
│ ├── jay.jpg
│ ├── justice-icon.jpg
│ ├── legos.jpg
│ ├── local_url.png
│ ├── ls.png
│ ├── machine_learning.png
│ ├── ml_types.png
│ ├── ml_types2.jpeg
│ ├── mlflow_capabilities.png
│ ├── mlflow_tech_icon.png
│ ├── mlruns_directory.png
│ ├── model_registry_governance.png
│ ├── model_registry_mlops.png
│ ├── model_registry_visibility.png
│ ├── modeling-process-bias-model-1.png
│ ├── modeling-process-bias-model-2.png
│ ├── modeling-process-knn-options-1.png
│ ├── modeling-process-variance-model-1.png
│ ├── modeling-process-variance-model-2.png
│ ├── modeling_process.png
│ ├── my-module.png
│ ├── new-text-file.png
│ ├── pattern1.jpeg
│ ├── pattern2.jpeg
│ ├── patterns.png
│ ├── prefix-prompt.png
│ ├── process-icon.svg
│ ├── push-origin.png
│ ├── random_search.png
│ ├── registered_models.png
│ ├── rename-script.png
│ ├── resampling.png
│ ├── resampling.svg
│ ├── scikit-learn-logo.png
│ ├── sidebar-script.png
│ ├── variance-model-1.png
│ └── what_is_ml.jpeg
├── my_module.py
└── tests.py
└── scripts
├── generate_slides.sh
└── prep_nb_for_ci.py
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
1 | name: Test
2 |
3 | on: push
4 |
5 | jobs:
6 | validate-notebooks:
7 | name: Validate Notebooks
8 | runs-on: ubuntu-latest
9 | defaults:
10 | run:
11 | # Required for "run" commands to execute in the conda env.
12 | shell: bash -l {0}
13 | steps:
14 | - name: Checkout Code
15 | uses: actions/checkout@v2
16 | - name: Set Up Conda Environment
17 | uses: conda-incubator/setup-miniconda@v2
18 | with:
19 | activate-environment: uc-python
20 | environment-file: environment.yaml
21 | - name: Set Up Jupyter Kernel
22 | run: |
23 | python -m ipykernel install --user --name uc-python
24 | - name: Install Papermill
25 | run: |
26 | conda install papermill
27 | - name: Prep notebooks
28 | run: |
29 | # Remove nb cells that should be skipped in CI.
30 | for nb in notebooks/*.ipynb; do
31 | python scripts/prep_nb_for_ci.py "$nb"
32 | done
33 | - name: Run notebooks
34 | run: |
35 | for nb in notebooks/*.ipynb; do
36 | echo "running $nb"
37 | output=$(papermill --cwd notebooks/ "$nb" -)
38 | done
39 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 | slides
75 |
76 | # pyenv
77 | .python-version
78 |
79 | # celery beat schedule file
80 | celerybeat-schedule
81 |
82 | # SageMath parsed files
83 | *.sage.py
84 |
85 | # Environments
86 | .env
87 | .venv
88 | env/
89 | venv/
90 | ENV/
91 | env.bak/
92 | venv.bak/
93 |
94 | # Spyder project settings
95 | .spyderproject
96 | .spyproject
97 |
98 | # Rope project settings
99 | .ropeproject
100 |
101 | # mkdocs documentation
102 | /site
103 |
104 | # mypy
105 | .mypy_cache/
106 |
107 | .DS_Store
108 |
109 | notebooks/mlruns
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Python for Data Science @ University of Cincinnati
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | IMAGESIN=$(wildcard notebooks/images/*)
2 | IMAGESOUT=$(patsubst notebooks/%,slides/%,$(IMAGESIN))
3 | NBFILES=$(wildcard notebooks/*-*.ipynb)
4 | HTMLFILES=$(patsubst notebooks/%.ipynb,slides/%.slides.html,$(NBFILES))
5 |
6 | slides: html
7 |
8 | images: slides/images $(IMAGESOUT)
9 |
10 | slides/images:
11 | mkdir -p slides/images
12 |
13 | $(IMAGESOUT): slides/images/%: notebooks/images/%
14 | cp -a $< $@
15 |
16 | html: images $(HTMLFILES)
17 |
18 | $(HTMLFILES): slides/%.slides.html: notebooks/%.ipynb
19 | bash scripts/generate_slides.sh $<
20 |
21 | clean:
22 | rm -rf slides/
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Advanced Python for Data Science
2 |
3 | [](https://github.com/uc-python/advanced-python-datasci/actions?query=workflow%3ATest)
4 |
5 | ### Course Description
6 |
7 | This is a two-day course that introduces how one can use Python for advanced machine learning applications.
8 | Most of the time will be spent working through example problems end-to-end in the classroom.
9 | Students will learn the fundamentals of the Scikit-learn library along with exploring several other tools and methodologies that allow you to implement a robust end-to-end machine learning workflow.
10 | Some additional time will be reserved for discussion of real programming challenges students have encountered, and for an overview of related relevant technologies students may need in an industry setting (e.g. Git and GitHub).
11 |
12 | ### Objectives
13 |
14 | 1. Develop an intuition for the machine learning workflow and Python tooling.
15 | 2. Build familiarity with common software engineering tooling and methodologies for implementing a machine learning project.
16 | 3. Gain a high-level understanding of the function of data science-adjacent technologies that students will encounter in the workplace, focusing on [Git](https://git-scm.com) and [GitHub](https://github.com).
17 |
18 | ### Prerequisites
19 |
20 | - Strong understanding of core Python concepts: variables, loops, conditionals, and functions
21 | - Some experience using Jupyter Notebooks or Jupyter Lab
22 | - Solid grasp of Pandas and how to use it for data manipulation: filtering, selecting, aggregating, slicing (indexing), and updating
23 | - High-level understanding of modeling concepts: training and test data, model accuracy, and overfitting
24 |
25 | ### Agenda
26 | **This workshop will be 100% virtual over 4 half-days.**
27 |
28 | | Day | Topic | Time |
29 | | :-: | :----------------------------------------------------------------------------- | :-----------: |
30 | | 1 | Introductions | 12:45 - 1:00 |
31 | | | Setting the Stage | 1:00 - 1:15 |
32 | | | Git & version control | 1:15 - 2:00 |
33 | | | Break | 2:00 - 2:15 |
34 | | | EDA & Our First scikit-learn Model | 2:15 - 3:45 |
35 | | | Q&A | 3:45 - 4:15 |
36 | | 2 | Q&A | 12:45 - 1:00 |
37 | | | Modular Code | 1:00 - 2:00 |
38 | | | Feature Engineering | 2:00 - 3:00 |
39 | | | Break | 3:00 - 3:15 |
40 | | | Case Study, pt. 1 | 3:15 - 4:00 |
41 | | | Q&A | 4:00 - 4:15 |
42 | | 3 | Q&A | 12:45 - 1:00 |
43 | | | Model Evaluation & Selection | 1:00 - 2:15 |
44 | | | Break | 2:15 - 2:30 |
45 | | | More on Modular Code | 2:30 - 3:15 |
46 | | | Unit Tests | 3:15 - 4:00 |
47 | | | Q&A | 4:00 - 4:15 |
48 | | 4 | Q&A | 12:45 - 1:00 |
49 | | | More on Unit Tests | 1:00 - 1:30 |
50 | | | ML lifecycle management | 1:30 - 2:30 |
51 | | | Break | 2:30 - 2:45 |
52 | | | Case Study, pt. 2 | 2:45 - 3:45 |
53 | | | Case Study Review, pt. 2 and Q&A | 3:45 - 4:15 |
54 |
55 | ### Course Preparation
56 |
57 | You will need to install Python, Jupyter, and the relevant libraries on your personal computer for this workshop. we also recommend downloading the course materials.
58 |
59 | See below for instructions on doing so.
60 |
61 | #### 1. Install Python, Jupyter and Needed Packages
62 |
63 | These easiest way to install Python, Jupyter, and the necessary packages is through Anaconda. To download and install Anaconda and its graphical interface, Anaconda Navigator, follow these steps:
64 |
65 | 1. Visit the [Anaconda download page](https://www.anaconda.com/products/individual).
66 | 2. Select your appropriate operating system.
67 | 3. Click the "Download" button for Anaconda Individual Edition, Python 3.9 - this will begin to download the Anaconda installer.
68 | - If a popup appears, asking you to sign up for anything, you can close the window.
69 | 4. Open the installer when the download completes, and then follow the prompts. If you are prompted about installing PyCharm, elect **not** to do so.
70 | 5. Once installed, open the Anaconda Navigator and launch a Jupyter Notebook to ensure it works.
71 | 6. Download the class materials (see the below section) and use the included `environment.yaml` file to create a new environment from Anaconda Navigator, using these steps:
72 | - In the tabs along the left side, select "Environments".
73 | - At the bottom of the list of environments (you will likely have just one, "base"), look for the "Import" button. Click it.
74 | - In the dialog box that appears, click on the folder icon and then navigate your computer's files in order to select the `environment.yaml` file you downloaded earlier. Click "Open" once you've selected it.
75 | - Wait for Anaconda Navigator to finish fetching and installing the needed packages. When it finishes, a new environment called "uc-python" should show up in the list.
76 |
77 |
78 | #### 2. Download Class Materials
79 |
80 | There are two ways to download the class materials:
81 |
82 | 1. Clone it - If you're familiar with using Git, we recommend cloning the repo.
83 | 2. Download the files as a zip - This will allow you to download a static copy of the files here, but in order to get any updates you'll need to redownload the entire repo. Use [this link](https://github.com/uc-python/advanced-python-datasci/archive/master.zip).
84 |
85 | ### Your Instructors
86 |
87 | If you have any specific questions prior to the class you can reach out to us directly via GitHub or email:
88 |
89 | * Ethan Swan: [GitHub](https://www.github.com/eswan18) & [Email](mailto:ethanpswan@gmail.com)
90 | * Bradley Boehmke: [GitHub](https://www.github.com/bradleyboehmke) & [Email](mailto:bradleyboehmke@gmail.com)
91 | * Gus Powers: [GitHub](https://www.github.com/augustopher) & [Email](mailto:guspowers0@gmail.com)
92 | * Jay Cunningham: [GitHub](https://github.com/cunningjames) & [Email](mailto:james@notbadafterall.com)
93 |
--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
1 | name: uc-python
2 | channels:
3 | - defaults
4 | - conda-forge
5 | dependencies:
6 | - python=3.11
7 | - category_encoders>=2.6
8 | - ipykernel>=6.28
9 | - matplotlib>=3.8
10 | - missingno>=0.4
11 | - mlflow=2.9
12 | - nbconvert>=7.14
13 | - numpy>=1.26
14 | - pandas>=2.1
15 | - pip>=23.3
16 | - plotnine>=0.12
17 | - pytest>=7.4
18 | - scikit-learn>=1.3
19 | - seaborn>=0.13
20 |
--------------------------------------------------------------------------------
/notebooks/00-Introduction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "slideshow": {
7 | "slide_type": "slide"
8 | }
9 | },
10 | "source": [
11 | "# Advanced Python for Data Science\n",
12 | "\n",
13 | "Gus Powers and Jay Cunningham\n",
14 | "\n",
15 | "January 2023"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {
21 | "slideshow": {
22 | "slide_type": "slide"
23 | }
24 | },
25 | "source": [
26 | "## Introductions"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {
32 | "slideshow": {
33 | "slide_type": "slide"
34 | }
35 | },
36 | "source": [
37 | "## Gus Powers\n",
38 | "
\n",
39 | " \n",
40 | " \n",
41 | " Lead Data Scientist at 84.51° \n",
42 | " \n",
43 | " Creating and maintaining data science tools for internal use \n",
44 | " Python, Bash (shell), & R \n",
45 | " \n",
46 | " Academic \n",
47 | " \n",
48 | " BS, Chemistry, Thomas More College \n",
49 | " MS, Chemistry, University of Cincinnati \n",
50 | " MS, Business Analytics, University of Cincinnati \n",
51 | " \n",
52 | " Contact \n",
53 | " \n",
58 | " \n",
59 | "
"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {
65 | "slideshow": {
66 | "slide_type": "slide"
67 | }
68 | },
69 | "source": [
70 | "## Jay Cunningham\n",
71 | "\n",
72 | " \n",
73 | " \n",
74 | " Lead Data Scientist at 84.51° \n",
75 | " \n",
76 | " Researching and developing forecasting models \n",
77 | " Machine learning, Python \n",
78 | " \n",
79 | " Academic \n",
80 | " \n",
81 | " BA, Mathematics, University of Kentucky \n",
82 | " MA, Economics, University of North Carolina (Greensboro) \n",
83 | " \n",
84 | " Contact \n",
85 | " \n",
89 | " \n",
90 | "
"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {
96 | "slideshow": {
97 | "slide_type": "skip"
98 | }
99 | },
100 | "source": [
101 | "## Brad Boehmke\n",
102 | "\n",
103 | " \n",
104 | " \n",
105 | " Director, Data Science at 84.51° \n",
106 | " \n",
107 | " Productionizing models and science solutions \n",
108 | " R&D and protogyping new solutions \n",
109 | " Python, R, & MLOps toolchain \n",
110 | " \n",
111 | " Academic \n",
112 | " \n",
113 | " BS, Kinesiology, North Dakota State University \n",
114 | " MS, Cost Analytics, Air Force Institute of Technology \n",
115 | " PhD, Logistics, Air Force Institute of Technology \n",
116 | " \n",
117 | " Contact \n",
118 | " \n",
125 | " \n",
126 | "
"
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "metadata": {
132 | "cell_style": "split",
133 | "slideshow": {
134 | "slide_type": "skip"
135 | }
136 | },
137 | "source": [
138 | "## Ethan Swan\n",
139 | "\n",
140 | " \n",
141 | " \n",
142 | " Senior Backend Engineer at ReviewTrackers \n",
143 | " \n",
144 | " Rest API development \n",
145 | " Putting ML models in production \n",
146 | " Python, Go, Ruby, & ReactJS (JavaScript) \n",
147 | " \n",
148 | " Academic \n",
149 | " \n",
150 | " BS, Computer Science, University of Notre Dame \n",
151 | " MBA, Business Analytics, University of Notre Dame \n",
152 | " \n",
153 | " Contact \n",
154 | " \n",
161 | " \n",
162 | "
"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {
168 | "slideshow": {
169 | "slide_type": "slide"
170 | }
171 | },
172 | "source": [
173 | "## Your Turn\n",
174 | "\n",
175 | "We'll go around the room. Please share:\n",
176 | "- Your name\n",
177 | "- Your job or field\n",
178 | "- How you use Python now or would like to in the future"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {
184 | "slideshow": {
185 | "slide_type": "slide"
186 | }
187 | },
188 | "source": [
189 | "## Course"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {
195 | "slideshow": {
196 | "slide_type": "slide"
197 | }
198 | },
199 | "source": [
200 | "## Course Objectives\n",
201 | "\n",
202 | "The following are the primary learning objectives of this course:"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {
208 | "slideshow": {
209 | "slide_type": "fragment"
210 | }
211 | },
212 | "source": [
213 | "- Develop an intuition for the machine learning workflow and Python tooling."
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {
219 | "slideshow": {
220 | "slide_type": "fragment"
221 | }
222 | },
223 | "source": [
224 | "- Build familiarity with common software engineering tooling and methodologies for implementing a machine learning project."
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "metadata": {
230 | "slideshow": {
231 | "slide_type": "fragment"
232 | }
233 | },
234 | "source": [
235 | "- Gain hands-on experience with the tools and processes discussed with applied case study work."
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {
241 | "slideshow": {
242 | "slide_type": "slide"
243 | }
244 | },
245 | "source": [
246 | "## Course Agenda"
247 | ]
248 | },
249 | {
250 | "cell_type": "markdown",
251 | "metadata": {
252 | "slideshow": {
253 | "slide_type": "slide"
254 | }
255 | },
256 | "source": [
257 | "| Day | Topic | Time |\n",
258 | "| :-: | :----------------------------------------------------------------------------- | :-----------: |\n",
259 | "| 1 | Introductions | 12:45 - 1:00 |\n",
260 | "| | Setting the Stage | 1:00 - 1:15 |\n",
261 | "| | Git & version control | 1:15 - 2:00 |\n",
262 | "| | Break | 2:00 - 2:15 |\n",
263 | "| | EDA & Our First scikit-learn Model | 2:15 - 3:45 |\n",
264 | "| | Q&A | 3:45 - 4:15 |\n",
265 | "| 2 | Q&A | 12:45 - 1:00 |\n",
266 | "| | Modular Code | 1:00 - 2:00 |\n",
267 | "| | Feature Engineering | 2:00 - 3:00 |\n",
268 | "| | Break | 3:00 - 3:15 |\n",
269 | "| | Case Study, pt. 1 | 3:15 - 4:00 |\n",
270 | "| | Q&A | 4:00 - 4:15 |"
271 | ]
272 | },
273 | {
274 | "cell_type": "markdown",
275 | "metadata": {
276 | "slideshow": {
277 | "slide_type": "slide"
278 | }
279 | },
280 | "source": [
281 | "| Day | Topic | Time |\n",
282 | "| :-: | :----------------------------------------------------------------------------- | :-----------: |\n",
283 | "| 3 | Q&A | 12:45 - 1:00 |\n",
284 | "| | Model Evaluation & Selection | 1:00 - 2:15 |\n",
285 | "| | Break | 2:15 - 2:30 |\n",
286 | "| | More on Modular Code | 2:30 - 3:15 |\n",
287 | "| | Unit Tests | 3:15 - 4:00 |\n",
288 | "| | Q&A | 4:00 - 4:15 |\n",
289 | "| 4 | Q&A | 12:45 - 1:00 |\n",
290 | "| | More on Unit Tests | 1:00 - 1:30 |\n",
291 | "| | ML lifecycle management | 1:30 - 2:30 |\n",
292 | "| | Break | 2:30 - 2:45 |\n",
293 | "| | Case Study, pt. 2 | 2:45 - 3:45 |\n",
294 | "| | Case Study Review, pt. 2 and Q&A | 3:45 - 4:15 |"
295 | ]
296 | },
297 | {
298 | "cell_type": "markdown",
299 | "metadata": {
300 | "slideshow": {
301 | "slide_type": "slide"
302 | }
303 | },
304 | "source": [
305 | "## Course Philosophy"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {
311 | "slideshow": {
312 | "slide_type": "fragment"
313 | }
314 | },
315 | "source": [
316 | "Beginners typically need the instructor to make connections and solve problems for them.\n",
317 | "\n",
318 | "*Why is this code not running?\n",
319 | "What types of real world problems could I use this package for?*"
320 | ]
321 | },
322 | {
323 | "cell_type": "markdown",
324 | "metadata": {},
325 | "source": []
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "metadata": {
330 | "slideshow": {
331 | "slide_type": "fragment"
332 | }
333 | },
334 | "source": [
335 | "But as intermediate to advanced users, we believe you'll be more capable of seeing those connections yourselves.\n",
336 | "Instead of diving into details and working through small code examples, this advanced workshop takes a slightly different approach..."
337 | ]
338 | },
339 | {
340 | "cell_type": "markdown",
341 | "metadata": {
342 | "slideshow": {
343 | "slide_type": "slide"
344 | }
345 | },
346 | "source": [
347 | "- **Give you an overview of the tools you might need to solve a problem**. We can't teach you machine learning in just two days, but we *can* give you a foundation. And as experienced coders, you'll be able to fill in the details yourselves when the time comes to use these tools."
348 | ]
349 | },
350 | {
351 | "cell_type": "markdown",
352 | "metadata": {
353 | "slideshow": {
354 | "slide_type": "fragment"
355 | }
356 | },
357 | "source": [
358 | "- **Explain more of the intuition behind tools and techniques**. Beginners can't yet see the forest for the trees -- they are caught up in small problems and not yet ready to understand the big picture. But in this class we will talk more about general design patterns of Python and its libraries, in a way that should help you *learn them* instead of simply memorize functions."
359 | ]
360 | },
361 | {
362 | "cell_type": "markdown",
363 | "metadata": {
364 | "slideshow": {
365 | "slide_type": "fragment"
366 | }
367 | },
368 | "source": [
369 | "- **Expect you to help yourself**. We'll still be here to answer questions and help with hard problems, but the mark of an experienced programmer is that he/she consults references often (Google, documentation, etc) and can find answers there. You'll need to do that during this course and afterward when you apply the techniques we discuss."
370 | ]
371 | },
372 | {
373 | "cell_type": "markdown",
374 | "metadata": {
375 | "slideshow": {
376 | "slide_type": "slide"
377 | }
378 | },
379 | "source": [
380 | "## Prerequisites"
381 | ]
382 | },
383 | {
384 | "cell_type": "markdown",
385 | "metadata": {
386 | "slideshow": {
387 | "slide_type": "slide"
388 | }
389 | },
390 | "source": [
391 | "### Python\n",
392 | "\n",
393 | "- If you're attending this class, it's assumed you're comfortable with the material covered in the [Introduction to Python for Data Science](https://github.com/uc-python/intro-python-datasci) and [Intermediate Python for Data Science](https://github.com/uc-python/intermediate-python-datasci) classes.\n",
394 | "- At a very high level, those courses covered:\n",
395 | " - Importing data into and exporting data out of Python, via Pandas\n",
396 | " - Wrangling data in Python with Pandas\n",
397 | " - Basics of visualization with Seaborn\n",
398 | " - Control flow\n",
399 | " - Writing functions\n",
400 | " - Conda environments\n",
401 | " - Running Python outside of Jupyter notebooks\n",
402 | " - Basics of modeling with scikit-learn"
403 | ]
404 | },
405 | {
406 | "cell_type": "markdown",
407 | "metadata": {
408 | "slideshow": {
409 | "slide_type": "slide"
410 | }
411 | },
412 | "source": [
413 | "### Jupyter\n",
414 | "\n",
415 | "* If you're attending this class, it's assumed you're comfortable with launching and using Python via Jupyter Notebooks -- and ideally outside of Jupyter as well.\n",
416 | "* Course materials (slides, case studies, etc.) will be in Jupyter Notebooks, but you're free to use your IDE of choice when completing exercises and case studies."
417 | ]
418 | },
419 | {
420 | "cell_type": "markdown",
421 | "metadata": {
422 | "slideshow": {
423 | "slide_type": "slide"
424 | }
425 | },
426 | "source": [
427 | "## Technology Setup"
428 | ]
429 | },
430 | {
431 | "cell_type": "markdown",
432 | "metadata": {
433 | "slideshow": {
434 | "slide_type": "slide"
435 | }
436 | },
437 | "source": [
438 | "- Unlike our other courses, Advanced Python is not designed with Binder in mind.\n",
439 | "- This means that you'll need to use your personal laptop to run today's code.\n",
440 | "- Why? We're going to be working with bigger data and more computationally-intensive algorithms, for which Binder is not well-equipped.\n",
441 | " - In an industry setting, using these techniques would best be done on a *server*, not a personal computer."
442 | ]
443 | },
444 | {
445 | "cell_type": "markdown",
446 | "metadata": {
447 | "slideshow": {
448 | "slide_type": "slide"
449 | }
450 | },
451 | "source": [
452 | "### Anaconda\n",
453 | "\n",
454 | "* Anaconda is the easiest way to install Python 3 and Jupyter.\n",
455 | "* If you have not yet installed Anaconda, please follow the [directions in the course README](https://github.com/uc-python/advanced-python-datasci).\n",
456 | "* Be sure that all Python packages listed in the [environment.yaml](https://github.com/uc-python/advanced-python-datasci/blob/master/environment.yaml) are installed. See [here](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-from-an-environment-yml-file) for instructions on creating a Conda environment from an environment.yaml file.\n",
457 | "* This Anaconda installation will not be able to natively display the course content as slides, but I recommend using it for completing exercises and the case studies."
458 | ]
459 | },
460 | {
461 | "cell_type": "markdown",
462 | "metadata": {
463 | "slideshow": {
464 | "slide_type": "slide"
465 | }
466 | },
467 | "source": [
468 | "### JupyterLab\n",
469 | "- If you took the introductory and/or intermediate courses, you may have used Jupyter Notebooks to write Python.\n",
470 | "- Jupyter Notebooks are slowly being deprecated in favor of a new, more featureful product called JupyterLab.\n",
471 | "- JupyterLab is extremely similar but supports more features, and Notebooks is no longer being updated.\n",
472 | "- I recommend using JupyterLab today even if you haven't used it before -- it comes packaged with Anaconda and should feel very familiar!"
473 | ]
474 | },
475 | {
476 | "cell_type": "markdown",
477 | "metadata": {
478 | "slideshow": {
479 | "slide_type": "slide"
480 | }
481 | },
482 | "source": [
483 | "## Course Materials"
484 | ]
485 | },
486 | {
487 | "cell_type": "markdown",
488 | "metadata": {
489 | "slideshow": {
490 | "slide_type": "fragment"
491 | }
492 | },
493 | "source": [
494 | "* All of the material for this course can be reached from the [GitHub repository](https://github.com/uc-python/advanced-python-datasci).\n",
495 | "* This repository has access to the slides and notebooks.\n",
496 | "* You should download the material -- available via [this link](https://github.com/uc-python/advanced-python-datasci/archive/master.zip) -- and open it via Anaconda Navigator and Jupyter Notebooks/Lab."
497 | ]
498 | },
499 | {
500 | "cell_type": "markdown",
501 | "metadata": {
502 | "slideshow": {
503 | "slide_type": "slide"
504 | }
505 | },
506 | "source": [
507 | "### Slides *are* Notebooks"
508 | ]
509 | },
510 | {
511 | "cell_type": "markdown",
512 | "metadata": {
513 | "slideshow": {
514 | "slide_type": "fragment"
515 | }
516 | },
517 | "source": [
518 | "- We'll be showing the material in slide format most of the time.\n",
519 | "- These slides contain the same content as your notebooks, so you can follow along and run cells as we go."
520 | ]
521 | },
522 | {
523 | "cell_type": "markdown",
524 | "metadata": {
525 | "slideshow": {
526 | "slide_type": "slide"
527 | }
528 | },
529 | "source": [
530 | "### Source Code"
531 | ]
532 | },
533 | {
534 | "cell_type": "markdown",
535 | "metadata": {
536 | "slideshow": {
537 | "slide_type": "fragment"
538 | }
539 | },
540 | "source": [
541 | "* Source code for the training can be found on [GitHub](https://github.com/uc-python/advanced-python-datasci)\n",
542 | "* This repository is public so you can clone (download) and/or refer to the materials at any point in the future"
543 | ]
544 | },
545 | {
546 | "cell_type": "markdown",
547 | "metadata": {
548 | "slideshow": {
549 | "slide_type": "slide"
550 | }
551 | },
552 | "source": [
553 | "## Questions\n",
554 | "\n",
555 | "Are there any questions before moving on?"
556 | ]
557 | }
558 | ],
559 | "metadata": {
560 | "celltoolbar": "Slideshow",
561 | "kernelspec": {
562 | "display_name": "Python 3 (ipykernel)",
563 | "language": "python",
564 | "name": "python3"
565 | },
566 | "language_info": {
567 | "codemirror_mode": {
568 | "name": "ipython",
569 | "version": 3
570 | },
571 | "file_extension": ".py",
572 | "mimetype": "text/x-python",
573 | "name": "python",
574 | "nbconvert_exporter": "python",
575 | "pygments_lexer": "ipython3",
576 | "version": "3.7.11"
577 | },
578 | "rise": {
579 | "autolaunch": true,
580 | "transition": "none"
581 | }
582 | },
583 | "nbformat": 4,
584 | "nbformat_minor": 4
585 | }
586 |
--------------------------------------------------------------------------------
/notebooks/01-git.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "slideshow": {
7 | "slide_type": "slide"
8 | },
9 | "tags": []
10 | },
11 | "source": [
12 | "# Git and GitHub"
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {
18 | "jp-MarkdownHeadingCollapsed": true,
19 | "slideshow": {
20 | "slide_type": "slide"
21 | },
22 | "tags": []
23 | },
24 | "source": [
25 | "## Lesson Goals\n",
26 | "- Understand the purpose of version control systems\n",
27 | "- Create a GitHub account\n",
28 | "- Upload your first code to GitHub"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {
34 | "tags": []
35 | },
36 | "source": [
37 | "## Prerequisites\n",
38 | "- None"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {
44 | "slideshow": {
45 | "slide_type": "slide"
46 | },
47 | "tags": []
48 | },
49 | "source": [
50 | "## Code Management\n",
51 | "\n",
52 | "Some new considerations come up as we build larger projects that eventually go into production..."
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {
58 | "slideshow": {
59 | "slide_type": "fragment"
60 | },
61 | "tags": []
62 | },
63 | "source": [
64 | "- What happens if our computer breaks or is lost? Will we lose all of our code?"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {
70 | "slideshow": {
71 | "slide_type": "fragment"
72 | },
73 | "tags": []
74 | },
75 | "source": [
76 | "- What if we make changes to our code that we later want to roll back? For example, trying a different method of cleaning our data and eventually discovering that it yields worse results."
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {
82 | "slideshow": {
83 | "slide_type": "fragment"
84 | },
85 | "tags": []
86 | },
87 | "source": [
88 | "- How do multiple people work on the same project without sending files back and forth, and stepping on each others' toes?"
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "metadata": {},
94 | "source": [
95 | "\n",
96 | "
Discussion
\n",
97 | " Has anyone faced problems like these before?\n",
98 | "
"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {
104 | "slideshow": {
105 | "slide_type": "slide"
106 | },
107 | "tags": []
108 | },
109 | "source": [
110 | "## Version Control Systems (VCS)\n",
111 | "The most common industry solution is a **Version Control System**, which:\n",
112 | "- Provides a backup of your code on a separate computer\n",
113 | "- Tracks every change made to the code, allowing you to see when certain code was updated and to roll back to an earlier state if necessary\n",
114 | "- Helps with collaboration by letting contributors work on different things in parallel and then \"merge\" their changes together later"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {
120 | "slideshow": {
121 | "slide_type": "fragment"
122 | },
123 | "tags": []
124 | },
125 | "source": [
126 | "Typically, an organization will have one central VCS where all projects are managed."
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "metadata": {
132 | "slideshow": {
133 | "slide_type": "slide"
134 | },
135 | "tags": []
136 | },
137 | "source": [
138 | "## Common VCS Options"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "Far and away the most popular VCS tool is Git, which is notable for its scalability and performance.\n",
146 | "\n",
147 | "Unfortunately, it's often challenging for beginners; its interface can be overwhelming."
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "metadata": {
153 | "slideshow": {
154 | "slide_type": "fragment"
155 | },
156 | "tags": []
157 | },
158 | "source": [
159 | "Other VCSes exist, and were more common until Git became dominant around 2010.\n",
160 | "You may have heard of these or even used them:\n",
161 | "- Mercurial\n",
162 | "- Subversion"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {
168 | "slideshow": {
169 | "slide_type": "slide"
170 | },
171 | "tags": []
172 | },
173 | "source": [
174 | "## Git and GitHub\n",
175 | "Git is generally used in tandem with a website where your code can be kept, viewed, and managed.\n",
176 | "There are several, but the most commonly used site is **GitHub**."
177 | ]
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "metadata": {
182 | "slideshow": {
183 | "slide_type": "fragment"
184 | },
185 | "tags": []
186 | },
187 | "source": [
188 | "Not only does GitHub provide a good interface for viewing code, it also features:\n",
189 | "- Project management tools\n",
190 | "- Collaboration tools\n",
191 | "\n",
192 | "Both of which are tightly integrated with your code -- very convenient for developer and data science teams.\n",
193 | "\n",
194 | "GitHub offers most of its tools for free and has become the home of most popular open source projects (such as Python itself and the pandas library)."
195 | ]
196 | },
197 | {
198 | "cell_type": "markdown",
199 | "metadata": {
200 | "slideshow": {
201 | "slide_type": "fragment"
202 | },
203 | "tags": []
204 | },
205 | "source": [
206 | "\n",
207 | "
Note
\n",
208 | " There are competing services to GitHub, such as
GitLab and
Bitbucket , but GitHub is by far the most popular tool -- to the point that employers sometimes ask for your GitHub profile to see your portfolio.\n",
209 | "
"
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "metadata": {
215 | "slideshow": {
216 | "slide_type": "slide"
217 | },
218 | "tags": []
219 | },
220 | "source": [
221 | "## Creating a GitHub Account\n",
222 | "\n",
223 | "*(If you already have a GitHub account, you may skip these steps. Just log into your account so we can push code to it later.)*"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {
229 | "slideshow": {
230 | "slide_type": "fragment"
231 | },
232 | "tags": []
233 | },
234 | "source": [
235 | "1. Go to `github.com` and find the **Sign Up** button.\n",
236 | " - When prompted, enter your email address, a new password, and a username\n",
237 | " - This username will be your identifier on GitHub, so make sure you'd be comfortable sharing it with an employer or colleague\n",
238 | "2. You may then need to solve a Captcha-like puzzle and verify your email address. Do so.\n",
239 | "\n",
240 | "3. Once the account is created, you may be asked whether to create a new project, or \"repository\". We'll do that, but not yet!"
241 | ]
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "metadata": {
246 | "slideshow": {
247 | "slide_type": "slide"
248 | },
249 | "tags": []
250 | },
251 | "source": [
252 | "## GitHub Tour\n",
253 | "\n",
254 | "*Demo of Profile and Repos*"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "metadata": {
260 | "slideshow": {
261 | "slide_type": "slide"
262 | },
263 | "tags": []
264 | },
265 | "source": [
266 | "## Repositories\n",
267 | "\n",
268 | "- As we saw, repositories are just projects.\n",
269 | "- For short, we usually call them **repos**.\n",
270 | "- Generally, it's good to have a unique repository in GitHub for every project you work on.\n",
271 | "- Let's create a repo for the code we write in this workshop!"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "id": "803ff344",
277 | "metadata": {
278 | "slideshow": {
279 | "slide_type": "slide"
280 | }
281 | },
282 | "source": [
283 | "## Creating a Repo\n",
284 | "\n",
285 | "1. Go back to GitHub.\n",
286 | "2. If today is the first time you've used GitHub, the site may immediately prompt you to create a repo. If so, click that.\n",
287 | " - If not, look in the left sidebar for a \"New\" button and click that -- it should take you to a repo creation page.\n",
288 | "3. In the _Name_ field enter \"advanced-python-datasci\", and in the _Description_ enter \"Working through the Advanced Python for Data Science workshop\".\n",
289 | "4. There are three boxes below; check the first two. Those should be \"Add a readme\" and \"Add a gitignore\".\n",
290 | " - The gitignore checkbox should show a dropdown below it, \"gitignore template\". Look through that list and select _Python_.\n",
291 | "5. Then press the \"Create Repository\" button!"
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "id": "92d0d89e",
297 | "metadata": {
298 | "slideshow": {
299 | "slide_type": "slide"
300 | }
301 | },
302 | "source": [
303 | " "
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "metadata": {
309 | "slideshow": {
310 | "slide_type": "slide"
311 | },
312 | "tags": []
313 | },
314 | "source": [
315 | "## GitHub Desktop\n",
316 | "\n",
317 | "Next, we'll download a piece of software from GitHub that handles syncing our code with our repository: *GitHub Desktop*\n",
318 | "\n",
319 | "_Note: if you're already comfortable using Git from the command line, you can skip this part; just clone the repo we've created as we'll be using it for the rest of the workshop._\n",
320 | "\n",
321 | "1. Go to https://desktop.github.com and download the application.\n",
322 | "2. Once it's downloaded and installed, open it.\n",
323 | " "
324 | ]
325 | },
326 | {
327 | "cell_type": "markdown",
328 | "metadata": {
329 | "slideshow": {
330 | "slide_type": "slide"
331 | },
332 | "tags": []
333 | },
334 | "source": [
335 | "## Connecting Our Repo to GH Desktop\n",
336 | "\n",
337 | "The last bit of direction-following!"
338 | ]
339 | },
340 | {
341 | "cell_type": "markdown",
342 | "metadata": {
343 | "slideshow": {
344 | "slide_type": "slide"
345 | },
346 | "tags": []
347 | },
348 | "source": [
349 | "1. In GitHub Desktop, you should see an option like _Clone a Repository from the Internet..._. Click this.\n",
350 | "2. At this point, the application should prompt you to sign into GitHub. Follow its instructions to do so, which may involve it redirecting you to the browser.\n",
351 | "3. Once done signing in, you may have to press _Clone a Repository from the Internet..._ again.\n",
352 | "4. Choose the _advanced-python-datasci_ project we just created.\n",
353 | " - Optionally, you may change the _Local Path_ -- this is where the repository will be saved on your computer. You'll need to be able to open the code here in JupyterLab, so if you're more comfortable keeping your code somewhere else, change this to a different location on your computer.\n",
354 | "5. Then press Clone to pull down the repository we created."
355 | ]
356 | },
357 | {
358 | "cell_type": "markdown",
359 | "id": "b5f2197b",
360 | "metadata": {
361 | "slideshow": {
362 | "slide_type": "slide"
363 | }
364 | },
365 | "source": [
366 | " "
367 | ]
368 | },
369 | {
370 | "cell_type": "markdown",
371 | "metadata": {
372 | "slideshow": {
373 | "slide_type": "slide"
374 | },
375 | "tags": []
376 | },
377 | "source": [
378 | "Congrats! You've set up your first GitHub repository, and now you're ready to work in it."
379 | ]
380 | },
381 | {
382 | "cell_type": "markdown",
383 | "id": "ef6fb25d",
384 | "metadata": {
385 | "slideshow": {
386 | "slide_type": "slide"
387 | }
388 | },
389 | "source": [
390 | "## Questions\n",
391 | "\n",
392 | "Are there any questions before we move on?"
393 | ]
394 | }
395 | ],
396 | "metadata": {
397 | "kernelspec": {
398 | "display_name": "uc-python",
399 | "language": "python",
400 | "name": "uc-python"
401 | },
402 | "language_info": {
403 | "codemirror_mode": {
404 | "name": "ipython",
405 | "version": 3
406 | },
407 | "file_extension": ".py",
408 | "mimetype": "text/x-python",
409 | "name": "python",
410 | "nbconvert_exporter": "python",
411 | "pygments_lexer": "ipython3",
412 | "version": "3.9.1"
413 | }
414 | },
415 | "nbformat": 4,
416 | "nbformat_minor": 5
417 | }
418 |
--------------------------------------------------------------------------------
/notebooks/03-first_model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "ecbf9fcb-cd85-41ba-b24b-4ec0b3818293",
6 | "metadata": {
7 | "slideshow": {
8 | "slide_type": "slide"
9 | },
10 | "tags": []
11 | },
12 | "source": [
13 | "# First model with scikit-learn\n",
14 | "\n",
15 | ""
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "id": "b16847ba-b980-4bd7-bf77-b8c635b20b10",
21 | "metadata": {
22 | "slideshow": {
23 | "slide_type": "slide"
24 | },
25 | "tags": []
26 | },
27 | "source": [
28 | "## Objective\n",
29 | "\n",
30 | "In this module, we present how to build predictive models on tabular datasets, with only numerical features.\n",
31 | "\n",
32 | "In particular we will highlight:\n",
33 | "\n",
34 | "* the scikit-learn API: `.fit(X, y)`/`.predict(X)`/`.score(X, y)`;\n",
35 | "* how to evaluate the generalization performance of a model with a train-test\n",
36 | " split."
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "id": "a6fbebbe-a5e4-4616-b70f-897376bc8f8e",
42 | "metadata": {
43 | "slideshow": {
44 | "slide_type": "slide"
45 | },
46 | "tags": []
47 | },
48 | "source": [
49 | "## Data\n",
50 | "\n",
51 | "We will use the same dataset \"adult_census\" described in the previous\n",
52 | "module. For more details about the dataset see ."
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 1,
58 | "id": "b7d5d794-7240-4e76-874d-f4f83ab8476b",
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "import pandas as pd\n",
63 | "\n",
64 | "adult_census = pd.read_csv(\"../data/adult-census.csv\")"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "id": "5cf478ca-cb9b-4a20-8f50-6b591eb972cc",
70 | "metadata": {
71 | "slideshow": {
72 | "slide_type": "slide"
73 | },
74 | "tags": []
75 | },
76 | "source": [
77 | "## Separating features from target\n",
78 | "\n",
79 | "Scikit-learn prefers our features ($X$) apart from our target ($y$)\n",
80 | "\n",
81 | "Numerical data is the most natural type of data used in machine learning and can (often) be directly fed into predictive models. Consequently, for this module we will use a subset of the original data with only the numerical columns."
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 2,
87 | "id": "d5bc74af-6a9e-44f5-a7f6-b71c50a2b620",
88 | "metadata": {
89 | "slideshow": {
90 | "slide_type": "fragment"
91 | },
92 | "tags": []
93 | },
94 | "outputs": [],
95 | "source": [
96 | "import numpy as np\n",
97 | "\n",
98 | "# create column names of interest\n",
99 | "target_col = \"class\"\n",
100 | "feature_col = (\n",
101 | " adult_census.drop(columns=target_col)\n",
102 | " .select_dtypes(np.number).columns.values\n",
103 | ")"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 3,
109 | "id": "3a2e0692-05e9-40b5-9821-f710e3138535",
110 | "metadata": {
111 | "slideshow": {
112 | "slide_type": "slide"
113 | },
114 | "tags": []
115 | },
116 | "outputs": [
117 | {
118 | "data": {
119 | "text/plain": [
120 | "0 <=50K\n",
121 | "1 <=50K\n",
122 | "2 >50K\n",
123 | "3 >50K\n",
124 | "4 <=50K\n",
125 | " ... \n",
126 | "48837 <=50K\n",
127 | "48838 >50K\n",
128 | "48839 <=50K\n",
129 | "48840 <=50K\n",
130 | "48841 >50K\n",
131 | "Name: class, Length: 48842, dtype: object"
132 | ]
133 | },
134 | "execution_count": 3,
135 | "metadata": {},
136 | "output_type": "execute_result"
137 | }
138 | ],
139 | "source": [
140 | "target = adult_census[target_col]\n",
141 | "target"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 4,
147 | "id": "ac2a9461-b480-4e3b-9d1d-25adb9644a11",
148 | "metadata": {
149 | "slideshow": {
150 | "slide_type": "slide"
151 | },
152 | "tags": []
153 | },
154 | "outputs": [
155 | {
156 | "data": {
157 | "text/html": [
158 | "\n",
159 | "\n",
172 | "
\n",
173 | " \n",
174 | " \n",
175 | " \n",
176 | " age \n",
177 | " education-num \n",
178 | " capital-gain \n",
179 | " capital-loss \n",
180 | " hours-per-week \n",
181 | " \n",
182 | " \n",
183 | " \n",
184 | " \n",
185 | " 0 \n",
186 | " 25 \n",
187 | " 7 \n",
188 | " 0 \n",
189 | " 0 \n",
190 | " 40 \n",
191 | " \n",
192 | " \n",
193 | " 1 \n",
194 | " 38 \n",
195 | " 9 \n",
196 | " 0 \n",
197 | " 0 \n",
198 | " 50 \n",
199 | " \n",
200 | " \n",
201 | " 2 \n",
202 | " 28 \n",
203 | " 12 \n",
204 | " 0 \n",
205 | " 0 \n",
206 | " 40 \n",
207 | " \n",
208 | " \n",
209 | " 3 \n",
210 | " 44 \n",
211 | " 10 \n",
212 | " 7688 \n",
213 | " 0 \n",
214 | " 40 \n",
215 | " \n",
216 | " \n",
217 | " 4 \n",
218 | " 18 \n",
219 | " 10 \n",
220 | " 0 \n",
221 | " 0 \n",
222 | " 30 \n",
223 | " \n",
224 | " \n",
225 | " ... \n",
226 | " ... \n",
227 | " ... \n",
228 | " ... \n",
229 | " ... \n",
230 | " ... \n",
231 | " \n",
232 | " \n",
233 | " 48837 \n",
234 | " 27 \n",
235 | " 12 \n",
236 | " 0 \n",
237 | " 0 \n",
238 | " 38 \n",
239 | " \n",
240 | " \n",
241 | " 48838 \n",
242 | " 40 \n",
243 | " 9 \n",
244 | " 0 \n",
245 | " 0 \n",
246 | " 40 \n",
247 | " \n",
248 | " \n",
249 | " 48839 \n",
250 | " 58 \n",
251 | " 9 \n",
252 | " 0 \n",
253 | " 0 \n",
254 | " 40 \n",
255 | " \n",
256 | " \n",
257 | " 48840 \n",
258 | " 22 \n",
259 | " 9 \n",
260 | " 0 \n",
261 | " 0 \n",
262 | " 20 \n",
263 | " \n",
264 | " \n",
265 | " 48841 \n",
266 | " 52 \n",
267 | " 9 \n",
268 | " 15024 \n",
269 | " 0 \n",
270 | " 40 \n",
271 | " \n",
272 | " \n",
273 | "
\n",
274 | "
48842 rows × 5 columns
\n",
275 | "
"
276 | ],
277 | "text/plain": [
278 | " age education-num capital-gain capital-loss hours-per-week\n",
279 | "0 25 7 0 0 40\n",
280 | "1 38 9 0 0 50\n",
281 | "2 28 12 0 0 40\n",
282 | "3 44 10 7688 0 40\n",
283 | "4 18 10 0 0 30\n",
284 | "... ... ... ... ... ...\n",
285 | "48837 27 12 0 0 38\n",
286 | "48838 40 9 0 0 40\n",
287 | "48839 58 9 0 0 40\n",
288 | "48840 22 9 0 0 20\n",
289 | "48841 52 9 15024 0 40\n",
290 | "\n",
291 | "[48842 rows x 5 columns]"
292 | ]
293 | },
294 | "execution_count": 4,
295 | "metadata": {},
296 | "output_type": "execute_result"
297 | }
298 | ],
299 | "source": [
300 | "features = adult_census[feature_col]\n",
301 | "features"
302 | ]
303 | },
304 | {
305 | "cell_type": "markdown",
306 | "id": "576a46c7-fe30-475f-a772-6ff043910847",
307 | "metadata": {
308 | "slideshow": {
309 | "slide_type": "slide"
310 | },
311 | "tags": []
312 | },
313 | "source": [
314 | "\n",
315 | "
Question
\n",
316 | "
\n",
317 | " \n",
318 | "What type of object is the target data set? \n",
319 | "What type of object is the feature data set?\n",
320 | "
\n",
321 | "
"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": 5,
327 | "id": "ef443bc4-9dfb-4ecb-a61b-79ca2abf5f46",
328 | "metadata": {
329 | "slideshow": {
330 | "slide_type": "skip"
331 | },
332 | "tags": []
333 | },
334 | "outputs": [
335 | {
336 | "name": "stdout",
337 | "output_type": "stream",
338 | "text": [
339 | "The dataset contains 48842 samples and 5 features\n"
340 | ]
341 | }
342 | ],
343 | "source": [
344 | "print(\n",
345 | " f\"The dataset contains {features.shape[0]} samples and \"\n",
346 | " f\"{features.shape[1]} features\"\n",
347 | ")"
348 | ]
349 | },
350 | {
351 | "cell_type": "markdown",
352 | "id": "44ed09b5-0a37-40bd-a4ba-5e18d337f3e4",
353 | "metadata": {
354 | "slideshow": {
355 | "slide_type": "slide"
356 | },
357 | "tags": []
358 | },
359 | "source": [
360 | "## Fit a model\n",
361 | "\n",
362 | "We will build a classification model using the \"K-nearest neighbors\"\n",
363 | "strategy. To predict the target of a new sample, a k-nearest neighbors takes\n",
364 | "into account its `k` closest samples in the training set and predicts the\n",
365 | "majority target of these samples.\n",
366 | "\n",
367 | "\n",
368 | "
Note
\n",
369 | "
We use a K-nearest neighbors here. However, be aware that it is seldom useful\n",
370 | "in practice. We use it because it is an intuitive algorithm. In future modules, we will introduce alternative algorithms.
\n",
371 | "
"
372 | ]
373 | },
374 | {
375 | "cell_type": "code",
376 | "execution_count": 6,
377 | "id": "43f64570-8607-47dd-807b-9d4db67be554",
378 | "metadata": {
379 | "slideshow": {
380 | "slide_type": "fragment"
381 | },
382 | "tags": []
383 | },
384 | "outputs": [],
385 | "source": [
386 | "# to display nice model diagram\n",
387 | "from sklearn import set_config\n",
388 | "set_config(display='diagram')"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 7,
394 | "id": "fbd15beb-5573-455d-9288-bf7f91685fd9",
395 | "metadata": {},
396 | "outputs": [
397 | {
398 | "data": {
399 | "text/html": [
400 | "KNeighborsClassifier() In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
401 | ],
402 | "text/plain": [
403 | "KNeighborsClassifier()"
404 | ]
405 | },
406 | "execution_count": 7,
407 | "metadata": {},
408 | "output_type": "execute_result"
409 | }
410 | ],
411 | "source": [
412 | "from sklearn.neighbors import KNeighborsClassifier\n",
413 | "\n",
414 | "# 1. define the algorithm\n",
415 | "model = KNeighborsClassifier()\n",
416 | "\n",
417 | "# 2. fit the model\n",
418 | "model.fit(features, target)"
419 | ]
420 | },
421 | {
422 | "cell_type": "markdown",
423 | "id": "ef9d8fc4-9bda-4fc2-8f47-17dc3c48fd56",
424 | "metadata": {
425 | "slideshow": {
426 | "slide_type": "slide"
427 | },
428 | "tags": []
429 | },
430 | "source": [
431 | "Learning can be represented as follows:\n",
432 | "\n",
433 | "\n",
434 | "\n",
435 | "The method `fit` is based on two important elements: (i) **learning algorithm**\n",
436 | "and (ii) **model state**. The model state can be used later to either predict (for classifiers and regressors) or transform data (for transformers)."
437 | ]
438 | },
439 | {
440 | "cell_type": "markdown",
441 | "id": "f9a022bb-27d3-4740-8a7b-f93e2d87ba93",
442 | "metadata": {},
443 | "source": [
444 | "\n",
445 | "
Note
\n",
446 | "
Here and later, we use the name data and target to be explicit. In\n",
447 | "scikit-learn documentation, data is commonly named X and target is\n",
448 | "commonly called y .
\n",
449 | "
"
450 | ]
451 | },
452 | {
453 | "cell_type": "markdown",
454 | "id": "2072be72-ee1b-43b3-9de8-87601e20a6ce",
455 | "metadata": {
456 | "slideshow": {
457 | "slide_type": "slide"
458 | },
459 | "tags": []
460 | },
461 | "source": [
462 | "## Make predictions\n",
463 | "\n",
464 | "Let's use our model to make some predictions using the same dataset. To predict, a model uses a **prediction function** that will use the input data together with the model states.\n",
465 | "\n",
466 | ""
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": 8,
472 | "id": "91c65c5a-0dd9-4ac3-a1b4-915f9b477b4a",
473 | "metadata": {},
474 | "outputs": [
475 | {
476 | "data": {
477 | "text/plain": [
478 | "array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],\n",
479 | " dtype=object)"
480 | ]
481 | },
482 | "execution_count": 8,
483 | "metadata": {},
484 | "output_type": "execute_result"
485 | }
486 | ],
487 | "source": [
488 | "target_predicted = model.predict(features)\n",
489 | "target_predicted"
490 | ]
491 | },
492 | {
493 | "cell_type": "markdown",
494 | "id": "bb503eff-ed37-43c4-85e2-ca627b370e24",
495 | "metadata": {
496 | "slideshow": {
497 | "slide_type": "slide"
498 | },
499 | "tags": []
500 | },
501 | "source": [
502 | "...and we could even check if the predictions agree with the real targets:"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": 9,
508 | "id": "2cb0488b-b174-4c01-9fd3-c381de028255",
509 | "metadata": {},
510 | "outputs": [
511 | {
512 | "data": {
513 | "text/plain": [
514 | "0 True\n",
515 | "1 True\n",
516 | "2 False\n",
517 | "3 True\n",
518 | "4 True\n",
519 | "Name: class, dtype: bool"
520 | ]
521 | },
522 | "execution_count": 9,
523 | "metadata": {},
524 | "output_type": "execute_result"
525 | }
526 | ],
527 | "source": [
528 | "# accuracy of first 5 predictions\n",
529 | "target[:5] == target_predicted[:5]"
530 | ]
531 | },
532 | {
533 | "cell_type": "markdown",
534 | "id": "07b52345-7803-4548-bfc9-f96b11463297",
535 | "metadata": {},
536 | "source": [
537 | "\n",
538 | "
Note
\n",
539 | "
Here, we see that our model makes a mistake when predicting the third observation.
\n",
540 | "
"
541 | ]
542 | },
543 | {
544 | "cell_type": "markdown",
545 | "id": "56d00cfb-b280-4ae2-8ba2-fd01c9d3e729",
546 | "metadata": {
547 | "slideshow": {
548 | "slide_type": "slide"
549 | },
550 | "tags": []
551 | },
552 | "source": [
553 | "To get a better assessment, we can compute the average success rate."
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": 10,
559 | "id": "880dfdb8-492d-45ae-a117-1a154f75e7eb",
560 | "metadata": {},
561 | "outputs": [
562 | {
563 | "data": {
564 | "text/plain": [
565 | "0.8479791982310306"
566 | ]
567 | },
568 | "execution_count": 10,
569 | "metadata": {},
570 | "output_type": "execute_result"
571 | }
572 | ],
573 | "source": [
574 | "(target == target_predicted).mean()"
575 | ]
576 | },
577 | {
578 | "cell_type": "markdown",
579 | "id": "2535d2eb-8710-45ae-9d7a-fc7708a60231",
580 | "metadata": {
581 | "slideshow": {
582 | "slide_type": "fragment"
583 | },
584 | "tags": []
585 | },
586 | "source": [
587 | "\n",
588 | "
Warning!
\n",
589 | "
But, can this evaluation be trusted, or is it too good to be true?
\n",
590 | "
"
591 | ]
592 | },
593 | {
594 | "cell_type": "markdown",
595 | "id": "642852e2-a665-45c0-bb9d-7093c18f3564",
596 | "metadata": {
597 | "slideshow": {
598 | "slide_type": "slide"
599 | },
600 | "tags": []
601 | },
602 | "source": [
603 | "## Train-test data split\n",
604 | "\n",
605 | "When building a machine learning model, it is important to evaluate the\n",
606 | "trained model on data that was not used to fit it, as **generalization** is\n",
607 | "our primary concern -- meaning we want a rule that generalizes to new data.\n",
608 | "\n",
609 | "Correct evaluation is easily done by leaving out a subset of the data when\n",
610 | "training the model and using it afterwards for model evaluation.\n",
611 | "\n",
612 | "The data used to fit a model is called training data while the data used to\n",
613 | "assess a model is called testing data ."
614 | ]
615 | },
616 | {
617 | "cell_type": "markdown",
618 | "id": "abf3ba92-89ab-4140-9e92-2fc5560e8fa5",
619 | "metadata": {
620 | "slideshow": {
621 | "slide_type": "slide"
622 | },
623 | "tags": []
624 | },
625 | "source": [
626 | "Scikit-learn provides the helper function `sklearn.model_selection.train_test_split` which is used to automatically split the dataset into two subsets."
627 | ]
628 | },
629 | {
630 | "cell_type": "code",
631 | "execution_count": 11,
632 | "id": "2b1d046b-d298-4495-b66d-ce096a438c2b",
633 | "metadata": {},
634 | "outputs": [],
635 | "source": [
636 | "from sklearn.model_selection import train_test_split\n",
637 | "\n",
638 | "X_train, X_test, y_train, y_test = train_test_split(\n",
639 | " features, \n",
640 | " target, \n",
641 | " random_state=123, \n",
642 | " test_size=0.25,\n",
643 | " stratify=target\n",
644 | ")"
645 | ]
646 | },
647 | {
648 | "cell_type": "markdown",
649 | "id": "6d94c391-0265-4f25-91ac-2d22b95e83e2",
650 | "metadata": {
651 | "slideshow": {
652 | "slide_type": "fragment"
653 | },
654 | "tags": []
655 | },
656 | "source": [
657 | "\n",
658 | "
Tip
\n",
659 | "
In scikit-learn setting the random_state parameter allows to get\n",
660 | "deterministic results when we use a random number generator. In the\n",
661 | "train_test_split case the randomness comes from shuffling the data, which\n",
662 | "decides how the dataset is split into a train and a test set).\n",
663 | " \n",
664 | "And as your target becomes more imbalanced it is important to use the stratify parameter.\n",
665 | "
\n",
666 | "
"
667 | ]
668 | },
669 | {
670 | "cell_type": "markdown",
671 | "id": "ae299fe5-ebef-4000-a8c0-29aec3147311",
672 | "metadata": {
673 | "slideshow": {
674 | "slide_type": "slide"
675 | },
676 | "tags": []
677 | },
678 | "source": [
679 | "\n",
680 | "
Your Turn
\n",
681 | "
\n",
682 | "\n",
683 | "1. How many observations are in your train and test data sets? \n",
684 | "\n",
685 | "2. What is the proportion of response values in your y_train and y_test ? \n",
686 | "
\n",
687 | "
"
688 | ]
689 | },
690 | {
691 | "cell_type": "markdown",
692 | "id": "0abf73e1-7212-4b66-99ce-ac6819d7935e",
693 | "metadata": {
694 | "slideshow": {
695 | "slide_type": "slide"
696 | },
697 | "tags": []
698 | },
699 | "source": [
700 | "Instead of computing the prediction and manually computing the average\n",
701 | "success rate, we can use the method `score`. When dealing with classifiers\n",
702 | "this method returns their performance metric.\n",
703 | "\n",
704 | ""
705 | ]
706 | },
707 | {
708 | "cell_type": "code",
709 | "execution_count": 12,
710 | "id": "afbf4632-2a5a-497d-ab32-9950c7ba84dd",
711 | "metadata": {
712 | "slideshow": {
713 | "slide_type": "slide"
714 | },
715 | "tags": []
716 | },
717 | "outputs": [
718 | {
719 | "name": "stdout",
720 | "output_type": "stream",
721 | "text": [
722 | "The test accuracy using KNeighborsClassifier is 82.59%\n"
723 | ]
724 | }
725 | ],
726 | "source": [
727 | "# 1. define the algorithm\n",
728 | "model = KNeighborsClassifier()\n",
729 | "\n",
730 | "# 2. fit the model\n",
731 | "model.fit(X_train, y_train)\n",
732 | "\n",
733 | "# 3. score our model on test data\n",
734 | "accuracy = model.score(X_test, y_test)\n",
735 | "\n",
736 | "print(f'The test accuracy using {model.__class__.__name__} is {round(accuracy, 4) * 100}%')"
737 | ]
738 | },
739 | {
740 | "cell_type": "markdown",
741 | "id": "9070e86f-803c-449a-922a-b9b290f7e820",
742 | "metadata": {
743 | "slideshow": {
744 | "slide_type": "slide"
745 | },
746 | "tags": []
747 | },
748 | "source": [
749 | "\n",
750 | "
Important!
\n",
751 | "
\n",
752 | "If we compare with the accuracy obtained by wrongly evaluating the model\n",
753 | "on the training set, we find that this evaluation was indeed optimistic\n",
754 | "compared to the score obtained on a held-out test set.\n",
755 | "\n",
756 | "This illustrates the importance of always testing the generalization performance of\n",
757 | "predictive models on a different set than the one used to train these models.\n",
758 | "
\n",
759 | "
"
760 | ]
761 | },
762 | {
763 | "cell_type": "markdown",
764 | "id": "faa7a034-264e-4987-9208-f3641de93930",
765 | "metadata": {
766 | "slideshow": {
767 | "slide_type": "slide"
768 | },
769 | "tags": []
770 | },
771 | "source": [
772 | "## Wrapping up\n",
773 | "\n",
774 | "In this module we learned how to:\n",
775 | "\n",
776 | "* fit a predictive machine learning algorithm (**k-nearest neighbors**) on a training dataset;\n",
777 | "* evaluate its generalization performance on the testing data;\n",
778 | "* introduced the scikit-learn API `.fit(X, y)` (to train a model),\n",
779 | " `.predict(X)` (to make predictions) and `.score(X, y)`\n",
780 | " (to evaluate a model)."
781 | ]
782 | },
783 | {
784 | "cell_type": "markdown",
785 | "id": "76dd4527-19e4-441e-8d31-54319fee200b",
786 | "metadata": {
787 | "slideshow": {
788 | "slide_type": "slide"
789 | },
790 | "tags": []
791 | },
792 | "source": [
793 | "\n",
794 | "
Your Turn
\n",
795 | "
\n",
796 | "Scikit-learn provides a logistic regression algorithm, which is another type of algorithm for making binary classification predictions. This algorithm is available at sklearn.linear_model.LogisticRegression . \n",
797 | " \n",
798 | "Fill in the blanks below to import the LogisticRegression module, define the algorithm, fit the model, and score on the test data.\n",
799 | "
\n",
800 | "
"
801 | ]
802 | },
803 | {
804 | "cell_type": "code",
805 | "execution_count": null,
806 | "id": "8139c016-d63b-44ed-a725-d613f6c8bafb",
807 | "metadata": {
808 | "tags": [
809 | "ci-skip"
810 | ]
811 | },
812 | "outputs": [],
813 | "source": [
814 | "# 1. import the LogisticRegression module\n",
815 | "from sklearn.linear_model import __________\n",
816 | "\n",
817 | "# 2. define the algorithm\n",
818 | "model = __________\n",
819 | "\n",
820 | "# 3. fit the model\n",
821 | "model.fit(______, ______)\n",
822 | "\n",
823 | "# 4. score our model on test data\n",
824 | "model.score(______, ______)"
825 | ]
826 | }
827 | ],
828 | "metadata": {
829 | "kernelspec": {
830 | "display_name": "Python 3 (ipykernel)",
831 | "language": "python",
832 | "name": "python3"
833 | },
834 | "language_info": {
835 | "codemirror_mode": {
836 | "name": "ipython",
837 | "version": 3
838 | },
839 | "file_extension": ".py",
840 | "mimetype": "text/x-python",
841 | "name": "python",
842 | "nbconvert_exporter": "python",
843 | "pygments_lexer": "ipython3",
844 | "version": "3.11.7"
845 | }
846 | },
847 | "nbformat": 4,
848 | "nbformat_minor": 5
849 | }
850 |
--------------------------------------------------------------------------------
/notebooks/04-modular_code.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "8508a2d2-e00b-40fd-a36e-5bb6a7246fc1",
6 | "metadata": {
7 | "slideshow": {
8 | "slide_type": "slide"
9 | },
10 | "tags": []
11 | },
12 | "source": [
13 | "# Modular Code\n",
14 | "\n",
15 | ""
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "id": "189cdd58-cf78-4d6c-84b9-bff84b233919",
21 | "metadata": {
22 | "slideshow": {
23 | "slide_type": "slide"
24 | },
25 | "tags": []
26 | },
27 | "source": [
28 | "# What's Modularity?\n",
29 | "\n",
30 | "- Building our code in discrete, clearly separated chunks\n",
31 | "- So we can...\n",
32 | " - **Modify one piece** without breaking the others\n",
33 | " - **Combine pieces** in different ways, as we need them"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "id": "63602da2-d767-47b3-873c-8b5e2d398a63",
39 | "metadata": {
40 | "slideshow": {
41 | "slide_type": "fragment"
42 | },
43 | "tags": []
44 | },
45 | "source": [
46 | "Often, modularity implies breaking our code down into different **functions** which live in different **modules**."
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "id": "375fcb2a-12f6-47da-a01a-f75c8abcafab",
52 | "metadata": {
53 | "tags": []
54 | },
55 | "source": [
56 | "\n",
57 | "
Note
\n",
58 | "
Generally, a module in Python is just a file that ends in .py
\n",
59 | "
"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "id": "719240b2-c828-45fa-afc4-83c8c089d72b",
65 | "metadata": {
66 | "slideshow": {
67 | "slide_type": "slide"
68 | },
69 | "tags": []
70 | },
71 | "source": [
72 | "## How do we achieve modularity in Python?\n",
73 | "- Move code chunks into their own **functions**\n",
74 | "- Move functions into their own **files**"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "id": "a445cc43-d99f-49a5-97f6-456a686ed502",
80 | "metadata": {
81 | "slideshow": {
82 | "slide_type": "slide"
83 | },
84 | "tags": []
85 | },
86 | "source": [
87 | "## Example: Functions\n",
88 | "In the last section, we imported some data from a CSV, kept just its numeric columns, and separated the target from the features..."
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 1,
94 | "id": "8fbccadb-ef77-4fbb-bebb-9306f80f9efe",
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "import numpy as np\n",
99 | "import pandas as pd\n",
100 | "\n",
101 | "adult_census = pd.read_csv(\"../data/adult-census.csv\")\n",
102 | "\n",
103 | "# create column names of interest\n",
104 | "target_col = \"class\"\n",
105 | "\n",
106 | "raw_features = adult_census.drop(columns=target_col)\n",
107 | "numeric_features = raw_features.select_dtypes(np.number)\n",
108 | "feature_cols = numeric_features.columns.values\n",
109 | "\n",
110 | "features = adult_census[feature_cols]\n",
111 | "target = adult_census[target_col]"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "id": "e5325c45-2c2c-4c22-8701-84c0ebd074cf",
117 | "metadata": {
118 | "slideshow": {
119 | "slide_type": "fragment"
120 | },
121 | "tags": []
122 | },
123 | "source": [
124 | "This is a lot of code though, and we might want to do this again in the future **with different data sets**.\n",
125 | "\n",
126 | "That makes it a perfect case to be its own function."
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 2,
132 | "id": "9f9d8082-a6c0-44fc-b07c-2cc546026811",
133 | "metadata": {
134 | "slideshow": {
135 | "slide_type": "slide"
136 | },
137 | "tags": []
138 | },
139 | "outputs": [],
140 | "source": [
141 | "def get_features_and_target():\n",
142 | " '''Split a CSV into a DF of numeric features and a target column.'''\n",
143 | " adult_census = pd.read_csv(\"../data/adult-census.csv\")\n",
144 | "\n",
145 | " target_col = \"class\"\n",
146 | " \n",
147 | " raw_features = adult_census.drop(columns=target_col)\n",
148 | " numeric_features = raw_features.select_dtypes(np.number)\n",
149 | " feature_cols = numeric_features.columns.values\n",
150 | "\n",
151 | " features = adult_census[feature_cols]\n",
152 | " target = adult_census[target_col]\n",
153 | " \n",
154 | " return (features, target)"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "id": "d7742630-3832-452b-bfcb-0c0e5b8f0211",
160 | "metadata": {
161 | "slideshow": {
162 | "slide_type": "fragment"
163 | },
164 | "tags": []
165 | },
166 | "source": [
167 | "Let's test it!"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 3,
173 | "id": "9d968ac2-5f84-4380-9080-f67eba43103c",
174 | "metadata": {},
175 | "outputs": [
176 | {
177 | "data": {
178 | "text/html": [
179 | "\n",
180 | "\n",
193 | "
\n",
194 | " \n",
195 | " \n",
196 | " \n",
197 | " age \n",
198 | " education-num \n",
199 | " capital-gain \n",
200 | " capital-loss \n",
201 | " hours-per-week \n",
202 | " \n",
203 | " \n",
204 | " \n",
205 | " \n",
206 | " 0 \n",
207 | " 25 \n",
208 | " 7 \n",
209 | " 0 \n",
210 | " 0 \n",
211 | " 40 \n",
212 | " \n",
213 | " \n",
214 | " 1 \n",
215 | " 38 \n",
216 | " 9 \n",
217 | " 0 \n",
218 | " 0 \n",
219 | " 50 \n",
220 | " \n",
221 | " \n",
222 | " 2 \n",
223 | " 28 \n",
224 | " 12 \n",
225 | " 0 \n",
226 | " 0 \n",
227 | " 40 \n",
228 | " \n",
229 | " \n",
230 | " 3 \n",
231 | " 44 \n",
232 | " 10 \n",
233 | " 7688 \n",
234 | " 0 \n",
235 | " 40 \n",
236 | " \n",
237 | " \n",
238 | " 4 \n",
239 | " 18 \n",
240 | " 10 \n",
241 | " 0 \n",
242 | " 0 \n",
243 | " 30 \n",
244 | " \n",
245 | " \n",
246 | "
\n",
247 | "
"
248 | ],
249 | "text/plain": [
250 | " age education-num capital-gain capital-loss hours-per-week\n",
251 | "0 25 7 0 0 40\n",
252 | "1 38 9 0 0 50\n",
253 | "2 28 12 0 0 40\n",
254 | "3 44 10 7688 0 40\n",
255 | "4 18 10 0 0 30"
256 | ]
257 | },
258 | "execution_count": 3,
259 | "metadata": {},
260 | "output_type": "execute_result"
261 | }
262 | ],
263 | "source": [
264 | "f, t = get_features_and_target()\n",
265 | "f.head()"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": 4,
271 | "id": "065fec64-a8c7-4264-8d2d-64f8a6bf6e5e",
272 | "metadata": {},
273 | "outputs": [
274 | {
275 | "data": {
276 | "text/plain": [
277 | "0 <=50K\n",
278 | "1 <=50K\n",
279 | "2 >50K\n",
280 | "3 >50K\n",
281 | "4 <=50K\n",
282 | "Name: class, dtype: object"
283 | ]
284 | },
285 | "execution_count": 4,
286 | "metadata": {},
287 | "output_type": "execute_result"
288 | }
289 | ],
290 | "source": [
291 | "t.head()"
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "id": "80a80a7d-5996-4045-a172-55290974c843",
297 | "metadata": {},
298 | "source": [
299 | "Looks like it worked!!"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 5,
305 | "id": "65753a10-5b76-42f1-a970-a71b0677ed7a",
306 | "metadata": {
307 | "slideshow": {
308 | "slide_type": "slide"
309 | },
310 | "tags": []
311 | },
312 | "outputs": [],
313 | "source": [
314 | "def get_features_and_target():\n",
315 | " '''Split a CSV into a DF of numeric features and a target column.'''\n",
316 | " adult_census = pd.read_csv(\"../data/adult-census.csv\")\n",
317 | "\n",
318 | " target_col = \"class\"\n",
319 | " \n",
320 | " raw_features = adult_census.drop(columns=target_col)\n",
321 | " numeric_features = raw_features.select_dtypes(np.number)\n",
322 | " feature_cols = numeric_features.columns.values\n",
323 | "\n",
324 | " features = adult_census[feature_cols]\n",
325 | " target = adult_census[target_col]\n",
326 | " \n",
327 | " return (features, target)"
328 | ]
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "id": "044fa257-a6b9-4189-ae7b-8324e64f68f3",
333 | "metadata": {},
334 | "source": [
335 | "\n",
336 | "
Discussion
\n",
337 | " How would we apply this function to new data?\n",
338 | "
"
339 | ]
340 | },
341 | {
342 | "cell_type": "markdown",
343 | "id": "50d64911-785e-4019-93ca-6a0504dea0cb",
344 | "metadata": {
345 | "slideshow": {
346 | "slide_type": "fragment"
347 | },
348 | "tags": []
349 | },
350 | "source": [
351 | "We can't! We didn't **parametrize** it."
352 | ]
353 | },
354 | {
355 | "cell_type": "markdown",
356 | "id": "cec58ea8-cae1-43c0-8af7-957ab8cbf16c",
357 | "metadata": {
358 | "slideshow": {
359 | "slide_type": "slide"
360 | },
361 | "tags": []
362 | },
363 | "source": [
364 | "## Parametrizing Functions\n",
365 | "\n",
366 | "While functions are about reusing code, we rarely want to rerun *exactly* the same code.\n",
367 | "\n",
368 | "Usually, there are a small number of things that should change from run to run. These are called **parameters**."
369 | ]
370 | },
371 | {
372 | "cell_type": "markdown",
373 | "id": "b1646049-4f65-4868-84f1-331d6a3943d2",
374 | "metadata": {
375 | "slideshow": {
376 | "slide_type": "fragment"
377 | },
378 | "tags": []
379 | },
380 | "source": [
381 | "Common things that might be used parameters:\n",
382 | "- threshold values\n",
383 | "- filenames\n",
384 | "- column names"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": 6,
390 | "id": "02479640-a0d9-4b84-9114-20515c5c756d",
391 | "metadata": {
392 | "slideshow": {
393 | "slide_type": "slide"
394 | },
395 | "tags": []
396 | },
397 | "outputs": [],
398 | "source": [
399 | "def get_features_and_target():\n",
400 | " '''Split a CSV into a DF of numeric features and a target column.'''\n",
401 | " adult_census = pd.read_csv(\"../data/adult-census.csv\")\n",
402 | "\n",
403 | " target_col = \"class\"\n",
404 | " \n",
405 | " raw_features = adult_census.drop(columns=target_col)\n",
406 | " numeric_features = raw_features.select_dtypes(np.number)\n",
407 | " feature_cols = numeric_features.columns.values\n",
408 | "\n",
409 | " features = adult_census[feature_cols]\n",
410 | " target = adult_census[target_col]\n",
411 | " \n",
412 | " return (features, target)"
413 | ]
414 | },
415 | {
416 | "cell_type": "markdown",
417 | "id": "d80267a4-5441-4642-8998-2d97ff50b55d",
418 | "metadata": {},
419 | "source": [
420 | "\n",
421 | "
Discussion
\n",
422 | " What should be the parameters of our
get_features_and_target
function?\n",
423 | "
"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": 7,
429 | "id": "7ded97b6-9595-416f-ab26-d98301a7d714",
430 | "metadata": {
431 | "slideshow": {
432 | "slide_type": "slide"
433 | },
434 | "tags": []
435 | },
436 | "outputs": [],
437 | "source": [
438 | "def get_features_and_target(csv_file, target_col):\n",
439 | " '''Split a CSV into a DF of numeric features and a target column.'''\n",
440 | " \n",
441 | " adult_census = pd.read_csv(csv_file)\n",
442 | " \n",
443 | " raw_features = adult_census.drop(columns=target_col)\n",
444 | " numeric_features = raw_features.select_dtypes(np.number)\n",
445 | " feature_cols = numeric_features.columns.values\n",
446 | "\n",
447 | " features = adult_census[feature_cols]\n",
448 | " target = adult_census[target_col]\n",
449 | " \n",
450 | " return (features, target)"
451 | ]
452 | },
453 | {
454 | "cell_type": "markdown",
455 | "id": "ce36e103-c5fa-4818-a28b-ba97e4e4907d",
456 | "metadata": {
457 | "slideshow": {
458 | "slide_type": "slide"
459 | },
460 | "tags": []
461 | },
462 | "source": [
463 | "Now if we call our function without passing `csv_file` and `target_col`, we get an error:"
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": 8,
469 | "id": "9c8ffbcb-1870-43f9-9204-3149e1f1940c",
470 | "metadata": {
471 | "tags": [
472 | "ci-skip"
473 | ]
474 | },
475 | "outputs": [
476 | {
477 | "ename": "TypeError",
478 | "evalue": "get_features_and_target() missing 2 required positional arguments: 'csv_file' and 'target_col'",
479 | "output_type": "error",
480 | "traceback": [
481 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
482 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
483 | "\u001b[0;32m/var/folders/j3/v1318ng94fvdpq7kzr0hq9kw0000gn/T/ipykernel_2533/3218846325.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_features_and_target\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
484 | "\u001b[0;31mTypeError\u001b[0m: get_features_and_target() missing 2 required positional arguments: 'csv_file' and 'target_col'"
485 | ]
486 | }
487 | ],
488 | "source": [
489 | "f, t = get_features_and_target()"
490 | ]
491 | },
492 | {
493 | "cell_type": "markdown",
494 | "id": "7aa651cd-1ab3-4211-9253-263f5f15ea2e",
495 | "metadata": {
496 | "slideshow": {
497 | "slide_type": "slide"
498 | },
499 | "tags": []
500 | },
501 | "source": [
502 | "But we can still use it if we pass in those parameters:"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": 9,
508 | "id": "7c5239ae-be56-42e7-a9fc-c36a084fef6c",
509 | "metadata": {},
510 | "outputs": [],
511 | "source": [
512 | "# In Python, linebreaks and spaces inside parentheses are ignored.\n",
513 | "f, t = get_features_and_target(\n",
514 | " csv_file='../data/adult-census.csv',\n",
515 | " target_col='class',\n",
516 | ")"
517 | ]
518 | },
519 | {
520 | "cell_type": "code",
521 | "execution_count": 10,
522 | "id": "35209624-3af9-45e3-b803-578570b76ea3",
523 | "metadata": {},
524 | "outputs": [
525 | {
526 | "data": {
527 | "text/html": [
528 | "\n",
529 | "\n",
542 | "
\n",
543 | " \n",
544 | " \n",
545 | " \n",
546 | " age \n",
547 | " education-num \n",
548 | " capital-gain \n",
549 | " capital-loss \n",
550 | " hours-per-week \n",
551 | " \n",
552 | " \n",
553 | " \n",
554 | " \n",
555 | " 0 \n",
556 | " 25 \n",
557 | " 7 \n",
558 | " 0 \n",
559 | " 0 \n",
560 | " 40 \n",
561 | " \n",
562 | " \n",
563 | " 1 \n",
564 | " 38 \n",
565 | " 9 \n",
566 | " 0 \n",
567 | " 0 \n",
568 | " 50 \n",
569 | " \n",
570 | " \n",
571 | " 2 \n",
572 | " 28 \n",
573 | " 12 \n",
574 | " 0 \n",
575 | " 0 \n",
576 | " 40 \n",
577 | " \n",
578 | " \n",
579 | " 3 \n",
580 | " 44 \n",
581 | " 10 \n",
582 | " 7688 \n",
583 | " 0 \n",
584 | " 40 \n",
585 | " \n",
586 | " \n",
587 | " 4 \n",
588 | " 18 \n",
589 | " 10 \n",
590 | " 0 \n",
591 | " 0 \n",
592 | " 30 \n",
593 | " \n",
594 | " \n",
595 | "
\n",
596 | "
"
597 | ],
598 | "text/plain": [
599 | " age education-num capital-gain capital-loss hours-per-week\n",
600 | "0 25 7 0 0 40\n",
601 | "1 38 9 0 0 50\n",
602 | "2 28 12 0 0 40\n",
603 | "3 44 10 7688 0 40\n",
604 | "4 18 10 0 0 30"
605 | ]
606 | },
607 | "execution_count": 10,
608 | "metadata": {},
609 | "output_type": "execute_result"
610 | }
611 | ],
612 | "source": [
613 | "f.head()"
614 | ]
615 | },
616 | {
617 | "cell_type": "markdown",
618 | "id": "ad799973-7438-45f7-b41b-9aedea313403",
619 | "metadata": {
620 | "slideshow": {
621 | "slide_type": "slide"
622 | },
623 | "tags": []
624 | },
625 | "source": [
626 | "Now, imagine we want to build a model on the Ames data instead, using the \"Sale_Price\" column as our target..."
627 | ]
628 | },
629 | {
630 | "cell_type": "code",
631 | "execution_count": 11,
632 | "id": "9c0bd0af-8eeb-45fd-aa7f-bbfce9788841",
633 | "metadata": {},
634 | "outputs": [],
635 | "source": [
636 | "ames_features, ames_target = get_features_and_target(\n",
637 | " csv_file='../data/ames.csv',\n",
638 | " target_col='Sale_Price',\n",
639 | ")"
640 | ]
641 | },
642 | {
643 | "cell_type": "code",
644 | "execution_count": 12,
645 | "id": "a7211f46-69f9-4de2-831a-0e8c900e359d",
646 | "metadata": {
647 | "slideshow": {
648 | "slide_type": "fragment"
649 | },
650 | "tags": []
651 | },
652 | "outputs": [
653 | {
654 | "data": {
655 | "text/html": [
656 | "\n",
657 | "\n",
670 | "
\n",
671 | " \n",
672 | " \n",
673 | " \n",
674 | " Lot_Frontage \n",
675 | " Lot_Area \n",
676 | " Year_Built \n",
677 | " Year_Remod_Add \n",
678 | " Mas_Vnr_Area \n",
679 | " BsmtFin_SF_1 \n",
680 | " BsmtFin_SF_2 \n",
681 | " Bsmt_Unf_SF \n",
682 | " Total_Bsmt_SF \n",
683 | " First_Flr_SF \n",
684 | " ... \n",
685 | " Open_Porch_SF \n",
686 | " Enclosed_Porch \n",
687 | " Three_season_porch \n",
688 | " Screen_Porch \n",
689 | " Pool_Area \n",
690 | " Misc_Val \n",
691 | " Mo_Sold \n",
692 | " Year_Sold \n",
693 | " Longitude \n",
694 | " Latitude \n",
695 | " \n",
696 | " \n",
697 | " \n",
698 | " \n",
699 | " 0 \n",
700 | " 141 \n",
701 | " 31770 \n",
702 | " 1960 \n",
703 | " 1960 \n",
704 | " 112 \n",
705 | " 2 \n",
706 | " 0 \n",
707 | " 441 \n",
708 | " 1080 \n",
709 | " 1656 \n",
710 | " ... \n",
711 | " 62 \n",
712 | " 0 \n",
713 | " 0 \n",
714 | " 0 \n",
715 | " 0 \n",
716 | " 0 \n",
717 | " 5 \n",
718 | " 2010 \n",
719 | " -93.619754 \n",
720 | " 42.054035 \n",
721 | " \n",
722 | " \n",
723 | " 1 \n",
724 | " 80 \n",
725 | " 11622 \n",
726 | " 1961 \n",
727 | " 1961 \n",
728 | " 0 \n",
729 | " 6 \n",
730 | " 144 \n",
731 | " 270 \n",
732 | " 882 \n",
733 | " 896 \n",
734 | " ... \n",
735 | " 0 \n",
736 | " 0 \n",
737 | " 0 \n",
738 | " 120 \n",
739 | " 0 \n",
740 | " 0 \n",
741 | " 6 \n",
742 | " 2010 \n",
743 | " -93.619756 \n",
744 | " 42.053014 \n",
745 | " \n",
746 | " \n",
747 | " 2 \n",
748 | " 81 \n",
749 | " 14267 \n",
750 | " 1958 \n",
751 | " 1958 \n",
752 | " 108 \n",
753 | " 1 \n",
754 | " 0 \n",
755 | " 406 \n",
756 | " 1329 \n",
757 | " 1329 \n",
758 | " ... \n",
759 | " 36 \n",
760 | " 0 \n",
761 | " 0 \n",
762 | " 0 \n",
763 | " 0 \n",
764 | " 12500 \n",
765 | " 6 \n",
766 | " 2010 \n",
767 | " -93.619387 \n",
768 | " 42.052659 \n",
769 | " \n",
770 | " \n",
771 | " 3 \n",
772 | " 93 \n",
773 | " 11160 \n",
774 | " 1968 \n",
775 | " 1968 \n",
776 | " 0 \n",
777 | " 1 \n",
778 | " 0 \n",
779 | " 1045 \n",
780 | " 2110 \n",
781 | " 2110 \n",
782 | " ... \n",
783 | " 0 \n",
784 | " 0 \n",
785 | " 0 \n",
786 | " 0 \n",
787 | " 0 \n",
788 | " 0 \n",
789 | " 4 \n",
790 | " 2010 \n",
791 | " -93.617320 \n",
792 | " 42.051245 \n",
793 | " \n",
794 | " \n",
795 | " 4 \n",
796 | " 74 \n",
797 | " 13830 \n",
798 | " 1997 \n",
799 | " 1998 \n",
800 | " 0 \n",
801 | " 3 \n",
802 | " 0 \n",
803 | " 137 \n",
804 | " 928 \n",
805 | " 928 \n",
806 | " ... \n",
807 | " 34 \n",
808 | " 0 \n",
809 | " 0 \n",
810 | " 0 \n",
811 | " 0 \n",
812 | " 0 \n",
813 | " 3 \n",
814 | " 2010 \n",
815 | " -93.638933 \n",
816 | " 42.060899 \n",
817 | " \n",
818 | " \n",
819 | "
\n",
820 | "
5 rows × 34 columns
\n",
821 | "
"
822 | ],
823 | "text/plain": [
824 | " Lot_Frontage Lot_Area Year_Built Year_Remod_Add Mas_Vnr_Area \\\n",
825 | "0 141 31770 1960 1960 112 \n",
826 | "1 80 11622 1961 1961 0 \n",
827 | "2 81 14267 1958 1958 108 \n",
828 | "3 93 11160 1968 1968 0 \n",
829 | "4 74 13830 1997 1998 0 \n",
830 | "\n",
831 | " BsmtFin_SF_1 BsmtFin_SF_2 Bsmt_Unf_SF Total_Bsmt_SF First_Flr_SF ... \\\n",
832 | "0 2 0 441 1080 1656 ... \n",
833 | "1 6 144 270 882 896 ... \n",
834 | "2 1 0 406 1329 1329 ... \n",
835 | "3 1 0 1045 2110 2110 ... \n",
836 | "4 3 0 137 928 928 ... \n",
837 | "\n",
838 | " Open_Porch_SF Enclosed_Porch Three_season_porch Screen_Porch Pool_Area \\\n",
839 | "0 62 0 0 0 0 \n",
840 | "1 0 0 0 120 0 \n",
841 | "2 36 0 0 0 0 \n",
842 | "3 0 0 0 0 0 \n",
843 | "4 34 0 0 0 0 \n",
844 | "\n",
845 | " Misc_Val Mo_Sold Year_Sold Longitude Latitude \n",
846 | "0 0 5 2010 -93.619754 42.054035 \n",
847 | "1 0 6 2010 -93.619756 42.053014 \n",
848 | "2 12500 6 2010 -93.619387 42.052659 \n",
849 | "3 0 4 2010 -93.617320 42.051245 \n",
850 | "4 0 3 2010 -93.638933 42.060899 \n",
851 | "\n",
852 | "[5 rows x 34 columns]"
853 | ]
854 | },
855 | "execution_count": 12,
856 | "metadata": {},
857 | "output_type": "execute_result"
858 | }
859 | ],
860 | "source": [
861 | "ames_features.head()"
862 | ]
863 | },
864 | {
865 | "cell_type": "markdown",
866 | "id": "33481d5d-92fa-4b3e-a8b4-2f075847cd52",
867 | "metadata": {
868 | "slideshow": {
869 | "slide_type": "slide"
870 | },
871 | "tags": []
872 | },
873 | "source": [
874 | "We've successfully **abstracted** some of our code logic, moving it to a separate function that we can use without having to think too much about how it works.\n",
875 | "\n",
876 | "This is the foundation of building larger projects in Python."
877 | ]
878 | },
879 | {
880 | "cell_type": "markdown",
881 | "id": "aec7e39a-a6d1-47d9-a8e3-fb6d4659ef58",
882 | "metadata": {
883 | "slideshow": {
884 | "slide_type": "slide"
885 | },
886 | "tags": []
887 | },
888 | "source": [
889 | "## Example: Files\n",
890 | "\n",
891 | "As we write more and more functions, it can be nice to move them outside of the script or notebook where we're currently working.\n",
892 | "\n",
893 | "Let's move our new function into its own file, or **module**, and then use it from Jupyter."
894 | ]
895 | },
896 | {
897 | "cell_type": "markdown",
898 | "id": "92db7ba2-987d-4421-b1b5-d3b88cf10c0b",
899 | "metadata": {
900 | "slideshow": {
901 | "slide_type": "slide"
902 | },
903 | "tags": []
904 | },
905 | "source": [
906 | "We'll start by creating a new text file in Jupyter:\n",
907 | "\n",
908 | ""
909 | ]
910 | },
911 | {
912 | "cell_type": "markdown",
913 | "id": "486d1858-ba21-493c-8589-e8b28a1d92f8",
914 | "metadata": {
915 | "slideshow": {
916 | "slide_type": "slide"
917 | },
918 | "tags": []
919 | },
920 | "source": [
921 | "Then we'll give our new module a sensible name. Right-click on the `untitled.txt` tab and rename it to `my_module.py`.\n",
922 | "\n",
923 | "Don't forget to make sure it ends in `.py`, not `.txt`!\n",
924 | "\n",
925 | ""
926 | ]
927 | },
928 | {
929 | "cell_type": "markdown",
930 | "id": "2dd4a1ab-73ac-45d5-b8dd-b775494aa6ea",
931 | "metadata": {
932 | "slideshow": {
933 | "slide_type": "slide"
934 | },
935 | "tags": []
936 | },
937 | "source": [
938 | "Then paste the function we wrote, along with lines to import numpy and pandas:\n",
939 | "\n",
940 | "\n",
941 | "\n",
942 | "Save the file and close the `my_module.py` tab."
943 | ]
944 | },
945 | {
946 | "cell_type": "markdown",
947 | "id": "8831a2d5-2c41-4ec4-a02a-ab410b22e0ce",
948 | "metadata": {
949 | "slideshow": {
950 | "slide_type": "slide"
951 | },
952 | "tags": []
953 | },
954 | "source": [
955 | "Notice how that file is now in your sidebar:\n",
956 | "\n",
957 | ""
958 | ]
959 | },
960 | {
961 | "cell_type": "markdown",
962 | "id": "c9796596-539f-45a6-bc99-5d0cbfb15795",
963 | "metadata": {
964 | "slideshow": {
965 | "slide_type": "slide"
966 | },
967 | "tags": []
968 | },
969 | "source": [
970 | "Now that our module is saved, we can import it in any notebook (or script) that's saved in the same folder as the module."
971 | ]
972 | },
973 | {
974 | "cell_type": "code",
975 | "execution_count": 13,
976 | "id": "39f0e643-9702-4bd0-92ff-f0a942678413",
977 | "metadata": {},
978 | "outputs": [],
979 | "source": [
980 | "import my_module"
981 | ]
982 | },
983 | {
984 | "cell_type": "code",
985 | "execution_count": 14,
986 | "id": "cde0043e-bbb4-4253-88d7-e2eb97916d78",
987 | "metadata": {
988 | "tags": [
989 | "ci-skip"
990 | ]
991 | },
992 | "outputs": [
993 | {
994 | "data": {
995 | "text/plain": [
996 | "\u001b[0;31mSignature:\u001b[0m \u001b[0mmy_module\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_features_and_target\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcsv_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget_col\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
997 | "\u001b[0;31mDocstring:\u001b[0m Split a CSV into a DF of numeric features and a target column.\n",
998 | "\u001b[0;31mFile:\u001b[0m ~/Teaching/advanced-python-datasci/notebooks/my_module.py\n",
999 | "\u001b[0;31mType:\u001b[0m function\n"
1000 | ]
1001 | },
1002 | "metadata": {},
1003 | "output_type": "display_data"
1004 | }
1005 | ],
1006 | "source": [
1007 | "my_module.get_features_and_target?"
1008 | ]
1009 | },
1010 | {
1011 | "cell_type": "code",
1012 | "execution_count": 16,
1013 | "id": "eca99ff9-44c8-4264-8fb0-79e22b2f8034",
1014 | "metadata": {
1015 | "slideshow": {
1016 | "slide_type": "slide"
1017 | },
1018 | "tags": []
1019 | },
1020 | "outputs": [],
1021 | "source": [
1022 | "f, t = my_module.get_features_and_target(\n",
1023 | " csv_file='../data/adult-census.csv',\n",
1024 | " target_col='class',\n",
1025 | ")"
1026 | ]
1027 | },
1028 | {
1029 | "cell_type": "code",
1030 | "execution_count": 17,
1031 | "id": "28a8765e-1ba3-4d2c-8f90-3e7d9b9f3254",
1032 | "metadata": {},
1033 | "outputs": [
1034 | {
1035 | "data": {
1036 | "text/html": [
1037 | "\n",
1038 | "\n",
1051 | "
\n",
1052 | " \n",
1053 | " \n",
1054 | " \n",
1055 | " age \n",
1056 | " education-num \n",
1057 | " capital-gain \n",
1058 | " capital-loss \n",
1059 | " hours-per-week \n",
1060 | " \n",
1061 | " \n",
1062 | " \n",
1063 | " \n",
1064 | " 0 \n",
1065 | " 25 \n",
1066 | " 7 \n",
1067 | " 0 \n",
1068 | " 0 \n",
1069 | " 40 \n",
1070 | " \n",
1071 | " \n",
1072 | " 1 \n",
1073 | " 38 \n",
1074 | " 9 \n",
1075 | " 0 \n",
1076 | " 0 \n",
1077 | " 50 \n",
1078 | " \n",
1079 | " \n",
1080 | " 2 \n",
1081 | " 28 \n",
1082 | " 12 \n",
1083 | " 0 \n",
1084 | " 0 \n",
1085 | " 40 \n",
1086 | " \n",
1087 | " \n",
1088 | " 3 \n",
1089 | " 44 \n",
1090 | " 10 \n",
1091 | " 7688 \n",
1092 | " 0 \n",
1093 | " 40 \n",
1094 | " \n",
1095 | " \n",
1096 | " 4 \n",
1097 | " 18 \n",
1098 | " 10 \n",
1099 | " 0 \n",
1100 | " 0 \n",
1101 | " 30 \n",
1102 | " \n",
1103 | " \n",
1104 | "
\n",
1105 | "
"
1106 | ],
1107 | "text/plain": [
1108 | " age education-num capital-gain capital-loss hours-per-week\n",
1109 | "0 25 7 0 0 40\n",
1110 | "1 38 9 0 0 50\n",
1111 | "2 28 12 0 0 40\n",
1112 | "3 44 10 7688 0 40\n",
1113 | "4 18 10 0 0 30"
1114 | ]
1115 | },
1116 | "execution_count": 17,
1117 | "metadata": {},
1118 | "output_type": "execute_result"
1119 | }
1120 | ],
1121 | "source": [
1122 | "f.head()"
1123 | ]
1124 | },
1125 | {
1126 | "cell_type": "markdown",
1127 | "id": "21e629c7-202d-4201-b08d-cdb3b7a7c310",
1128 | "metadata": {
1129 | "slideshow": {
1130 | "slide_type": "fragment"
1131 | },
1132 | "tags": []
1133 | },
1134 | "source": [
1135 | "Notice that we call our function as `my_module.get_features_and_target`, not just `get_features_and_target`.\n",
1136 | "\n",
1137 | "\n",
1138 | "
Discussion
\n",
1139 | " Does this syntax remind you of anything we've seen before?\n",
1140 | "
"
1141 | ]
1142 | },
1143 | {
1144 | "cell_type": "markdown",
1145 | "id": "8091578a-ae4b-49a7-94e1-e2af53f0f7fb",
1146 | "metadata": {
1147 | "slideshow": {
1148 | "slide_type": "slide"
1149 | },
1150 | "tags": []
1151 | },
1152 | "source": [
1153 | "Ultimately, modules you create aren't any different from numpy, pandas, or any other Python libraries.\n",
1154 | "You can build them and use them just the same way.\n",
1155 | "\n",
1156 | "It's a good idea to put related functions into a module, which you can then reuse within a project or even across different projects."
1157 | ]
1158 | },
1159 | {
1160 | "cell_type": "markdown",
1161 | "id": "84b8caf1-a5ed-4c59-91d7-218842767182",
1162 | "metadata": {
1163 | "slideshow": {
1164 | "slide_type": "slide"
1165 | },
1166 | "tags": []
1167 | },
1168 | "source": [
1169 | "## Committing to GitHub\n",
1170 | "\n",
1171 | "Before we end this section, let's commit our code so far to GitHub."
1172 | ]
1173 | },
1174 | {
1175 | "cell_type": "markdown",
1176 | "id": "63206154-7760-4f8b-8787-94a5cd44623c",
1177 | "metadata": {
1178 | "slideshow": {
1179 | "slide_type": "slide"
1180 | },
1181 | "tags": []
1182 | },
1183 | "source": [
1184 | "Open GitHub Desktop.\n",
1185 | "It should show you what files you've added and changed in the project.\n",
1186 | "In the *summary* box, write a message that encapsulates what we've done so far.\n",
1187 | "\n",
1188 | ""
1189 | ]
1190 | },
1191 | {
1192 | "cell_type": "markdown",
1193 | "id": "5fa4a35f-a8cf-4eb9-a672-60eceb8e3a93",
1194 | "metadata": {},
1195 | "source": [
1196 | "Then press \"Commit to **main**\"."
1197 | ]
1198 | },
1199 | {
1200 | "cell_type": "markdown",
1201 | "id": "755085d2-f89e-413e-8905-69550f056e66",
1202 | "metadata": {
1203 | "slideshow": {
1204 | "slide_type": "slide"
1205 | },
1206 | "tags": []
1207 | },
1208 | "source": [
1209 | "At this point, we've *committed* our code but haven't synced it with GitHub.\n",
1210 | "GitHub Desktop will inform us that we need to \"push\" our new commit:\n",
1211 | "\n",
1212 | "\n",
1213 | "\n",
1214 | "Click the \"Push Origin\" button. Done!"
1215 | ]
1216 | },
1217 | {
1218 | "cell_type": "markdown",
1219 | "id": "7a5ec477-5b33-431b-925b-a0a2eb48b05a",
1220 | "metadata": {},
1221 | "source": [
1222 | "If you go to your *advanced-python-datasci* repo in GitHub, you should see your new files!"
1223 | ]
1224 | },
1225 | {
1226 | "cell_type": "markdown",
1227 | "id": "019b0e01",
1228 | "metadata": {
1229 | "slideshow": {
1230 | "slide_type": "slide"
1231 | }
1232 | },
1233 | "source": [
1234 | "## Questions\n",
1235 | "\n",
1236 | "Are there any questions before we move on?"
1237 | ]
1238 | }
1239 | ],
1240 | "metadata": {
1241 | "kernelspec": {
1242 | "display_name": "uc-python",
1243 | "language": "python",
1244 | "name": "python3"
1245 | },
1246 | "language_info": {
1247 | "codemirror_mode": {
1248 | "name": "ipython",
1249 | "version": 3
1250 | },
1251 | "file_extension": ".py",
1252 | "mimetype": "text/x-python",
1253 | "name": "python",
1254 | "nbconvert_exporter": "python",
1255 | "pygments_lexer": "ipython3",
1256 | "version": "3.11.7"
1257 | }
1258 | },
1259 | "nbformat": 4,
1260 | "nbformat_minor": 5
1261 | }
1262 |
--------------------------------------------------------------------------------
/notebooks/07-modularity-pt2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "dd9d2e0d-6b11-454d-a3e4-3e6bdf6af479",
6 | "metadata": {
7 | "slideshow": {
8 | "slide_type": "slide"
9 | },
10 | "tags": []
11 | },
12 | "source": [
13 | "# Modular Code, Part 2"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "id": "0bc36274-6ff4-430d-bd49-a8054cd7625e",
19 | "metadata": {
20 | "slideshow": {
21 | "slide_type": "slide"
22 | },
23 | "tags": []
24 | },
25 | "source": [
26 | "- In our coverage of modular code, we talked about abstracting reusable code chunks into their own **functions**\n",
27 | " - And, in turn, grouping those functions together into separate **modules**\n",
28 | " - We created a function that splits a data set into its features (a DataFrame) and target (a Series)\n",
29 | " \n",
30 | "- In our discussion of feature engineering, we showed how one might make a \"preprocessor\": a column transformer that one-hot encodes categorical features and applies standard scaling to numeric columns\n",
31 | " - We then chained this preprocessor together with a logistic regression model in order to form a scikit-learn **pipeline**"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "id": "f2874f42-6c62-4228-bee3-952a11414532",
37 | "metadata": {
38 | "slideshow": {
39 | "slide_type": "fragment"
40 | },
41 | "tags": []
42 | },
43 | "source": [
44 | "- We might use the same approach in preprocessing other datasets, so **let's move that logic to its own function and add it to our personal module**"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "id": "238ba661-113d-44e8-8828-0530eba34a34",
50 | "metadata": {
51 | "slideshow": {
52 | "slide_type": "slide"
53 | },
54 | "tags": []
55 | },
56 | "source": [
57 | "## Writing a Preprocessor Function"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "id": "84276dd0-5d1c-4991-86a3-3d3c6f32f8b5",
63 | "metadata": {},
64 | "source": [
65 | "Sometimes it's easiest to write a function's definition, or *signature*, before actually writing its code.\n",
66 | "\n",
67 | "Our function is going to give us a column transformer that we can use in pipelines.\n",
68 | "The only parameter will be the features DataFrame (at least, for right now)."
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "id": "11bfc1a8-4fa8-4b44-adba-518baae1c4f7",
74 | "metadata": {
75 | "slideshow": {
76 | "slide_type": "fragment"
77 | },
78 | "tags": []
79 | },
80 | "source": [
81 | "One possible function signature looks like this:\n",
82 | "\n",
83 | "```python\n",
84 | "def make_preprocessor(features):\n",
85 | " ...\n",
86 | "```"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "id": "b65503a5-a248-4f0a-914e-70bb2985143c",
92 | "metadata": {
93 | "slideshow": {
94 | "slide_type": "slide"
95 | },
96 | "tags": []
97 | },
98 | "source": [
99 | "Now that we have our definition, we can add code to it.\n",
100 | "In this case, we can reuse the code we wrote in the feature engineering section.\n",
101 | "\n",
102 | "```python\n",
103 | "from sklearn.compose import ColumnTransformer\n",
104 | "\n",
105 | "preprocessor = ColumnTransformer([\n",
106 | " ('one-hot-encoder', categorical_preprocessor, categorical_columns),\n",
107 | " ('standard_scaler', numeric_preprocessor, numeric_columns)\n",
108 | "])\n",
109 | "```"
110 | ]
111 | },
112 | {
113 | "cell_type": "markdown",
114 | "id": "f1be5273-c16b-4de4-92e1-f1ddf1ca2faa",
115 | "metadata": {
116 | "slideshow": {
117 | "slide_type": "slide"
118 | },
119 | "tags": []
120 | },
121 | "source": [
122 | "Can we just put all of that code into our function without any changes?"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 1,
128 | "id": "92be057e-57d5-405d-91f1-d1419c03e6df",
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "def make_preprocessor(features):\n",
133 | " from sklearn.compose import ColumnTransformer\n",
134 | "\n",
135 | " preprocessor = ColumnTransformer([\n",
136 | " ('one-hot-encoder', categorical_preprocessor, categorical_columns),\n",
137 | " ('standard_scaler', numeric_preprocessor, numeric_columns)\n",
138 | " ])"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "id": "e99da5b7-bc3b-4401-924f-d1f2fe5af86b",
144 | "metadata": {},
145 | "source": [
146 | "\n",
147 | "
Discussion
\n",
148 | " Does anyone see any issues with this?\n",
149 | "
"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 2,
155 | "id": "218afb3f-cf49-4f1c-8fb7-c21367888811",
156 | "metadata": {
157 | "slideshow": {
158 | "slide_type": "slide"
159 | },
160 | "tags": []
161 | },
162 | "outputs": [],
163 | "source": [
164 | "import pandas as pd\n",
165 | "fake_features = pd.read_csv('../data/planes.csv')"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 3,
171 | "id": "deb5e1d0-e198-4d47-9dd5-773a56219005",
172 | "metadata": {
173 | "tags": [
174 | "ci-skip"
175 | ]
176 | },
177 | "outputs": [
178 | {
179 | "ename": "NameError",
180 | "evalue": "name 'categorical_preprocessor' is not defined",
181 | "output_type": "error",
182 | "traceback": [
183 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
184 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
185 | "\u001b[0;32m/var/folders/9w/9m3mzyd96fbdm8q4sy2pjpdw0000gn/T/ipykernel_61981/3965947682.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpreprocessor\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_preprocessor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfake_features\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
186 | "\u001b[0;32m/var/folders/9w/9m3mzyd96fbdm8q4sy2pjpdw0000gn/T/ipykernel_61981/2727407406.py\u001b[0m in \u001b[0;36mmake_preprocessor\u001b[0;34m(features)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m preprocessor = ColumnTransformer([\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;34m(\u001b[0m\u001b[0;34m'one-hot-encoder'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcategorical_preprocessor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcategorical_columns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'standard_scaler'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnumeric_preprocessor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnumeric_columns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m ])\n",
187 | "\u001b[0;31mNameError\u001b[0m: name 'categorical_preprocessor' is not defined"
188 | ]
189 | }
190 | ],
191 | "source": [
192 | "preprocessor = make_preprocessor(fake_features)"
193 | ]
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "id": "c517343b-3883-49e7-8beb-929c2bcc2a00",
198 | "metadata": {},
199 | "source": [
200 | "Our code is missing some context.\n",
201 | "`categorical_preprocessor`, `categorical_columns`, `numeric_preprocessor`, and `numeric_columns` aren't defined yet."
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "id": "02d8e82a-3f3d-4ec4-9633-f10b41067760",
207 | "metadata": {
208 | "slideshow": {
209 | "slide_type": "slide"
210 | },
211 | "tags": []
212 | },
213 | "source": [
214 | "Here's an updated version in which we assign to those variables before using them."
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 4,
220 | "id": "405b65a4-fd13-490b-b713-cde9e8243b30",
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "def make_preprocessor(features):\n",
225 | " from sklearn.compose import ColumnTransformer\n",
226 | " from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
227 | " \n",
228 | " categorical_preprocessor = OneHotEncoder(handle_unknown=\"ignore\")\n",
229 | " numeric_preprocessor = StandardScaler()\n",
230 | " \n",
231 | " numeric_columns = features.select_dtypes(exclude=object).columns\n",
232 | " categorical_columns = features.select_dtypes(include=object).columns\n",
233 | "\n",
234 | " preprocessor = ColumnTransformer([\n",
235 | " ('one-hot-encoder', categorical_preprocessor, categorical_columns),\n",
236 | " ('standard_scaler', numeric_preprocessor, numeric_columns)\n",
237 | " ])"
238 | ]
239 | },
240 | {
241 | "cell_type": "markdown",
242 | "id": "804f7c2e-e468-4886-96f2-f845c60c189b",
243 | "metadata": {
244 | "slideshow": {
245 | "slide_type": "slide"
246 | },
247 | "tags": []
248 | },
249 | "source": [
250 | "Things run without error now!"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": 5,
256 | "id": "e9c97464-cb3b-47f1-a721-8e256e1dfda7",
257 | "metadata": {
258 | "tags": []
259 | },
260 | "outputs": [],
261 | "source": [
262 | "preprocessor = make_preprocessor(fake_features)"
263 | ]
264 | },
265 | {
266 | "cell_type": "markdown",
267 | "id": "0efbec89-5229-437d-abc3-412911d5ff7f",
268 | "metadata": {},
269 | "source": [
270 | "But there are a couple of other issues.\n",
271 | "\n",
272 | "What does our resulting preprocessor object look like?"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 6,
278 | "id": "c1cda4d1-0cd3-4c96-abab-6485a80ae77d",
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "preprocessor"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": 7,
288 | "id": "d4a2f7bb-0302-45ff-9686-bdcba63e3ff2",
289 | "metadata": {},
290 | "outputs": [
291 | {
292 | "data": {
293 | "text/plain": [
294 | "NoneType"
295 | ]
296 | },
297 | "execution_count": 7,
298 | "metadata": {},
299 | "output_type": "execute_result"
300 | }
301 | ],
302 | "source": [
303 | "type(preprocessor)"
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "id": "340778e2-cb1e-4e85-95a3-d9c670842050",
309 | "metadata": {
310 | "slideshow": {
311 | "slide_type": "slide"
312 | },
313 | "tags": []
314 | },
315 | "source": [
316 | "- We need to remember to *return a value* -- otherwise we can't get anything useful out of the function.\n",
317 | "\n",
318 | "- Generally, Python best practice is to import libraries *outside* functions.\n",
319 | "All imports, even if they're to be used in different functions, are usually placed at the top of the Python module.\n",
320 | "\n",
321 | "Let's make those changes..."
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": 8,
327 | "id": "85c223b7-b181-49f8-ad3b-982a3a8b4577",
328 | "metadata": {},
329 | "outputs": [],
330 | "source": [
331 | "from sklearn.compose import ColumnTransformer\n",
332 | "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
333 | "\n",
334 | "def make_preprocessor(features):\n",
335 | " categorical_preprocessor = OneHotEncoder(handle_unknown=\"ignore\")\n",
336 | " numeric_preprocessor = StandardScaler()\n",
337 | " \n",
338 | " numeric_columns = features.select_dtypes(exclude=object).columns\n",
339 | " categorical_columns = features.select_dtypes(include=object).columns\n",
340 | "\n",
341 | " preprocessor = ColumnTransformer([\n",
342 | " ('one-hot-encoder', categorical_preprocessor, categorical_columns),\n",
343 | " ('standard_scaler', numeric_preprocessor, numeric_columns)\n",
344 | " ])\n",
345 | " \n",
346 | " return preprocessor"
347 | ]
348 | },
349 | {
350 | "cell_type": "markdown",
351 | "id": "c2968aae-f721-4e3b-bd82-848075421116",
352 | "metadata": {
353 | "slideshow": {
354 | "slide_type": "slide"
355 | },
356 | "tags": []
357 | },
358 | "source": [
359 | "And then make sure it works..."
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": 9,
365 | "id": "78249c2c-e50f-469a-9b40-9037e922cb33",
366 | "metadata": {},
367 | "outputs": [
368 | {
369 | "data": {
370 | "text/plain": [
371 | "ColumnTransformer(transformers=[('one-hot-encoder',\n",
372 | " OneHotEncoder(handle_unknown='ignore'),\n",
373 | " Index(['tailnum', 'type', 'manufacturer', 'model', 'engine'], dtype='object')),\n",
374 | " ('standard_scaler', StandardScaler(),\n",
375 | " Index(['year', 'engines', 'seats', 'speed'], dtype='object'))])"
376 | ]
377 | },
378 | "execution_count": 9,
379 | "metadata": {},
380 | "output_type": "execute_result"
381 | }
382 | ],
383 | "source": [
384 | "preprocessor = make_preprocessor(fake_features)\n",
385 | "preprocessor"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": 10,
391 | "id": "3655b1f9-63b9-4d3b-b60a-9b779b02d784",
392 | "metadata": {},
393 | "outputs": [
394 | {
395 | "data": {
396 | "text/plain": [
397 | "sklearn.compose._column_transformer.ColumnTransformer"
398 | ]
399 | },
400 | "execution_count": 10,
401 | "metadata": {},
402 | "output_type": "execute_result"
403 | }
404 | ],
405 | "source": [
406 | "type(preprocessor)"
407 | ]
408 | },
409 | {
410 | "cell_type": "markdown",
411 | "id": "087548dc-5a61-442f-bbc8-88291b377bf9",
412 | "metadata": {
413 | "slideshow": {
414 | "slide_type": "slide"
415 | },
416 | "tags": []
417 | },
418 | "source": [
419 | "Now that our function is ready, we can add it to our module!\n",
420 | "Reopen `my_module.py` and add our imports to the top and our new function at the end:"
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": 11,
426 | "id": "11c199c1-b22b-455d-8a97-56cb8c4f04ae",
427 | "metadata": {
428 | "slideshow": {
429 | "slide_type": "slide"
430 | }
431 | },
432 | "outputs": [],
433 | "source": [
434 | "import numpy as np\n",
435 | "import pandas as pd\n",
436 | "from sklearn.compose import ColumnTransformer\n",
437 | "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
438 | "\n",
439 | "def get_features_and_target(csv_file, target_col):\n",
440 | " '''Split a CSV into a DF of numeric features and a target column.'''\n",
441 | " adult_census = pd.read_csv(csv_file)\n",
442 | " \n",
443 | " raw_features = adult_census.drop(columns=target_col)\n",
444 | " numeric_features = raw_features.select_dtypes(np.number)\n",
445 | " feature_cols = numeric_features.columns.values\n",
446 | "\n",
447 | " features = adult_census[feature_cols]\n",
448 | " target = adult_census[target_col]\n",
449 | " return (features, target)\n",
450 | "\n",
451 | "def make_preprocessor(features):\n",
452 | " '''Create a column transformer that applies sensible preprocessing procedures.'''\n",
453 | " categorical_preprocessor = OneHotEncoder(handle_unknown=\"ignore\")\n",
454 | " numeric_preprocessor = StandardScaler()\n",
455 | " \n",
456 | " numeric_columns = features.select_dtypes(exclude=object).columns\n",
457 | " categorical_columns = features.select_dtypes(include=object).columns\n",
458 | "\n",
459 | " preprocessor = ColumnTransformer([\n",
460 | " ('one-hot-encoder', categorical_preprocessor, categorical_columns),\n",
461 | " ('standard_scaler', numeric_preprocessor, numeric_columns)\n",
462 | " ])\n",
463 | " return preprocessor"
464 | ]
465 | },
466 | {
467 | "cell_type": "markdown",
468 | "id": "029f3248-ef60-4159-944b-66f9feb17d9a",
469 | "metadata": {
470 | "slideshow": {
471 | "slide_type": "slide"
472 | },
473 | "tags": []
474 | },
475 | "source": [
476 | "Our functions can work together now..."
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": 12,
482 | "id": "25688ca6-8c11-4a26-b03e-363ad8f4d6d7",
483 | "metadata": {
484 | "tags": []
485 | },
486 | "outputs": [],
487 | "source": [
488 | "import my_module\n",
489 | "\n",
490 | "features, target = my_module.get_features_and_target(\n",
491 | " csv_file='../data/adult-census.csv',\n",
492 | " target_col='class',\n",
493 | ")\n",
494 | "\n",
495 | "# Drop education-num as discussed before, because it's redundant.\n",
496 | "features = features.drop('education-num', axis=1)\n",
497 | "\n",
498 | "preprocessor = my_module.make_preprocessor(features)"
499 | ]
500 | },
501 | {
502 | "cell_type": "markdown",
503 | "id": "0dfd9dab-40c9-4bff-9bec-2a02107c82a5",
504 | "metadata": {
505 | "slideshow": {
506 | "slide_type": "slide"
507 | },
508 | "tags": []
509 | },
510 | "source": [
511 | "And we could make this preprocessor part of a scikit-learn pipeline, as we saw before:"
512 | ]
513 | },
514 | {
515 | "cell_type": "code",
516 | "execution_count": 13,
517 | "id": "251aeca0-0c7c-4b67-aa7e-3171143fac5c",
518 | "metadata": {},
519 | "outputs": [],
520 | "source": [
521 | "from sklearn.pipeline import make_pipeline\n",
522 | "from sklearn.linear_model import LogisticRegression\n",
523 | "from sklearn.ensemble import RandomForestRegressor\n",
524 | "\n",
525 | "# If we want a logistic regression\n",
526 | "model = make_pipeline(preprocessor, LogisticRegression())\n",
527 | "# or perhaps we prefer a random forest?\n",
528 | "#model = make_pipeline(RandomForestRegressor())"
529 | ]
530 | },
531 | {
532 | "cell_type": "markdown",
533 | "id": "0e2a02e4-1b72-4c92-97f9-eba814588ee4",
534 | "metadata": {},
535 | "source": [
536 | "If we were even more ambitious, we could build a function that just took `features` and a model class (such as `LogisticRegression`) and returned a pipeline.\n",
537 | "But that wouldn't simplify things much beyond what we already have, so we'll leave that as an exercise you can try if you want to experiment more with modularizing your code."
538 | ]
539 | },
540 | {
541 | "cell_type": "markdown",
542 | "id": "07e4cb31-5a7e-464b-b158-eca91589c612",
543 | "metadata": {
544 | "slideshow": {
545 | "slide_type": "slide"
546 | },
547 | "tags": []
548 | },
549 | "source": [
550 | "We can use our pipeline on real data, just as we did before."
551 | ]
552 | },
553 | {
554 | "cell_type": "code",
555 | "execution_count": 14,
556 | "id": "0f64a285-d1af-443d-8c85-8df5c45db02e",
557 | "metadata": {},
558 | "outputs": [
559 | {
560 | "data": {
561 | "text/plain": [
562 | "0.7988698714274015"
563 | ]
564 | },
565 | "execution_count": 14,
566 | "metadata": {},
567 | "output_type": "execute_result"
568 | }
569 | ],
570 | "source": [
571 | "from sklearn.model_selection import train_test_split\n",
572 | "\n",
573 | "# one small addition: the target column is encoded as a string in our data so we need to convert to 1s and 0s.\n",
574 | "target = target.str.contains('>50K').astype(int)\n",
575 | "\n",
576 | "X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=123)\n",
577 | "\n",
578 | "# fit our model\n",
579 | "_ = model.fit(X_train, y_train)\n",
580 | "\n",
581 | "# score on test set\n",
582 | "model.score(X_test, y_test)"
583 | ]
584 | },
585 | {
586 | "cell_type": "markdown",
587 | "id": "be669d43-e908-4489-9478-a7862c36baf0",
588 | "metadata": {
589 | "slideshow": {
590 | "slide_type": "slide"
591 | },
592 | "tags": []
593 | },
594 | "source": [
595 | "\n",
596 | "
Discussion
\n",
597 | " What if we wanted to make our function more flexible, such that users could determine what kind of categorical and numeric encoding schemes should be used?\n",
598 | "
"
599 | ]
600 | },
601 | {
602 | "cell_type": "code",
603 | "execution_count": 15,
604 | "id": "2f78f15e-a6dc-407d-baf5-51ff5f860bbf",
605 | "metadata": {
606 | "slideshow": {
607 | "slide_type": "slide"
608 | }
609 | },
610 | "outputs": [],
611 | "source": [
612 | "def make_preprocessor(features):\n",
613 | " '''Create a column transformer that applies sensible preprocessing procedures.'''\n",
614 | " categorical_preprocessor = OneHotEncoder(handle_unknown=\"ignore\")\n",
615 | " numeric_preprocessor = StandardScaler()\n",
616 | " numeric_columns = features.select_dtypes(exclude=object).columns\n",
617 | " categorical_columns = features.select_dtypes(include=object).columns\n",
618 | " preprocessor = ColumnTransformer([\n",
619 | " ('one-hot-encoder', categorical_preprocessor, categorical_columns),\n",
620 | " ('standard_scaler', numeric_preprocessor, numeric_columns)\n",
621 | " ])\n",
622 | " return preprocessor"
623 | ]
624 | },
625 | {
626 | "cell_type": "markdown",
627 | "id": "90e0a838-69c5-45dd-b2bd-93eea82983ae",
628 | "metadata": {
629 | "slideshow": {
630 | "slide_type": "fragment"
631 | },
632 | "tags": []
633 | },
634 | "source": [
635 | "One approach would be to add \"categorical_preprocessor\" and \"numeric_preprocessor\" parameters..."
636 | ]
637 | },
638 | {
639 | "cell_type": "code",
640 | "execution_count": 16,
641 | "id": "87c46182-6c3e-4c86-bcac-267ae75bb034",
642 | "metadata": {},
643 | "outputs": [],
644 | "source": [
645 | "def make_preprocessor(features, categorical_preprocessor, numeric_preprocessor):\n",
646 | " '''Create a column transformer that applies sensible preprocessing procedures.'''\n",
647 | " numeric_columns = features.select_dtypes(exclude=object).columns\n",
648 | " categorical_columns = features.select_dtypes(include=object).columns\n",
649 | " preprocessor = ColumnTransformer([\n",
650 | " ('one-hot-encoder', categorical_preprocessor, categorical_columns),\n",
651 | " ('standard_scaler', numeric_preprocessor, numeric_columns)\n",
652 | " ])\n",
653 | " return preprocessor"
654 | ]
655 | },
656 | {
657 | "cell_type": "markdown",
658 | "id": "5c6da51b-913f-4f23-83bb-a56fdc901c08",
659 | "metadata": {
660 | "slideshow": {
661 | "slide_type": "slide"
662 | },
663 | "tags": []
664 | },
665 | "source": [
666 | "This allows us to specify the precise transformations we want:"
667 | ]
668 | },
669 | {
670 | "cell_type": "code",
671 | "execution_count": 17,
672 | "id": "18560d32-557b-499c-8ea2-8b77375727e4",
673 | "metadata": {},
674 | "outputs": [],
675 | "source": [
676 | "# Will work the same as the original\n",
677 | "preprocessor = make_preprocessor(\n",
678 | " fake_features,\n",
679 | " categorical_preprocessor=OneHotEncoder(handle_unknown=\"ignore\"),\n",
680 | " numeric_preprocessor=StandardScaler(),\n",
681 | ")"
682 | ]
683 | },
684 | {
685 | "cell_type": "code",
686 | "execution_count": 18,
687 | "id": "5684fea7-0154-4685-84f9-898ff2ec5abc",
688 | "metadata": {},
689 | "outputs": [],
690 | "source": [
691 | "from sklearn.preprocessing import Normalizer, OrdinalEncoder\n",
692 | "# Uses different strategies\n",
693 | "preprocessor = make_preprocessor(\n",
694 | " fake_features,\n",
695 | " categorical_preprocessor=OrdinalEncoder(),\n",
696 | " numeric_preprocessor=Normalizer(),\n",
697 | ")"
698 | ]
699 | },
700 | {
701 | "cell_type": "markdown",
702 | "id": "7a296ab1-b132-4626-bd6a-df4769a81414",
703 | "metadata": {
704 | "slideshow": {
705 | "slide_type": "slide"
706 | },
707 | "tags": []
708 | },
709 | "source": [
710 | "But this is a bit cumbersome - we have to specify all three arguments every time:"
711 | ]
712 | },
713 | {
714 | "cell_type": "code",
715 | "execution_count": 19,
716 | "id": "f1eb74ab-40b3-4876-b509-b76f74751272",
717 | "metadata": {
718 | "tags": [
719 | "ci-skip"
720 | ]
721 | },
722 | "outputs": [
723 | {
724 | "ename": "TypeError",
725 | "evalue": "make_preprocessor() missing 2 required positional arguments: 'categorical_preprocessor' and 'numeric_preprocessor'",
726 | "output_type": "error",
727 | "traceback": [
728 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
729 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
730 | "\u001b[0;32m/var/folders/9w/9m3mzyd96fbdm8q4sy2pjpdw0000gn/T/ipykernel_61981/3965947682.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpreprocessor\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_preprocessor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfake_features\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
731 | "\u001b[0;31mTypeError\u001b[0m: make_preprocessor() missing 2 required positional arguments: 'categorical_preprocessor' and 'numeric_preprocessor'"
732 | ]
733 | }
734 | ],
735 | "source": [
736 | "preprocessor = make_preprocessor(fake_features)"
737 | ]
738 | },
739 | {
740 | "cell_type": "markdown",
741 | "id": "7b511487-1e15-450e-a988-65fbb95cea27",
742 | "metadata": {},
743 | "source": [
744 | "It would be nicer if these arguments were optional, and *defaulted* to the original choices..."
745 | ]
746 | },
747 | {
748 | "cell_type": "code",
749 | "execution_count": 20,
750 | "id": "f0ba98d6-2911-4f21-9d45-dcbc1ee420f7",
751 | "metadata": {
752 | "slideshow": {
753 | "slide_type": "slide"
754 | },
755 | "tags": []
756 | },
757 | "outputs": [],
758 | "source": [
759 | "def make_preprocessor(features, categorical_preprocessor=None, numeric_preprocessor=None):\n",
760 | " '''Create a column transformer that applies sensible preprocessing procedures.'''\n",
761 | " \n",
762 | " if categorical_preprocessor is None:\n",
763 | " categorical_preprocessor = OneHotEncoder(handle_unknown='ignore')\n",
764 | " if numeric_preprocessor is None:\n",
765 | " numeric_preprocessor = StandardScaler()\n",
766 | " \n",
767 | " numeric_columns = features.select_dtypes(exclude=object).columns\n",
768 | " categorical_columns = features.select_dtypes(include=object).columns\n",
769 | " preprocessor = ColumnTransformer([\n",
770 | " ('one-hot-encoder', categorical_preprocessor, categorical_columns),\n",
771 | " ('standard_scaler', numeric_preprocessor, numeric_columns)\n",
772 | " ])\n",
773 | " return preprocessor"
774 | ]
775 | },
776 | {
777 | "cell_type": "code",
778 | "execution_count": 21,
779 | "id": "b1618821-d5ba-40f8-b5b7-832cc5cf3d0b",
780 | "metadata": {},
781 | "outputs": [],
782 | "source": [
783 | "preprocessor = make_preprocessor(fake_features)"
784 | ]
785 | },
786 | {
787 | "cell_type": "markdown",
788 | "id": "25a0d4a7-ee7b-4eec-a908-9905a96dd32d",
789 | "metadata": {
790 | "slideshow": {
791 | "slide_type": "slide"
792 | },
793 | "tags": []
794 | },
795 | "source": [
796 | "\n",
797 | "
Your Turn
\n",
798 | "
Update your my_module.py
file to reflect the changes we made above. Try testing out the new version with the below code:
\n",
799 | "
"
800 | ]
801 | },
802 | {
803 | "cell_type": "code",
804 | "execution_count": 22,
805 | "id": "bc23b739-d556-4eae-a306-36ad463eb8a0",
806 | "metadata": {},
807 | "outputs": [
808 | {
809 | "data": {
810 | "text/plain": [
811 | "0.7806076488412087"
812 | ]
813 | },
814 | "execution_count": 22,
815 | "metadata": {},
816 | "output_type": "execute_result"
817 | }
818 | ],
819 | "source": [
820 | "import my_module\n",
821 | "\n",
822 | "features, target = my_module.get_features_and_target(\n",
823 | " csv_file='../data/adult-census.csv',\n",
824 | " target_col='class',\n",
825 | ")\n",
826 | "features = features.drop('education-num', axis=1)\n",
827 | "target = target.str.contains('>50K').astype(int)\n",
828 | "\n",
829 | "preprocessor = my_module.make_preprocessor(features, numeric_preprocessor=Normalizer())\n",
830 | "model = make_pipeline(preprocessor, LogisticRegression())\n",
831 | "\n",
832 | "X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=123)\n",
833 | "\n",
834 | "_ = model.fit(X_train, y_train)\n",
835 | "model.score(X_test, y_test)"
836 | ]
837 | },
838 | {
839 | "cell_type": "markdown",
840 | "id": "af64ffd4-6437-4cd3-a9fa-5beaf52a427a",
841 | "metadata": {
842 | "slideshow": {
843 | "slide_type": "slide"
844 | },
845 | "tags": []
846 | },
847 | "source": [
848 | "## Remember GitHub?\n",
849 | "\n",
850 | "We always commit signicant code updates to GitHub, so let's stop now and push our changes."
851 | ]
852 | },
853 | {
854 | "cell_type": "markdown",
855 | "id": "9d2314db",
856 | "metadata": {
857 | "slideshow": {
858 | "slide_type": "slide"
859 | }
860 | },
861 | "source": [
862 | "## Questions\n",
863 | "\n",
864 | "Are there any questions before we move on?"
865 | ]
866 | }
867 | ],
868 | "metadata": {
869 | "kernelspec": {
870 | "display_name": "uc-python",
871 | "language": "python",
872 | "name": "python3"
873 | },
874 | "language_info": {
875 | "codemirror_mode": {
876 | "name": "ipython",
877 | "version": 3
878 | },
879 | "file_extension": ".py",
880 | "mimetype": "text/x-python",
881 | "name": "python",
882 | "nbconvert_exporter": "python",
883 | "pygments_lexer": "ipython3",
884 | "version": "3.11.7"
885 | }
886 | },
887 | "nbformat": 4,
888 | "nbformat_minor": 5
889 | }
890 |
--------------------------------------------------------------------------------
/notebooks/Case Study.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "443c6dcb",
6 | "metadata": {},
7 | "source": [
8 | "# Case Study\n",
9 | "\n",
10 | "## Part 1"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "id": "f7cada4e",
16 | "metadata": {},
17 | "source": [
18 | "### Git & version control\n",
19 | "\n",
20 | "1. Create a Github repository called \"ames-housing-analysis\".\n",
21 | "1. Copy the ames.csv data from the `data/` directory into this repository.\n",
22 | "1. Update the README with a short synopsis of this repo.\n",
23 | "1. Create a folder called `notebooks/`\n",
24 | "1. Add, commit, and push what you have so far. Verify in that it appears in GitHub on your repository page."
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "id": "d43125c4",
30 | "metadata": {},
31 | "source": [
32 | "### Exploratory data analysis\n",
33 | "\n",
34 | "1. In the repo's `notebooks/` folder, create a new notebook: `eda.ipynb`.\n",
35 | "2. Load the ames.csv data.\n",
36 | "3. Assess the distribution of the response variable (`Sale_Price`).\n",
37 | "4. How many features are numeric vs. categorical? (Make sure to create two variables: `num_features` and `cat_features`, to use later)\n",
38 | "5. Pick a numeric feature that you believe would be influential on a home's `Sale_Price`. Assess the distribution of the numeric feature. Assess the relationship between that feature and the `Sale_Price`.\n",
39 | "6. Pick a categorical feature that you believe would be influential on a home's `Sale_Price`. Assess the distribution of the categorical feature. Assess the relationship between that feature and the `Sale_Price`."
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "id": "90db6075",
45 | "metadata": {},
46 | "source": [
47 | "### Modular code & Scikit-learn model"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "id": "626052ab",
53 | "metadata": {},
54 | "source": [
55 | "1. Copy `my_module.py` (that we created together) into the notebooks folder.\n",
56 | "2. Import your module and use `get_features_and_target` to load the numeric features of the Ames data, along with the \"Sale_Price\" as a target column.\n",
57 | "\n",
58 | "With your features and target prepared:\n",
59 | "1. Split the data into training and test sets. Use 75% of the data for training and 25% for testing.\n",
60 | "2. Fit a default `sklearn.neighbors.KNeighborsRegressor` model on the training data and score on the test data. Note that scoring on regression models provides the $R^2$.\n",
61 | "3. Fit a default `sklearn.linear_model.LinearRegression` model on the training data and score on the test data.\n",
62 | "4. Fit a default `sklearn.ensemble.RandomForestRegressor` model on the training data and score on the test data."
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "id": "1737cd83",
68 | "metadata": {
69 | "tags": []
70 | },
71 | "source": [
72 | "### Feature engineering"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "id": "4b1b0caf",
78 | "metadata": {},
79 | "source": [
80 | "1. Fill in the blanks to standardize the numeric features and then apply a linear regression model. Does standardizing the numeric features improve the linear regression's $R^2$?"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "id": "6b6be7a3",
87 | "metadata": {
88 | "tags": [
89 | "ci-skip"
90 | ]
91 | },
92 | "outputs": [],
93 | "source": [
94 | "from sklearn.pipeline import make_pipeline\n",
95 | "from sklearn.preprocessing import ________\n",
96 | "\n",
97 | "lm_model_scaled = make_pipeline(__________, LinearRegression())\n",
98 | "lm_model_scaled.fit(X_train, y_train)\n",
99 | "lm_model_scaled.score(X_test, y_test)"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "id": "c57871c5",
105 | "metadata": {},
106 | "source": [
107 | "2. Using the code chunks below, which computes the following:\n",
108 | "\n",
109 | "- identifies numeric, categorical, and ordinal columns in our full feature set,\n",
110 | "- replaces unique values in our ordinal columns (i.e. \"No_basement\", \"No_garage\"), and\n",
111 | "- creates our encoders for the numeric, categorical, and ordinal columns.\n",
112 | "\n",
113 | "\n",
114 | "
Note
\n",
115 | "
Run the following two code cells without changing anything.
\n",
116 | "
"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "id": "3f0cf7c8",
123 | "metadata": {
124 | "tags": [
125 | "ci-skip"
126 | ]
127 | },
128 | "outputs": [],
129 | "source": [
130 | "######## RUN THIS CODE CELL AS-IS ########\n",
131 | "\n",
132 | "# get columns of interest\n",
133 | "numerical_columns = num_features.columns\n",
134 | "ordinal_columns = cat_features.filter(regex='Qual').columns\n",
135 | "categorical_columns = cat_features.drop(columns=ordinal_columns).columns\n",
136 | "\n",
137 | "# replace unique values in our ordinal columns (i.e. \"No_basement\", \"No_garage\") with 'NA'\n",
138 | "for col in ordinal_columns:\n",
139 | " features[col] = features[col].replace(to_replace='No_.*', value='NA', regex=True)\n",
140 | " \n",
141 | "# split full feature set (numeric, categorical, & ordinal features) into train & test sets\n",
142 | "X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=123)"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "id": "713f1d19",
149 | "metadata": {
150 | "tags": [
151 | "ci-skip"
152 | ]
153 | },
154 | "outputs": [],
155 | "source": [
156 | "######## RUN THIS CODE CELL AS-IS ########\n",
157 | "\n",
158 | "from sklearn.preprocessing import OneHotEncoder\n",
159 | "from sklearn.preprocessing import OrdinalEncoder\n",
160 | "\n",
161 | "# create our numeric, categorical, and ordinal preprocessor encoders\n",
162 | "numerical_preprocessor = StandardScaler()\n",
163 | "categorical_preprocessor = OneHotEncoder(handle_unknown=\"ignore\")\n",
164 | "\n",
165 | "ordinal_categories = [\n",
166 | " \"NA\", \"Very_Poor\", \"Poor\", \"Fair\", \"Below_Average\", \"Average\", \"Typical\",\n",
167 | " \"Above_Average\", \"Good\", \"Very_Good\", \"Excellent\", \"Very_Excellent\"\n",
168 | "]\n",
169 | "list_of_ord_cats = [ordinal_categories for col in ordinal_columns]\n",
170 | "ordinal_preprocessor = OrdinalEncoder(categories=list_of_ord_cats)"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "id": "da0d4d19",
176 | "metadata": {},
177 | "source": [
178 | "2. Continued...\n",
179 | "\n",
180 | "Now fill in the blanks to create our `ColumnTransformer` that:\n",
181 | "\n",
182 | "- standardizes numerical columns (preprocessor: `numerical_preprocessor`; columns of interest: `numerical_columns`) \n",
183 | "- one-hot encodes categorical columns (preprocessor: `categorical_preprocessor`; columns of interest: `categorical_columns`) \n",
184 | "- ordinal encodes ordinal columns (preprocessor: `ordinal_preprocessor`; columns of interest: `ordinal_columns`) "
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "id": "3079666b",
191 | "metadata": {
192 | "tags": [
193 | "ci-skip"
194 | ]
195 | },
196 | "outputs": [],
197 | "source": [
198 | "from sklearn.compose import ColumnTransformer\n",
199 | "\n",
200 | "preprocessor = ColumnTransformer([\n",
201 | " ('standard_scaler', __________, __________),\n",
202 | " ('one_hot_encoder', __________, __________),\n",
203 | " ('ordinal_encoder', __________, __________),\n",
204 | "])"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "id": "d4c8487e",
210 | "metadata": {},
211 | "source": [
212 | "3. Now create a pipeline that includes the preprocessing step and applies a linear regression model. Does this improve the linear regression's $R^2$?"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "id": "a25c4be5",
219 | "metadata": {
220 | "tags": [
221 | "ci-skip"
222 | ]
223 | },
224 | "outputs": [],
225 | "source": [
226 | "lm_full = make_pipeline(___________, ___________)\n",
227 | "_ = lm_full.fit(X_train, y_train)\n",
228 | "lm_full.score(X_test, y_test)"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "id": "8174aabe",
234 | "metadata": {},
235 | "source": [
236 | "4. If time allows, create a pipeline that applies these preprocessing steps with a default random forest model and see if performance improves."
237 | ]
238 | },
239 | {
240 | "cell_type": "markdown",
241 | "id": "101c4572-0ce6-4436-9621-835ee6b5c872",
242 | "metadata": {},
243 | "source": [
244 | "### GitHub Check-in\n",
245 | "\n",
246 | "Add, commit (with a good message!), and push your code to this point."
247 | ]
248 | },
249 | {
250 | "cell_type": "markdown",
251 | "id": "622f7037",
252 | "metadata": {},
253 | "source": [
254 | "## Part 2"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "id": "377cd13e",
260 | "metadata": {},
261 | "source": [
262 | "### Model evaluation & selection\n",
263 | "\n",
264 | "1. Using same preprocessing pipeline you created in Part 1, fit a default random forest model using a 5-fold cross validation procedure using the root mean squared error metric (`'neg_root_mean_squared_error'`)."
265 | ]
266 | },
267 | {
268 | "cell_type": "markdown",
269 | "id": "96f5f15f",
270 | "metadata": {},
271 | "source": [
272 | "2. Run the following two code chunks as is without making any changes. This will create a random forest model pipeline and create specified hyperparameter distributions to draw from."
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": null,
278 | "id": "2294d7b1",
279 | "metadata": {
280 | "tags": [
281 | "ci-skip"
282 | ]
283 | },
284 | "outputs": [],
285 | "source": [
286 | "######## RUN THIS CODE CELL AS-IS ########\n",
287 | "\n",
288 | "from scipy.stats import loguniform\n",
289 | "\n",
290 | "\n",
291 | "class loguniform_int:\n",
292 | " \"\"\"Integer valued version of the log-uniform distribution\"\"\"\n",
293 | " def __init__(self, a, b):\n",
294 | " self._distribution = loguniform(a, b)\n",
295 | "\n",
296 | " def rvs(self, *args, **kwargs):\n",
297 | " \"\"\"Random variable sample\"\"\"\n",
298 | " return self._distribution.rvs(*args, **kwargs).astype(int)"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "id": "6b2903d1",
305 | "metadata": {
306 | "tags": [
307 | "ci-skip"
308 | ]
309 | },
310 | "outputs": [],
311 | "source": [
312 | "######## RUN THIS CODE CELL AS-IS ########\n",
313 | "\n",
314 | "from sklearn.pipeline import Pipeline\n",
315 | "\n",
316 | "# create preprocessor & modeling pipeline\n",
317 | "rf = RandomForestRegressor(random_state=123)\n",
318 | "pipeline = Pipeline([('prep', preprocessor), ('rf', rf)])\n",
319 | "\n",
320 | "# specify hyperparameter distributions to randomly sample from\n",
321 | "param_distributions = {\n",
322 | " 'rf__n_estimators': loguniform_int(50, 1000),\n",
323 | " 'rf__max_features': loguniform(.1, .8),\n",
324 | " 'rf__max_depth': loguniform_int(2, 30),\n",
325 | " 'rf__min_samples_leaf': loguniform_int(1, 100),\n",
326 | " 'rf__max_samples': loguniform(.5, 1),\n",
327 | "}"
328 | ]
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "id": "f6771c59",
333 | "metadata": {},
334 | "source": [
335 | "2. Continued...\n",
336 | "\n",
337 | "Fill in the blanks to perform a random hyperparameter search based on the following:\n",
338 | "\n",
339 | "- use the parameter distributions specified above,\n",
340 | "- perform 25 random searches,\n",
341 | "- use a 5-fold cross-validation procedure, and\n",
342 | "- use root mean squared error (RMSE) as our scoring metric.\n",
343 | "\n",
344 | "What are the hyperparameters that provide the lowest RMSE? What is the lowest cross validated RMSE?"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": null,
350 | "id": "40410327",
351 | "metadata": {
352 | "tags": [
353 | "ci-skip"
354 | ]
355 | },
356 | "outputs": [],
357 | "source": [
358 | "%%time\n",
359 | "from sklearn.model_selection import ___________\n",
360 | "\n",
361 | "random_search = RandomizedSearchCV(\n",
362 | " pipeline, \n",
363 | " param_distributions=___________, \n",
364 | " n_iter=__,\n",
365 | " cv=__, \n",
366 | " scoring='___________',\n",
367 | " verbose=1,\n",
368 | " n_jobs=-1,\n",
369 | ")\n",
370 | "\n",
371 | "results = random_search.___________"
372 | ]
373 | },
374 | {
375 | "cell_type": "markdown",
376 | "id": "ed954c73-b660-4edc-93f4-b6869c0dc9d3",
377 | "metadata": {},
378 | "source": [
379 | "### Modular code & unit tests\n",
380 | "\n",
381 | "1. Move the `loguniform_int` class we defined above into a new module, `loguniform_int.py`. We haven't put classes into modules before, but it's no different than a function; just paste it along with any imports it needs."
382 | ]
383 | },
384 | {
385 | "cell_type": "markdown",
386 | "id": "4bd495e6-eb4b-4ddb-83b5-dde8e586ef36",
387 | "metadata": {},
388 | "source": [
389 | "Your new module should contain something like:\n",
390 | "\n",
391 | "```python\n",
392 | "from scipy.stats import loguniform\n",
393 | "\n",
394 | "class loguniform_int:\n",
395 | " \"\"\"Integer valued version of the log-uniform distribution\"\"\"\n",
396 | " def __init__(self, a, b):\n",
397 | " self._distribution = loguniform(a, b)\n",
398 | "\n",
399 | " def rvs(self, *args, **kwargs):\n",
400 | " \"\"\"Random variable sample\"\"\"\n",
401 | " return self._distribution.rvs(*args, **kwargs).astype(int)\n",
402 | "```"
403 | ]
404 | },
405 | {
406 | "cell_type": "markdown",
407 | "id": "7375923a-f673-4c83-abd3-cf4f099a5b9c",
408 | "metadata": {},
409 | "source": [
410 | "2. Import your module and make sure you can use it in code by (re)running the below:"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": 34,
416 | "id": "31f07d64-f468-4b4a-a60e-e338f2f00cb2",
417 | "metadata": {
418 | "tags": [
419 | "ci-skip"
420 | ]
421 | },
422 | "outputs": [
423 | {
424 | "name": "stdout",
425 | "output_type": "stream",
426 | "text": [
427 | "Fitting 5 folds for each of 10 candidates, totalling 50 fits\n"
428 | ]
429 | }
430 | ],
431 | "source": [
432 | "from loguniform_int import loguniform_int\n",
433 | "\n",
434 | "param_distributions = {\n",
435 | " 'rf__n_estimators': loguniform_int(50, 1000),\n",
436 | " 'rf__max_features': loguniform(.1, .8),\n",
437 | " 'rf__max_depth': loguniform_int(2, 30),\n",
438 | " 'rf__min_samples_leaf': loguniform_int(1, 100),\n",
439 | " 'rf__max_samples': loguniform(.5, 1),\n",
440 | "}\n",
441 | "\n",
442 | "random_search = RandomizedSearchCV(\n",
443 | " pipeline, \n",
444 | " param_distributions=param_distributions, \n",
445 | " n_iter=10, # lower this to 10 so it's faster\n",
446 | " cv=5, \n",
447 | " scoring='neg_root_mean_squared_error',\n",
448 | " verbose=1,\n",
449 | " n_jobs=-1,\n",
450 | ")\n",
451 | "\n",
452 | "results2 = random_search.fit(X_train, y_train)"
453 | ]
454 | },
455 | {
456 | "cell_type": "markdown",
457 | "id": "ca9dc10a-42dd-4cc4-b957-0451046cc5f9",
458 | "metadata": {},
459 | "source": [
460 | "3. Create a `tests.py` file in which you add the tests we already create for `get_features_and_target` (you can just copy them), along with a new test that asserts that `loguniform` objects have a `._distribution.args` attribute that holds the original numbers passed into them -- confirming that we did indeed create the kind of distribution we expected. Run the tests when finished.\n",
461 | "\n",
462 | "```python\n",
463 | ">>> lu = loguniform_int(2, 30)\n",
464 | ">>> lu._distribution.args\n",
465 | "(2, 30)\n",
466 | "```"
467 | ]
468 | },
469 | {
470 | "cell_type": "markdown",
471 | "id": "c7d3dd7d-11c9-471f-8391-c5a23219acd6",
472 | "metadata": {},
473 | "source": [
474 | "4. Parametrize this test. Create one `loguniform_int` with `(2, 30)` as the arguments and another with `(1, 100)` as the arguments. Confirm that in both cases, the resulting `._distribution.args` attribute holds a tuple with the same numbers that were supplied initially. Rerun your tests."
475 | ]
476 | },
477 | {
478 | "cell_type": "markdown",
479 | "id": "98334504",
480 | "metadata": {},
481 | "source": [
482 | "### ML lifecycle management"
483 | ]
484 | },
485 | {
486 | "cell_type": "markdown",
487 | "id": "42c8a06f",
488 | "metadata": {},
489 | "source": [
490 | "1. Create and set an MLflow experiment titled \"UC Advanced Python Case Study\"\n",
491 | "2. Re-perform the random hyperparameter search executed above while logging the hyperparameter search experiment with MLflow's autologging. Title this run \"rf_hyperparameter_tuning\"."
492 | ]
493 | }
494 | ],
495 | "metadata": {
496 | "kernelspec": {
497 | "display_name": "Python 3",
498 | "language": "python",
499 | "name": "python3"
500 | },
501 | "language_info": {
502 | "codemirror_mode": {
503 | "name": "ipython",
504 | "version": 3
505 | },
506 | "file_extension": ".py",
507 | "mimetype": "text/x-python",
508 | "name": "python",
509 | "nbconvert_exporter": "python",
510 | "pygments_lexer": "ipython3",
511 | "version": "3.9.2"
512 | }
513 | },
514 | "nbformat": 4,
515 | "nbformat_minor": 5
516 | }
517 |
--------------------------------------------------------------------------------
/notebooks/images/all-split-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/all-split-1.png
--------------------------------------------------------------------------------
/notebooks/images/assert-false.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/assert-false.png
--------------------------------------------------------------------------------
/notebooks/images/bias-model-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/bias-model-1.png
--------------------------------------------------------------------------------
/notebooks/images/brad.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/brad.jpg
--------------------------------------------------------------------------------
/notebooks/images/clone-repo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/clone-repo.png
--------------------------------------------------------------------------------
/notebooks/images/clustering_vs_pca.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/clustering_vs_pca.jpeg
--------------------------------------------------------------------------------
/notebooks/images/commit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/commit.png
--------------------------------------------------------------------------------
/notebooks/images/create-repo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/create-repo.png
--------------------------------------------------------------------------------
/notebooks/images/cross_validation_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/cross_validation_diagram.png
--------------------------------------------------------------------------------
/notebooks/images/cv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/cv.png
--------------------------------------------------------------------------------
/notebooks/images/engineering-icon.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/engineering-icon.jpeg
--------------------------------------------------------------------------------
/notebooks/images/ethan-headshot.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/ethan-headshot.gif
--------------------------------------------------------------------------------
/notebooks/images/ethan.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/ethan.jpg
--------------------------------------------------------------------------------
/notebooks/images/example-knn-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/example-knn-1.png
--------------------------------------------------------------------------------
/notebooks/images/explore-icon.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/explore-icon.jpeg
--------------------------------------------------------------------------------
/notebooks/images/gh-desktop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/gh-desktop.png
--------------------------------------------------------------------------------
/notebooks/images/grid_search_cross_validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/grid_search_cross_validation.png
--------------------------------------------------------------------------------
/notebooks/images/gus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/gus.jpg
--------------------------------------------------------------------------------
/notebooks/images/jay.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/jay.jpg
--------------------------------------------------------------------------------
/notebooks/images/justice-icon.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/justice-icon.jpg
--------------------------------------------------------------------------------
/notebooks/images/legos.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/legos.jpg
--------------------------------------------------------------------------------
/notebooks/images/local_url.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/local_url.png
--------------------------------------------------------------------------------
/notebooks/images/ls.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/ls.png
--------------------------------------------------------------------------------
/notebooks/images/machine_learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/machine_learning.png
--------------------------------------------------------------------------------
/notebooks/images/ml_types.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/ml_types.png
--------------------------------------------------------------------------------
/notebooks/images/ml_types2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/ml_types2.jpeg
--------------------------------------------------------------------------------
/notebooks/images/mlflow_capabilities.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/mlflow_capabilities.png
--------------------------------------------------------------------------------
/notebooks/images/mlflow_tech_icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/mlflow_tech_icon.png
--------------------------------------------------------------------------------
/notebooks/images/mlruns_directory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/mlruns_directory.png
--------------------------------------------------------------------------------
/notebooks/images/model_registry_governance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/model_registry_governance.png
--------------------------------------------------------------------------------
/notebooks/images/model_registry_mlops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/model_registry_mlops.png
--------------------------------------------------------------------------------
/notebooks/images/model_registry_visibility.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/model_registry_visibility.png
--------------------------------------------------------------------------------
/notebooks/images/modeling-process-bias-model-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/modeling-process-bias-model-1.png
--------------------------------------------------------------------------------
/notebooks/images/modeling-process-bias-model-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/modeling-process-bias-model-2.png
--------------------------------------------------------------------------------
/notebooks/images/modeling-process-knn-options-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/modeling-process-knn-options-1.png
--------------------------------------------------------------------------------
/notebooks/images/modeling-process-variance-model-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/modeling-process-variance-model-1.png
--------------------------------------------------------------------------------
/notebooks/images/modeling-process-variance-model-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/modeling-process-variance-model-2.png
--------------------------------------------------------------------------------
/notebooks/images/modeling_process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/modeling_process.png
--------------------------------------------------------------------------------
/notebooks/images/my-module.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/my-module.png
--------------------------------------------------------------------------------
/notebooks/images/new-text-file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/new-text-file.png
--------------------------------------------------------------------------------
/notebooks/images/pattern1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/pattern1.jpeg
--------------------------------------------------------------------------------
/notebooks/images/pattern2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/pattern2.jpeg
--------------------------------------------------------------------------------
/notebooks/images/patterns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/patterns.png
--------------------------------------------------------------------------------
/notebooks/images/prefix-prompt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/prefix-prompt.png
--------------------------------------------------------------------------------
/notebooks/images/process-icon.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
6 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
27 |
28 |
29 |
32 |
33 |
34 |
35 |
36 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
--------------------------------------------------------------------------------
/notebooks/images/push-origin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/push-origin.png
--------------------------------------------------------------------------------
/notebooks/images/random_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/random_search.png
--------------------------------------------------------------------------------
/notebooks/images/registered_models.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/registered_models.png
--------------------------------------------------------------------------------
/notebooks/images/rename-script.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/rename-script.png
--------------------------------------------------------------------------------
/notebooks/images/resampling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/resampling.png
--------------------------------------------------------------------------------
/notebooks/images/resampling.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 | Produced by OmniGraffle 7.9.4
32 | 2019-02-16 02:42:35 +0000
33 |
34 |
35 | Canvas 1
36 |
37 | Layer 1
38 |
39 |
40 |
41 |
42 | All Data
43 |
44 |
45 |
46 |
47 |
48 |
49 | Training
50 |
51 |
52 |
53 |
54 |
55 |
56 | Testing
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 | Assessment
70 |
71 |
72 |
73 |
74 |
75 |
76 | Analysis
77 |
78 |
79 |
80 |
81 |
82 |
83 | Resample 1
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 | Assessment
100 |
101 |
102 |
103 |
104 |
105 |
106 | Analysis
107 |
108 |
109 |
110 |
111 |
112 |
113 | Resample 2
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 | Assessment
130 |
131 |
132 |
133 |
134 |
135 |
136 | Analysis
137 |
138 |
139 |
140 |
141 |
142 |
143 | Resample
144 | B
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
--------------------------------------------------------------------------------
/notebooks/images/scikit-learn-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/scikit-learn-logo.png
--------------------------------------------------------------------------------
/notebooks/images/sidebar-script.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/sidebar-script.png
--------------------------------------------------------------------------------
/notebooks/images/variance-model-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/variance-model-1.png
--------------------------------------------------------------------------------
/notebooks/images/what_is_ml.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/advanced-python-datasci/9b4bb1aabd20c9bd210232860d5205d85d82986f/notebooks/images/what_is_ml.jpeg
--------------------------------------------------------------------------------
/notebooks/my_module.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.compose import ColumnTransformer
4 | from sklearn.preprocessing import OneHotEncoder, StandardScaler
5 |
6 | def get_features_and_target(csv_file, target_col):
7 | '''Split a CSV into a DF of numeric features and a target column.'''
8 |
9 | adult_census = pd.read_csv(csv_file)
10 |
11 | raw_features = adult_census.drop(columns=target_col)
12 | numeric_features = raw_features.select_dtypes(np.number)
13 | feature_cols = numeric_features.columns.values
14 |
15 | features = adult_census[feature_cols]
16 | target = adult_census[target_col]
17 |
18 | return (features, target)
19 |
20 | def make_preprocessor(features, categorical_preprocessor=None, numeric_preprocessor=None):
21 | '''Create a column transformer that applies sensible preprocessing procedures.'''
22 |
23 | if categorical_preprocessor is None:
24 | categorical_preprocessor = OneHotEncoder(handle_unknown='ignore')
25 | if numeric_preprocessor is None:
26 | numeric_preprocessor = StandardScaler()
27 |
28 | numeric_columns = features.select_dtypes(exclude=object).columns
29 | categorical_columns = features.select_dtypes(include=object).columns
30 | preprocessor = ColumnTransformer([
31 | ('one-hot-encoder', categorical_preprocessor, categorical_columns),
32 | ('standard_scaler', numeric_preprocessor, numeric_columns)
33 | ])
34 | return preprocessor
--------------------------------------------------------------------------------
/notebooks/tests.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import my_module
3 | import pandas as pd
4 |
5 | def test_invocation():
6 | features, target = my_module.get_features_and_target(
7 | csv_file='../data/adult-census.csv',
8 | target_col='class'
9 | )
10 |
11 | @pytest.mark.parametrize(
12 | 'csv,target',
13 | [
14 | ('../data/adult-census.csv', 'class'),
15 | ('../data/ames.csv', 'Sale_Price')
16 | ]
17 | )
18 | def test_return_types(csv, target):
19 | features, target = my_module.get_features_and_target(
20 | csv_file=csv,
21 | target_col=target
22 | )
23 | assert isinstance(features, pd.DataFrame)
24 | assert isinstance(target, pd.Series)
25 |
26 | def test_cols_make_sense():
27 | features, target = my_module.get_features_and_target(
28 | csv_file='../data/adult-census.csv',
29 | target_col='class'
30 | )
31 | # Load the data ourselves so we can double-check the columns
32 | df = pd.read_csv('../data/adult-census.csv')
33 | assert target.name in df.columns
34 | # Use a list comprehension to check all the feature columns
35 | assert all([feature_col in df.columns for feature_col in features])
36 |
37 | @pytest.mark.parametrize(
38 | 'csv', [ ['a', 'b', 'c'], 123 ]
39 | )
40 | def test_bad_input_error(csv):
41 | with pytest.raises(ValueError):
42 | features, target = my_module.get_features_and_target(
43 | csv_file=csv,
44 | target_col='Sale_Price'
45 | )
46 |
--------------------------------------------------------------------------------
/scripts/generate_slides.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ ! -d ".git" ]; then
4 | echo "Error: no .git directory detected"
5 | echo "This script should be run from the base of the repo"
6 | echo 'e.g. `bash scripts/generate_slides.sh`'
7 | exit 1
8 | fi
9 |
10 | # We must be *in* the notebook folder for relative links (to eg images) to work
11 | # correctly..
12 | cd notebooks
13 | # images are copied over to slides/ by the Makefile
14 | NB_PATH="$1"
15 | REL_NB=${NB_PATH/#notebooks\//}
16 | jupyter nbconvert --to slides $REL_NB --output-dir=../slides
17 |
--------------------------------------------------------------------------------
/scripts/prep_nb_for_ci.py:
--------------------------------------------------------------------------------
1 | # Replace a notebook, with cells tagged ci-skip removed.
2 | # Adapted from https://stackoverflow.com/questions/62022603/how-to-delete-a-jupyter-notebook-input-cell-programmatically-using-its-tag
3 |
4 | import sys
5 | import nbformat
6 |
7 | SKIP_TAG = 'ci-skip'
8 |
9 | if len(sys.argv) != 2:
10 | raise Exception('Usage: prep_nb_for_ci [notebook.ipynb]')
11 | nb_file = sys.argv[1]
12 |
13 | nb = nbformat.read(nb_file, as_version=nbformat.NO_CONVERT)
14 |
15 | tagged_cell_indices = []
16 |
17 | # find index for the cell with the injected params
18 | for idx, cell in enumerate(nb.cells):
19 | cell_tags = cell.metadata.get('tags')
20 | if cell_tags:
21 | if SKIP_TAG in cell_tags:
22 | tagged_cell_indices.append(idx)
23 |
24 | # Remove tagged cells.
25 | # Iterate in reverse because deleting an earlier index will change what cell
26 | # is at a later one.
27 | for idx in reversed(tagged_cell_indices):
28 | nb.cells.pop(idx)
29 |
30 | # Overwrite the original.
31 | nbformat.write(nb, nb_file)
32 |
--------------------------------------------------------------------------------