├── README.md
├── notebooks
├── .gitignore
├── 1-colab-tour.ipynb
├── 1-colab-tour.md
├── 1-colab-tour.pdf
├── 1-data-detective.ipynb
├── 1-data-detective.md
├── 1-data-detective.pdf
├── 1-exploratory-data-analysis.ipynb
├── 1-exploratory-data-analysis.md
├── 1-exploratory-data-analysis.pdf
├── 1-explore-candidate-datasets.ipynb
├── 1-explore-hw.ipynb
├── 1-explore-hw.md
├── 1-explore-hw.pdf
├── 1-print-colab.ipynb
├── 1-print-colab.md
├── 1-print-colab.pdf
├── 1-python-numpy-tutorial.ipynb
├── 1-python-numpy-tutorial.md
├── 1-python-numpy-tutorial.pdf
├── 2-advertising-hw.ipynb
├── 2-advertising-hw.md
├── 2-advertising-hw.pdf
├── 2-compute-by-hand.ipynb
├── 2-compute-by-hand.md
├── 2-compute-by-hand.pdf
├── 2-linear-regression-case-study.ipynb
├── 2-linear-regression-case-study.md
├── 2-linear-regression-case-study.pdf
├── 2-linear-regression-deep-dive.ipynb
├── 2-linear-regression-deep-dive.md
├── 2-linear-regression-deep-dive.pdf
├── 2-regression-r2.ipynb
├── 2-regression-r2.md
├── 2-regression-r2.pdf
├── 3-bias-variance-deep-dive.ipynb
├── 3-bias-variance-deep-dive.md
├── 3-bias-variance-deep-dive.pdf
├── 3-gradient-descent-deep-dive.ipynb
├── 3-gradient-descent-deep-dive.md
├── 3-gradient-descent-deep-dive.pdf
├── 3-gradient-descent-hw.ipynb
├── 3-gradient-descent-hw.md
├── 3-gradient-descent-hw.pdf
├── 4-linear-regression-case-study-part-2.ipynb
├── 4-linear-regression-case-study-part-2.md
├── 4-linear-regression-case-study-part-2.pdf
├── 4-model-selection.ipynb
├── 4-model-selection.md
├── 4-model-selection.pdf
├── 4-neural-model-selection-hw.ipynb
├── 4-neural-model-selection-hw.md
├── 4-regularization-deep-dive.ipynb
├── 4-regularization-deep-dive.md
├── 4-regularization-deep-dive.pdf
├── 5-compas-case-study.ipynb
├── 5-compas-case-study.md
├── 5-compas-case-study.pdf
├── 5-demo-adaboost.ipynb
├── 5-hw-logistic-regression.ipynb
├── 5-hw-logistic-regression.md
├── 5-hw-voter-classification.ipynb
├── 5-hw-voter-classification.md
├── 5-logistic-regression-digits.ipynb
├── 5-logistic-regression-digits.md
├── 5-logistic-regression-digits.pdf
├── 5-logistic-regression-in-depth.ipynb
├── 5-logistic-regression-in-depth.md
├── 5-logistic-regression-in-depth.pdf
├── 6-decision-trees.ipynb
├── 6-decision-trees.md
├── 6-decision-trees.pdf
├── 6-k-nearest-neighbors-in-depth.ipynb
├── 6-k-nearest-neighbors-in-depth.md
├── 6-k-nearest-neighbors-in-depth.pdf
├── 6-knn-tree-bias-variance.ipynb
├── 6-knn-tree-bias-variance.md
├── 6-knn-tree-bias-variance.pdf
├── 6-knn-voter-classification-2020-hw.ipynb
├── 6-knn-voter-classification-2020-hw.md
├── 6-knn-voter-classification-hw.ipynb
├── 6-knn-voter-classification-hw.md
├── 6-knn-voter-classification-hw.pdf
├── 6-knn-voter-classification-hw.py
├── 7-demo-adaboost.ipynb
├── 7-demo-adaboost.md
├── 7-demo-adaboost.pdf
├── 7-demo-digits-classifiers.ipynb
├── 7-demo-digits-classifiers.md
├── 7-demo-digits-classifiers.pdf
├── 7-knn-tree-bias-variance.ipynb
├── 7-knn-tree-bias-variance.md
├── 7-svm-pre-kernel.ipynb
├── 7-svm-pre-kernel.md
├── 7-svm-pre-kernel.pdf
├── 7-trees-ensembles-in-depth.ipynb
├── 7-trees-ensembles-in-depth.md
├── 7-trees-ensembles-in-depth.pdf
├── 8-demo-backprop.ipynb
├── 8-demo-backprop.md
├── 8-demo-backprop.pdf
├── 8-hyperparameter.ipynb
├── 8-hyperparameter.md
├── 8-lab-neural-net-music-classification.ipynb
├── 8-lab-neural-net-music-classification.md
├── 8-neural-net-demo-draw-torch.ipynb
├── 8-neural-net-demo-draw-torch.md
├── 8-neural-net-demo-draw.ipynb
├── 8-neural-net-demo-draw.md
├── 8-svm-bias-variance.ipynb
├── 8-svm-bias-variance.md
├── 8-svm-bias-variance.pdf
├── 8-svm-with-kernel.ipynb
├── 8-svm-with-kernel.md
├── 8-svm-with-kernel.pdf
├── 9-convolutional-neural-networks-old.md
├── 9-convolutional-neural-networks.ipynb
├── 9-convolutional-neural-networks.md
├── 9-convolutional-neural-networks.pdf
├── 9-fine-tune-rock-paper-scissors.ipynb
├── 9-fine-tune-rock-paper-scissors.md
├── 9-slash-dataset.ipynb
├── 9-slash-dataset.md
├── 9-slash-dataset.pdf
├── Makefile
├── images
│ ├── colab-tour-auto-0.png
│ ├── colab-tour-auto-1.png
│ ├── colab-tour-cell-order-0.png
│ ├── colab-tour-connect-0.png
│ ├── colab-tour-connect-1.png
│ ├── colab-tour-delete-0.png
│ ├── colab-tour-delete-1.png
│ ├── colab-tour-file-upload-0.png
│ ├── colab-tour-filexplore-0.png
│ ├── colab-tour-filexplore-1.png
│ ├── colab-tour-filexplore-2.png
│ ├── colab-tour-gdrive-0.png
│ ├── colab-tour-gdrive-1.png
│ ├── colab-tour-gdrive-2.png
│ ├── colab-tour-gdrive-3.png
│ ├── colab-tour-newcell-0.png
│ ├── colab-tour-newcell-1.png
│ ├── colab-tour-newcell-2.png
│ ├── colab-tour-run-0.png
│ ├── colab-tour-run-1.png
│ ├── exit-poll-nan-distance.png
│ ├── exit-poll-survey-versions-2020.png
│ └── exit-poll-survey-versions.png
└── style
│ ├── default.latex
│ ├── includes.tex
│ ├── keystroke-setup.tex
│ └── listings-setup.tex
└── projects
├── Audio (music).pdf
├── Audio (speech).pdf
├── Generating images.pdf
├── Generating text.pdf
├── ML and society_ Fairness, privacy, explainability.pdf
├── Reinforcement learning.pdf
├── Security and robustness.pdf
├── Understanding images.pdf
└── Understanding text.pdf
/README.md:
--------------------------------------------------------------------------------
1 | ## Machine learning notebooks
2 |
3 | These are notebooks developed for ECE-GY 6143 Intro to Machine Learning at NYU Tandon School of Engineering.
4 |
5 |
6 | ### Intro ML, exploratory data analysis
7 |
8 | * Notebook: Python + numpy tutorial
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/1-python-numpy-tutorial.ipynb)
9 | * Notebook: Colab tutorial [](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/1-colab-tour.ipynb)
10 | * Notebook: Printing from Colab [](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/1-print-colab.ipynb)
11 | * Notebook: Exploratory data analysis (in-class)
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/1-exploratory-data-analysis.ipynb)
12 | * Notebook: Exploratory data analysis (homework)
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/1-explore-hw.ipynb)
13 | * Notebook: Data detective challenge (optional homework)
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/1-data-detective.ipynb)
14 |
15 |
16 | ### Linear regression
17 |
18 | * Notebook: Linear regression in depth
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/2-linear-regression-deep-dive.ipynb)
19 | * Notebook: Compute regression coefficients by hand
20 | * Notebook: Regression metrics
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/2-regression-r2.ipynb)
21 | * Notebook: Case study on "Beauty in the Classroom"
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/2-linear-regression-case-study.ipynb)
22 | * Notebook: Residual analysis on Advertising data (homework)
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/2-advertising-hw.ipynb)
23 |
24 | ### Gradient descent
25 |
26 | * Notebook: Gradient descent in depth
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/3-gradient-descent-deep-dive.ipynb)
27 |
28 | ### Bias-variance tradeoff, model selection
29 |
30 | * Notebook: Bias-variance tradeoff and model selection in depth
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/3-bias-variance-model-selection-deep-dive.ipynb)
31 | * Notebook: Model order selection for neural data (homework) [](https://colab.research.google.com/drive/1RKVHfezDfY0ar6KB6QVkJNRpcJt20Tgy?usp=sharing)
32 |
33 | ### Regularization
34 |
35 | * Notebook: Regularization in depth
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/3-regularization-deep-dive.ipynb)
36 |
37 | ### Logistic regression
38 |
39 | * Notebook: Logistic regression in depth
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/4-logistic-regression-in-depth.ipynb)
40 | * Notebook: Logistic regression for handwritten digits classification
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/4-logistic-regression-digits.ipynb)
41 | * Notebook: COMPAS case study
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/4-compas-case-study.ipynb)
42 | * Notebook: Classifying your own handwritten digit (homework)
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/4-hw-logistic-regression.ipynb)
43 |
44 | ### K nearest neighbor
45 |
46 | * Notebook: K nearest neighbor in depth
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/5-k-nearest-neighbors-in-depth.ipynb)
47 | * Notebook: Voter classification with K nearest neighbor (homework) [](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/5-hw-voter-classification.ipynb)
48 |
49 | ### Decision tree, ensemble methods
50 |
51 | * Notebook: Decision trees and ensembles
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/5-trees-ensembles-in-depth.ipynb)
52 | * Notebook: AdaBoost
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/5-demo-adaboost.ipynb)
53 | * Notebook: Bias and variance of KNN and decision tree models [](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/5-knn-tree-bias-variance.ipynb)
54 |
55 | ### Support vector machines
56 |
57 | * Notebook: Support vector machines
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/6-svm-pre-kernel.ipynb)
58 | * Notebook: Handwritten digits classification
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/6-demo-digits-classifiers.ipynb)
59 | * Notebook: Bias and variance of SVM
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/7-svm-bias-variance.ipynb)
60 | * Homework: [Grid search for SVM hyperparameter tuning](https://colab.research.google.com/drive/1lpudIgi9VqxIjckUVNhXbOssrtd7ah3Z?usp=sharing) (use NYU Google account to open)
61 |
62 | ### Neural networks
63 |
64 | * Notebook: Backpropagation from scratch
[](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/7-demo-backprop.ipynb)
65 | * Notebook: Draw your own classification problem for a neural network [](https://colab.research.google.com/github/ffund/ml-notebooks/blob/master/notebooks/7-neural-net-demo-draw.ipynb)
66 | * Homework: Neural network for musical instrument classification (homework) [
67 |
68 |
69 | ### Convolutional neural networks
70 |
71 | * Deep dive: [Convolutional neural networks](https://colab.research.google.com/drive/1gHQEEIDkmgExueDe_LtM24dN2PN8Wvs0?usp=sharing) (use NYU Google account to open)
72 | * [Transfer learning](https://colab.research.google.com/drive/1tm5ZxeN8uVqj6veLKarFizD0eW9cRd4e#scrollTo=_yz-QM1sLt0Cusp=sharing) (use NYU Google account to open)
73 | * Homework: [Transfer learning](https://colab.research.google.com/drive/16w-mLZ4tSxwH7bZof-1Baota-TIYv19B?usp=sharing)
74 |
75 |
76 | ### Unsupervised learning
77 |
78 | ### Project topics
79 |
80 | For your project, you will replicate and then extend an existing ML project (typically a recent publication in a major ML conference). See [projects](projects) list for examples.
81 |
--------------------------------------------------------------------------------
/notebooks/.gitignore:
--------------------------------------------------------------------------------
1 | *.nbconvert.ipynb
2 | *.csv
3 |
--------------------------------------------------------------------------------
/notebooks/1-colab-tour.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/1-colab-tour.pdf
--------------------------------------------------------------------------------
/notebooks/1-data-detective.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Data detective challenge!\n",
8 | "=========================\n",
9 | "\n",
10 | "*Fraida Fund*"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "Introduction\n",
18 | "------------\n",
19 | "\n",
20 | "In this notebook, we will consider several machine learning tasks, and\n",
21 | "candidate data sets for them. We will explore the following questions:\n",
22 | "\n",
23 | "- Do these data sets seem appropriate for the task?\n",
24 | "- Are there any important limitations of the datasets, or problems\n",
25 | " that need to be addressed before we use them to train a machine\n",
26 | " learning model?\n",
27 | "\n",
28 | "In fact, each of these datasets has a significant problem that - if not\n",
29 | "detected early on - would create a “Garbage In, Garbage Out” situation.\n",
30 | "See if you can identify the problem with each dataset!\n",
31 | "\n",
32 | "To get you started, I included some code to show you how to read in the\n",
33 | "data. You can add additional code and text cells to explore the data.\n",
34 | "\n",
35 | "Your work on this challenge won’t be submitted or graded. If you think\n",
36 | "you found the problem with a dataset, share your findings with the class\n",
37 | "by posting on Ed! (In your post, show evidence from your exploratory\n",
38 | "data analysis to support your claims.)"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "import pandas as pd\n",
48 | "import matplotlib.pyplot as plt\n",
49 | "import seaborn as sns"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "Taxi tip prediction\n",
57 | "-------------------"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "### Scenario\n",
65 | "\n",
66 | "You are developing an app for NYC taxi drivers that will predict what\n",
67 | "the typical tip would be for a given fare.\n",
68 | "\n",
69 | "You consider using data collected by the NYC Taxi and Limousine\n",
70 | "Commission on taxi trips. These links are for 2019 data (2020 was\n",
71 | "probably an atypical year, so we won’t use that). Previous years are\n",
72 | "also available.\n",
73 | "\n",
74 | "- [Data link for yellow (Manhattan) taxi\n",
75 | " trips](https://data.cityofnewyork.us/Transportation/2019-Yellow-Taxi-Trip-Data/2upf-qytp)\n",
76 | "- [Data link for green (non-Manhattan) taxi\n",
77 | " trips](https://data.cityofnewyork.us/Transportation/2019-Green-Taxi-Trip-Data/q5mz-t52e)"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {},
83 | "source": [
84 | "### Read in data\n",
85 | "\n",
86 | "We’ll start by reading in the 2019 Green Taxi trip data. It’s a large\n",
87 | "file and takes a long time to download, so we may interrupt the download\n",
88 | "in middle (using the Runtime menu in Colab) and just work with the\n",
89 | "partial data.\n",
90 | "\n",
91 | "In the next couple of cells, `wget` and `wc` are not Python code -\n",
92 | "they’re Linux commands. We can run some basic Linux commands inside our\n",
93 | "Colab runtime, and it’s often helpful to do so. For example, we may use\n",
94 | "Linux commands to install extra software libraries that are not\n",
95 | "pre-installed in our runtime, clone a source code repository from\n",
96 | "Github, or download data from the Internet."
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "!wget \"https://data.cityofnewyork.us/api/views/q5mz-t52e/rows.csv?accessType=DOWNLOAD\" -O 2019-Green-Taxi-Trip-Data.csv"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "Is the cell above taking a long time to run? That’s because this data\n",
113 | "set is very large, and the server from which it is retrieved is not very\n",
114 | "fast. Since we don’t need to explore the whole dataset, necessarily, we\n",
115 | "can interrupt the partial download by using the Runtime \\> Interrupt\n",
116 | "Execution menu option.\n",
117 | "\n",
118 | "Then, we can read in just 10,000 rows of data."
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "df_taxi = pd.read_csv('2019-Green-Taxi-Trip-Data.csv', nrows=10000) \n",
128 | "df_taxi.head()"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "Use additional cells as needed to explore this data. Answer the\n",
136 | "following questions:\n",
137 | "\n",
138 | "- How is the data collected? Is it automatic, or is there human\n",
139 | " involvement?\n",
140 | "- What variable should be the *target variable* for this machine\n",
141 | " learning problem?\n",
142 | "- What variable(s) could potentially be used as *features* to train\n",
143 | " the model?\n",
144 | "- What are our assumptions about the features and the target variable,\n",
145 | " and the relationships between these? (For example: in NYC, what is a\n",
146 | " conventional tip amount, as a percent of the total fare? If you are\n",
147 | " not from NYC, you can find information about this online!) Are any\n",
148 | " of these assumptions violated in this data?\n",
149 | "- Are there variables that should *not* be used as features to train\n",
150 | " the model, because of potential for data leakage?\n",
151 | "- Are there any serious data problems that we need to correct before\n",
152 | " using the data for this purpose? Explain."
153 | ]
154 | },
155 | {
156 | "cell_type": "markdown",
157 | "metadata": {},
158 | "source": [
159 | "Highway traffic prediction\n",
160 | "--------------------------"
161 | ]
162 | },
163 | {
164 | "cell_type": "markdown",
165 | "metadata": {},
166 | "source": [
167 | "### Scenario\n",
168 | "\n",
169 | "You are working for the state of New York to develop a traffic\n",
170 | "prediction model for the NYS Thruway. The following Thruway data is\n",
171 | "available: Number and types of vehicles that entered from each entry\n",
172 | "point on the Thruway, along with their exit points, at 15 minute\n",
173 | "intervals.\n",
174 | "\n",
175 | "The link points to the most recent week’s worth of available data, but\n",
176 | "this data is available through 2014. [Link to NYS Thruway\n",
177 | "data](https://data.ny.gov/Transportation/NYS-Thruway-Origin-and-Destination-Points-for-All-/4dbf-24u2)"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {},
183 | "source": [
184 | "### Read in data"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "url = 'https://data.ny.gov/api/views/4dbf-24u2/rows.csv?accessType=DOWNLOAD&sorting=true'\n",
194 | "df_thruway = pd.read_csv(url)\n",
195 | "df_thruway.head()"
196 | ]
197 | },
198 | {
199 | "cell_type": "markdown",
200 | "metadata": {},
201 | "source": [
202 | "Use additional cells as needed to explore this data. Answer the\n",
203 | "following questions:\n",
204 | "\n",
205 | "- How is the data collected? Is it automatic, or is there human\n",
206 | " involvement?\n",
207 | "- What variable should be the *target variable* for this machine\n",
208 | " learning problem?\n",
209 | "- What variable(s) could potentially be used as *features* to train\n",
210 | " the model?\n",
211 | "- What are our assumptions about the features and the target variable,\n",
212 | " and the relationships between these? (For example: what times of day\n",
213 | " should be busy? What times of day will be less busy? What stretches\n",
214 | " of the Thruway might be especially congested - look at Google Maps?)\n",
215 | "- Are there variables that should *not* be used as features to train\n",
216 | " the model, because of potential for data leakage?\n",
217 | "- Are there any serious data problems that we need to correct before\n",
218 | " using the data for this purpose? Explain."
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {},
224 | "source": [
225 | "Satirical headline classification\n",
226 | "---------------------------------"
227 | ]
228 | },
229 | {
230 | "cell_type": "markdown",
231 | "metadata": {},
232 | "source": [
233 | "### Scenario\n",
234 | "\n",
235 | "You are hired by a major social media platform to develop a machine\n",
236 | "learning model that will be used to clearly mark *satirical news\n",
237 | "articles* when they are shared on social media.\n",
238 | "\n",
239 | "You consider using this dataset of 9,000 headlines from [The\n",
240 | "Onion](https://www.theonion.com/) and 15,000 headlines from [Not The\n",
241 | "Onion on Reddit](https://www.reddit.com/r/nottheonion/). [Link to\n",
242 | "OnionOrNot data](https://github.com/lukefeilberg/onion)\n",
243 | "\n",
244 | "([This\n",
245 | "notebook](https://github.com/lukefeilberg/onion/blob/master/Onion.ipynb)\n",
246 | "shows how the data was compiled and processed.)"
247 | ]
248 | },
249 | {
250 | "cell_type": "markdown",
251 | "metadata": {},
252 | "source": [
253 | "### Read in data\n",
254 | "\n",
255 | "This time, we’ll retrieve the data from Github."
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": null,
261 | "metadata": {},
262 | "outputs": [],
263 | "source": [
264 | "!git clone https://github.com/lukefeilberg/onion.git"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "df_headline = pd.read_csv(\"onion/OnionOrNot.csv\")\n",
274 | "df_headline.head()"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {},
280 | "source": [
281 | "Use additional cells as needed to explore this data. Answer the\n",
282 | "following questions:\n",
283 | "\n",
284 | "- How is the data collected? Is it automatic, or is there human\n",
285 | " involvement?\n",
286 | "- What variable should be the *target variable* for this machine\n",
287 | " learning problem?\n",
288 | "- What variable(s) could potentially be used as *features* to train\n",
289 | " the model?\n",
290 | "- What are our assumptions about the data?\n",
291 | "- Are there variables that should *not* be used as features to train\n",
292 | " the model, because of potential for data leakage?\n",
293 | "- Are there any serious data problems that we need to correct before\n",
294 | " using the data for this purpose? Explain."
295 | ]
296 | }
297 | ],
298 | "nbformat": 4,
299 | "nbformat_minor": 5,
300 | "metadata": {}
301 | }
302 |
--------------------------------------------------------------------------------
/notebooks/1-data-detective.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Data detective challenge'
3 | author: 'Fraida Fund'
4 | ---
5 |
6 |
7 | ::: {.cell .markdown}
8 |
9 | # Data detective challenge!
10 |
11 | _Fraida Fund_
12 |
13 |
14 | :::
15 |
16 |
17 | ::: {.cell .markdown}
18 |
19 |
20 | ## Introduction
21 |
22 | In this notebook, we will consider several machine learning tasks, and candidate data sets for them. We will explore the following questions:
23 |
24 | * Do these data sets seem appropriate for the task?
25 | * Are there any important limitations of the datasets, or problems that need to be addressed before we use them to train a machine learning model?
26 |
27 | In fact, each of these datasets has a significant problem that - if not detected early on - would create a "Garbage In, Garbage Out" situation. See if you can identify the problem with each dataset!
28 |
29 | To get you started, I included some code to show you how to read in the data. You can add additional code and text cells to explore the data.
30 |
31 | Your work on this challenge won't be submitted or graded. If you think you found the problem with a dataset, share your findings with the class by posting on Ed! (In your post, show evidence from your exploratory data analysis to support your claims.)
32 |
33 |
34 | :::
35 |
36 |
37 |
38 | ::: {.cell .code}
39 | ```python
40 | import pandas as pd
41 | import matplotlib.pyplot as plt
42 | import seaborn as sns
43 | ```
44 | :::
45 |
46 |
47 | ::: {.cell .markdown}
48 |
49 | ## Taxi tip prediction
50 |
51 | :::
52 |
53 |
54 | ::: {.cell .markdown}
55 |
56 | ### Scenario
57 |
58 | You are developing an app for NYC taxi drivers that will predict what the typical tip would be for a given fare.
59 |
60 | You consider using data collected by the NYC Taxi and Limousine Commission on taxi trips. These links are for 2019 data (2020 was probably an atypical year, so we won't use that). Previous years are also available.
61 |
62 | * [Data link for yellow (Manhattan) taxi trips](https://data.cityofnewyork.us/Transportation/2019-Yellow-Taxi-Trip-Data/2upf-qytp)
63 | * [Data link for green (non-Manhattan) taxi trips](https://data.cityofnewyork.us/Transportation/2019-Green-Taxi-Trip-Data/q5mz-t52e)
64 |
65 | :::
66 |
67 | ::: {.cell .markdown}
68 |
69 | ### Read in data
70 |
71 | We'll start by reading in the 2019 Green Taxi trip data. It's a large file and takes a long time to download, so we may interrupt the download in middle (using the Runtime menu in Colab) and just work with the partial data.
72 |
73 |
74 | In the next couple of cells, `wget` and `wc` are not Python code - they're Linux commands. We can run some basic Linux commands inside our Colab runtime, and it's often helpful to do so. For example, we may use Linux commands to install extra software libraries that are not pre-installed in our runtime, clone a source code repository from Github, or download data from the Internet.
75 |
76 | :::
77 |
78 |
79 | ::: {.cell .code}
80 | ```python
81 | !wget "https://data.cityofnewyork.us/api/views/q5mz-t52e/rows.csv?accessType=DOWNLOAD" -O 2019-Green-Taxi-Trip-Data.csv
82 | ```
83 | :::
84 |
85 | ::: {.cell .markdown}
86 |
87 | Is the cell above taking a long time to run? That's because this data set is very large, and the server from which it is retrieved is not very fast. Since we don't need to explore the whole dataset, necessarily, we can interrupt the partial download by using the Runtime > Interrupt Execution menu option.
88 |
89 | Then, we can read in just 10,000 rows of data.
90 | :::
91 |
92 |
93 |
94 | ::: {.cell .code}
95 | ```python
96 | df_taxi = pd.read_csv('2019-Green-Taxi-Trip-Data.csv', nrows=10000)
97 | df_taxi.head()
98 | ```
99 | :::
100 |
101 | ::: {.cell .markdown}
102 |
103 | Use additional cells as needed to explore this data. Answer the following questions:
104 |
105 | * How is the data collected? Is it automatic, or is there human involvement?
106 | * What variable should be the *target variable* for this machine learning problem?
107 | * What variable(s) could potentially be used as *features* to train the model?
108 | * What are our assumptions about the features and the target variable, and the relationships between these? (For example: in NYC, what is a conventional tip amount, as a percent of the total fare? If you are not from NYC, you can find information about this online!) Are any of these assumptions violated in this data?
109 | * Are there variables that should *not* be used as features to train the model, because of potential for data leakage?
110 | * Are there any serious data problems that we need to correct before using the data for this purpose? Explain.
111 |
112 | :::
113 |
114 |
115 | ::: {.cell .markdown}
116 |
117 | ## Highway traffic prediction
118 |
119 | :::
120 |
121 |
122 |
123 | ::: {.cell .markdown}
124 |
125 | ### Scenario
126 |
127 | You are working for the state of New York to develop a traffic prediction model for the NYS Thruway. The following Thruway data is available: Number and types of vehicles that entered from each entry point on the Thruway, along with their exit points, at 15 minute intervals.
128 |
129 | The link points to the most recent week's worth of available data, but this data is available through 2014. [Link to NYS Thruway data](https://data.ny.gov/Transportation/NYS-Thruway-Origin-and-Destination-Points-for-All-/4dbf-24u2)
130 |
131 | :::
132 |
133 | ::: {.cell .markdown}
134 |
135 | ### Read in data
136 |
137 | :::
138 |
139 |
140 | ::: {.cell .code}
141 | ```python
142 | url = 'https://data.ny.gov/api/views/4dbf-24u2/rows.csv?accessType=DOWNLOAD&sorting=true'
143 | df_thruway = pd.read_csv(url)
144 | df_thruway.head()
145 | ```
146 | :::
147 |
148 | ::: {.cell .markdown}
149 |
150 | Use additional cells as needed to explore this data. Answer the following questions:
151 |
152 | * How is the data collected? Is it automatic, or is there human involvement?
153 | * What variable should be the *target variable* for this machine learning problem?
154 | * What variable(s) could potentially be used as *features* to train the model?
155 | * What are our assumptions about the features and the target variable, and the relationships between these? (For example: what times of day should be busy? What times of day will be less busy? What stretches of the Thruway might be especially congested - look at Google Maps?)
156 | * Are there variables that should *not* be used as features to train the model, because of potential for data leakage?
157 | * Are there any serious data problems that we need to correct before using the data for this purpose? Explain.
158 |
159 | :::
160 |
161 |
162 | ::: {.cell .markdown}
163 |
164 | ## Satirical headline classification
165 |
166 |
167 | :::
168 |
169 |
170 | ::: {.cell .markdown}
171 |
172 | ### Scenario
173 |
174 | You are hired by a major social media platform to develop a machine learning model that will be used to clearly mark *satirical news articles* when they are shared on social media.
175 |
176 | You consider using this dataset of 9,000 headlines from [The Onion](https://www.theonion.com/) and 15,000 headlines from [Not The Onion on Reddit](https://www.reddit.com/r/nottheonion/). [Link to OnionOrNot data](https://github.com/lukefeilberg/onion)
177 |
178 | ([This notebook](https://github.com/lukefeilberg/onion/blob/master/Onion.ipynb) shows how the data was compiled and processed.)
179 |
180 | :::
181 |
182 |
183 | ::: {.cell .markdown}
184 |
185 | ### Read in data
186 |
187 | This time, we'll retrieve the data from Github.
188 |
189 | :::
190 |
191 |
192 | ::: {.cell .code}
193 | ```python
194 | !git clone https://github.com/lukefeilberg/onion.git
195 | ```
196 | :::
197 |
198 | ::: {.cell .code}
199 | ```python
200 | df_headline = pd.read_csv("onion/OnionOrNot.csv")
201 | df_headline.head()
202 | ```
203 | :::
204 |
205 | ::: {.cell .markdown}
206 |
207 | ::: {.cell .markdown}
208 |
209 | Use additional cells as needed to explore this data. Answer the following questions:
210 |
211 | * How is the data collected? Is it automatic, or is there human involvement?
212 | * What variable should be the *target variable* for this machine learning problem?
213 | * What variable(s) could potentially be used as *features* to train the model?
214 | * What are our assumptions about the data?
215 | * Are there variables that should *not* be used as features to train the model, because of potential for data leakage?
216 | * Are there any serious data problems that we need to correct before using the data for this purpose? Explain.
217 |
218 | :::
219 |
220 |
221 | :::
222 |
--------------------------------------------------------------------------------
/notebooks/1-data-detective.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/1-data-detective.pdf
--------------------------------------------------------------------------------
/notebooks/1-exploratory-data-analysis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/1-exploratory-data-analysis.pdf
--------------------------------------------------------------------------------
/notebooks/1-explore-hw.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Exploratory data analysis'
3 | author: 'Fraida Fund'
4 | ---
5 |
6 |
7 | ::: {.cell .markdown}
8 |
9 | # Assignment: Exploratory data analysis
10 |
11 |
12 | **TODO**: Edit this cell to fill in your NYU Net ID and your name:
13 |
14 | * **Net ID**:
15 | * **Name**:
16 |
17 | :::
18 |
19 |
20 | ::: {.cell .markdown}
21 |
22 |
23 | ## Introduction
24 |
25 | In this assignment, we will practice using exploratory data analysis on Google's COVID-19 Community Mobility data.
26 |
27 | This data was collected from Google Maps users around the world over the last few months - including you, *if* you have Google Maps on your phone and have turned on the Location History setting. It combines location history from a large number of users to capture the overall increase or decrease in time spent in places such as: retail and recreation facilities, groceries and pharmacies, parks, transit stations, workplaces, and residences.
28 |
29 | The data shows how users' mobility patterns - what types of places they spend time in - varied over the course of the COVID-19 pandemic.
30 |
31 | As you work through this notebook, you will see that some text and code cells are marked with a "TODO" at the top. You'll have to edit these cells to fill in the code or answer the questions as indicated.
32 |
33 | When you are finished, make sure you have run all of the cells in the notebook (in order), and then create a PDF from it. Submit the PDF on Gradescope.
34 |
35 | **Important note**: You won't necessarily have learned or seen in advance how to use all the Python commands and library functions you need to complete this assignment. That's OK. Part of the learning objective here is to practice finding and applying that kind of new information as you go! Use the library documentation, search the Internet, or ask questions on Ed if you need any help.
36 |
37 | :::
38 |
39 |
40 | ::: {.cell .markdown}
41 |
42 | ## Learn about the data
43 |
44 | First, it is worthwhile to learn more about the data: how it is collected, what is included, how Google gets consent to collect this data, and how user privacy is protected. Google provides several resources for learning about the data:
45 |
46 | * [Blog post](https://www.blog.google/technology/health/covid-19-community-mobility-reports?hl=en)
47 | * [About this data](https://www.google.com/covid19/mobility/data_documentation.html?hl=en#about-this-data)
48 | * [Understand the data](https://support.google.com/covid19-mobility/answer/9825414?hl=en&ref_topic=9822927)
49 |
50 | :::
51 |
52 |
53 |
54 | ::: {.cell .markdown}
55 |
56 | ## Read in data
57 |
58 | Now you are ready to read the data into your notebook.
59 |
60 | Visit Google's web page for the [COVID-19 Community Mobility](https://www.google.com/covid19/mobility/) project to get the URL for the data.
61 |
62 | (Specific instructions will depend on your browser and operating system, but on my laptop, I can get the URL by right-clicking on the button that says "Download global CSV" and choosing "Copy Link Address".)
63 |
64 | Then, in the following cells, use that URL to read the data into a pandas Data Frame called `df`. (You can follow the example in the "Exploratory data analysis" notebook from this week's lesson.)
65 |
66 | :::
67 |
68 | ::: {.cell .code}
69 | ```python
70 | import pandas as pd
71 | import seaborn as sns
72 | import matplotlib.pyplot as plt
73 | ```
74 | :::
75 |
76 |
77 | ::: {.cell .code}
78 | ```python
79 | # TODO Q1
80 | # url = ...
81 | # df = ...
82 | ```
83 | :::
84 |
85 | ::: {.cell .markdown}
86 |
87 | Use the `info()` and `head()` functions to show some basic information about the data and to look at the first few samples.
88 |
89 |
90 | :::
91 |
92 |
93 | ::: {.cell .code}
94 | ```python
95 | # TODO Q2
96 | # use info()
97 | ```
98 | :::
99 |
100 | ::: {.cell .code}
101 | ```python
102 | # TODO Q3
103 | # use head()
104 | ```
105 | :::
106 |
107 | ::: {.cell .markdown}
108 |
109 | ## Basic data manipulations
110 |
111 | :::
112 |
113 | ::: {.cell .markdown}
114 |
115 | The data includes a date field, but it may have been read in as a string, rather than as a `datetime`. If that's the case, use `to_datetime()` to convert the field into a datetime format. (You can follow the example in the "Exploratory data analysis" notebook from this week's lesson.)
116 |
117 | Then, use `info()` again to make sure your change was applied. Note the difference in the output, relative to the cell above.
118 |
119 | :::
120 |
121 |
122 | ::: {.cell .code}
123 | ```python
124 | # TODO Q4
125 | # df['date'] = ...
126 |
127 | ```
128 | :::
129 |
130 |
131 |
132 | ::: {.cell .markdown}
133 |
134 | Next, you are going to extract the subset of data for the U.S. state of your choice. You can choose any location *except* New York.
135 |
136 |
137 | The data is reported for different regions, with different levels of granularity available. This is best explained by example:
138 |
139 | Suppose I want the overall trend from the entire U.S. I would use the subset of data where `country_region` is equal to "United States" and `sub_region_1` is null:
140 |
141 | ```
142 | df_subset = df[(df['country_region'].eq("United States")) & (df['sub_region_1'].isnull())]
143 | ```
144 |
145 | Suppose I want the overall trend from the entire state of New York: I would use the subset of data where `country_region` is equal to "United States", `sub_region_1` is equal to "New York", and `sub_region_2` is null:
146 |
147 | ```
148 | df_subset = df[(df['country_region'].eq("United States")) & (df['sub_region_1'].eq("New York")) & (df['sub_region_2'].isnull())]
149 | ```
150 |
151 | Suppose I want the overall trend from Brooklyn, New York (Kings County): I would use the subset of data where `country_region` is equal to "United States", `sub_region_1` is equal to "New York", and `sub_region_2` is equal to "Kings County":
152 |
153 | ```
154 | df_subset = df[(df['country_region'].eq("United States")) & (df['sub_region_1'].eq("New York")) & (df['sub_region_2'].eq("Kings County"))]
155 | ```
156 |
157 | In the following cell(s), fill in the code to create a data frame `df_subset` with data from a single U.S. state (but *not* New York).
158 |
159 |
160 | :::
161 |
162 |
163 |
164 | ::: {.cell .code}
165 | ```python
166 | # TODO Q5
167 | # df_subset =
168 | ```
169 | :::
170 |
171 |
172 | ::: {.cell .markdown}
173 |
174 | Is the data complete, or is some data not available for the location you have chosen? In the following cell, write code to check for missing data in the `...percent_change_from_baseline` fields.
175 |
176 | Also check whether there are any missing rows of data. What date range is represented in this data? Is every day within that range included in the data?
177 |
178 | :::
179 |
180 | ::: {.cell .code}
181 | ```python
182 | # TODO Q6
183 | # df_subset
184 | ```
185 | :::
186 |
187 |
188 | ::: {.cell .markdown}
189 |
190 | **TODO** Q7: Edit this cell to answer the following question: Is the data complete, or is some relevant data missing? Why would some locations only have partial data available (missing some `...percent_change_from_baseline` fields for some dates)? (Even if, for the U.S. state you have chosen, the data is complete, explain why some data may be missing for other regions.)
191 |
192 | **Include a short quote from the material you read in the "Learn about the data" section to answer this question. Indicate that it is a quote using quotation marks or a block quote, and cite the source, including a URL.**
193 |
194 | :::
195 |
196 |
197 | ::: {.cell .markdown}
198 |
199 | To track trends in cases and vaccinations alongside mobility trends, we can also read in data from several other sources. For example,
200 |
201 | * Our World in Data distributes data about COVID-19 vaccination status over time for U.S. states in their [Github repository](https://github.com/owid/covid-19-data).
202 | * The New York Times distributes data about COVID-19 cumulative cases over time for U.S. states in their [Github repository](https://github.com/nytimes/covid-19-data).
203 |
204 |
205 | You can choose whether to look at vaccination trends or case trends for the U.S. state you have selected. Use one of the following cells to read in the data, convert the `date` field to a `datetime`, and get the subset of the data that applies to the specific U.S. state for which you are exploring mobility data.
206 |
207 | Then, use `pandas` functions to check your new data frame and look at the first few rows of data.
208 |
209 | :::
210 |
211 |
212 | ::: {.cell .code}
213 | ``` {.python}
214 | # TODO Q8 - Vaccinations option
215 |
216 | url_vax = 'https://github.com/owid/covid-19-data/raw/master/public/data/vaccinations/us_state_vaccinations.csv'
217 | # df_vax = ...
218 | # df_vax['date'] = ...
219 | # df_vax_subset = ...
220 | # check the data frame and look at a few rows
221 | ```
222 | :::
223 |
224 | ::: {.cell .code}
225 | ``` {.python}
226 | # TODO Q8 - Cases option
227 |
228 | url_cases = 'https://github.com/nytimes/covid-19-data/raw/master/us-states.csv'
229 | # df_cases = ...
230 | # df_cases['date'] = ...
231 | # df_cases_subset = ...
232 | # check the data frame and look at a few rows
233 | ```
234 | :::
235 |
236 |
237 | ::: {.cell .markdown}
238 |
239 | ## Visualize data
240 |
241 | Finally, we are going to visualize the changes in human mobility over this time, for the location you have chosen, alongside either vaccination trends or cases trends.
242 |
243 | In the following cell, create a figure with seven subplots, arranged vertically. (You can refer to the example in the "Python + numpy" notebook from this week's lesson.) On the horizontal axis, put the date. On the vertical axes, show (as a line):
244 |
245 | * `retail_and_recreation_percent_change_from_baseline` in the top subplot
246 | * `grocery_and_pharmacy_percent_change_from_baseline` in the next subplot
247 | * `parks_percent_change_from_baseline` in the next subplot
248 | * `transit_stations_percent_change_from_baseline` in the next subplot
249 | * `workplaces_percent_change_from_baseline` in the next subplot
250 | * `residential_percent_change_from_baseline` in the next subplot
251 | * either COVID-19 cases or vaccinations in the bottom subplot
252 |
253 | Make sure to clearly label each axis. Use `matplotlib` library documentation to adjust your figures and make your plot look nice!
254 |
255 | :::
256 |
257 | ::: {.cell .code}
258 | ```python
259 | # TODO Q9
260 | # create visualization
261 | ```
262 | :::
263 |
264 |
265 |
266 | ::: {.cell .markdown}
267 |
268 | **TODO** Q10: Answer the following questions:
269 |
270 | * Do the results seem to satisfy "common sense"?
271 | * Make sure to explain any trends, patterns, or notable anomalies observed in your mobility data.
272 | * Which trends, patterns, or notable anomalies in the mobility data are likely related to COVID-19 cases, non-pharmaceutical interventions such as stay-at-home orders, or vaccinations?
273 | * Which trends, patterns, or notable anomalies in the mobility data are likely related to other factors?
274 | * Cite specific evidence from your plot to support your answer.
275 |
276 | **TODO** Q11: In the [Calibrate Region](https://support.google.com/covid19-mobility/checklist/9834261?hl=en&ref_topic=9822927) checklist, Google suggests a number of reasons why their mobility data might *not* be useful for understanding the effect of COVID-19-related interventions, or why the data might be misleading.
277 |
278 | * For the U.S. state you have chosen, briefly answer *all* of the questions in that checklist, and explain how your answer affects the validity of the data.
279 |
280 | * Based on your answers, do you think there are any serious problems associated with using this data for understanding user mobility changes due to COVID-19?
281 |
282 |
283 |
284 | :::
285 |
286 |
287 |
--------------------------------------------------------------------------------
/notebooks/1-explore-hw.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/1-explore-hw.pdf
--------------------------------------------------------------------------------
/notebooks/1-print-colab.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Printing from Colab\n",
8 | "\n",
9 | "*Fraida Fund*"
10 | ],
11 | "id": "ae144cd2-2b6c-4d05-9c68-7ac7d3f84c8c"
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "To submit homework assignments, you will need to generate PDF versions of your completed Colab notebooks.\n",
18 | "\n",
19 | "Printing to a PDF from Colab seems easy - there’s a File \\> Print option in the menu! However, the built-in print option won’t always work well for us, because if a plot or other output happens to come out near a page break, it can get cut off."
20 | ],
21 | "id": "737b7bf9-85fa-4974-9151-3474409e4d8c"
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "As an alternative to Colab’s built-in print, you can use this notebook to generate a PDF version of any Colab notebook that is saved in your Google Drive."
28 | ],
29 | "id": "cce1c197-6b50-44a9-b626-ec8b75ccdf90"
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "## Step 1: Prepare the source notebook\n",
36 | "\n",
37 | "Make sure the notebook that you want to print is ready:\n",
38 | "\n",
39 | "- you ran the cells in the notebook (in order!) and their output is visible in the notebook\n",
40 | "- it is saved in your Google Drive"
41 | ],
42 | "id": "189c2345-a348-4181-bbc0-2bdb8df05df2"
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "## Step 2: Install software and libraries\n",
49 | "\n",
50 | "In *this* notebook, run the following cell:"
51 | ],
52 | "id": "105a0864-c2aa-4d56-a067-ed44f3f63d06"
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "!apt-get update\n",
61 | "!apt-get install texlive texlive-xetex texlive-latex-extra pandoc\n",
62 | "!pip install pypandoc"
63 | ],
64 | "id": "a84e9b82-7988-4c1d-99a2-08c0357a2ae5"
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "## Step 3: Mount your Google Drive\n",
71 | "\n",
72 | "In *this* notebook, mount your Google Drive:"
73 | ],
74 | "id": "3d500ba4-ac39-453c-94e2-b64927ff733b"
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "from google.colab import drive\n",
83 | "drive.mount('/content/drive')"
84 | ],
85 | "id": "502e530b-1c6b-4752-8325-734411d6f487"
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "metadata": {},
90 | "source": [
91 | "## Step 4: Select notebook and convert to PDF\n",
92 | "\n",
93 | "In *both* of the following cells, change the name “Untitled” to whatever your notebook is named. Then, run the cells."
94 | ],
95 | "id": "cdfd7027-4125-4018-a4bc-dbce309a50fd"
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "!jupyter nbconvert --output-dir='/content' --to latex '/content/drive/My Drive/Colab Notebooks/Untitled.ipynb'"
104 | ],
105 | "id": "7a810845-9125-48d3-a819-b1cdb1e15399"
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "!buf_size=1000000 xelatex --interaction=nonstopmode 'Untitled.tex'"
114 | ],
115 | "id": "20842593-5bef-4901-b073-316003a50aa2"
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {},
120 | "source": [
121 | "## Step 5: Download PDF\n",
122 | "\n",
123 | "Finally, open the Colab file browser, locate your new PDF, and download it. Review the PDF and make sure it looks good before you submit!"
124 | ],
125 | "id": "806fad85-70be-4034-8e6a-0d154cdf0714"
126 | }
127 | ],
128 | "nbformat": 4,
129 | "nbformat_minor": 5,
130 | "metadata": {}
131 | }
132 |
--------------------------------------------------------------------------------
/notebooks/1-print-colab.md:
--------------------------------------------------------------------------------
1 | ::: {.cell .markdown}
2 |
3 | # Printing from Colab
4 |
5 | _Fraida Fund_
6 |
7 | :::
8 |
9 |
10 | ::: {.cell .markdown}
11 |
12 | To submit homework assignments, you will need to generate PDF versions of your completed Colab notebooks.
13 |
14 | Printing to a PDF from Colab seems easy - there's a File > Print option in the menu! However, the built-in print option won't always work well for us, because if a plot or other output happens to come out near a page break, it can get cut off.
15 |
16 | :::
17 |
18 | ::: {.cell .markdown}
19 |
20 | As an alternative to Colab's built-in print, you can use this notebook to generate a PDF version of any Colab notebook that is saved in your Google Drive.
21 |
22 | :::
23 |
24 |
25 | ::: {.cell .markdown}
26 |
27 | ## Step 1: Prepare the source notebook
28 |
29 | Make sure the notebook that you want to print is ready:
30 |
31 | * you ran the cells in the notebook (in order!) and their output is visible in the notebook
32 | * it is saved in your Google Drive
33 |
34 | :::
35 |
36 | ::: {.cell .markdown}
37 |
38 | ## Step 2: Install software and libraries
39 |
40 | In *this* notebook, run the following cell:
41 | :::
42 |
43 | ::: {.cell .code}
44 | ``` {.python}
45 | !apt-get update
46 | !apt-get install texlive texlive-xetex texlive-latex-extra pandoc
47 | !pip install pypandoc
48 | ```
49 | :::
50 |
51 | ::: {.cell .markdown}
52 |
53 | ## Step 3: Mount your Google Drive
54 |
55 | In *this* notebook, mount your Google Drive:
56 |
57 | :::
58 |
59 | ::: {.cell .code}
60 | ``` {.python}
61 | from google.colab import drive
62 | drive.mount('/content/drive')
63 | ```
64 | :::
65 |
66 | ::: {.cell .markdown}
67 |
68 | ## Step 4: Select notebook and convert to PDF
69 |
70 | In *both* of the following cells, change the name "Untitled" to whatever your notebook is named. Then, run the cells.
71 |
72 | :::
73 |
74 | ::: {.cell .code}
75 | ``` {.python}
76 | !jupyter nbconvert --output-dir='/content' --to latex '/content/drive/My Drive/Colab Notebooks/Untitled.ipynb'
77 | ```
78 | :::
79 |
80 | ::: {.cell .code}
81 | ``` {.python}
82 | !buf_size=1000000 xelatex --interaction=nonstopmode 'Untitled.tex'
83 | ```
84 | :::
85 |
86 |
87 | ::: {.cell .markdown}
88 |
89 | ## Step 5: Download PDF
90 |
91 | Finally, open the Colab file browser, locate your new PDF, and download it. Review the PDF and make sure it looks good before you submit!
92 |
93 | :::
94 |
--------------------------------------------------------------------------------
/notebooks/1-print-colab.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/1-print-colab.pdf
--------------------------------------------------------------------------------
/notebooks/1-python-numpy-tutorial.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/1-python-numpy-tutorial.pdf
--------------------------------------------------------------------------------
/notebooks/2-advertising-hw.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/2-advertising-hw.pdf
--------------------------------------------------------------------------------
/notebooks/2-compute-by-hand.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Computing regression coefficients\n",
8 | "\n",
9 | "This notebook walks through the steps involved in manual (pen and paper) computation of the optimal least squares regression coefficients, using the normal equations.\n",
10 | "\n",
11 | "We’ll also show how to do each of these steps in Python, so that you can try other values for $X$ and $y$ and then check your work."
12 | ],
13 | "id": "ea562ad4-c539-4a83-a582-7276b4886e5d"
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "import pandas as pd\n",
22 | "import seaborn as sns\n",
23 | "import numpy as np"
24 | ],
25 | "id": "c4bf15f0-5011-4673-9185-ee402db01fba"
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "We will start with the labeled data. Our data includes four samples, with two features:\n",
32 | "\n",
33 | "$$ X = \n",
34 | "\\begin{bmatrix}\n",
35 | "1 & 2 \\\\\n",
36 | "2 & 3 \\\\\n",
37 | "4 & 1 \\\\\n",
38 | "5 & 5 \n",
39 | "\\end{bmatrix},\n",
40 | "y = \n",
41 | "\\begin{bmatrix}\n",
42 | "3 \\\\\n",
43 | "2 \\\\\n",
44 | "7 \\\\\n",
45 | "1 \n",
46 | "\\end{bmatrix}\n",
47 | "$$"
48 | ],
49 | "id": "24424047-cea1-4cbb-ab6d-523417b49d7c"
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "X = [[1,2], [2, 3], [4, 1], [5, 5]]\n",
58 | "y = [3, 2, 7, 1]"
59 | ],
60 | "id": "2b999e60-d5b9-4b09-8f1e-5d1c85787386"
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "We can use `seaborn` to plot each column of $X$ versus $y$:"
67 | ],
68 | "id": "b97ed4c4-801d-4946-8099-1b188c504286"
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "df = pd.DataFrame(X, columns = [ 'x1', 'x2'])\n",
77 | "df = df.assign(y=y)\n",
78 | "\n",
79 | "melted = df.melt(id_vars=['y'], value_vars=[ 'x1','x2'])\n",
80 | "g = sns.FacetGrid(melted, col='variable', col_wrap=3);\n",
81 | "g.map(sns.scatterplot, 'value', 'y');"
82 | ],
83 | "id": "9fa6e7f2-b6df-4e9c-949b-ab3460602606"
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "Of course, we will assume a linear model. For a given sample:\n",
90 | "\n",
91 | "$$\\hat{y} = w_0 + w_1 x_1 + w_2 x_2$$"
92 | ],
93 | "id": "182e426c-5510-441d-b0da-a56d248084c8"
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "Next, we will create a design matrix $A$ by horizontally “stacking” a vector of 1s and the matrix $X$:\n",
100 | "\n",
101 | "$$ A = \n",
102 | "\\begin{bmatrix}\n",
103 | "1 & 1 & 2 \\\\\n",
104 | "1 & 2 & 3 \\\\\n",
105 | "1 & 4 & 1 \\\\\n",
106 | "1 & 5 & 5 \n",
107 | "\\end{bmatrix},\n",
108 | "$$"
109 | ],
110 | "id": "1cbd3831-8f27-4cf2-90ec-738f3cc851bd"
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "A = np.hstack((np.ones(4)[:,None], X))\n",
119 | "A"
120 | ],
121 | "id": "e7e85f91-eda6-42a8-b333-eed0c1fb7267"
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "metadata": {},
126 | "source": [
127 | "Now we can represent our linear model more easily using matrices:\n",
128 | "\n",
129 | "$$\\hat{\\mathbf{\\hat{y}}} = A\\mathbf{w}$$\n",
130 | "\n",
131 | "where $\\hat{\\mathbf{\\hat{y}}}$ and $\\mathbf{w}$ are vectors."
132 | ],
133 | "id": "cfdacf81-1ed4-4887-bc17-09bff2a60436"
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "metadata": {},
138 | "source": [
139 | "The optimal least squares values for the vector $w$ are\n",
140 | "\n",
141 | "$$w^* = (A^T A)^{-1} A^{T} \\mathbf{y}$$\n",
142 | "\n",
143 | "Note that the least-squares solutions are the solutions of the matrix equation\n",
144 | "\n",
145 | "$$ A^T A \\mathbf{w} = A^T \\mathbf{y}$$\n",
146 | "\n",
147 | "A matrix equation is in the form $Ax=b$, where $A$ is an $m \\times n$ matrix and $b$ is a column vector with $m$ entries. It can be solved for $x$ by forming the augmented matrix $(A | b)$ and then using elementary row operations to get it in row reduced form.\n",
148 | "\n",
149 | "Thus, to get $w$ we will:\n",
150 | "\n",
151 | "- Compute the matrix $A^T A$ and the vector $A^T y$.\n",
152 | "- Form the augmented matrix for the matrix equation $A^T A w = A^T y$: $(A^T A | A^T y)$\n",
153 | "- Row reduce to find the optimal value for $w$, \\$w^\\* \\$."
154 | ],
155 | "id": "68d41ac2-14e1-411d-b9a5-a9cd3d4c0a9a"
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "The transpose of $A$, $A^T$, is the matrix whose rows are the columns of $A$:\n",
162 | "\n",
163 | "$$ A^T = \n",
164 | "\\begin{bmatrix}\n",
165 | "1 & 1 & 1 & 1 \\\\\n",
166 | "1 & 2 & 4 & 5 \\\\\n",
167 | "2 & 3 & 1 & 5 \n",
168 | "\\end{bmatrix}\n",
169 | "$$"
170 | ],
171 | "id": "f858adc1-50ac-49ce-80dd-ff69372facd3"
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "A.T"
180 | ],
181 | "id": "7aa04c05-71d3-4aae-b83f-8090c3127a43"
182 | },
183 | {
184 | "cell_type": "markdown",
185 | "metadata": {},
186 | "source": [
187 | "To solve\n",
188 | "\n",
189 | "$$ A^T A w = A^T y$$\n",
190 | "\n",
191 | "we’ll need $A^T A$:\n",
192 | "\n",
193 | "$$ A^T A = \n",
194 | "\\begin{bmatrix}\n",
195 | "1 & 1 & 1 & 1 \\\\\n",
196 | "1 & 2 & 4 & 5 \\\\\n",
197 | "2 & 3 & 1 & 5 \n",
198 | "\\end{bmatrix}\n",
199 | "\\begin{bmatrix}\n",
200 | "1 & 1 & 2 \\\\\n",
201 | "1 & 2 & 3 \\\\\n",
202 | "1 & 4 & 1 \\\\\n",
203 | "1 & 5 & 5 \n",
204 | "\\end{bmatrix} = \n",
205 | "\\begin{bmatrix}\n",
206 | "4 & 12 & 11 \\\\\n",
207 | "12 & 46 & 37 \\\\\n",
208 | "11 & 37 & 39 \n",
209 | "\\end{bmatrix}\n",
210 | "$$"
211 | ],
212 | "id": "86358d21-55f9-47db-ab20-517ed5295351"
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": null,
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "A.T.dot(A)"
221 | ],
222 | "id": "e8ed18e9-c5c6-4086-a2c1-7395014f188a"
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "metadata": {},
227 | "source": [
228 | "and $A^T y$:\n",
229 | "\n",
230 | "$$ A^T y = \n",
231 | "\\begin{bmatrix}\n",
232 | "1 & 1 & 1 & 1 \\\\\n",
233 | "1 & 2 & 4 & 5 \\\\\n",
234 | "2 & 3 & 1 & 5 \n",
235 | "\\end{bmatrix}\n",
236 | "\\begin{bmatrix}\n",
237 | "3 \\\\\n",
238 | "2 \\\\\n",
239 | "7 \\\\\n",
240 | "1 \n",
241 | "\\end{bmatrix} =\n",
242 | "\\begin{bmatrix}\n",
243 | "13 \\\\\n",
244 | "40 \\\\\n",
245 | "24 \n",
246 | "\\end{bmatrix}\n",
247 | "$$"
248 | ],
249 | "id": "8036c27a-0a22-487d-a1f4-e385c7980efd"
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": null,
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "A.T.dot(y)"
258 | ],
259 | "id": "3b913940-2897-4cfd-b60a-5091ab94077d"
260 | },
261 | {
262 | "cell_type": "markdown",
263 | "metadata": {},
264 | "source": [
265 | "Next, create the augmented matrix $(A^T A | A^T y)$:\n",
266 | "\n",
267 | "$$ (A^T A | A^T y)=\n",
268 | "\\begin{bmatrix}\n",
269 | "4 & 12 & 11 & | & 13 \\\\\n",
270 | "12 & 46 & 37 & | & 40 \\\\\n",
271 | "11 & 37 & 39 & | & 24\n",
272 | "\\end{bmatrix}\n",
273 | "$$"
274 | ],
275 | "id": "3e937131-270b-48f7-8ba7-ff8bf7fa0d9a"
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": null,
280 | "metadata": {},
281 | "outputs": [],
282 | "source": [
283 | "M1 = np.hstack([A.T.dot(A), A.T.dot(y)[:,None]])\n",
284 | "M1"
285 | ],
286 | "id": "b560cae7-d43a-433d-af60-df114e181bdc"
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "metadata": {},
291 | "source": [
292 | "We will perform some elementary row operations on the augmented matrix to get $A^T A$ in row reduced form:"
293 | ],
294 | "id": "1dbfb502-4f2f-4586-bc0f-50a84ab400fd"
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": null,
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "M2 = np.vstack([M1[0], M1[1]-3*M1[0], 4*M1[2]-11*M1[0]])\n",
303 | "M2"
304 | ],
305 | "id": "d1ecb9f5-0ddd-4724-a48e-273749594ad6"
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "metadata": {},
311 | "outputs": [],
312 | "source": [
313 | "M3 = np.vstack([M2[0], M2[1], 5*M2[2]-8*M2[1]])\n",
314 | "M3"
315 | ],
316 | "id": "5da621aa-251f-4378-a2bd-5b43decb6b06"
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": null,
321 | "metadata": {},
322 | "outputs": [],
323 | "source": [
324 | "M4 = np.vstack([M3[0], 143*M3[1]-4*M3[2], M3[2]])\n",
325 | "M4"
326 | ],
327 | "id": "fd2ed094-b22c-4f0d-820c-b1349e0bafc6"
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": null,
332 | "metadata": {},
333 | "outputs": [],
334 | "source": [
335 | "M5 = np.vstack([143/2*(M4[0]-12/1430*M4[1] - 11/143*M4[2]), M4[1], M4[2]])\n",
336 | "M5"
337 | ],
338 | "id": "a2188987-8b2c-4f14-a9b8-c54126f12c68"
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": null,
343 | "metadata": {},
344 | "outputs": [],
345 | "source": [
346 | "M6 = np.vstack([M5[0]/286, M5[1]/1430, M5[2]/143])\n",
347 | "M6"
348 | ],
349 | "id": "81730ab5-3479-41b0-925f-b4808d8ae1b0"
350 | },
351 | {
352 | "cell_type": "markdown",
353 | "metadata": {},
354 | "source": [
355 | "When we’re finished, our augmented matrix looks like this:\n",
356 | "\n",
357 | "$$\n",
358 | "\\begin{bmatrix}\n",
359 | "1 & 0 & 0 & | & \\frac{1597}{286} \\\\\n",
360 | "0 & 1 & 0 & | & \\frac{1115}{1430} \\\\\n",
361 | "0 & 0 & 1 & | & \\frac{-243}{143}\n",
362 | "\\end{bmatrix}\n",
363 | "$$\n",
364 | "\n",
365 | "and so\n",
366 | "\n",
367 | "$$ w^* = \\left[ \\frac{1597}{286}, \\frac{1115}{1430}, \\frac{-243}{143} \\right] $$"
368 | ],
369 | "id": "1e133ab3-4e51-454d-ad74-d6ba8d6f0487"
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": null,
374 | "metadata": {},
375 | "outputs": [],
376 | "source": [
377 | "M6[:,3]"
378 | ],
379 | "id": "f779f49f-fb24-48e0-b1d9-dfc0cb620a2b"
380 | },
381 | {
382 | "cell_type": "markdown",
383 | "metadata": {},
384 | "source": [
385 | "Compare this to the known solution:"
386 | ],
387 | "id": "ed84c0e4-246e-4694-97f7-5d56e76b53d0"
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "np.linalg.inv( A.T.dot(A) ).dot(A.T.dot(y))"
396 | ],
397 | "id": "1e19b8fe-1864-40ea-8472-5b8e4a5b1b6c"
398 | }
399 | ],
400 | "nbformat": 4,
401 | "nbformat_minor": 5,
402 | "metadata": {}
403 | }
404 |
--------------------------------------------------------------------------------
/notebooks/2-compute-by-hand.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Computing regression coefficients'
3 | author: 'Fraida Fund'
4 |
5 | ---
6 |
7 | ::: {.cell .markdown}
8 |
9 | ## Computing regression coefficients
10 |
11 | This notebook walks through the steps involved in manual (pen and paper) computation of the optimal least squares regression coefficients, using the normal equations.
12 |
13 | We'll also show how to do each of these steps in Python, so that you can try other values for $X$ and $y$ and then check your work.
14 |
15 | :::
16 |
17 | ::: {.cell .code}
18 | ```python
19 | import pandas as pd
20 | import seaborn as sns
21 | import numpy as np
22 | ```
23 | :::
24 |
25 |
26 |
27 | ::: {.cell .markdown}
28 |
29 | We will start with the labeled data. Our data includes four samples, with two features:
30 |
31 | $$ X =
32 | \begin{bmatrix}
33 | 1 & 2 \\
34 | 2 & 3 \\
35 | 4 & 1 \\
36 | 5 & 5
37 | \end{bmatrix},
38 | y =
39 | \begin{bmatrix}
40 | 3 \\
41 | 2 \\
42 | 7 \\
43 | 1
44 | \end{bmatrix}
45 | $$
46 |
47 | :::
48 |
49 |
50 | ::: {.cell .code}
51 | ```python
52 | X = [[1,2], [2, 3], [4, 1], [5, 5]]
53 | y = [3, 2, 7, 1]
54 | ```
55 | :::
56 |
57 |
58 |
59 | ::: {.cell .markdown}
60 |
61 | We can use `seaborn` to plot each column of $X$ versus $y$:
62 |
63 | :::
64 |
65 |
66 | ::: {.cell .code}
67 | ```python
68 | df = pd.DataFrame(X, columns = [ 'x1', 'x2'])
69 | df = df.assign(y=y)
70 |
71 | melted = df.melt(id_vars=['y'], value_vars=[ 'x1','x2'])
72 | g = sns.FacetGrid(melted, col='variable', col_wrap=3);
73 | g.map(sns.scatterplot, 'value', 'y');
74 | ```
75 | :::
76 |
77 | ::: {.cell .markdown}
78 |
79 | Of course, we will assume a linear model. For a given sample:
80 |
81 | $$\hat{y} = w_0 + w_1 x_1 + w_2 x_2$$
82 |
83 | :::
84 |
85 | ::: {.cell .markdown}
86 |
87 | Next, we will create a design matrix $A$ by horizontally “stacking” a vector
88 | of 1s and the matrix $X$:
89 |
90 | $$ A =
91 | \begin{bmatrix}
92 | 1 & 1 & 2 \\
93 | 1 & 2 & 3 \\
94 | 1 & 4 & 1 \\
95 | 1 & 5 & 5
96 | \end{bmatrix},
97 | $$
98 |
99 | :::
100 |
101 |
102 |
103 | ::: {.cell .code}
104 | ```python
105 | A = np.hstack((np.ones(4)[:,None], X))
106 | A
107 | ```
108 | :::
109 |
110 | ::: {.cell .markdown}
111 |
112 | Now we can represent our linear model more easily using matrices:
113 |
114 |
115 | $$\hat{\mathbf{\hat{y}}} = A\mathbf{w}$$
116 |
117 | where $\hat{\mathbf{\hat{y}}}$ and $\mathbf{w}$ are vectors.
118 |
119 | :::
120 |
121 |
122 |
123 | ::: {.cell .markdown}
124 |
125 | The optimal least squares values for the vector $w$ are
126 |
127 | $$w^* = (A^T A)^{-1} A^{T} \mathbf{y}$$
128 |
129 |
130 | Note that the least-squares solutions are the solutions of the matrix equation
131 |
132 | $$ A^T A \mathbf{w} = A^T \mathbf{y}$$
133 |
134 | A matrix equation is in the form $Ax=b$, where $A$ is an $m \times n$ matrix and $b$ is a column vector with $m$ entries. It can be solved for $x$ by forming the augmented matrix $(A | b)$ and then using elementary row operations to get it in row reduced form.
135 |
136 |
137 | Thus, to get $w$ we will:
138 |
139 | * Compute the matrix $A^T A$ and the vector $A^T y$.
140 | * Form the augmented matrix for the matrix equation $A^T A w = A^T y$: $(A^T A | A^T y)$
141 | * Row reduce to find the optimal value for $w$, $w^* $.
142 |
143 | :::
144 |
145 |
146 | ::: {.cell .markdown}
147 |
148 | The transpose of $A$, $A^T$, is the matrix whose rows are the columns of $A$:
149 |
150 |
151 |
152 | $$ A^T =
153 | \begin{bmatrix}
154 | 1 & 1 & 1 & 1 \\
155 | 1 & 2 & 4 & 5 \\
156 | 2 & 3 & 1 & 5
157 | \end{bmatrix}
158 | $$
159 |
160 | :::
161 |
162 |
163 | ::: {.cell .code}
164 | ```python
165 | A.T
166 | ```
167 | :::
168 |
169 |
170 | ::: {.cell .markdown}
171 |
172 | To solve
173 |
174 | $$ A^T A w = A^T y$$
175 |
176 | we'll need $A^T A$:
177 |
178 | $$ A^T A =
179 | \begin{bmatrix}
180 | 1 & 1 & 1 & 1 \\
181 | 1 & 2 & 4 & 5 \\
182 | 2 & 3 & 1 & 5
183 | \end{bmatrix}
184 | \begin{bmatrix}
185 | 1 & 1 & 2 \\
186 | 1 & 2 & 3 \\
187 | 1 & 4 & 1 \\
188 | 1 & 5 & 5
189 | \end{bmatrix} =
190 | \begin{bmatrix}
191 | 4 & 12 & 11 \\
192 | 12 & 46 & 37 \\
193 | 11 & 37 & 39
194 | \end{bmatrix}
195 | $$
196 |
197 | :::
198 |
199 |
200 | ::: {.cell .code}
201 | ```python
202 | A.T.dot(A)
203 | ```
204 | :::
205 |
206 |
207 | ::: {.cell .markdown}
208 |
209 | and $A^T y$:
210 |
211 |
212 | $$ A^T y =
213 | \begin{bmatrix}
214 | 1 & 1 & 1 & 1 \\
215 | 1 & 2 & 4 & 5 \\
216 | 2 & 3 & 1 & 5
217 | \end{bmatrix}
218 | \begin{bmatrix}
219 | 3 \\
220 | 2 \\
221 | 7 \\
222 | 1
223 | \end{bmatrix} =
224 | \begin{bmatrix}
225 | 13 \\
226 | 40 \\
227 | 24
228 | \end{bmatrix}
229 | $$
230 |
231 | :::
232 |
233 |
234 | ::: {.cell .code}
235 | ```python
236 | A.T.dot(y)
237 | ```
238 | :::
239 |
240 |
241 |
242 | ::: {.cell .markdown}
243 |
244 | Next, create the augmented matrix $(A^T A | A^T y)$:
245 |
246 | $$ (A^T A | A^T y)=
247 | \begin{bmatrix}
248 | 4 & 12 & 11 & | & 13 \\
249 | 12 & 46 & 37 & | & 40 \\
250 | 11 & 37 & 39 & | & 24
251 | \end{bmatrix}
252 | $$
253 |
254 | :::
255 |
256 |
257 | ::: {.cell .code}
258 | ```python
259 | M1 = np.hstack([A.T.dot(A), A.T.dot(y)[:,None]])
260 | M1
261 | ```
262 | :::
263 |
264 | ::: {.cell .markdown}
265 |
266 | We will perform some elementary row operations on the augmented matrix to get $A^T A$ in row reduced form:
267 |
268 | :::
269 |
270 |
271 | ::: {.cell .code}
272 | ```python
273 | M2 = np.vstack([M1[0], M1[1]-3*M1[0], 4*M1[2]-11*M1[0]])
274 | M2
275 | ```
276 | :::
277 |
278 |
279 | ::: {.cell .code}
280 | ```python
281 | M3 = np.vstack([M2[0], M2[1], 5*M2[2]-8*M2[1]])
282 | M3
283 | ```
284 | :::
285 |
286 |
287 | ::: {.cell .code}
288 | ```python
289 | M4 = np.vstack([M3[0], 143*M3[1]-4*M3[2], M3[2]])
290 | M4
291 | ```
292 | :::
293 |
294 |
295 | ::: {.cell .code}
296 | ```python
297 | M5 = np.vstack([143/2*(M4[0]-12/1430*M4[1] - 11/143*M4[2]), M4[1], M4[2]])
298 | M5
299 | ```
300 | :::
301 |
302 |
303 | ::: {.cell .code}
304 | ```python
305 | M6 = np.vstack([M5[0]/286, M5[1]/1430, M5[2]/143])
306 | M6
307 | ```
308 | :::
309 |
310 |
311 | ::: {.cell .markdown}
312 |
313 | When we're finished, our augmented matrix looks like this:
314 |
315 | $$
316 | \begin{bmatrix}
317 | 1 & 0 & 0 & | & \frac{1597}{286} \\
318 | 0 & 1 & 0 & | & \frac{1115}{1430} \\
319 | 0 & 0 & 1 & | & \frac{-243}{143}
320 | \end{bmatrix}
321 | $$
322 |
323 | and so
324 |
325 | $$ w^* = \left[ \frac{1597}{286}, \frac{1115}{1430}, \frac{-243}{143} \right] $$
326 |
327 | :::
328 |
329 |
330 | ::: {.cell .code}
331 | ```python
332 | M6[:,3]
333 | ```
334 | :::
335 |
336 | ::: {.cell .markdown}
337 |
338 | Compare this to the known solution:
339 |
340 | :::
341 |
342 |
343 | ::: {.cell .code}
344 | ```python
345 | np.linalg.inv( A.T.dot(A) ).dot(A.T.dot(y))
346 | ```
347 | :::
348 |
349 |
--------------------------------------------------------------------------------
/notebooks/2-compute-by-hand.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/2-compute-by-hand.pdf
--------------------------------------------------------------------------------
/notebooks/2-linear-regression-case-study.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/2-linear-regression-case-study.pdf
--------------------------------------------------------------------------------
/notebooks/2-linear-regression-deep-dive.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/2-linear-regression-deep-dive.pdf
--------------------------------------------------------------------------------
/notebooks/2-regression-r2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Regression metrics\n",
8 | "------------------\n",
9 | "\n",
10 | "In this notebook, we will explore some metrics typically applied to\n",
11 | "linear regression models:\n",
12 | "\n",
13 | "- R2\n",
14 | "- Mean squared error (RSS divided by number of samples)\n",
15 | "- Ratio of RSS for regression model to sample variance (“RSS for\n",
16 | " prediction by mean”)\n",
17 | "\n",
18 | "using some synthetic data sets."
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "from sklearn import datasets\n",
28 | "from sklearn import metrics\n",
29 | "from sklearn import preprocessing\n",
30 | "from sklearn.linear_model import LinearRegression\n",
31 | "\n",
32 | "import numpy as np\n",
33 | "import matplotlib.pyplot as plt\n",
34 | "import pandas as pd\n",
35 | "import seaborn as sns\n",
36 | "\n",
37 | "from IPython.core.interactiveshell import InteractiveShell\n",
38 | "InteractiveShell.ast_node_interactivity = \"all\""
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "Generate synthetic data\n",
46 | "-----------------------\n",
47 | "\n",
48 | "We will generate four sets of synthetic data for a simple linear\n",
49 | "regression.\n",
50 | "\n",
51 | "Each dataset will be generated using the `make_regression` function in\n",
52 | "`sklearn`’s `datasets` module. This will:\n",
53 | "\n",
54 | "- generate a random regression coefficient, $w_1$,\n",
55 | "- generate `n_samples` points on the line defined by that coefficient\n",
56 | " (i.e. generate random $x$ and then compute $y$ using the equation\n",
57 | " for the linear model),\n",
58 | "- and then add Gaussian noise with standard deviation defined by the\n",
59 | " `noise` argument to each of the `n_samples` points.\n",
60 | "\n",
61 | "We will also scale all the “features” to the $[-1, 1]$ range using\n",
62 | "`sklearn`’s `MaxAbsScaler`, so that we can make reasonable comparisons\n",
63 | "between the datasets.\n",
64 | "\n",
65 | "The sets `hivar1` and `lovar1` will be identical to one another with\n",
66 | "respect to the number of samples and regression coefficents, but the\n",
67 | "`hivar1` set will have 5x the noise of the `lovar1` set.\n",
68 | "\n",
69 | "Similarly, the sets `hivar2` and `lovar2` will be identical to one\n",
70 | "another with respect to the number of samples and regression\n",
71 | "coefficents, but the `hivar2` set will have 5 times the noise of the\n",
72 | "`lovar2` set."
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "X_hivar1, y_hivar1 = datasets.make_regression(n_samples=300, n_features=1, noise=20, random_state=4)\n",
82 | "X_hivar1 = preprocessing.MaxAbsScaler().fit_transform(X_hivar1)\n",
83 | "\n",
84 | "X_lovar1, y_lovar1 = datasets.make_regression(n_samples=300, n_features=1, noise=4, random_state=4)\n",
85 | "X_lovar1 = preprocessing.MaxAbsScaler().fit_transform(X_lovar1)\n",
86 | "\n",
87 | "X_hivar2, y_hivar2 = datasets.make_regression(n_samples=150, n_features=1, noise=50, random_state=9)\n",
88 | "X_hivar2 = preprocessing.MaxAbsScaler().fit_transform(X_hivar2)\n",
89 | "\n",
90 | "X_lovar2, y_lovar2 = datasets.make_regression(n_samples=150, n_features=1, noise=10, random_state=9)\n",
91 | "X_lovar2 = preprocessing.MaxAbsScaler().fit_transform(X_lovar2)"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "### Fit a linear regression\n",
99 | "\n",
100 | "Next, we will fit a linear regression to each data set:"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "regr_hivar1 = LinearRegression().fit(X_hivar1, y_hivar1)\n",
110 | "regr_lovar1 = LinearRegression().fit(X_lovar1, y_lovar1)\n",
111 | "regr_hivar2 = LinearRegression().fit(X_hivar2, y_hivar2)\n",
112 | "regr_lovar2 = LinearRegression().fit(X_lovar2, y_lovar2)"
113 | ]
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {},
118 | "source": [
119 | "### Visualize data and regression line\n",
120 | "\n",
121 | "Finally, for each dataset:\n",
122 | "\n",
123 | "- we plot the data points and the fitted linear regression line\n",
124 | "- we print the coefficient $w_1$ on each plot\n",
125 | "- we print the R2 value on each plot\n",
126 | "- we compute the MSE of the regression, and print it on each plot\n",
127 | "- we compute the “MSE of prediction by mean”, and print it on each\n",
128 | " plot"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "fig = plt.figure()\n",
138 | "fig.set_size_inches(8, 8)\n",
139 | "ax1 = fig.add_subplot(221)\n",
140 | "ax2 = fig.add_subplot(222)\n",
141 | "ax3 = fig.add_subplot(223)\n",
142 | "ax4 = fig.add_subplot(224)\n",
143 | "plt.subplots_adjust(hspace=0.4)\n",
144 | "\n",
145 | "sns.scatterplot(x=X_hivar1.squeeze(), y=y_hivar1, ax=ax1);\n",
146 | "sns.lineplot(x=X_hivar1.squeeze(), y=regr_hivar1.predict(X_hivar1), color='red', ax=ax1);\n",
147 | "sns.lineplot(x=X_hivar1.squeeze(), y=np.mean(y_hivar1), color='purple', ax=ax1);\n",
148 | "ax1.title.set_text('w1: %s, R2 score: %s \\n MSE regression: %s \\n MSE mean: %s' % \n",
149 | " (\n",
150 | " '{0:.2f}'.format(regr_hivar1.coef_[0]),\n",
151 | " '{0:.4f}'.format(metrics.r2_score(y_hivar1, regr_hivar1.predict(X_hivar1))),\n",
152 | " '{0:.4f}'.format(np.mean((regr_hivar1.predict(X_hivar1)-y_hivar1)**2)),\n",
153 | " '{0:.4f}'.format(np.mean(( np.mean(y_hivar1)-y_hivar1)**2))\n",
154 | " ));\n",
155 | "ax1.text(0.75, -250, \"(1)\", size='medium', color='black', weight='semibold');\n",
156 | "ax1.set_ylim(-300, 300);\n",
157 | "ax1.set_xlim(-1, 1);\n",
158 | "\n",
159 | "sns.scatterplot(x=X_lovar1.squeeze(), y=y_lovar1, ax=ax2);\n",
160 | "sns.lineplot(x=X_lovar1.squeeze(), y=regr_lovar1.predict(X_lovar1), color='red', ax=ax2);\n",
161 | "sns.lineplot(x=X_lovar1.squeeze(), y=np.mean(y_lovar1), color='purple', ax=ax2);\n",
162 | "ax2.title.set_text('w1: %s, R2 score: %s \\n MSE regression: %s \\n MSE mean: %s' % \n",
163 | " (\n",
164 | " '{0:.2f}'.format(regr_lovar1.coef_[0]),\n",
165 | " '{0:.4f}'.format(metrics.r2_score(y_lovar1, regr_lovar1.predict(X_lovar1))),\n",
166 | " '{0:.4f}'.format(np.mean((regr_lovar1.predict(X_lovar1)-y_lovar1)**2)),\n",
167 | " '{0:.4f}'.format(np.mean(( np.mean(y_lovar1)-y_lovar1)**2))\n",
168 | " ));\n",
169 | "ax2.text(0.75, -250, \"(2)\", size='medium', color='black', weight='semibold');\n",
170 | "ax2.set_ylim(-300, 300);\n",
171 | "ax2.set_xlim(-1, 1);\n",
172 | "\n",
173 | "sns.scatterplot(x=X_hivar2.squeeze(), y=y_hivar2, ax=ax3);\n",
174 | "sns.lineplot(x=X_hivar2.squeeze(), y=regr_hivar2.predict(X_hivar2), color='red', ax=ax3);\n",
175 | "sns.lineplot(x=X_hivar2.squeeze(), y=np.mean(y_hivar2), color='purple', ax=ax3);\n",
176 | "ax3.title.set_text('w1: %s, R2 score: %s \\n MSE regression: %s \\n MSE mean: %s' % \n",
177 | " (\n",
178 | " '{0:.2f}'.format(regr_hivar2.coef_[0]),\n",
179 | " '{0:.4f}'.format(metrics.r2_score(y_hivar2, regr_hivar2.predict(X_hivar2))),\n",
180 | " '{0:.4f}'.format(np.mean((regr_hivar2.predict(X_hivar2)-y_hivar2)**2)),\n",
181 | " '{0:.4f}'.format(np.mean(( np.mean(y_hivar2)-y_hivar2)**2))\n",
182 | " ));\n",
183 | "ax3.text(0.75, -250, \"(3)\", size='medium', color='black', weight='semibold');\n",
184 | "ax3.set_ylim(-300, 300);\n",
185 | "ax3.set_xlim(-1, 1);\n",
186 | "\n",
187 | "sns.scatterplot(x=X_lovar2.squeeze(), y=y_lovar2, ax=ax4);\n",
188 | "sns.lineplot(x=X_lovar2.squeeze(), y=regr_lovar2.predict(X_lovar2), color='red', ax=ax4);\n",
189 | "sns.lineplot(x=X_lovar2.squeeze(), y=np.mean(y_lovar2), color='purple', ax=ax4);\n",
190 | "ax4.title.set_text('w1: %s, R2 score: %s \\n MSE regression: %s \\n MSE mean: %s' % \n",
191 | " (\n",
192 | " '{0:.2f}'.format(regr_lovar2.coef_[0]),\n",
193 | " '{0:.4f}'.format(metrics.r2_score(y_lovar2, regr_lovar2.predict(X_lovar2))),\n",
194 | " '{0:.4f}'.format(np.mean((regr_lovar2.predict(X_lovar2)-y_lovar2)**2)),\n",
195 | " '{0:.4f}'.format(np.mean(( np.mean(y_lovar2)-y_lovar2)**2))\n",
196 | " ));\n",
197 | "ax4.text(0.75, -250, \"(4)\", size='medium', color='black', weight='semibold');\n",
198 | "ax4.set_ylim(-300, 300);\n",
199 | "ax4.set_xlim(-1, 1);\n"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "Interpret results\n",
207 | "-----------------\n",
208 | "\n",
209 | "Based on the figures above, we can make the following statements:\n",
210 | "\n",
211 | "From $w_1$, and visually from the slope of the regression line:\n",
212 | "\n",
213 | "- For **(1)**, **(2)**: an increase in $x$ of 1 is, on average,\n",
214 | " associated with an increase in $y$ of about 100.\n",
215 | "- For **(3)**, **(4)**: an increase in $x$ of 1 is, on average,\n",
216 | " associated with an increase in $y$ of about 240.\n",
217 | "\n",
218 | "From the R2 score, and visually from the variance around the regression\n",
219 | "line:\n",
220 | "\n",
221 | "- For **(1)**, **(3)**: about 75% of the variance in $y$ is explained\n",
222 | " by the regression on $x$.\n",
223 | "- For **(2)**, **(4)**: about 99% of the variance in $y$ is explained\n",
224 | " by the regression on $x$.\n",
225 | "\n",
226 | "We also observe:\n",
227 | "\n",
228 | "- The MSE of the regression line is equivalent to the variance of the\n",
229 | " noise we added around the regression line. (Take the square of the\n",
230 | " `noise` argument we used, which was the standard deviation of the\n",
231 | " noise.)\n",
232 | "- The greater the slope of the regression line, the more error is\n",
233 | " associated with prediction by mean. Prediction by mean is the same\n",
234 | " thing as prediction by a line with intercept $w_0 = \\overline{y}$\n",
235 | " and slope $w_1 = 0$ (purple line in the figures above). The greater\n",
236 | " the true $w_1$, the more “wrong” the $w_1 = 0$ prediction is.\n",
237 | "- The ratio of MSE of the regression line to MSE of prediction by\n",
238 | " mean, is $1-R2$."
239 | ]
240 | }
241 | ],
242 | "nbformat": 4,
243 | "nbformat_minor": 5,
244 | "metadata": {}
245 | }
246 |
--------------------------------------------------------------------------------
/notebooks/2-regression-r2.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Regression metrics"
3 | author: "Fraida Fund"
4 | ---
5 |
6 | ::: {.cell .markdown}
7 |
8 | Regression metrics
9 | ------------------
10 |
11 | In this notebook, we will explore some metrics typically applied to
12 | linear regression models:
13 |
14 | - R2
15 | - Mean squared error (RSS divided by number of samples)
16 | - Ratio of RSS for regression model to sample variance ("RSS for prediction by mean")
17 |
18 | using some synthetic data sets.
19 |
20 | :::
21 |
22 | ::: {.cell .code}
23 | ``` {.python}
24 | from sklearn import datasets
25 | from sklearn import metrics
26 | from sklearn import preprocessing
27 | from sklearn.linear_model import LinearRegression
28 |
29 | import numpy as np
30 | import matplotlib.pyplot as plt
31 | import pandas as pd
32 | import seaborn as sns
33 |
34 | from IPython.core.interactiveshell import InteractiveShell
35 | InteractiveShell.ast_node_interactivity = "all"
36 | ```
37 | :::
38 |
39 |
40 | ::: {.cell .markdown}
41 |
42 | ## Generate synthetic data
43 |
44 | We will generate four sets of synthetic data for a simple linear regression.
45 |
46 | Each dataset will be generated using the `make_regression` function in
47 | `sklearn`’s `datasets` module. This will:
48 |
49 | * generate a random regression coefficient, $w_1$,
50 | * generate `n_samples` points on the line defined by that coefficient (i.e. generate random $x$ and then compute $y$ using the equation for the linear model),
51 | * and then add Gaussian noise with standard deviation defined by the `noise` argument to each of the `n_samples` points.
52 |
53 | We will also scale all the "features" to the $[-1, 1]$ range using
54 | `sklearn`’s `MaxAbsScaler`, so that we can make reasonable comparisons
55 | between the datasets.
56 |
57 | The sets `hivar1` and `lovar1` will be identical to one another with
58 | respect to the number of samples and regression coefficents, but the
59 | `hivar1` set will have 5x the noise of the `lovar1` set.
60 |
61 | Similarly, the sets `hivar2` and `lovar2` will be identical to one
62 | another with respect to the number of samples and regression
63 | coefficents, but the `hivar2` set will have 5 times the noise of the `lovar2` set.
64 |
65 |
66 | :::
67 |
68 | ::: {.cell .code}
69 | ``` {.python}
70 | X_hivar1, y_hivar1 = datasets.make_regression(n_samples=300, n_features=1, noise=20, random_state=4)
71 | X_hivar1 = preprocessing.MaxAbsScaler().fit_transform(X_hivar1)
72 |
73 | X_lovar1, y_lovar1 = datasets.make_regression(n_samples=300, n_features=1, noise=4, random_state=4)
74 | X_lovar1 = preprocessing.MaxAbsScaler().fit_transform(X_lovar1)
75 |
76 | X_hivar2, y_hivar2 = datasets.make_regression(n_samples=150, n_features=1, noise=50, random_state=9)
77 | X_hivar2 = preprocessing.MaxAbsScaler().fit_transform(X_hivar2)
78 |
79 | X_lovar2, y_lovar2 = datasets.make_regression(n_samples=150, n_features=1, noise=10, random_state=9)
80 | X_lovar2 = preprocessing.MaxAbsScaler().fit_transform(X_lovar2)
81 | ```
82 | :::
83 |
84 |
85 | ::: {.cell .markdown}
86 |
87 |
88 | ### Fit a linear regression
89 |
90 | Next, we will fit a linear regression to each data set:
91 |
92 | :::
93 |
94 | ::: {.cell .code}
95 | ``` {.python}
96 | regr_hivar1 = LinearRegression().fit(X_hivar1, y_hivar1)
97 | regr_lovar1 = LinearRegression().fit(X_lovar1, y_lovar1)
98 | regr_hivar2 = LinearRegression().fit(X_hivar2, y_hivar2)
99 | regr_lovar2 = LinearRegression().fit(X_lovar2, y_lovar2)
100 | ```
101 | :::
102 |
103 |
104 | ::: {.cell .markdown}
105 |
106 | ### Visualize data and regression line
107 |
108 | Finally, for each dataset:
109 |
110 | - we plot the data points and the fitted linear regression line
111 | - we print the coefficient $w_1$ on each plot
112 | - we print the R2 value on each plot
113 | - we compute the MSE of the regression, and print it on each plot
114 | - we compute the "MSE of prediction by mean", and print it on each plot
115 |
116 | :::
117 |
118 |
119 | ::: {.cell .code}
120 | ``` {.python}
121 | fig = plt.figure()
122 | fig.set_size_inches(8, 8)
123 | ax1 = fig.add_subplot(221)
124 | ax2 = fig.add_subplot(222)
125 | ax3 = fig.add_subplot(223)
126 | ax4 = fig.add_subplot(224)
127 | plt.subplots_adjust(hspace=0.4)
128 |
129 | sns.scatterplot(x=X_hivar1.squeeze(), y=y_hivar1, ax=ax1);
130 | sns.lineplot(x=X_hivar1.squeeze(), y=regr_hivar1.predict(X_hivar1), color='red', ax=ax1);
131 | sns.lineplot(x=X_hivar1.squeeze(), y=np.mean(y_hivar1), color='purple', ax=ax1);
132 | ax1.title.set_text('w1: %s, R2 score: %s \n MSE regression: %s \n MSE mean: %s' %
133 | (
134 | '{0:.2f}'.format(regr_hivar1.coef_[0]),
135 | '{0:.4f}'.format(metrics.r2_score(y_hivar1, regr_hivar1.predict(X_hivar1))),
136 | '{0:.4f}'.format(np.mean((regr_hivar1.predict(X_hivar1)-y_hivar1)**2)),
137 | '{0:.4f}'.format(np.mean(( np.mean(y_hivar1)-y_hivar1)**2))
138 | ));
139 | ax1.text(0.75, -250, "(1)", size='medium', color='black', weight='semibold');
140 | ax1.set_ylim(-300, 300);
141 | ax1.set_xlim(-1, 1);
142 |
143 | sns.scatterplot(x=X_lovar1.squeeze(), y=y_lovar1, ax=ax2);
144 | sns.lineplot(x=X_lovar1.squeeze(), y=regr_lovar1.predict(X_lovar1), color='red', ax=ax2);
145 | sns.lineplot(x=X_lovar1.squeeze(), y=np.mean(y_lovar1), color='purple', ax=ax2);
146 | ax2.title.set_text('w1: %s, R2 score: %s \n MSE regression: %s \n MSE mean: %s' %
147 | (
148 | '{0:.2f}'.format(regr_lovar1.coef_[0]),
149 | '{0:.4f}'.format(metrics.r2_score(y_lovar1, regr_lovar1.predict(X_lovar1))),
150 | '{0:.4f}'.format(np.mean((regr_lovar1.predict(X_lovar1)-y_lovar1)**2)),
151 | '{0:.4f}'.format(np.mean(( np.mean(y_lovar1)-y_lovar1)**2))
152 | ));
153 | ax2.text(0.75, -250, "(2)", size='medium', color='black', weight='semibold');
154 | ax2.set_ylim(-300, 300);
155 | ax2.set_xlim(-1, 1);
156 |
157 | sns.scatterplot(x=X_hivar2.squeeze(), y=y_hivar2, ax=ax3);
158 | sns.lineplot(x=X_hivar2.squeeze(), y=regr_hivar2.predict(X_hivar2), color='red', ax=ax3);
159 | sns.lineplot(x=X_hivar2.squeeze(), y=np.mean(y_hivar2), color='purple', ax=ax3);
160 | ax3.title.set_text('w1: %s, R2 score: %s \n MSE regression: %s \n MSE mean: %s' %
161 | (
162 | '{0:.2f}'.format(regr_hivar2.coef_[0]),
163 | '{0:.4f}'.format(metrics.r2_score(y_hivar2, regr_hivar2.predict(X_hivar2))),
164 | '{0:.4f}'.format(np.mean((regr_hivar2.predict(X_hivar2)-y_hivar2)**2)),
165 | '{0:.4f}'.format(np.mean(( np.mean(y_hivar2)-y_hivar2)**2))
166 | ));
167 | ax3.text(0.75, -250, "(3)", size='medium', color='black', weight='semibold');
168 | ax3.set_ylim(-300, 300);
169 | ax3.set_xlim(-1, 1);
170 |
171 | sns.scatterplot(x=X_lovar2.squeeze(), y=y_lovar2, ax=ax4);
172 | sns.lineplot(x=X_lovar2.squeeze(), y=regr_lovar2.predict(X_lovar2), color='red', ax=ax4);
173 | sns.lineplot(x=X_lovar2.squeeze(), y=np.mean(y_lovar2), color='purple', ax=ax4);
174 | ax4.title.set_text('w1: %s, R2 score: %s \n MSE regression: %s \n MSE mean: %s' %
175 | (
176 | '{0:.2f}'.format(regr_lovar2.coef_[0]),
177 | '{0:.4f}'.format(metrics.r2_score(y_lovar2, regr_lovar2.predict(X_lovar2))),
178 | '{0:.4f}'.format(np.mean((regr_lovar2.predict(X_lovar2)-y_lovar2)**2)),
179 | '{0:.4f}'.format(np.mean(( np.mean(y_lovar2)-y_lovar2)**2))
180 | ));
181 | ax4.text(0.75, -250, "(4)", size='medium', color='black', weight='semibold');
182 | ax4.set_ylim(-300, 300);
183 | ax4.set_xlim(-1, 1);
184 |
185 | ```
186 | :::
187 |
188 | ::: {.cell .markdown}
189 |
190 | ## Interpret results
191 |
192 | Based on the figures above, we can make the following statements:
193 |
194 | From $w_1$, and visually from the slope of the regression line:
195 |
196 | - For **(1)**, **(2)**: an increase in $x$ of 1 is, on average,
197 | associated with an increase in $y$ of about 100.
198 | - For **(3)**, **(4)**: an increase in $x$ of 1 is, on average,
199 | associated with an increase in $y$ of about 240.
200 |
201 | From the R2 score, and visually from the variance around the regression
202 | line:
203 |
204 | - For **(1)**, **(3)**: about 75% of the variance in $y$ is explained
205 | by the regression on $x$.
206 | - For **(2)**, **(4)**: about 99% of the variance in $y$ is explained
207 | by the regression on $x$.
208 |
209 | We also observe:
210 |
211 | - The MSE of the regression line is equivalent to the variance of the noise we added around the regression line. (Take the square of the `noise` argument we used, which was the standard deviation of the noise.)
212 | - The greater the slope of the regression line, the more error is associated with prediction by mean. Prediction by mean is the same thing as prediction by a line with intercept $w_0 = \overline{y}$ and slope $w_1 = 0$ (purple line in the figures above). The greater the true $w_1$, the more “wrong” the $w_1 = 0$ prediction is.
213 | - The ratio of MSE of the regression line to MSE of prediction by mean, is $1-R2$.
214 |
215 | :::
--------------------------------------------------------------------------------
/notebooks/2-regression-r2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/2-regression-r2.pdf
--------------------------------------------------------------------------------
/notebooks/3-bias-variance-deep-dive.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/3-bias-variance-deep-dive.pdf
--------------------------------------------------------------------------------
/notebooks/3-gradient-descent-deep-dive.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/3-gradient-descent-deep-dive.pdf
--------------------------------------------------------------------------------
/notebooks/3-gradient-descent-hw.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "For this question you will change some parameters in the “Deep dive:\n",
8 | "gradient descent” notebook, re-run the notebook with the new parameters,\n",
9 | "and answer questions about the results. You do not have to write any new\n",
10 | "code, and you should not submit any code. (Copy the relevant output\n",
11 | "images to a regular document, answer the questions there, and submit\n",
12 | "that document - don’t submit a Colab notebook.)\n",
13 | "\n",
14 | "A. Re-run the “Descent path” section with three different learning\n",
15 | "rates: `lr = 0.0002`, `lr = 0.002`, and `lr = 0.02` (and leave other\n",
16 | "parameters at their default settings). For each learning rate,\n",
17 | "\n",
18 | "- Show the plot of coeffient value vs. iteration, and the plot of the\n",
19 | " descent path on the MSE contour.\n",
20 | "- What is the estimate of $w$ after 50 iterations?\n",
21 | "- Describe whether the gradient descent diverges, converges within 50\n",
22 | " iterations, or starts to converge but does not get close enough to\n",
23 | " the optimum value within 50 iterations.\n",
24 | "\n",
25 | "B. Re-run the “Stochastic gradient descent” section with `lr=0.1` and\n",
26 | "`n=1`, then with `lr=0.01` and `n=10`, and finally with `lr = 0.001` and\n",
27 | "`n = 100` (and leave the other parameters at their default settings).\n",
28 | "For each,\n",
29 | "\n",
30 | "- Show the plot of coeffient value vs. iteration, and the plot of the\n",
31 | " descent path on the MSE contour.\n",
32 | "- Comment on the descent path. Does it converge smoothly to the\n",
33 | " optimal solution?"
34 | ]
35 | }
36 | ],
37 | "nbformat": 4,
38 | "nbformat_minor": 5,
39 | "metadata": {}
40 | }
41 |
--------------------------------------------------------------------------------
/notebooks/3-gradient-descent-hw.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | For this question you will change some parameters in the "Deep dive: gradient descent" notebook, re-run the notebook with the new parameters, and answer questions about the results. You do not have to write any new code, and you should not submit any code. (Copy the relevant output images to a regular document, answer the questions there, and submit that document - don't submit a Colab notebook.)
4 |
5 | A. Re-run the "Descent path" section with three different learning rates: `lr = 0.0002`, `lr = 0.002`, and `lr = 0.02` (and leave other parameters at their default settings). For each learning rate,
6 |
7 | * Show the plot of coeffient value vs. iteration, and the plot of the descent path on the MSE contour.
8 | * What is the estimate of $w$ after 50 iterations?
9 | * Describe whether the gradient descent diverges, converges within 50 iterations, or starts to converge but does not get close enough to the optimum value within 50 iterations.
10 |
11 | B. Re-run the "Stochastic gradient descent" section with `lr=0.1` and `n=1`, then with `lr=0.01` and `n=10`, and finally with `lr = 0.001` and `n = 100` (and leave the other parameters at their default settings). For each,
12 |
13 | * Show the plot of coeffient value vs. iteration, and the plot of the descent path on the MSE contour.
14 | * Comment on the descent path. Does it converge smoothly to the optimal solution?
--------------------------------------------------------------------------------
/notebooks/3-gradient-descent-hw.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/3-gradient-descent-hw.pdf
--------------------------------------------------------------------------------
/notebooks/4-linear-regression-case-study-part-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/4-linear-regression-case-study-part-2.pdf
--------------------------------------------------------------------------------
/notebooks/4-model-selection.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/4-model-selection.pdf
--------------------------------------------------------------------------------
/notebooks/4-regularization-deep-dive.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/4-regularization-deep-dive.pdf
--------------------------------------------------------------------------------
/notebooks/5-compas-case-study.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/5-compas-case-study.pdf
--------------------------------------------------------------------------------
/notebooks/5-hw-logistic-regression.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Homework: Classifying your own handwritten digit'
3 | author: 'Fraida Fund'
4 | jupyter:
5 | colab:
6 | toc_visible: true
7 | kernelspec:
8 | display_name: Python 3
9 | name: python3
10 | nbformat: 4
11 | nbformat_minor: 0
12 | ---
13 |
14 | ::: {.cell .markdown }
15 | ## Homework problem: Logistic regression for classification of handwritten digits
16 | :::
17 |
18 | ::: {.cell .markdown }
19 | For this homework problem, you will create your own test image for the
20 | logistic regression classifier that we trained in a demo notebook this
21 | week.
22 | :::
23 |
24 | ::: {.cell .markdown }
25 | #### Train your classifier
26 |
27 | First, we'll repeat the steps from the demo notebook to train a
28 | logistic regression for classification of handwritten digits. This code
29 | is provided for you.
30 |
31 | (It is copied from the demo notebook exactly, with one exception: we use
32 | a larger subset of the data for training than in the demo notebook, so
33 | this fitted model will have better accuracy.)
34 | :::
35 |
36 | ::: {.cell .code }
37 | ```python
38 | import numpy as np
39 | import pandas as pd
40 | import seaborn as sns
41 | import matplotlib.pyplot as plt
42 | from sklearn.datasets import fetch_openml
43 | from sklearn.model_selection import train_test_split
44 | from sklearn.linear_model import LogisticRegression
45 | from sklearn.metrics import accuracy_score
46 |
47 | from IPython.core.interactiveshell import InteractiveShell
48 | InteractiveShell.ast_node_interactivity = "all"
49 | ```
50 | :::
51 |
52 | ::: {.cell .code }
53 | ```python
54 | X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
55 | ```
56 | :::
57 |
58 | ::: {.cell .code }
59 | ```python
60 | classes = ['0', '1', '2','3', '4','5', '6', '7', '8', '9']
61 | nclasses = len(classes)
62 | ```
63 | :::
64 |
65 | ::: {.cell .code }
66 | ```python
67 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=9,
68 | train_size=0.7, test_size=0.3)
69 | ```
70 | :::
71 |
72 | ::: {.cell .code }
73 | ```python
74 | X_train_scaled = X_train/255.0
75 | X_test_scaled = X_test/255.0
76 | ```
77 | :::
78 |
79 | ::: {.cell .code }
80 | ```python
81 | clf = LogisticRegression(penalty=None,
82 | tol=0.01, solver='saga',
83 | multi_class='multinomial').fit(X_train_scaled, y_train)
84 | ```
85 | :::
86 |
87 | ::: {.cell .code }
88 | ```python
89 | accuracy = clf.score(X_test_scaled, y_test)
90 | ```
91 | :::
92 |
93 | ::: {.cell .code }
94 | ```python
95 | print(accuracy)
96 | ```
97 | :::
98 |
99 | ::: {.cell .markdown }
100 | #### Create a test image
101 |
102 | On a plain white piece of paper, in a black or other dark-colored pen or
103 | pencil, write a digit of your choice from 0 to 9. Take a photo of your
104 | handwritten digit.
105 |
106 | Edit your photo (crop, rotate as needed), using a photo editor of your
107 | choice (I used Google Photos), so that your photo is approximately
108 | square, and includes only the digit and the white background. Leave a
109 | small margin around the edge of the writing, but not too much. Your
110 | edited photo should look similar to the MNIST images in the demo
111 | notebook.
112 |
113 | For example:
114 |
115 | `
`{=html}
116 | :::
117 |
118 | ::: {.cell .markdown }
119 | #### Upload your image to Colab
120 |
121 | Run the following cell. Click "Choose files", and upload the photo of
122 | your handwritten digit.
123 | :::
124 |
125 | ::: {.cell .code }
126 | ```python
127 | from google.colab import files
128 |
129 | uploaded = files.upload()
130 |
131 | for fn in uploaded.keys():
132 | print('User uploaded file "{name}" with length {length} bytes'.format(
133 | name=fn, length=len(uploaded[fn])))
134 | ```
135 | :::
136 |
137 | ::: {.cell .markdown }
138 | On the left side of the Colab window, you will see a small file folder
139 | icon, which allows you to explore the filesystem of your Colab
140 | workspace. If you click on this icon, you should see that your file has
141 | been uploaded to your Colab workspace. (You may need to use the
142 | "Refresh" button in the file browser in order to see the file.) Make a
143 | note of the file name.
144 | :::
145 |
146 | ::: {.cell .markdown }
147 | #### Visualize the image
148 |
149 | After uploading your image, run this cell, but *replace the filename*
150 | with the name of the file you have just uploaded to Colab. You shold see
151 | your image appear in the cell output.
152 | :::
153 |
154 | ::: {.cell .code }
155 | ```python
156 | from PIL import Image
157 |
158 | filename = '2021-07-01_14-03.png'
159 |
160 | image = Image.open(filename)
161 | p = plt.imshow(np.asarray(image), cmap=plt.cm.gray,);
162 | p = plt.title('Shape: ' + str(np.asarray(image).shape))
163 | ```
164 | :::
165 |
166 | ::: {.cell .markdown }
167 | For example:
168 |
169 | `
`{=html}
170 | :::
171 |
172 | ::: {.cell .markdown }
173 | #### Pre-process the image
174 |
175 | The images in MNIST have been pre-processed - they are converted to
176 | grayscale, and centered in a 28x28 image by computing the center of mass
177 | of the pixels, and then translating and scaling the image so as to
178 | position this point at the center of the 28x28 field.
179 |
180 | You have already done some manual pre-processing, by cropping your image
181 | before uploading. But you may have noticed from the `shape` output that
182 | your image resolution is much larger than 28x28, and you probably had
183 | three color channels (red, green, and blue).
184 |
185 | Use the code in the following cells to pre-process your image into a
186 | 28x28 image with one color channel (grayscale). You may have to manually
187 | tune the contrast for best results, by changing the `pixel_filter`
188 | value. You will want the background to be as close to pure black as
189 | possible, without affecting the legibility of the handwritten digit.
190 |
191 | (We won't bother with centering the image, but that would probably
192 | improve the prediction performance quite a lot!)
193 | :::
194 |
195 | ::: {.cell .code }
196 | ```python
197 | # convert to grayscale image - 'L' format means each pixel is
198 | # represented by a single value from 0 to 255
199 | image_bw = image.convert('L')
200 | p = plt.imshow(np.asarray(image_bw), cmap=plt.cm.gray,);
201 | p = plt.title('Shape: ' + str(np.asarray(image_bw).shape))
202 | ```
203 | :::
204 |
205 | ::: {.cell .code }
206 | ```python
207 | # resize image
208 | image_bw_resized = image_bw.resize((28,28), Image.ANTIALIAS)
209 | p = plt.imshow(np.asarray(image_bw_resized), cmap=plt.cm.gray,);
210 | p = plt.title('Shape: ' + str(np.asarray(image_bw_resized).shape))
211 | ```
212 | :::
213 |
214 | ::: {.cell .code }
215 | ```python
216 | # invert image, to match training data
217 | import PIL.ImageOps
218 |
219 | image_bw_resized_inverted = PIL.ImageOps.invert(image_bw_resized)
220 | p = plt.imshow(np.asarray(image_bw_resized_inverted), cmap=plt.cm.gray,);
221 | p = plt.title('Shape: ' + str(np.asarray(image_bw_resized_inverted).shape))
222 | ```
223 | :::
224 |
225 | ::: {.cell .code }
226 | ```python
227 | # adjust contrast and scale
228 | pixel_filter = 20 # value from 0 to 100 - may need to adjust this manually
229 | min_pixel = np.percentile(image_bw_resized_inverted, pixel_filter)
230 | image_bw_resized_inverted_scaled = np.clip(image_bw_resized_inverted-min_pixel, 0, 255)
231 | max_pixel = np.max(image_bw_resized_inverted_scaled)
232 | image_bw_resized_inverted_scaled = np.asarray(image_bw_resized_inverted_scaled)/max_pixel
233 | p = plt.imshow(np.asarray(image_bw_resized_inverted_scaled), cmap=plt.cm.gray,);
234 | p = plt.title('Shape: ' + str(np.asarray(image_bw_resized_inverted_scaled).shape))
235 | ```
236 | :::
237 |
238 | ::: {.cell .code }
239 | ```python
240 | # finally, reshape to (1, 784) - 1 sample, 784 features
241 | test_sample = np.array(image_bw_resized_inverted_scaled).reshape(1,784)
242 | p = plt.imshow(np.reshape(test_sample, (28,28)), cmap=plt.cm.gray,);
243 | p = plt.title('Shape: ' + str(test_sample.shape))
244 | ```
245 | :::
246 |
247 | ::: {.cell .markdown }
248 | Make sure the `shape` of your test sample is `(1,784)` (one sample, 784
249 | features).
250 | :::
251 |
252 | ::: {.cell .markdown }
253 | #### Visualize the pre-processed image
254 |
255 | Run the following code to visualize your pre-processed image.
256 | :::
257 |
258 | ::: {.cell .code }
259 | ```python
260 | p = plt.imshow(np.reshape(test_sample, (28,28)), cmap=plt.cm.gray,);
261 | p = plt.title('Shape: ' + str(test_sample.shape))
262 | ```
263 | :::
264 |
265 | ::: {.cell .markdown }
266 | For example:
267 |
268 | `
`{=html}
269 | :::
270 |
271 | ::: {.cell .markdown }
272 | #### Use your fitted logistic regression
273 |
274 | Now that you have processed your test image, let us see whether it is
275 | classified correctly by the logistic regression.
276 | :::
277 |
278 | ::: {.cell .markdown }
279 | Run the following cell. This will use your fitted logistic regression to
280 | predict conditional probabilities per class for this test sample, and
281 | plot them.
282 | :::
283 |
284 | ::: {.cell .code }
285 | ```python
286 | test_probs = clf.predict_proba(test_sample)
287 |
288 | sns.barplot(x=np.arange(0,10), y=test_probs.squeeze());
289 | plt.ylabel("Probability");
290 | plt.xlabel("Class");
291 | ```
292 | :::
293 |
294 | ::: {.cell .markdown }
295 | For example:
296 |
297 | `
`{=html}
298 | :::
299 |
300 | ::: {.cell .markdown }
301 | Also run this cell, to show the predicted label for your test sample:
302 | :::
303 |
304 | ::: {.cell .code }
305 | ```python
306 | ```
307 | :::
308 |
309 | ::: {.cell .code }
310 | ```python
311 | test_pred = clf.predict(test_sample)
312 | print("Predicted class is: ", test_pred)
313 | ```
314 | :::
315 |
316 | ::: {.cell .markdown }
317 | #### Explain the model prediction
318 |
319 | Even if the fitted model correctly labeled your handwritten digit, it
320 | may have estimated a moderately high probability for some of the other
321 | labels. To understand why, it is useful to visualize
322 |
323 | $$\langle w_k, x\rangle$$
324 |
325 | for each class $k$.
326 |
327 | Add a cell with the following code, and run it. This will plot:
328 |
329 | - on the top row, the coefficient vector for each class,
330 | - on the bottom row, each pixel in your test image, multiplied by the
331 | associated coefficient for that class.
332 | :::
333 |
334 | ::: {.cell .code }
335 | ```python
336 | scale = np.max(np.abs(clf.coef_))
337 |
338 | p = plt.figure(figsize=(25, 5));
339 |
340 | for i in range(nclasses):
341 | p = plt.subplot(2, nclasses, i + 1)
342 | p = plt.imshow(clf.coef_[i].reshape(28, 28),
343 | cmap=plt.cm.RdBu, vmin=-scale, vmax=scale);
344 | p = plt.title('Class %i' % i);
345 | p = plt.axis('off')
346 |
347 | for i in range(nclasses):
348 | p = plt.subplot(2, nclasses, nclasses + i + 1)
349 | p = plt.imshow(test_sample.reshape(28, 28)*clf.coef_[i].reshape(28, 28),
350 | cmap=plt.cm.RdBu, vmin=-scale/2, vmax=scale/2);
351 | # note: you can adjust the scaling factor if necessary,
352 | # to make the visualization easier to understand
353 | p = plt.axis('off')
354 | ```
355 | :::
356 |
357 | ::: {.cell .markdown }
358 | For example:
359 |
360 | `
`{=html}
361 | :::
362 |
363 | ::: {.cell .markdown }
364 | In the images in the bottom row,
365 |
366 | - a blue pixel (and especially a dark blue pixel) means that your test
367 | image had writing in the part of the image that is positively
368 | associated with belonging to the class, and
369 | - a red pixel (and especially a dark red pixel) means that your test
370 | image had writing in the part of the image that is negatively
371 | associated with belonging to the class.
372 | :::
373 |
374 | ::: {.cell .markdown }
375 | ### Exploring the model error
376 |
377 | The image above should give you an idea of why your digit was classified
378 | correctly or incorrectly, and should help you understand when and why
379 | the model misclassifies some samples.
380 |
381 | - if your image *was* classified correctly: draw a *slightly* modified
382 | version of the same digit, that you believe will be classified
383 | *incorrectly*. Run this second image through the steps above, and
384 | confirm your intuition.
385 | - if your image *was not* classified correctly: draw a *slightly*
386 | modified version of the same digit, that you believe will be
387 | classified *correctly*. Run this second image through the steps
388 | above, and confirm your intuition.
389 |
390 | (Your second image should still be approximately square, include only
391 | the digit and the white background, and have a small margin around the
392 | edge of the writing, i.e. it should also "look like" the MNIST
393 | samples.)
394 | :::
395 |
396 | ::: {.cell .markdown }
397 | ### What to submit
398 |
399 | Don't submit the entire notebook. Instead, submit only the following
400 | items (for *your two handwritten digit samples*, not my example):
401 |
402 | - The visualization of your test image before pre-processing.
403 | - The visualization of your test image after pre-processing.
404 | - The bar plot showing the conditional probabilities per class for
405 | your test image.
406 | - The predicted class label for your test image.
407 | - The figure from the "Explain the model prediction" section.
408 | - **In your own words**, list the classes for which the logistic
409 | regression predicted a high or moderately high probability. Using
410 | the figure from the "explain the model prediction" section,
411 | explain *why* the logistic regression estimates that these classes
412 | are very likely or moderately likely.
413 | - Explain: how did you know what changes to make to your original
414 | drawing to create a modified version that would get a different
415 | predicted class label?
416 | :::
417 |
--------------------------------------------------------------------------------
/notebooks/5-logistic-regression-digits.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/5-logistic-regression-digits.pdf
--------------------------------------------------------------------------------
/notebooks/5-logistic-regression-in-depth.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/5-logistic-regression-in-depth.pdf
--------------------------------------------------------------------------------
/notebooks/6-decision-trees.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Demo: Decision trees\n",
8 | "====================\n",
9 | "\n",
10 | "*Fraida Fund*"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "This is a simple demo notebook that demonstrates a decision tree classifier.\n",
18 | "\n",
19 | "**Attribution**: Parts of this notebook are slightly modified from [this tutorial from “Intro to Data Mining”](http://www.cse.msu.edu/~ptan/dmbook/tutorials/tutorial6/tutorial6.html)."
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "import pandas as pd\n",
29 | "import numpy as np\n",
30 | "import matplotlib.pyplot as plt\n",
31 | "import seaborn as sns\n",
32 | "\n",
33 | "import sklearn\n",
34 | "from sklearn.tree import DecisionTreeClassifier\n",
35 | "from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "df = pd.read_csv('http://www.cse.msu.edu/~ptan/dmbook/tutorials/tutorial6/vertebrate.csv')\n",
45 | "df"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "We’l make it a binary classification problem:"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "df['Class'] = df['Class'].replace(['fishes','birds','amphibians','reptiles'],'non-mammals')\n",
62 | "df"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "Decision tree\n",
70 | "-------------"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "y = df['Class']\n",
80 | "X = df.drop(['Name','Class'],axis=1)\n",
81 | "\n",
82 | "clf_dt = DecisionTreeClassifier(criterion='entropy')\n",
83 | "clf_dt = clf_dt.fit(X, y)"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "plt.figure(figsize=(10,10))\n",
93 | "sklearn.tree.plot_tree(clf_dt, \n",
94 | " feature_names = df.columns.drop(['Name', 'Class']),\n",
95 | " class_names = [\"mammals\", \"non-mammals\"],\n",
96 | " filled=True, rounded=True);"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "### Feature importance"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "df_importance = pd.DataFrame({'feature': df.columns.drop(['Name', 'Class']),\n",
113 | " 'importance': clf_dt.feature_importances_})\n",
114 | "df_importance"
115 | ]
116 | }
117 | ],
118 | "nbformat": 4,
119 | "nbformat_minor": 0,
120 | "metadata": {
121 | "language_info": {
122 | "name": "python"
123 | },
124 | "kernelspec": {
125 | "name": "python3",
126 | "display_name": "Python 3"
127 | },
128 | "colab": {
129 | "toc_visible": "true"
130 | }
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/notebooks/6-decision-trees.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Demo: Decision trees'
3 | author: 'Fraida Fund'
4 | jupyter:
5 | colab:
6 | toc_visible: true
7 | kernelspec:
8 | display_name: Python 3
9 | name: python3
10 | language_info:
11 | name: python
12 | nbformat: 4
13 | nbformat_minor: 0
14 | ---
15 |
16 | ::: {.cell .markdown }
17 | # Demo: Decision trees
18 |
19 | *Fraida Fund*
20 | :::
21 |
22 | ::: {.cell .markdown }
23 | This is a simple demo notebook that demonstrates a decision tree
24 | classifier.
25 |
26 | **Attribution**: Parts of this notebook are slightly modified from [this
27 | tutorial from "Intro to Data
28 | Mining"](http://www.cse.msu.edu/~ptan/dmbook/tutorials/tutorial6/tutorial6.html).
29 | :::
30 |
31 | ::: {.cell .code }
32 | ```python
33 | import pandas as pd
34 | import numpy as np
35 | import matplotlib.pyplot as plt
36 | import seaborn as sns
37 |
38 | import sklearn
39 | from sklearn.tree import DecisionTreeClassifier
40 | from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
41 | ```
42 | :::
43 |
44 | ::: {.cell .code }
45 | ```python
46 | df = pd.read_csv('http://www.cse.msu.edu/~ptan/dmbook/tutorials/tutorial6/vertebrate.csv')
47 | df
48 | ```
49 | :::
50 |
51 | ::: {.cell .markdown }
52 | We'l make it a binary classification problem:
53 | :::
54 |
55 | ::: {.cell .code }
56 | ```python
57 | df['Class'] = df['Class'].replace(['fishes','birds','amphibians','reptiles'],'non-mammals')
58 | df
59 | ```
60 | :::
61 |
62 | ::: {.cell .markdown }
63 | ## Decision tree
64 | :::
65 |
66 | ::: {.cell .code }
67 | ```python
68 | y = df['Class']
69 | X = df.drop(['Name','Class'],axis=1)
70 |
71 | clf_dt = DecisionTreeClassifier(criterion='entropy')
72 | clf_dt = clf_dt.fit(X, y)
73 | ```
74 | :::
75 |
76 | ::: {.cell .code }
77 | ```python
78 | plt.figure(figsize=(10,10))
79 | sklearn.tree.plot_tree(clf_dt,
80 | feature_names = df.columns.drop(['Name', 'Class']),
81 | class_names = ["mammals", "non-mammals"],
82 | filled=True, rounded=True);
83 | ```
84 | :::
85 |
86 | ::: {.cell .markdown }
87 | ### Feature importance
88 | :::
89 |
90 | ::: {.cell .code }
91 | ```python
92 | df_importance = pd.DataFrame({'feature': df.columns.drop(['Name', 'Class']),
93 | 'importance': clf_dt.feature_importances_})
94 | df_importance
95 | ```
96 | :::
97 |
--------------------------------------------------------------------------------
/notebooks/6-decision-trees.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/6-decision-trees.pdf
--------------------------------------------------------------------------------
/notebooks/6-k-nearest-neighbors-in-depth.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/6-k-nearest-neighbors-in-depth.pdf
--------------------------------------------------------------------------------
/notebooks/6-knn-tree-bias-variance.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Bias/variance of non-parametric models\n",
8 | "======================================\n",
9 | "\n",
10 | "*Fraida Fund*"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "import numpy as np\n",
20 | "import matplotlib.pyplot as plt\n",
21 | "import seaborn as sns\n",
22 | "\n",
23 | "from sklearn.tree import DecisionTreeRegressor\n",
24 | "from sklearn.neighbors import KNeighborsRegressor"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "Generate data\n",
32 | "-------------"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "We will generate data from the true function\n",
40 | "\n",
41 | "$$ t(x) = e^{-x^2} + 1.5 e^{-(x-2)^2}$$\n",
42 | "\n",
43 | "in the range $-5 < x < 5$.\n",
44 | "\n",
45 | "To this we will add Gaussian noise $\\epsilon$ so that\n",
46 | "\n",
47 | "$$ y = t(x) + \\epsilon$$\n",
48 | "\n",
49 | "We will use this data for *all* of the models trained in this notebook."
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "# Utility functions to generate data\n",
59 | "def f(x):\n",
60 | " x = x.ravel()\n",
61 | " return np.exp(-x ** 2) + 1.5 * np.exp(-(x - 2) ** 2)\n",
62 | "\n",
63 | "def generate(n_samples, noise, n_repeat=1):\n",
64 | " X = np.random.rand(n_samples) * 10 - 5\n",
65 | " X = np.sort(X)\n",
66 | " if n_repeat == 1:\n",
67 | " y = f(X) + np.random.normal(0.0, noise, n_samples)\n",
68 | " else:\n",
69 | " y = np.zeros((n_samples, n_repeat))\n",
70 | " for i in range(n_repeat):\n",
71 | " y[:, i] = f(X) + np.random.normal(0.0, noise, n_samples)\n",
72 | "\n",
73 | " X = X.reshape((n_samples, 1))\n",
74 | " return X, y"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "Set up simulation\n",
82 | "-----------------"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "# Simulation settings\n",
92 | "n_repeat = 500 # Number of iterations for computing expectations\n",
93 | "n_train = 500 # Size of the training set\n",
94 | "n_test = 1000 # Size of the test set\n",
95 | "noise = 0.15 # Standard deviation of the noise\n",
96 | "np.random.seed(4)"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "def plot_simulation(estimators):\n",
106 | "\n",
107 | " n_estimators = len(estimators)\n",
108 | " plt.figure(figsize=(5*n_estimators, 10))\n",
109 | "\n",
110 | " # Loop over estimators to compare\n",
111 | " for n, (name, estimator) in enumerate(estimators):\n",
112 | " # Compute predictions\n",
113 | " y_predict = np.zeros((n_test, n_repeat))\n",
114 | "\n",
115 | " for i in range(n_repeat):\n",
116 | " estimator.fit(X_train[i].reshape(-1,1), y_train[i])\n",
117 | " y_predict[:, i] = estimator.predict(X_test.reshape(-1,1))\n",
118 | "\n",
119 | " # Bias^2 + Variance + Noise decomposition of the mean squared error\n",
120 | " y_error = np.zeros(n_test)\n",
121 | "\n",
122 | " for i in range(n_repeat):\n",
123 | " for j in range(n_repeat):\n",
124 | " y_error += (y_test[:, j] - y_predict[:, i]) ** 2\n",
125 | "\n",
126 | " y_error /= (n_repeat * n_repeat)\n",
127 | "\n",
128 | " y_noise = np.var(y_test, axis=1)\n",
129 | " y_bias = (f(X_test) - np.mean(y_predict, axis=1)) ** 2\n",
130 | " y_var = np.var(y_predict, axis=1)\n",
131 | "\n",
132 | " # Plot figures\n",
133 | " plt.subplot(2, n_estimators, n + 1)\n",
134 | " plt.plot(X_test, f(X_test), \"b\", label=\"$f(x)$\")\n",
135 | " plt.plot(X_train[0], y_train[0], \".b\", alpha=0.2, label=\"$y = f(x)+noise$\")\n",
136 | "\n",
137 | " for i in range(20):\n",
138 | " if i == 0:\n",
139 | " plt.plot(X_test, y_predict[:, i], \"r\", label=r\"$\\^y(x)$\")\n",
140 | " else:\n",
141 | " plt.plot(X_test, y_predict[:, i], \"r\", alpha=0.1)\n",
142 | "\n",
143 | " plt.plot(X_test, np.mean(y_predict, axis=1), \"c\",\n",
144 | " label=r\"$E[ \\^y(x)]$\")\n",
145 | "\n",
146 | " plt.xlim([-5, 5])\n",
147 | " plt.title(name)\n",
148 | "\n",
149 | " if n == n_estimators - 1:\n",
150 | " plt.legend(loc=(1.1, .5))\n",
151 | "\n",
152 | " plt.subplot(2, n_estimators, n_estimators + n + 1)\n",
153 | " plt.plot(X_test, y_noise, \"c\", label=\"$noise(x)$\", alpha=0.3)\n",
154 | " plt.plot(X_test, y_bias, \"b\", label=\"$bias^2(x)$\", alpha=0.6),\n",
155 | " plt.plot(X_test, y_var, \"g\", label=\"$variance(x)$\", alpha=0.6),\n",
156 | " plt.plot(X_test, y_error, \"r\", label=\"$error(x)$\", alpha=0.4)\n",
157 | " plt.title(\"{0:.4f} (error) = {1:.4f} (bias^2) \\n\"\n",
158 | " \" + {2:.4f} (var) + {3:.4f} (noise)\".format( np.mean(y_error),\n",
159 | " np.mean(y_bias),\n",
160 | " np.mean(y_var),\n",
161 | " np.mean(y_noise)))\n",
162 | "\n",
163 | " plt.xlim([-5, 5])\n",
164 | " plt.ylim([0, 0.1])\n",
165 | "\n",
166 | " if n == n_estimators - 1:\n",
167 | "\n",
168 | " plt.legend(loc=(1.1, .5))\n",
169 | "\n",
170 | " plt.subplots_adjust(right=.75)"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "X_test, y_test = generate(n_samples=n_test, noise=noise, n_repeat=n_repeat)"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "metadata": {},
186 | "outputs": [],
187 | "source": [
188 | "X_train = np.zeros(shape=(n_repeat, n_train))\n",
189 | "y_train = np.zeros(shape=(n_repeat, n_train))\n",
190 | "\n",
191 | "for i in range(n_repeat):\n",
192 | " X, y = generate(n_samples=n_train, noise=noise)\n",
193 | " X_train[i] = X.ravel()\n",
194 | " y_train[i] = y"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "plt.figure(figsize=(5,5))\n",
204 | "plt.plot(X_test, f(X_test), \"b\", label=\"$f(x)$\");\n",
205 | "plt.plot(X_train[0], y_train[0], \".b\", alpha=0.2, label=\"$y = f(x)+noise$\");\n",
206 | "plt.legend(loc=(1.1, .5));"
207 | ]
208 | },
209 | {
210 | "cell_type": "markdown",
211 | "metadata": {},
212 | "source": [
213 | "K Nearest Neighbors\n",
214 | "-------------------"
215 | ]
216 | },
217 | {
218 | "cell_type": "markdown",
219 | "metadata": {},
220 | "source": [
221 | "Consider the following KNN regression models. Which model will have more bias? Which model will have more variance?\n",
222 | "\n",
223 | "- **Model A**: K = 1\n",
224 | "- **Model B**: K = 75"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "estimators = [(\"1-NN\", KNeighborsRegressor(n_neighbors=1)),\n",
234 | " (\"75-NN\", KNeighborsRegressor(n_neighbors=75))]\n",
235 | "\n",
236 | "plot_simulation(estimators)"
237 | ]
238 | },
239 | {
240 | "cell_type": "markdown",
241 | "metadata": {},
242 | "source": [
243 | "Decision tree by depth\n",
244 | "----------------------"
245 | ]
246 | },
247 | {
248 | "cell_type": "markdown",
249 | "metadata": {},
250 | "source": [
251 | "Consider the following decision tree regression models. Which model will have more bias? Which model will have more variance?\n",
252 | "\n",
253 | "- **Model A**: Max depth = 5\n",
254 | "- **Model B**: Max depth = 100"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": null,
260 | "metadata": {},
261 | "outputs": [],
262 | "source": [
263 | "estimators = [(\"DT - depth <= 5\", DecisionTreeRegressor(max_depth=5)),\n",
264 | " (\"DT - depth <= 100\", DecisionTreeRegressor(max_depth=100))]\n",
265 | "\n",
266 | "plot_simulation(estimators)"
267 | ]
268 | },
269 | {
270 | "cell_type": "markdown",
271 | "metadata": {},
272 | "source": [
273 | "Decision tree by pruning parameter\n",
274 | "----------------------------------"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {},
280 | "source": [
281 | "Suppose we use cost complexity tuning to train the decision tree that minimizes\n",
282 | "\n",
283 | "$$\\sum_{m=1}^{|T|} \\sum_{x_i}^{R_m} (y_i - \\hat{y}_{R_m})^2 + \\alpha |T| $$\n",
284 | "\n",
285 | "Consider the following decision tree regression models. Which model will have more bias? Which model will have more variance?\n",
286 | "\n",
287 | "- **Model A**: $\\alpha = 0.00001$\n",
288 | "- **Model B**: $\\alpha = 0.001$"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": null,
294 | "metadata": {},
295 | "outputs": [],
296 | "source": [
297 | "estimators = [(\"DT - α = 10e-6\", DecisionTreeRegressor(ccp_alpha=10e-6)),\n",
298 | " (\"DT - α = 10e-4\", DecisionTreeRegressor(ccp_alpha=10e-4))]\n",
299 | "\n",
300 | "plot_simulation(estimators)"
301 | ]
302 | }
303 | ],
304 | "nbformat": 4,
305 | "nbformat_minor": 0,
306 | "metadata": {
307 | "language_info": {
308 | "name": "python"
309 | },
310 | "kernelspec": {
311 | "name": "python3",
312 | "display_name": "Python 3"
313 | },
314 | "colab": {
315 | "toc_visible": "true"
316 | }
317 | }
318 | }
319 |
--------------------------------------------------------------------------------
/notebooks/6-knn-tree-bias-variance.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Bias and variance of non-parametric models'
3 | author: 'Fraida Fund'
4 | jupyter:
5 | colab:
6 | toc_visible: true
7 | kernelspec:
8 | display_name: Python 3
9 | name: python3
10 | language_info:
11 | name: python
12 | nbformat: 4
13 | nbformat_minor: 0
14 | ---
15 |
16 | ::: {.cell .markdown }
17 | # Bias/variance of non-parametric models
18 |
19 | *Fraida Fund*
20 | :::
21 |
22 |
23 | ::: {.cell .code }
24 | ```python
25 | import numpy as np
26 | import matplotlib.pyplot as plt
27 | import seaborn as sns
28 |
29 | from sklearn.tree import DecisionTreeRegressor
30 | from sklearn.neighbors import KNeighborsRegressor
31 | ```
32 | :::
33 |
34 | ::: {.cell .markdown }
35 | ## Generate data
36 | :::
37 |
38 | ::: {.cell .markdown }
39 | We will generate data from the true function
40 |
41 | $$ t(x) = e^{-x^2} + 1.5 e^{-(x-2)^2}$$
42 |
43 | in the range $-5 < x < 5$.
44 |
45 | To this we will add Gaussian noise $\epsilon$ so that
46 |
47 | $$ y = t(x) + \epsilon$$
48 |
49 | We will use this data for *all* of the models trained in this notebook.
50 | :::
51 |
52 | ::: {.cell .code }
53 | ```python
54 | # Utility functions to generate data
55 | def f(x):
56 | x = x.ravel()
57 | return np.exp(-x ** 2) + 1.5 * np.exp(-(x - 2) ** 2)
58 |
59 | def generate(n_samples, noise, n_repeat=1):
60 | X = np.random.rand(n_samples) * 10 - 5
61 | X = np.sort(X)
62 | if n_repeat == 1:
63 | y = f(X) + np.random.normal(0.0, noise, n_samples)
64 | else:
65 | y = np.zeros((n_samples, n_repeat))
66 | for i in range(n_repeat):
67 | y[:, i] = f(X) + np.random.normal(0.0, noise, n_samples)
68 |
69 | X = X.reshape((n_samples, 1))
70 | return X, y
71 | ```
72 | :::
73 |
74 | ::: {.cell .markdown }
75 | ## Set up simulation
76 | :::
77 |
78 | ::: {.cell .code }
79 | ```python
80 | # Simulation settings
81 | n_repeat = 500 # Number of iterations for computing expectations
82 | n_train = 500 # Size of the training set
83 | n_test = 1000 # Size of the test set
84 | noise = 0.15 # Standard deviation of the noise
85 | np.random.seed(4)
86 | ```
87 | :::
88 |
89 | ::: {.cell .code }
90 | ```python
91 | def plot_simulation(estimators):
92 |
93 | n_estimators = len(estimators)
94 | plt.figure(figsize=(5*n_estimators, 10))
95 |
96 | # Loop over estimators to compare
97 | for n, (name, estimator) in enumerate(estimators):
98 | # Compute predictions
99 | y_predict = np.zeros((n_test, n_repeat))
100 |
101 | for i in range(n_repeat):
102 | estimator.fit(X_train[i].reshape(-1,1), y_train[i])
103 | y_predict[:, i] = estimator.predict(X_test.reshape(-1,1))
104 |
105 | # Bias^2 + Variance + Noise decomposition of the mean squared error
106 | y_error = np.zeros(n_test)
107 |
108 | for i in range(n_repeat):
109 | for j in range(n_repeat):
110 | y_error += (y_test[:, j] - y_predict[:, i]) ** 2
111 |
112 | y_error /= (n_repeat * n_repeat)
113 |
114 | y_noise = np.var(y_test, axis=1)
115 | y_bias = (f(X_test) - np.mean(y_predict, axis=1)) ** 2
116 | y_var = np.var(y_predict, axis=1)
117 |
118 | # Plot figures
119 | plt.subplot(2, n_estimators, n + 1)
120 | plt.plot(X_test, f(X_test), "b", label="$f(x)$")
121 | plt.plot(X_train[0], y_train[0], ".b", alpha=0.2, label="$y = f(x)+noise$")
122 |
123 | for i in range(20):
124 | if i == 0:
125 | plt.plot(X_test, y_predict[:, i], "r", label=r"$\^y(x)$")
126 | else:
127 | plt.plot(X_test, y_predict[:, i], "r", alpha=0.1)
128 |
129 | plt.plot(X_test, np.mean(y_predict, axis=1), "c",
130 | label=r"$E[ \^y(x)]$")
131 |
132 | plt.xlim([-5, 5])
133 | plt.title(name)
134 |
135 | if n == n_estimators - 1:
136 | plt.legend(loc=(1.1, .5))
137 |
138 | plt.subplot(2, n_estimators, n_estimators + n + 1)
139 | plt.plot(X_test, y_noise, "c", label="$noise(x)$", alpha=0.3)
140 | plt.plot(X_test, y_bias, "b", label="$bias^2(x)$", alpha=0.6),
141 | plt.plot(X_test, y_var, "g", label="$variance(x)$", alpha=0.6),
142 | plt.plot(X_test, y_error, "r", label="$error(x)$", alpha=0.4)
143 | plt.title("{0:.4f} (error) = {1:.4f} (bias^2) \n"
144 | " + {2:.4f} (var) + {3:.4f} (noise)".format( np.mean(y_error),
145 | np.mean(y_bias),
146 | np.mean(y_var),
147 | np.mean(y_noise)))
148 |
149 | plt.xlim([-5, 5])
150 | plt.ylim([0, 0.1])
151 |
152 | if n == n_estimators - 1:
153 |
154 | plt.legend(loc=(1.1, .5))
155 |
156 | plt.subplots_adjust(right=.75)
157 | ```
158 | :::
159 |
160 | ::: {.cell .code }
161 | ```python
162 | X_test, y_test = generate(n_samples=n_test, noise=noise, n_repeat=n_repeat)
163 | ```
164 | :::
165 |
166 | ::: {.cell .code }
167 | ```python
168 | X_train = np.zeros(shape=(n_repeat, n_train))
169 | y_train = np.zeros(shape=(n_repeat, n_train))
170 |
171 | for i in range(n_repeat):
172 | X, y = generate(n_samples=n_train, noise=noise)
173 | X_train[i] = X.ravel()
174 | y_train[i] = y
175 | ```
176 | :::
177 |
178 | ::: {.cell .code }
179 | ```python
180 | plt.figure(figsize=(5,5))
181 | plt.plot(X_test, f(X_test), "b", label="$f(x)$");
182 | plt.plot(X_train[0], y_train[0], ".b", alpha=0.2, label="$y = f(x)+noise$");
183 | plt.legend(loc=(1.1, .5));
184 | ```
185 | :::
186 |
187 | ::: {.cell .markdown }
188 | ## K Nearest Neighbors
189 | :::
190 |
191 | ::: {.cell .markdown }
192 | Consider the following KNN regression models. Which model will have more
193 | bias? Which model will have more variance?
194 |
195 | - **Model A**: K = 1
196 | - **Model B**: K = 75
197 | :::
198 |
199 | ::: {.cell .code }
200 | ```python
201 | estimators = [("1-NN", KNeighborsRegressor(n_neighbors=1)),
202 | ("75-NN", KNeighborsRegressor(n_neighbors=75))]
203 |
204 | plot_simulation(estimators)
205 | ```
206 | :::
207 |
208 | ::: {.cell .markdown }
209 | ## Decision tree by depth
210 | :::
211 |
212 | ::: {.cell .markdown }
213 | Consider the following decision tree regression models. Which model will
214 | have more bias? Which model will have more variance?
215 |
216 | - **Model A**: Max depth = 5
217 | - **Model B**: Max depth = 100
218 | :::
219 |
220 | ::: {.cell .code }
221 | ```python
222 | estimators = [("DT - depth <= 5", DecisionTreeRegressor(max_depth=5)),
223 | ("DT - depth <= 100", DecisionTreeRegressor(max_depth=100))]
224 |
225 | plot_simulation(estimators)
226 | ```
227 | :::
228 |
229 | ::: {.cell .markdown }
230 | ## Decision tree by pruning parameter
231 | :::
232 |
233 | ::: {.cell .markdown }
234 | Suppose we use cost complexity tuning to train the decision tree that
235 | minimizes
236 |
237 | $$\sum_{m=1}^{|T|} \sum_{x_i}^{R_m} (y_i - \hat{y}_{R_m})^2 + \alpha |T| $$
238 |
239 | Consider the following decision tree regression models. Which model will
240 | have more bias? Which model will have more variance?
241 |
242 | - **Model A**: $\alpha = 0.00001$
243 | - **Model B**: $\alpha = 0.001$
244 | :::
245 |
246 | ::: {.cell .code }
247 | ```python
248 | estimators = [("DT - α = 10e-6", DecisionTreeRegressor(ccp_alpha=10e-6)),
249 | ("DT - α = 10e-4", DecisionTreeRegressor(ccp_alpha=10e-4))]
250 |
251 | plot_simulation(estimators)
252 | ```
253 | :::
254 |
--------------------------------------------------------------------------------
/notebooks/6-knn-tree-bias-variance.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/6-knn-tree-bias-variance.pdf
--------------------------------------------------------------------------------
/notebooks/6-knn-voter-classification-hw.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/6-knn-voter-classification-hw.pdf
--------------------------------------------------------------------------------
/notebooks/7-demo-adaboost.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Demo: AdaBoost Classifier'
3 | author: 'Fraida Fund'
4 | jupyter:
5 | colab:
6 | toc_visible: true
7 | kernelspec:
8 | display_name: Python 3
9 | name: python3
10 | nbformat: 4
11 | nbformat_minor: 0
12 | ---
13 |
14 | ::: {.cell .markdown }
15 | # Demo: AdaBoost Classifier
16 |
17 | *Fraida Fund*
18 | :::
19 |
20 | ::: {.cell .markdown }
21 | In this demo, we will build and train our own AdaBoost classifier, in
22 | order to better understand how this algorithm works. (At the end, we'l
23 | look at the `sklearn` implementation of AdaBoost and note that its
24 | behavior is identical.)
25 |
26 | This demo is based on the following blog post: [AdaBoost: Implementation
27 | and intuition](https://xavierbourretsicotte.github.io/AdaBoost.html)
28 | :::
29 |
30 | ::: {.cell .markdown }
31 | Note: for non-demo purposes, you can use the `sklearn` implementation,
32 | `AdaBoostClassifier`
33 | ([reference](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html)).
34 | :::
35 |
36 | ::: {.cell .code }
37 | ```python
38 | import matplotlib.pyplot as plt
39 | from matplotlib.colors import ListedColormap
40 | import numpy as np
41 | import seaborn as sns
42 |
43 | from sklearn.datasets import make_circles
44 | from sklearn.tree import DecisionTreeClassifier, plot_tree
45 | from sklearn.ensemble import AdaBoostClassifier
46 | from sklearn.metrics import accuracy_score
47 | ```
48 | :::
49 |
50 | ::: {.cell .markdown }
51 | ## Generate data
52 |
53 | First, we will generate and plot some "toy" data for a binary
54 | classification problem with class labels $-1, 1$.
55 | :::
56 |
57 | ::: {.cell .code }
58 | ```python
59 | X, y = make_circles(noise=0.1, factor=0.4, n_samples=50, random_state=3)
60 | y = y*2-1
61 | x1 = X[:,0]
62 | x2 = X[:,1]
63 | sns.scatterplot(x=x1, y=x2, hue=y, palette={-1:'red', 1:'blue'});
64 | ```
65 | :::
66 |
67 | ::: {.cell .markdown }
68 | ## Select the base estimator and the number of estimators
69 |
70 | The parameters of the base estimator and the number of base estimators
71 | are tuning parameters.
72 |
73 | - If this number of estimators (number of rounds of boosting) is
74 | small, the ensemble may have large bias.
75 | - If the base estimator is too complex (e.g. a deep tree), the
76 | ensemble may have high variance.
77 | :::
78 |
79 | ::: {.cell .code }
80 | ```python
81 | n_estimators = 15
82 |
83 | dt = []
84 | for i in range(n_estimators):
85 | dt.append(DecisionTreeClassifier(max_depth = 1))
86 | ```
87 | :::
88 |
89 | ::: {.cell .markdown }
90 | ## Initialize weights
91 |
92 | In the first step of the algorithm, let $w_i = \frac{1}{N}$ for all $i$
93 | in training set.
94 | :::
95 |
96 | ::: {.cell .code }
97 | ```python
98 | weights = np.repeat(1/len(y), repeats=len(y))
99 | weights
100 | ```
101 | :::
102 |
103 | ::: {.cell .markdown }
104 | ## Main loop
105 |
106 | In each iteration, we:
107 |
108 | - Fit a decision stump (tree with depth 1) using a *weighted* version
109 | of the training data set, and get the predictions of the decision
110 | stump for the training data.
111 | - Compute weighted error:
112 |
113 | $$err_m = \frac{\sum_{i=1}^N w_i 1(y_i \neq \hat{f}^m(x_i))}{\sum_{i=1}^N w_i}$$
114 |
115 | - Compute coefficient;
116 |
117 | $$\alpha_m = \log \left( \frac{1-err_m}{err_m} \right)$$
118 |
119 | - Update weights:
120 |
121 | $$w_{i+1} \leftarrow w_i e^{\alpha_m 1(y_i \neq \hat{f}^m(x_i))}$$
122 | :::
123 |
124 | ::: {.cell .code }
125 | ```python
126 | w = np.zeros(shape=(n_estimators+1, len(weights)))
127 | y_pred = np.zeros(shape=(n_estimators, len(y)))
128 | err = np.zeros(n_estimators)
129 | alpha = np.zeros(n_estimators)
130 | acc = np.zeros(n_estimators)
131 |
132 | w[0] = weights
133 |
134 | # loop over the number of base estimators
135 | for m in range(n_estimators):
136 | # fit decision stump and get its predictions
137 | dt[m].fit(X, y, sample_weight=w[m])
138 | y_pred[m] = dt[m].predict(X)
139 |
140 | # compute accuracy of the stump
141 | # (not really required, just out of interest)
142 | acc[m] = accuracy_score(y, y_pred[m])
143 |
144 | # compute weighted error
145 | err[m] = sum(w[m]*(y_pred[m]!=y))/sum(w[m])
146 |
147 | # compute coefficient
148 | alpha[m] = np.log((1.0-err[m])/err[m])
149 |
150 | # update weights
151 | w[m+1] = w[m]*np.exp(alpha[m]*(y_pred[m]!=y))/np.sum(w[m])
152 | ```
153 | :::
154 |
155 | ::: {.cell .markdown }
156 | ## Ensemble prediction
157 |
158 | The ensemble prediction is
159 |
160 | $$\hat{f}(x) = \text{sign} \left[\sum_{m=1}^M \alpha_m \hat{f}^m(x)\right]$$
161 |
162 | The sign of the ensemble output gives the predicted class.
163 |
164 | The magnitude,
165 |
166 | $$\text{abs} \left[\sum_{m=1}^M \alpha_m \hat{f}^m(x)\right]$$
167 |
168 | indicates how confident the ensemble is in the prediction.
169 |
170 | We will store the ensemble output from each stage of training, so that
171 | we can see how it changes. In general, however, it is only necessary to
172 | compute the ensemble output once, after the last iteration.
173 | :::
174 |
175 | ::: {.cell .code }
176 | ```python
177 | y_pred_en = np.zeros(shape=(n_estimators, len(y)))
178 | acc_en = np.zeros(n_estimators)
179 |
180 | for m in range(n_estimators):
181 | # compute ensemble prediction and its accuracy
182 | for i in range(m+1):
183 | y_pred_en[m] += alpha[i]*dt[i].predict(X)
184 | acc_en[m] = np.mean(y==np.sign(y_pred_en[m]))
185 | ```
186 | :::
187 |
188 | ::: {.cell .markdown }
189 | ## Visualization
190 |
191 | We will create the following plots in each iteration:
192 |
193 | 1. Plot of decision boundaries for the decision stump learned in this
194 | iteration.
195 |
196 | - Each region is shaded red or blue according to the prediction for
197 | the region, $\hat{y}_{R_k}$.
198 | - The intensity of the color of the region indicates the weighted
199 | proportion of samples in the region that belong to the predicted
200 | class. This is the estimated probability that a sample in this
201 | region belongs to the predicted class:
202 |
203 | $$P(y=\hat{y}_{R_k} | x) = \frac{\sum_{i:x_i \in R_k} w_i 1(y_i=\hat{y}_{R_k})} {\sum_{i:x_i \in R_k} w_i} $$
204 |
205 | - Training points are plotted on top of the decision regions, with
206 | their color indicating their true class, and their size indicating
207 | their relative weight at the beginning of this iteration.
208 |
209 | 1. Tree visualization for the decision stump learned in this iteration.
210 |
211 | - Each leaf node is shaded red or blue according to the prediction for
212 | the node. The intensity of the color is again
213 |
214 | $$P(y=\hat{y}_{R_k} | x)$$
215 |
216 | - The `value=[a, b]` line in each node gives the weighted sum of
217 | samples in each class that appear at that node. (These weighted
218 | values are used to compute the GINI index and choose the feature and
219 | cutpoint to split on):
220 |
221 | $$\sum_{i:x_i \in R_k} w_i 1(y_i=-1), \sum_{i:x_i \in R_k} w_i 1(y_i=1)$$
222 |
223 | 1. Scatter plot of training points showing *change* in weight after
224 | this iteration.
225 |
226 | - The color of each point shows the ratio of its weight *after* this
227 | iteration, to its weight *before* this iteration. Any sample that is
228 | misclassified by the decision stump should have its weight increase
229 | by a factor of $e^{\alpha_m}$.
230 |
231 | 1. Ensemble prediction after this iteration.
232 |
233 | - The color of each point shows its predicted class:
234 | $$\text{sign} \left[\sum_{m=1}^M \alpha_m \hat{f}^m(x)\right]$$
235 |
236 | - The size of each point shows the confidence of the prediction:
237 | $$\text{abs} \left[\sum_{m=1}^M \alpha_m \hat{f}^m(x)\right]$$
238 | :::
239 |
240 | ::: {.cell .code }
241 | ```python
242 | # utility function for plotting decision regions and scatter plot of data
243 |
244 | def plot_decision_boundary(classifier, X, y, N = 10, scatter_weights = np.ones(len(y)) , ax = None ):
245 |
246 | x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
247 | y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
248 | xx, yy = np.meshgrid( np.linspace(x_min, x_max, N), np.linspace(y_min, y_max, N))
249 | zz = np.array( [classifier.predict_proba(np.array([xi,yi]).reshape(1,-1))[:,1] for xi, yi in zip(np.ravel(xx), np.ravel(yy)) ] )
250 |
251 | Z = zz.reshape(xx.shape)
252 | cm_bright = ListedColormap(['#FF0000', '#0000FF'])
253 |
254 | # get current axis and plot
255 | if ax is None:
256 | ax = plt.gca()
257 | ax.contourf(xx, yy, Z, 2, cmap='RdBu', alpha=.5, vmin=0, vmax=1)
258 | ax.scatter(X[:,0],X[:,1], c = y, cmap = cm_bright, s = scatter_weights * 40)
259 | ```
260 | :::
261 |
262 | ::: {.cell .code }
263 | ```python
264 | for m in range(n_estimators):
265 | # plot the decision stump and its decision regions
266 | # size of each point shows its relative weight
267 | fig = plt.figure(figsize = (20, 4*n_estimators));
268 | plt.subplot(n_estimators, 4, 1+m*4)
269 | plot_decision_boundary(dt[m], X,y,N = 50, scatter_weights =w[m]*30/sum(w[m]) )
270 | plt.title("Decision boundary for iteration %d (%0.2f)" % (m, acc[m]));
271 |
272 | # plot the tree diagram for the decision stump
273 | plt.subplot(n_estimators, 4, 2+m*4)
274 | plot_tree(dt[m], filled=True, rounded=True, feature_names=['x1', 'x2']);
275 | plt.title("Iteration %d \n Alpha = %0.2f, Err = %0.2f" % (m, alpha[m], err[m]));
276 |
277 | # plot the change in weights - show which points have increased weight
278 | # following this iteration
279 | plt.subplot(n_estimators, 4, 3+m*4)
280 | sns.scatterplot(x=x1, y=x2, hue=w[m+1]/w[m], legend=False);
281 | plt.title("Samples with > weight after iteration %d" % m);
282 |
283 | # plot ensemble prediction and its accuracy
284 | # size of point shows confidence in prediction
285 | plt.subplot(n_estimators, 4, 4+m*4)
286 | sns.scatterplot(x=x1, y=x2, hue=np.sign(y_pred_en[m]),
287 | size=10*np.abs(y_pred_en[m]), legend=False,
288 | palette={-1:'red', 0:'purple', 1:'blue'});
289 | plt.title("Ensemble prediction (%0.2f)" % acc_en[m]);
290 | ```
291 | :::
292 |
293 | ::: {.cell .markdown }
294 | ## `sklearn` implementation
295 | :::
296 |
297 | ::: {.cell .code }
298 | ```python
299 | from sklearn.ensemble import AdaBoostClassifier
300 |
301 | clf_ab = AdaBoostClassifier(n_estimators = n_estimators, algorithm='SAMME',
302 | estimator = DecisionTreeClassifier(max_depth=1))
303 | clf_ab.fit(X, y)
304 | accuracy_score(clf_ab.predict(X), y)
305 | ```
306 | :::
307 |
308 | ::: {.cell .markdown }
309 | Note: in this implementation, we don't have information about the
310 | weights at each step, so our visualization won't include that.
311 | :::
312 |
313 | ::: {.cell .code }
314 | ```python
315 | y_pred_en_sk = np.zeros(shape=(n_estimators, len(y)))
316 | acc_en_sk = np.zeros(n_estimators)
317 | conf_en_sk = np.zeros(n_estimators)
318 |
319 | for m, pred in enumerate(clf_ab.staged_predict(X)):
320 | y_pred_en_sk[m] = pred
321 | acc_en_sk[m] = np.mean(y==np.sign(y_pred_en_sk[m]))
322 | ```
323 | :::
324 |
325 | ::: {.cell .code }
326 | ```python
327 | for m in range(n_estimators):
328 | # plot the decision stump and its decision regions
329 | # size of each point shows its relative weight
330 | fig = plt.figure(figsize = (15, 4*n_estimators));
331 | plt.subplot(n_estimators, 3, 1+m*3)
332 | plot_decision_boundary(clf_ab.estimators_[m], X,y,N = 50 )
333 | plt.title("Decision boundary for iteration %d (%0.2f)" %
334 | (m, accuracy_score(clf_ab.estimators_[m].predict(X), y)));
335 |
336 | # plot the tree diagram for the decision stump
337 | plt.subplot(n_estimators, 3, 2+m*3)
338 | plot_tree(clf_ab.estimators_[m], filled=True, rounded=True, feature_names=['x1', 'x2']);
339 | plt.title("Iteration %d \n Alpha = %0.2f, Err = %0.2f" %
340 | (m, clf_ab.estimator_weights_[m], clf_ab.estimator_errors_[m]));
341 |
342 | # plot ensemble prediction and its accuracy
343 | # size of point shows confidence in prediction
344 | plt.subplot(n_estimators, 3, 3+m*3)
345 | sns.scatterplot(x=x1, y=x2, hue=np.sign(y_pred_en_sk[m]),
346 | legend=False,
347 | palette={-1:'red', 0:'purple', 1:'blue'});
348 | plt.title("Ensemble prediction (%0.2f)" % acc_en_sk[m]);
349 | ```
350 | :::
351 |
352 | ::: {.cell .markdown }
353 | The overall decision boundary looks like this:
354 | :::
355 |
356 | ::: {.cell .code }
357 | ```python
358 | plot_decision_boundary(clf_ab, X, y)
359 | ```
360 | :::
361 |
--------------------------------------------------------------------------------
/notebooks/7-demo-adaboost.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/7-demo-adaboost.pdf
--------------------------------------------------------------------------------
/notebooks/7-demo-digits-classifiers.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/7-demo-digits-classifiers.pdf
--------------------------------------------------------------------------------
/notebooks/7-knn-tree-bias-variance.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Bias and variance of non-parametric models'
3 | author: 'Fraida Fund'
4 | jupyter:
5 | colab:
6 | toc_visible: true
7 | kernelspec:
8 | display_name: Python 3
9 | name: python3
10 | language_info:
11 | name: python
12 | nbformat: 4
13 | nbformat_minor: 0
14 | ---
15 |
16 | ::: {.cell .markdown }
17 | # Bias/variance of non-parametric models
18 |
19 | *Fraida Fund*
20 | :::
21 |
22 | ::: {.cell .markdown }
23 | **Attribution**: Parts of this notebook are adopted from: [Single
24 | estimator versus bagging: bias-variance
25 | decomposition](https://scikit-learn.org/stable/auto_examples/ensemble/plot_bias_variance.html#sphx-glr-auto-examples-ensemble-plot-bias-variance-py)
26 | :::
27 |
28 | ::: {.cell .code }
29 | ```python
30 | import numpy as np
31 | import matplotlib.pyplot as plt
32 | import seaborn as sns
33 |
34 | from sklearn.ensemble import BaggingRegressor,AdaBoostRegressor, RandomForestRegressor
35 | from sklearn.tree import DecisionTreeRegressor
36 | from sklearn.neighbors import KNeighborsRegressor
37 | ```
38 | :::
39 |
40 | ::: {.cell .markdown }
41 | ## Generate data
42 | :::
43 |
44 | ::: {.cell .markdown }
45 | We will generate data from the true function
46 |
47 | $$ t(x) = e^{-x^2} + 1.5 e^{-(x-2)^2}$$
48 |
49 | in the range $-5 < x < 5$.
50 |
51 | To this we will add Gaussian noise $\epsilon$ so that
52 |
53 | $$ y = t(x) + \epsilon$$
54 |
55 | We will use this data for *all* of the models trained in this notebook.
56 | :::
57 |
58 | ::: {.cell .code }
59 | ```python
60 | # Utility functions to generate data
61 | def f(x):
62 | x = x.ravel()
63 | return np.exp(-x ** 2) + 1.5 * np.exp(-(x - 2) ** 2)
64 |
65 | def generate(n_samples, noise, n_repeat=1):
66 | X = np.random.rand(n_samples) * 10 - 5
67 | X = np.sort(X)
68 | if n_repeat == 1:
69 | y = f(X) + np.random.normal(0.0, noise, n_samples)
70 | else:
71 | y = np.zeros((n_samples, n_repeat))
72 | for i in range(n_repeat):
73 | y[:, i] = f(X) + np.random.normal(0.0, noise, n_samples)
74 |
75 | X = X.reshape((n_samples, 1))
76 | return X, y
77 | ```
78 | :::
79 |
80 | ::: {.cell .markdown }
81 | ## Set up simulation
82 | :::
83 |
84 | ::: {.cell .code }
85 | ```python
86 | # Simulation settings
87 | n_repeat = 500 # Number of iterations for computing expectations
88 | n_train = 500 # Size of the training set
89 | n_test = 1000 # Size of the test set
90 | noise = 0.15 # Standard deviation of the noise
91 | np.random.seed(4)
92 | ```
93 | :::
94 |
95 | ::: {.cell .code }
96 | ```python
97 | def plot_simulation(estimators):
98 |
99 | n_estimators = len(estimators)
100 | plt.figure(figsize=(5*n_estimators, 10))
101 |
102 | # Loop over estimators to compare
103 | for n, (name, estimator) in enumerate(estimators):
104 | # Compute predictions
105 | y_predict = np.zeros((n_test, n_repeat))
106 |
107 | for i in range(n_repeat):
108 | estimator.fit(X_train[i].reshape(-1,1), y_train[i])
109 | y_predict[:, i] = estimator.predict(X_test.reshape(-1,1))
110 |
111 | # Bias^2 + Variance + Noise decomposition of the mean squared error
112 | y_error = np.zeros(n_test)
113 |
114 | for i in range(n_repeat):
115 | for j in range(n_repeat):
116 | y_error += (y_test[:, j] - y_predict[:, i]) ** 2
117 |
118 | y_error /= (n_repeat * n_repeat)
119 |
120 | y_noise = np.var(y_test, axis=1)
121 | y_bias = (f(X_test) - np.mean(y_predict, axis=1)) ** 2
122 | y_var = np.var(y_predict, axis=1)
123 |
124 | # Plot figures
125 | plt.subplot(2, n_estimators, n + 1)
126 | plt.plot(X_test, f(X_test), "b", label="$f(x)$")
127 | plt.plot(X_train[0], y_train[0], ".b", alpha=0.2, label="$y = f(x)+noise$")
128 |
129 | for i in range(20):
130 | if i == 0:
131 | plt.plot(X_test, y_predict[:, i], "r", label=r"$\^y(x)$")
132 | else:
133 | plt.plot(X_test, y_predict[:, i], "r", alpha=0.1)
134 |
135 | plt.plot(X_test, np.mean(y_predict, axis=1), "c",
136 | label=r"$E[ \^y(x)]$")
137 |
138 | plt.xlim([-5, 5])
139 | plt.title(name)
140 |
141 | if n == n_estimators - 1:
142 | plt.legend(loc=(1.1, .5))
143 |
144 | plt.subplot(2, n_estimators, n_estimators + n + 1)
145 | plt.plot(X_test, y_noise, "c", label="$noise(x)$", alpha=0.3)
146 | plt.plot(X_test, y_bias, "b", label="$bias^2(x)$", alpha=0.6),
147 | plt.plot(X_test, y_var, "g", label="$variance(x)$", alpha=0.6),
148 | plt.plot(X_test, y_error, "r", label="$error(x)$", alpha=0.4)
149 | plt.title("{0:.4f} (error) = {1:.4f} (bias^2) \n"
150 | " + {2:.4f} (var) + {3:.4f} (noise)".format( np.mean(y_error),
151 | np.mean(y_bias),
152 | np.mean(y_var),
153 | np.mean(y_noise)))
154 |
155 | plt.xlim([-5, 5])
156 | plt.ylim([0, 0.1])
157 |
158 | if n == n_estimators - 1:
159 |
160 | plt.legend(loc=(1.1, .5))
161 |
162 | plt.subplots_adjust(right=.75)
163 | ```
164 | :::
165 |
166 | ::: {.cell .code }
167 | ```python
168 | X_test, y_test = generate(n_samples=n_test, noise=noise, n_repeat=n_repeat)
169 | ```
170 | :::
171 |
172 | ::: {.cell .code }
173 | ```python
174 | X_train = np.zeros(shape=(n_repeat, n_train))
175 | y_train = np.zeros(shape=(n_repeat, n_train))
176 |
177 | for i in range(n_repeat):
178 | X, y = generate(n_samples=n_train, noise=noise)
179 | X_train[i] = X.ravel()
180 | y_train[i] = y
181 | ```
182 | :::
183 |
184 | ::: {.cell .code }
185 | ```python
186 | plt.figure(figsize=(5,5))
187 | plt.plot(X_test, f(X_test), "b", label="$f(x)$");
188 | plt.plot(X_train[0], y_train[0], ".b", alpha=0.2, label="$y = f(x)+noise$");
189 | plt.legend(loc=(1.1, .5));
190 | ```
191 | :::
192 |
193 | ::: {.cell .markdown }
194 | ## K Nearest Neighbors
195 | :::
196 |
197 | ::: {.cell .markdown }
198 | Consider the following KNN regression models. Which model will have more
199 | bias? Which model will have more variance?
200 |
201 | - **Model A**: K = 1
202 | - **Model B**: K = 75
203 | :::
204 |
205 | ::: {.cell .code }
206 | ```python
207 | estimators = [("1-NN", KNeighborsRegressor(n_neighbors=1)),
208 | ("75-NN", KNeighborsRegressor(n_neighbors=75))]
209 |
210 | plot_simulation(estimators)
211 | ```
212 | :::
213 |
214 | ::: {.cell .markdown }
215 | ## Decision tree by depth
216 | :::
217 |
218 | ::: {.cell .markdown }
219 | Consider the following decision tree regression models. Which model will
220 | have more bias? Which model will have more variance?
221 |
222 | - **Model A**: Max depth = 5
223 | - **Model B**: Max depth = 100
224 | :::
225 |
226 | ::: {.cell .code }
227 | ```python
228 | estimators = [("DT - depth <= 5", DecisionTreeRegressor(max_depth=5)),
229 | ("DT - depth <= 100", DecisionTreeRegressor(max_depth=100))]
230 |
231 | plot_simulation(estimators)
232 | ```
233 | :::
234 |
235 | ::: {.cell .markdown }
236 | ## Decision tree by pruning parameter
237 | :::
238 |
239 | ::: {.cell .markdown }
240 | Suppose we use cost complexity tuning to train the decision tree that
241 | minimizes
242 |
243 | $$\sum_{m=1}^{|T|} \sum_{x_i}^{R_m} (y_i - \hat{y}_{R_m})^2 + \alpha |T| $$
244 |
245 | Consider the following decision tree regression models. Which model will
246 | have more bias? Which model will have more variance?
247 |
248 | - **Model A**: $\alpha = 0.00001$
249 | - **Model B**: $\alpha = 0.001$
250 | :::
251 |
252 | ::: {.cell .code }
253 | ```python
254 | estimators = [("DT - α = 10e-6", DecisionTreeRegressor(ccp_alpha=10e-6)),
255 | ("DT - α = 10e-4", DecisionTreeRegressor(ccp_alpha=10e-4))]
256 |
257 | plot_simulation(estimators)
258 | ```
259 | :::
260 |
261 | ::: {.cell .markdown }
262 | ## Decision tree vs. bagged trees {#decision-tree-vs-bagged-trees}
263 | :::
264 |
265 | ::: {.cell .markdown }
266 | Consider the following regression models. Which model will have more
267 | bias? Which model will have more variance?
268 |
269 | - **Model A**: Decision tree
270 | - **Model B**: Ensemble of 10 trees using "bagging"
271 | :::
272 |
273 | ::: {.cell .code }
274 | ```python
275 | estimators = [("Decision Tree", DecisionTreeRegressor()),
276 | ("10 Bagged Trees", BaggingRegressor(n_estimators=10, base_estimator=DecisionTreeRegressor()))]
277 |
278 | plot_simulation(estimators)
279 | ```
280 | :::
281 |
282 | ::: {.cell .markdown }
283 | ## Bagged trees by number of estimators
284 | :::
285 |
286 | ::: {.cell .markdown }
287 | Consider the following regression models. Which model will have more
288 | bias? Which model will have more variance?
289 |
290 | - **Model A**: Ensemble of 100 trees using "bagging"
291 | - **Model B**: Ensemble of 2 trees using "bagging"
292 |
293 | Warning: this takes a long time to run!
294 | :::
295 |
296 | ::: {.cell .code }
297 | ```python
298 | estimators = [("100 Bagged Trees", BaggingRegressor(n_estimators=100, base_estimator=DecisionTreeRegressor(), n_jobs=-1)),
299 | ("2 Bagged Trees", BaggingRegressor(n_estimators=2, base_estimator=DecisionTreeRegressor(), n_jobs=-1))]
300 |
301 | plot_simulation(estimators)
302 | ```
303 | :::
304 |
305 | ::: {.cell .markdown }
306 | ## Bagged trees by depth of base estimator
307 | :::
308 |
309 | ::: {.cell .markdown }
310 | Consider the following regression models. Which model will have more
311 | bias? Which model will have more variance?
312 |
313 | - **Model A**: Ensemble of 10 trees, each with max depth 5, using
314 | "bagging"
315 | - **Model B**: Ensemble of 10 trees, each with max depth 20, using
316 | "bagging"
317 | :::
318 |
319 | ::: {.cell .code }
320 | ```python
321 | estimators = [("Bagged Trees, depth <= 5", BaggingRegressor(n_estimators=10, base_estimator=DecisionTreeRegressor(max_depth=5))),
322 | ("Bagged Trees, depth <= 20", BaggingRegressor(n_estimators=10, base_estimator=DecisionTreeRegressor(max_depth=20)))]
323 |
324 | plot_simulation(estimators)
325 | ```
326 | :::
327 |
328 | ::: {.cell .markdown }
329 | ## Bagged trees vs. random forest {#bagged-trees-vs-random-forest}
330 | :::
331 |
332 | ::: {.cell .markdown }
333 | Consider the following regression models. Which model will have more
334 | bias? Which model will have more variance?
335 |
336 | - **Model A**: Ensemble of 10 trees using "bagging"
337 | - **Model B**: Ensemble of 10 trees using random forest
338 | :::
339 |
340 | ::: {.cell .code }
341 | ```python
342 | estimators = [("Bagged Trees", BaggingRegressor(n_estimators=10, base_estimator=DecisionTreeRegressor())),
343 | ("Random Forest", RandomForestRegressor(n_estimators=10))]
344 |
345 | plot_simulation(estimators)
346 | ```
347 | :::
348 |
349 | ::: {.cell .markdown }
350 | Normally, we expect the random forest to reduce variance relative to the
351 | bagged trees! Why didn't it have this effect in this instance?
352 | :::
353 |
354 | ::: {.cell .markdown }
355 | ## Decision tree vs. AdaBoost {#decision-tree-vs-adaboost}
356 | :::
357 |
358 | ::: {.cell .markdown }
359 | Consider the following regression models. Which model will have more
360 | bias? Which model will have more variance?
361 |
362 | - **Model A**: Decision tree with max depth 5
363 | - **Model B**: Ensemble of 10 trees with max depth 5, using AdaBoost
364 | :::
365 |
366 | ::: {.cell .code }
367 | ```python
368 | estimators = [("Decision Tree", DecisionTreeRegressor(max_depth = 5)),
369 | ("AdaBoost", AdaBoostRegressor(n_estimators=10, base_estimator=DecisionTreeRegressor(max_depth=5)))]
370 |
371 | plot_simulation(estimators)
372 | ```
373 | :::
374 |
375 | ::: {.cell .markdown }
376 | ## AdaBoost by number of iterations
377 | :::
378 |
379 | ::: {.cell .markdown }
380 | Consider the following regression models. Which model will have more
381 | bias? Which model will have more variance?
382 |
383 | - **Model A**: AdaBoost ensemble after 10 iterations
384 | - **Model B**: AdaBoost ensemble after 100 iterations
385 | :::
386 |
387 | ::: {.cell .code }
388 | ```python
389 | estimators = [("AdaBoost - 10", AdaBoostRegressor(n_estimators=10)),
390 | ("AdaBoost - 100", AdaBoostRegressor(n_estimators=100))]
391 |
392 | plot_simulation(estimators)
393 | ```
394 | :::
395 |
396 | ::: {.cell .markdown }
397 | ## AdaBoost by depth of base estimator
398 | :::
399 |
400 | ::: {.cell .markdown }
401 | Consider the following regression models. Which model will have more
402 | bias? Which model will have more variance?
403 |
404 | - **Model A**: AdaBoost ensemble of trees with max depth 3
405 | - **Model B**: AdaBoost ensemble of trees with max depth 30
406 | :::
407 |
408 | ::: {.cell .code }
409 | ```python
410 | estimators = [("AdaBoost, depth <= 3", AdaBoostRegressor(n_estimators=50, base_estimator=DecisionTreeRegressor(max_depth=3))),
411 | ("AdaBoost, depth <= 30", AdaBoostRegressor(n_estimators=50, base_estimator=DecisionTreeRegressor(max_depth=30)))]
412 |
413 | plot_simulation(estimators)
414 | ```
415 | :::
416 |
--------------------------------------------------------------------------------
/notebooks/7-svm-pre-kernel.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/7-svm-pre-kernel.pdf
--------------------------------------------------------------------------------
/notebooks/7-trees-ensembles-in-depth.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Demo: Decision trees and ensembles\n",
8 | "==================================\n",
9 | "\n",
10 | "*Fraida Fund*"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "This is a simple demo notebook that demonstrates a decision tree classifier or an ensemble of decision trees.\n",
18 | "\n",
19 | "**Attribution**: Parts of this notebook are slightly modified from [this tutorial from “Intro to Data Mining”](http://www.cse.msu.edu/~ptan/dmbook/tutorials/tutorial6/tutorial6.html)."
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "import pandas as pd\n",
29 | "import numpy as np\n",
30 | "import matplotlib.pyplot as plt\n",
31 | "import seaborn as sns\n",
32 | "\n",
33 | "import sklearn\n",
34 | "from sklearn.tree import DecisionTreeClassifier\n",
35 | "from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "df = pd.read_csv('http://www.cse.msu.edu/~ptan/dmbook/tutorials/tutorial6/vertebrate.csv')\n",
45 | "df"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "We’l make it a binary classification problem:"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "df['Class'] = df['Class'].replace(['fishes','birds','amphibians','reptiles'],'non-mammals')\n",
62 | "df"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "Decision tree\n",
70 | "-------------"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "y = df['Class']\n",
80 | "X = df.drop(['Name','Class'],axis=1)\n",
81 | "\n",
82 | "clf_dt = DecisionTreeClassifier(criterion='entropy')\n",
83 | "clf_dt = clf_dt.fit(X, y)"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "plt.figure(figsize=(10,10))\n",
93 | "sklearn.tree.plot_tree(clf_dt, \n",
94 | " feature_names = df.columns.drop(['Name', 'Class']),\n",
95 | " class_names = [\"mammals\", \"non-mammals\"],\n",
96 | " filled=True, rounded=True);"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "### Feature importance"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "df_importance = pd.DataFrame({'feature': df.columns.drop(['Name', 'Class']),\n",
113 | " 'importance': clf_dt.feature_importances_})\n",
114 | "df_importance"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {},
120 | "source": [
121 | "Bagged tree\n",
122 | "-----------"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "n_tree = 3\n",
132 | "clf_bag = BaggingClassifier(n_estimators=n_tree)\n",
133 | "clf_bag = clf_bag.fit(X, y)"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "plt.figure(figsize=(n_tree*8, 10))\n",
143 | "for idx, clf_t in enumerate(clf_bag.estimators_):\n",
144 | " plt.subplot(1, n_tree,idx+1)\n",
145 | " sklearn.tree.plot_tree(clf_t, \n",
146 | " feature_names = df.columns.drop(['Name', 'Class']),\n",
147 | " class_names = [\"mammals\", \"non-mammals\"],\n",
148 | " filled=True, rounded=True) "
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "Notice the similarities! The bagged trees are highly correlated."
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "metadata": {},
161 | "source": [
162 | "Let’s look at the bootstrap sets each tree was trained on:"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "for samples in clf_bag.estimators_samples_:\n",
172 | " print(df.iloc[samples])"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "Random forest\n",
180 | "-------------"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "n_tree = 3\n",
190 | "clf_rf = RandomForestClassifier(n_estimators=n_tree, )\n",
191 | "clf_rf = clf_rf.fit(X, y)"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": null,
197 | "metadata": {},
198 | "outputs": [],
199 | "source": [
200 | "plt.figure(figsize=(n_tree*8, 10))\n",
201 | "for idx, clf_t in enumerate(clf_rf.estimators_):\n",
202 | " plt.subplot(1, n_tree,idx+1)\n",
203 | " sklearn.tree.plot_tree(clf_t, \n",
204 | " feature_names = df.columns.drop(['Name', 'Class']),\n",
205 | " class_names = [\"mammals\", \"non-mammals\"],\n",
206 | " filled=True, rounded=True) "
207 | ]
208 | },
209 | {
210 | "cell_type": "markdown",
211 | "metadata": {},
212 | "source": [
213 | "These trees are much less correlated."
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "AdaBoost\n",
221 | "--------"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "metadata": {},
228 | "outputs": [],
229 | "source": [
230 | "n_tree = 3\n",
231 | "clf_ab = AdaBoostClassifier(n_estimators=n_tree)\n",
232 | "clf_ab = clf_ab.fit(X, y)"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "metadata": {},
239 | "outputs": [],
240 | "source": [
241 | "plt.figure(figsize=(n_tree*8, 10))\n",
242 | "for idx, clf_t in enumerate(clf_ab.estimators_):\n",
243 | " plt.subplot(1, n_tree,idx+1)\n",
244 | " sklearn.tree.plot_tree(clf_t, \n",
245 | " feature_names = df.columns.drop(['Name', 'Class']),\n",
246 | " class_names = [\"mammals\", \"non-mammals\"],\n",
247 | " filled=True, rounded=True) "
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "metadata": {},
253 | "source": [
254 | "The output will be a weighted average of the predictions of all three trees."
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "metadata": {},
260 | "source": [
261 | "As we add more trees, the ensemble accuracy increases:"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": null,
267 | "metadata": {},
268 | "outputs": [],
269 | "source": [
270 | "for p in clf_ab.staged_predict(X):\n",
271 | " print(np.mean(p==y))"
272 | ]
273 | }
274 | ],
275 | "nbformat": 4,
276 | "nbformat_minor": 0,
277 | "metadata": {
278 | "language_info": {
279 | "name": "python"
280 | },
281 | "kernelspec": {
282 | "name": "python3",
283 | "display_name": "Python 3"
284 | },
285 | "colab": {
286 | "toc_visible": "true"
287 | }
288 | }
289 | }
290 |
--------------------------------------------------------------------------------
/notebooks/7-trees-ensembles-in-depth.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Demo: Decision trees and ensembles'
3 | author: 'Fraida Fund'
4 | jupyter:
5 | colab:
6 | toc_visible: true
7 | kernelspec:
8 | display_name: Python 3
9 | name: python3
10 | language_info:
11 | name: python
12 | nbformat: 4
13 | nbformat_minor: 0
14 | ---
15 |
16 | ::: {.cell .markdown }
17 | # Demo: Decision trees and ensembles
18 |
19 | *Fraida Fund*
20 | :::
21 |
22 | ::: {.cell .markdown }
23 | This is a simple demo notebook that demonstrates a decision tree
24 | classifier or an ensemble of decision trees.
25 |
26 | **Attribution**: Parts of this notebook are slightly modified from [this
27 | tutorial from "Intro to Data
28 | Mining"](http://www.cse.msu.edu/~ptan/dmbook/tutorials/tutorial6/tutorial6.html).
29 | :::
30 |
31 | ::: {.cell .code }
32 | ```python
33 | import pandas as pd
34 | import numpy as np
35 | import matplotlib.pyplot as plt
36 | import seaborn as sns
37 |
38 | import sklearn
39 | from sklearn.tree import DecisionTreeClassifier
40 | from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
41 | ```
42 | :::
43 |
44 | ::: {.cell .code }
45 | ```python
46 | df = pd.read_csv('http://www.cse.msu.edu/~ptan/dmbook/tutorials/tutorial6/vertebrate.csv')
47 | df
48 | ```
49 | :::
50 |
51 | ::: {.cell .markdown }
52 | We'l make it a binary classification problem:
53 | :::
54 |
55 | ::: {.cell .code }
56 | ```python
57 | df['Class'] = df['Class'].replace(['fishes','birds','amphibians','reptiles'],'non-mammals')
58 | df
59 | ```
60 | :::
61 |
62 | ::: {.cell .markdown }
63 | ## Decision tree
64 | :::
65 |
66 | ::: {.cell .code }
67 | ```python
68 | y = df['Class']
69 | X = df.drop(['Name','Class'],axis=1)
70 |
71 | clf_dt = DecisionTreeClassifier(criterion='entropy')
72 | clf_dt = clf_dt.fit(X, y)
73 | ```
74 | :::
75 |
76 | ::: {.cell .code }
77 | ```python
78 | plt.figure(figsize=(10,10))
79 | sklearn.tree.plot_tree(clf_dt,
80 | feature_names = df.columns.drop(['Name', 'Class']),
81 | class_names = ["mammals", "non-mammals"],
82 | filled=True, rounded=True);
83 | ```
84 | :::
85 |
86 | ::: {.cell .markdown }
87 | ### Feature importance
88 | :::
89 |
90 | ::: {.cell .code }
91 | ```python
92 | df_importance = pd.DataFrame({'feature': df.columns.drop(['Name', 'Class']),
93 | 'importance': clf_dt.feature_importances_})
94 | df_importance
95 | ```
96 | :::
97 |
98 | ::: {.cell .markdown }
99 | ## Bagged tree
100 | :::
101 |
102 | ::: {.cell .code }
103 | ```python
104 | n_tree = 3
105 | clf_bag = BaggingClassifier(n_estimators=n_tree)
106 | clf_bag = clf_bag.fit(X, y)
107 | ```
108 | :::
109 |
110 | ::: {.cell .code }
111 | ```python
112 | plt.figure(figsize=(n_tree*8, 10))
113 | for idx, clf_t in enumerate(clf_bag.estimators_):
114 | plt.subplot(1, n_tree,idx+1)
115 | sklearn.tree.plot_tree(clf_t,
116 | feature_names = df.columns.drop(['Name', 'Class']),
117 | class_names = ["mammals", "non-mammals"],
118 | filled=True, rounded=True)
119 | ```
120 | :::
121 |
122 | ::: {.cell .markdown }
123 | Notice the similarities! The bagged trees are highly correlated.
124 | :::
125 |
126 | ::: {.cell .markdown }
127 | Let's look at the bootstrap sets each tree was trained on:
128 | :::
129 |
130 | ::: {.cell .code }
131 | ```python
132 | for samples in clf_bag.estimators_samples_:
133 | print(df.iloc[samples])
134 | ```
135 | :::
136 |
137 | ::: {.cell .markdown }
138 | ## Random forest
139 | :::
140 |
141 | ::: {.cell .code }
142 | ```python
143 | n_tree = 3
144 | clf_rf = RandomForestClassifier(n_estimators=n_tree, )
145 | clf_rf = clf_rf.fit(X, y)
146 | ```
147 | :::
148 |
149 | ::: {.cell .code }
150 | ```python
151 | plt.figure(figsize=(n_tree*8, 10))
152 | for idx, clf_t in enumerate(clf_rf.estimators_):
153 | plt.subplot(1, n_tree,idx+1)
154 | sklearn.tree.plot_tree(clf_t,
155 | feature_names = df.columns.drop(['Name', 'Class']),
156 | class_names = ["mammals", "non-mammals"],
157 | filled=True, rounded=True)
158 | ```
159 | :::
160 |
161 | ::: {.cell .markdown }
162 | These trees are much less correlated.
163 | :::
164 |
165 | ::: {.cell .markdown }
166 | ## AdaBoost
167 | :::
168 |
169 | ::: {.cell .code }
170 | ```python
171 | n_tree = 3
172 | clf_ab = AdaBoostClassifier(n_estimators=n_tree)
173 | clf_ab = clf_ab.fit(X, y)
174 | ```
175 | :::
176 |
177 | ::: {.cell .code }
178 | ```python
179 | plt.figure(figsize=(n_tree*8, 10))
180 | for idx, clf_t in enumerate(clf_ab.estimators_):
181 | plt.subplot(1, n_tree,idx+1)
182 | sklearn.tree.plot_tree(clf_t,
183 | feature_names = df.columns.drop(['Name', 'Class']),
184 | class_names = ["mammals", "non-mammals"],
185 | filled=True, rounded=True)
186 | ```
187 | :::
188 |
189 | ::: {.cell .markdown }
190 | The output will be a weighted average of the predictions of all three
191 | trees.
192 | :::
193 |
194 | ::: {.cell .markdown }
195 | As we add more trees, the ensemble accuracy increases:
196 | :::
197 |
198 | ::: {.cell .code }
199 | ```python
200 | for p in clf_ab.staged_predict(X):
201 | print(np.mean(p==y))
202 | ```
203 | :::
204 |
--------------------------------------------------------------------------------
/notebooks/7-trees-ensembles-in-depth.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/7-trees-ensembles-in-depth.pdf
--------------------------------------------------------------------------------
/notebooks/8-demo-backprop.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/8-demo-backprop.pdf
--------------------------------------------------------------------------------
/notebooks/8-svm-bias-variance.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Bias and variance of SVMs'
3 | author: 'Fraida Fund'
4 | jupyter:
5 | colab:
6 | toc_visible: true
7 | kernelspec:
8 | display_name: Python 3
9 | name: python3
10 | nbformat: 4
11 | nbformat_minor: 0
12 | ---
13 |
14 | ::: {.cell .markdown }
15 | # Bias and variance of SVMs
16 |
17 | *Fraida Fund*
18 | :::
19 |
20 | ::: {.cell .markdown }
21 | In this notebook, we will explore the bias and variance of SVM models,
22 | and see how we can tune this tradeoff.
23 | :::
24 |
25 | ::: {.cell .code }
26 | ```python
27 | import numpy as np
28 | import matplotlib.pyplot as plt
29 | import seaborn as sns
30 | import pandas as pd
31 |
32 | from sklearn.svm import SVC
33 | from sklearn.model_selection import train_test_split
34 | from sklearn.metrics import accuracy_score
35 |
36 | from tqdm import tqdm
37 |
38 | from sklearn.datasets import make_blobs
39 | ```
40 | :::
41 |
42 | ::: {.cell .markdown }
43 | ### Regularization level
44 | :::
45 |
46 | ::: {.cell .markdown }
47 | Suppose we want to train a model to classify two "blobs" of data.
48 | :::
49 |
50 | ::: {.cell .code }
51 | ```python
52 | n_repeat = 100
53 | n_test = 500
54 | n_train = 100
55 | sigma= 0.8
56 | cluster_centers = np.array([[-1,1],[2,2]])
57 | ```
58 | :::
59 |
60 | ::: {.cell .code }
61 | ```python
62 | y_predict = np.zeros((n_test, n_repeat, 2))
63 | ```
64 | :::
65 |
66 | ::: {.cell .code }
67 | ```python
68 | x_test, y_test = make_blobs(n_samples=n_test, centers=cluster_centers,
69 | random_state=0, cluster_std=sigma)
70 | ```
71 | :::
72 |
73 | ::: {.cell .code }
74 | ```python
75 | sns.scatterplot(x=x_test[:,0], y=x_test[:,1], hue=y_test);
76 |
77 | plt.xlabel("x1");
78 | plt.ylabel("x2");
79 | plt.xlim(-5,5);
80 | plt.ylim(-2,5);
81 |
82 | # get the true decision boundary
83 | mid = [cluster_centers[:,0].mean(), cluster_centers[:,1].mean()]
84 | slp = -1.0/((cluster_centers[1,1]-cluster_centers[0,1])/(cluster_centers[1,0]-cluster_centers[0,0]))
85 | b = mid[1]-slp*mid[0]
86 | x_true = np.arange(-5,5)
87 | y_true = slp*x_true + b
88 | sns.lineplot(x=x_true, y=y_true, color='black', label="True decision boundary")
89 |
90 | plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1);
91 | ```
92 | :::
93 |
94 | ::: {.cell .markdown }
95 | Which will have greater bias, and which will have greater variance?
96 |
97 | - **Model A**: Linear SVM with $C=0.01$
98 |
99 | - **Model B**: Linear SVM with $C=100$
100 | :::
101 |
102 | ::: {.cell .markdown }
103 | Note: here is $C$ in the SVM problem:
104 |
105 | $$
106 | \begin{aligned}
107 | \operatorname*{minimize}_{\mathbf{w}, \mathbf{\epsilon}} \quad & \frac{1}{2} \sum_{j=1}^p w_j^2 + C \sum_{i=1}^n \epsilon_i \\
108 | \text{subject to} \quad & y_i(w_0 + \sum_{j=1}^p w_j x_{ij}) \geq 1-\epsilon_i, \quad \forall i \\
109 | & \epsilon_i \geq 0, \quad \forall i
110 | \end{aligned}
111 | $$
112 |
113 | The greater the value of $C$, the more heavily the "margin violators"
114 | penalize the overall objective function. Therefore,
115 |
116 | - If $C$ is large, the margin must be narrow (with few "margin
117 | violators").
118 | - If $C$ is small, the margin may be wider (with more "margin
119 | violators").
120 | :::
121 |
122 | ::: {.cell .code }
123 | ```python
124 | Z_sim = np.zeros((40000, n_repeat, 2))
125 |
126 | fig = plt.figure(figsize=(12,4))
127 | ax_a, ax_b = fig.subplots(1, 2, sharex=True, sharey=True)
128 |
129 | # now simulate training the model many times, on different training data every time
130 | # and evaluate using the test data
131 | for i in tqdm(range(n_repeat), total=n_repeat, desc="Simulation iteration"):
132 |
133 | # train both models on newly generated training data
134 | X, y = make_blobs(n_samples=n_test, centers=cluster_centers,
135 | cluster_std=sigma)
136 |
137 | clf_a = SVC(kernel='linear', C=0.01).fit(X, y)
138 | clf_b = SVC(kernel='linear', C=100.0).fit(X, y)
139 |
140 | y_predict[:, i, 0] = clf_a.predict(x_test)
141 | y_predict[:, i, 1] = clf_b.predict(x_test)
142 |
143 |
144 | xx, yy = np.meshgrid(np.arange(-5, 5, .05),
145 | np.arange(-5, 5, .05))
146 |
147 | Z = clf_a.decision_function(np.c_[xx.ravel(), yy.ravel()])
148 | Z_sim[:, i, 0] = Z
149 | Z = Z.reshape(xx.shape)
150 | ax_a.contour(xx, yy, Z, levels=[0.5], alpha=0.1, colors='plum');
151 |
152 | plt.xlim(-5,5);
153 | plt.ylim(-2,5);
154 |
155 | Z = clf_b.decision_function(np.c_[xx.ravel(), yy.ravel()])
156 | Z_sim[:, i, 1] = Z
157 | Z = Z.reshape(xx.shape)
158 | ax_b.contour(xx, yy, Z, levels=[0.5], alpha=0.1, colors='plum');
159 |
160 | plt.xlim(-5,5);
161 | plt.ylim(-2,5);
162 |
163 |
164 | cs_a = ax_a.contour(xx, yy, Z_sim[:,:,0].mean(axis=1).reshape(200,200), levels=[0.5], colors='magenta', linewidths=2);
165 | cs_b = ax_b.contour(xx, yy, Z_sim[:,:,1].mean(axis=1).reshape(200,200), levels=[0.5], colors='magenta', linewidths=2);
166 |
167 | # plot data
168 | sns.scatterplot(x=x_test[:,0], y=x_test[:,1], hue=y_test, ax=ax_a, legend=False);
169 | sns.scatterplot(x=x_test[:,0], y=x_test[:,1], hue=y_test, ax=ax_b, legend=False);
170 |
171 | sns.lineplot(x=x_true, y=y_true, color='black', ax=ax_a)
172 | sns.lineplot(x=x_true, y=y_true, color='black', ax=ax_b)
173 |
174 |
175 | ax_a.set_title("Model A");
176 | ax_b.set_title("Model B");
177 |
178 | ax_a.set_ylabel("x2");
179 | ax_a.set_xlabel("x1");
180 | ax_b.set_xlabel("x1");
181 | ```
182 | :::
183 |
184 | ::: {.cell .markdown }
185 | ### Kernels
186 | :::
187 |
188 | ::: {.cell .code }
189 | ```python
190 | def generate_polynomial_classifier_data(n=100, xrange=[-1,1], coefs=[1,0.5,0,2], sigma=0.5):
191 | x = np.random.uniform(xrange[0], xrange[1], size=(n, 2))
192 | ysep = np.polynomial.polynomial.polyval(x[:,0],coefs)
193 | y = (x[:,1]>ysep).astype(int)
194 | x[:,0] = x[:,0] + sigma * np.random.randn(n)
195 | x[:,1] = x[:,1] + sigma * np.random.randn(n)
196 | return x, y
197 | ```
198 | :::
199 |
200 | ::: {.cell .code }
201 | ```python
202 | n_repeat = 100
203 | n_test = 500
204 | n_train = 1000
205 | sigma= 0.3
206 | coefs=np.array([0.3, 1, -1.5, -2])
207 | xrange=[-1,1]
208 | ```
209 | :::
210 |
211 | ::: {.cell .code }
212 | ```python
213 | y_predict = np.zeros((n_test, n_repeat, 2))
214 | ```
215 | :::
216 |
217 | ::: {.cell .code }
218 | ```python
219 | # generate test data once
220 | x_test, y_test = generate_polynomial_classifier_data(n=n_test, xrange=xrange, coefs=coefs, sigma=sigma)
221 | ```
222 | :::
223 |
224 | ::: {.cell .code }
225 | ```python
226 | sns.scatterplot(x=x_test[:,0], y=x_test[:,1], hue=y_test);
227 |
228 | plt.xlabel("x1");
229 | plt.ylabel("x2");
230 | plt.xlim((xrange[0], xrange[1]));
231 | plt.ylim((xrange[0], xrange[1]));
232 |
233 | # Plot true function
234 | xtrue = np.arange(-1.5, 1.5, .05)
235 | ytrue = np.polynomial.polynomial.polyval(xtrue,coefs)
236 | sns.lineplot(x=xtrue, y=ytrue, color='black', label='True decision boundary');
237 |
238 |
239 | plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1);
240 | ```
241 | :::
242 |
243 | ::: {.cell .markdown }
244 | Suppose we want to train a model to classify data that is separated by a
245 | polynomial boundary.
246 | :::
247 |
248 | ::: {.cell .markdown }
249 | Which will have greater bias, and which will have greater variance?
250 |
251 | - **Model A**: SVM with linear kernel, $C = 1$
252 |
253 | - **Model B**: SVM with RBF kernel, $C = 1$
254 | :::
255 |
256 | ::: {.cell .code }
257 | ```python
258 | Z_sim = np.zeros((3600, n_repeat, 2))
259 |
260 | fig = plt.figure(figsize=(12,4))
261 | ax_a, ax_b = fig.subplots(1, 2, sharex=True, sharey=True)
262 |
263 | # now simulate training the model many times, on different training data every time
264 | # and evaluate using the test data
265 | for i in tqdm(range(n_repeat), total=n_repeat, desc="Simulation iteration"):
266 |
267 | # train both models on newly generated training data
268 | X, y = generate_polynomial_classifier_data(n=n_train, xrange=xrange, coefs=coefs, sigma=sigma)
269 |
270 | clf_a = SVC(kernel='linear', C=1).fit(X, y)
271 | clf_b = SVC(kernel='rbf', gamma=10, C=1).fit(X, y)
272 |
273 | y_predict[:, i, 0] = clf_a.predict(x_test)
274 | y_predict[:, i, 1] = clf_b.predict(x_test)
275 |
276 |
277 | xx, yy = np.meshgrid(np.arange(-1.5, 1.5, .05),
278 | np.arange(-1.5, 1.5, .05))
279 |
280 | Z = clf_a.decision_function(np.c_[xx.ravel(), yy.ravel()])
281 | Z_sim[:, i, 0] = Z
282 | Z = Z.reshape(xx.shape)
283 | ax_a.contour(xx, yy, Z, levels=[0.5], alpha=0.1, colors='plum');
284 | plt.xlim((xrange[0], xrange[1]));
285 | plt.ylim((xrange[0], xrange[1]));
286 |
287 | Z = clf_b.decision_function(np.c_[xx.ravel(), yy.ravel()])
288 | Z_sim[:, i, 1] = Z
289 | Z = Z.reshape(xx.shape)
290 | ax_b.contour(xx, yy, Z, levels=[0.5], alpha=0.1, colors='plum');
291 | plt.xlim((xrange[0], xrange[1]));
292 | plt.ylim((xrange[0], xrange[1]));
293 |
294 |
295 | cs_a = ax_a.contour(xx, yy, Z_sim[:,:,0].mean(axis=1).reshape(60,60), levels=[0.5], colors='magenta', linewidths=2);
296 | cs_b = ax_b.contour(xx, yy, Z_sim[:,:,1].mean(axis=1).reshape(60,60), levels=[0.5], colors='magenta', linewidths=2);
297 |
298 |
299 | # Plot true function
300 | xtrue = np.arange(-1.5, 1.5, .05)
301 | ytrue = np.polynomial.polynomial.polyval(xtrue,coefs)
302 | sns.lineplot(x=xtrue, y=ytrue, color='black', ax=ax_a);
303 | sns.lineplot(x=xtrue, y=ytrue, color='black', ax=ax_b);
304 |
305 | sns.scatterplot(x=x_test[:,0], y=x_test[:,1], hue=y_test, ax=ax_a, legend=False, alpha=0.1);
306 | sns.scatterplot(x=x_test[:,0], y=x_test[:,1], hue=y_test, ax=ax_b, legend=False, alpha=0.1);
307 |
308 | ax_a.set_title("Model A");
309 | ax_b.set_title("Model B");
310 |
311 | ax_a.set_ylabel("x2");
312 | ax_a.set_xlabel("x1");
313 | ax_b.set_xlabel("x1");
314 | ```
315 | :::
316 |
317 | ::: {.cell .markdown }
318 | ### RBF parameter
319 | :::
320 |
321 | ::: {.cell .markdown }
322 | Recall that the RBF kernel is defined as
323 |
324 | $$K(x,z) = \exp(-\frac{\|x-z\|^2_2}{\sigma^2})$$
325 |
326 | where $\sigma$ is the bandwidth, or equivalently, using a $\gamma$
327 | parameter,
328 |
329 | $$K(x, z) = \exp(-\gamma \| x - z \|^2_2 )$$
330 | :::
331 |
332 | ::: {.cell .markdown }
333 | For example, here is the RBF kernel centered on a single point, computed
334 | over the entire feature space, with two different values of $\gamma$:
335 | :::
336 |
337 | ::: {.cell .code }
338 | ```python
339 | from sklearn.metrics.pairwise import rbf_kernel
340 |
341 | test_point = np.random.uniform(0,1,size=2)
342 | xx, yy = np.meshgrid(np.arange(0 ,5, .05), np.arange(0, 5, .05))
343 |
344 | gamma_a=0.05
345 | gamma_b=5
346 | Z_a = rbf_kernel(np.c_[xx.ravel(), yy.ravel()], test_point.reshape(1, -1), gamma=gamma_a)
347 | Z_b = rbf_kernel(np.c_[xx.ravel(), yy.ravel()], test_point.reshape(1, -1), gamma=gamma_b)
348 | ```
349 | :::
350 |
351 | ::: {.cell .code }
352 | ```python
353 | plt.figure(figsize=(12,5))
354 | plt.subplot(1,2,1)
355 |
356 | plt.scatter(x=test_point[0], y=test_point[1])
357 | cs = plt.contourf(xx, yy, Z_a.reshape(xx.shape), vmin=0, vmax=1);
358 | plt.title("Gamma: %f" % gamma_a);
359 |
360 | plt.subplot(1,2,2)
361 |
362 |
363 | plt.scatter(x=test_point[0], y=test_point[1])
364 | cs = plt.contourf(xx, yy, Z_b.reshape(xx.shape), vmin=0, vmax=1);
365 | plt.title("Gamma: %f" % gamma_b);
366 |
367 | plt.subplots_adjust(right=0.8);
368 | cbar_ax = plt.axes([0.85, 0.15, 0.05, 0.7]);
369 | plt.colorbar(cax=cbar_ax);
370 | ```
371 | :::
372 |
373 | ::: {.cell .markdown }
374 | We can see that when the kernel bandwidth is large ($\gamma$ is small),
375 | the influence of each point extends much farther in the feature space
376 | than if the kernel bandwidth is small ($\gamma$ is large).
377 | :::
378 |
379 | ::: {.cell .markdown }
380 | Suppose we want to train a model to classify data that is separated by a
381 | polynomial boundary.
382 |
383 | Which will have greater bias, and which will have greater variance?
384 |
385 | - **Model A**: SVM with RBF kernel and $\gamma=0.05$
386 |
387 | - **Model B**: SVM with RBF kernel and $\gamma=5$
388 | :::
389 |
390 | ::: {.cell .code }
391 | ```python
392 | n_repeat = 100
393 | n_test = 500
394 | n_train = 100
395 | sigma= 0.3
396 | coefs=np.array([0.3, 1, -1.5, -2])
397 | xrange=[-1,1]
398 | ```
399 | :::
400 |
401 | ::: {.cell .code }
402 | ```python
403 | y_predict = np.zeros((n_test, n_repeat, 2))
404 | ```
405 | :::
406 |
407 | ::: {.cell .code }
408 | ```python
409 | # generate test data once
410 | x_test, y_test = generate_polynomial_classifier_data(n=n_test, xrange=xrange, coefs=coefs, sigma=sigma)
411 | ```
412 | :::
413 |
414 | ::: {.cell .code }
415 | ```python
416 | Z_sim = np.zeros((3600, n_repeat, 2))
417 |
418 | fig = plt.figure(figsize=(12,4))
419 | ax_a, ax_b = fig.subplots(1, 2, sharex=True, sharey=True)
420 |
421 | # now simulate training the model many times, on different training data every time
422 | # and evaluate using the test data
423 | for i in tqdm(range(n_repeat), total=n_repeat, desc="Simulation iteration"):
424 |
425 | # train both models on newly generated training data
426 | X, y = generate_polynomial_classifier_data(n=n_train, xrange=xrange, coefs=coefs, sigma=sigma)
427 |
428 | clf_a = SVC(kernel='rbf', gamma=0.05, C=10).fit(X, y)
429 | clf_b = SVC(kernel='rbf', gamma=5, C=10).fit(X, y)
430 |
431 | y_predict[:, i, 0] = clf_a.predict(x_test)
432 | y_predict[:, i, 1] = clf_b.predict(x_test)
433 |
434 |
435 | xx, yy = np.meshgrid(np.arange(-1.5, 1.5, .05),
436 | np.arange(-1.5, 1.5, .05))
437 |
438 | Z = clf_a.decision_function(np.c_[xx.ravel(), yy.ravel()])
439 | Z_sim[:, i, 0] = Z
440 | Z = Z.reshape(xx.shape)
441 | ax_a.contour(xx, yy, Z, levels=[0.5], alpha=0.1, colors='plum');
442 | plt.xlim((xrange[0], xrange[1]));
443 | plt.ylim((xrange[0], xrange[1]));
444 |
445 | Z = clf_b.decision_function(np.c_[xx.ravel(), yy.ravel()])
446 | Z_sim[:, i, 1] = Z
447 | Z = Z.reshape(xx.shape)
448 | ax_b.contour(xx, yy, Z, levels=[0.5], alpha=0.1, colors='plum');
449 | plt.xlim((xrange[0], xrange[1]));
450 | plt.ylim((xrange[0], xrange[1]));
451 |
452 |
453 | cs_a = ax_a.contour(xx, yy, Z_sim[:,:,0].mean(axis=1).reshape(60,60), levels=[0.5], colors='magenta', linewidths=2);
454 | cs_b = ax_b.contour(xx, yy, Z_sim[:,:,1].mean(axis=1).reshape(60,60), levels=[0.5], colors='magenta', linewidths=2);
455 |
456 |
457 | # Plot true function
458 | xtrue = np.arange(-1.5, 1.5, .05)
459 | ytrue = np.polynomial.polynomial.polyval(xtrue,coefs)
460 | sns.lineplot(x=xtrue, y=ytrue, color='black', ax=ax_a);
461 | sns.lineplot(x=xtrue, y=ytrue, color='black', ax=ax_b);
462 |
463 | #sns.scatterplot(x_test[:,0], x_test[:,1], y_test, ax=ax_a, legend=False, alpha=0.1);
464 | #sns.scatterplot(x_test[:,0], x_test[:,1], y_test, ax=ax_b, legend=False, alpha=0.1);
465 |
466 | ax_a.set_title("Model A");
467 | ax_b.set_title("Model B");
468 |
469 | ax_a.set_ylabel("x2");
470 | ax_a.set_xlabel("x1");
471 | ax_b.set_xlabel("x1");
472 | ```
473 | :::
474 |
--------------------------------------------------------------------------------
/notebooks/8-svm-bias-variance.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/8-svm-bias-variance.pdf
--------------------------------------------------------------------------------
/notebooks/8-svm-with-kernel.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/8-svm-with-kernel.pdf
--------------------------------------------------------------------------------
/notebooks/9-convolutional-neural-networks.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/9-convolutional-neural-networks.pdf
--------------------------------------------------------------------------------
/notebooks/9-slash-dataset.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/9-slash-dataset.pdf
--------------------------------------------------------------------------------
/notebooks/Makefile:
--------------------------------------------------------------------------------
1 | #SOURCES := $(wildcard *.md)
2 | SOURCES := 1-python-numpy-tutorial.md
3 | SOURCES += 1-colab-tour.md
4 | SOURCES += 1-print-colab.md
5 | SOURCES += 1-exploratory-data-analysis.md
6 | SOURCES += 1-data-detective.md
7 | SOURCES += 1-explore-hw.md
8 |
9 | EXECS := 1-python-numpy-tutorial.md
10 | EXECS += 1-exploratory-data-analysis.md
11 |
12 | SOURCES += 2-compute-by-hand.md
13 | SOURCES += 2-regression-r2.md
14 | SOURCES += 2-linear-regression-deep-dive.md
15 | SOURCES += 2-linear-regression-case-study.md
16 | SOURCES += 2-advertising-hw.md
17 |
18 | EXECS += 2-compute-by-hand.md
19 | EXECS += 2-regression-r2.md
20 | EXECS += 2-linear-regression-deep-dive.md
21 | EXECS += 2-linear-regression-case-study.md
22 | EXECS += 2-advertising-hw.md
23 |
24 | SOURCES += 3-gradient-descent-deep-dive.md
25 | SOURCES += 3-bias-variance-deep-dive.md
26 |
27 | EXECS += 3-gradient-descent-deep-dive.md
28 | EXECS += 3-bias-variance-deep-dive.md
29 |
30 | SOURCES += 4-model-selection.md
31 | SOURCES += 4-linear-regression-case-study-part-2.md
32 | SOURCES += 4-regularization-deep-dive.md
33 | SOURCES += 4-neural-model-selection-hw.md
34 |
35 | EXECS += 4-model-selection.md
36 | EXECS += 4-linear-regression-case-study-part-2.md
37 | EXECS += 4-regularization-deep-dive.md
38 |
39 | SOURCES += 5-logistic-regression-in-depth.md
40 | SOURCES += 5-logistic-regression-digits.md
41 | SOURCES += 5-compas-case-study.md
42 | SOURCES += 5-hw-logistic-regression.md
43 |
44 | EXECS += 5-logistic-regression-in-depth.md
45 | EXECS += 5-logistic-regression-digits.md
46 | EXECS += 5-compas-case-study.md
47 |
48 | SOURCES += 6-k-nearest-neighbors-in-depth.md
49 | SOURCES += 6-decision-trees.md
50 | SOURCES += 6-knn-tree-bias-variance.md
51 | SOURCES += 6-knn-voter-classification-hw.md
52 |
53 | EXECS += 6-k-nearest-neighbors-in-depth.md
54 | EXECS += 6-decision-trees.md
55 | EXECS += 6-knn-tree-bias-variance.md
56 | EXECS += 6-knn-voter-classification-hw.md
57 |
58 | SOURCES += 7-trees-ensembles-in-depth.md
59 | SOURCES += 7-demo-adaboost.md
60 | SOURCES += 7-svm-pre-kernel.md
61 | SOURCES += 7-knn-tree-bias-variance.md
62 | SOURCES += 7-demo-digits-classifiers.md
63 |
64 | EXECS += 7-trees-ensembles-in-depth.md
65 | EXECS += 7-demo-adaboost.md
66 | EXECS += 7-svm-pre-kernel.md
67 |
68 | SOURCES += 8-svm-with-kernel.md
69 | SOURCES += 8-svm-bias-variance.md
70 | SOURCES += 8-hyperparameter.md
71 |
72 | SOURCES += 8-demo-backprop.md
73 | SOURCES += 8-neural-net-demo-draw.md
74 | SOURCES += 8-neural-net-demo-draw-torch.md
75 | SOURCES += 8-lab-neural-net-music-classification.md
76 |
77 | EXECS += 8-svm-with-kernel.md
78 | EXECS += 8-svm-bias-variance.md
79 | EXECS += 8-demo-backprop.md
80 |
81 | SOURCES += 9-slash-dataset.md
82 | SOURCES += 9-convolutional-neural-networks.md
83 | SOURCES += 9-fine-tune-rock-paper-scissors.md
84 |
85 | EXECS += 9-slash-dataset.md
86 | EXECS += 9-convolutional-neural-networks.md
87 |
88 |
89 |
90 | # Convert the list of source files (Markdown files )
91 | # into a list of output files
92 | NBS := $(patsubst %.md,%.ipynb,$(SOURCES))
93 |
94 |
95 | NBSEXEC := $(patsubst %.md,%.nbconvert.ipynb,$(EXECS))
96 | PDFS := $(patsubst %.md,%.pdf,$(EXECS))
97 |
98 | PANDOCFLAGS=--pdf-engine=xelatex\
99 | -V mainfont='Fira Sans' \
100 | -V geometry:margin=1in \
101 | --highlight-style pygments \
102 | --listings --variable urlcolor=Maroon \
103 | -H style/listings-setup.tex -H style/keystroke-setup.tex -H style/includes.tex
104 |
105 | %.ipynb: %.md
106 | pandoc --self-contained --wrap=none $^ -o $@
107 |
108 | %.nbconvert.ipynb: %.ipynb
109 | jupyter nbconvert --to notebook --execute --ExecutePreprocessor.store_widget_state=False --ExecutePreprocessor.allow_errors=True --ExecutePreprocessor.timeout=360 $^
110 |
111 | %.pdf: %.nbconvert.ipynb
112 | pandoc $^ $(PANDOCFLAGS) -o $@
113 |
114 | all: $(NBS)
115 |
116 | notebooks: $(NBS)
117 |
118 | executed: $(NBSEXEC)
119 |
120 | pdfs: $(PDFS)
121 |
122 | clean:
123 | rm -f *.ipynb
124 |
125 |
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-auto-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-auto-0.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-auto-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-auto-1.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-cell-order-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-cell-order-0.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-connect-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-connect-0.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-connect-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-connect-1.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-delete-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-delete-0.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-delete-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-delete-1.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-file-upload-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-file-upload-0.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-filexplore-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-filexplore-0.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-filexplore-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-filexplore-1.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-filexplore-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-filexplore-2.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-gdrive-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-gdrive-0.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-gdrive-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-gdrive-1.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-gdrive-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-gdrive-2.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-gdrive-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-gdrive-3.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-newcell-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-newcell-0.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-newcell-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-newcell-1.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-newcell-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-newcell-2.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-run-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-run-0.png
--------------------------------------------------------------------------------
/notebooks/images/colab-tour-run-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/colab-tour-run-1.png
--------------------------------------------------------------------------------
/notebooks/images/exit-poll-nan-distance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/exit-poll-nan-distance.png
--------------------------------------------------------------------------------
/notebooks/images/exit-poll-survey-versions-2020.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/exit-poll-survey-versions-2020.png
--------------------------------------------------------------------------------
/notebooks/images/exit-poll-survey-versions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/notebooks/images/exit-poll-survey-versions.png
--------------------------------------------------------------------------------
/notebooks/style/default.latex:
--------------------------------------------------------------------------------
1 | \documentclass[$if(fontsize)$$fontsize$,$endif$$if(lang)$$babel-lang$,$endif$$if(papersize)$$papersize$paper,$endif$$for(classoption)$$classoption$$sep$,$endfor$]{$documentclass$}
2 | $if(fontfamily)$
3 | \usepackage[$for(fontfamilyoptions)$$fontfamilyoptions$$sep$,$endfor$]{$fontfamily$}
4 | $else$
5 | \usepackage{lmodern}
6 | $endif$
7 | $if(linestretch)$
8 | \usepackage{setspace}
9 | \setstretch{$linestretch$}
10 | $endif$
11 | \usepackage{amssymb,amsmath}
12 | \usepackage{ifxetex,ifluatex}
13 | \usepackage{fixltx2e} % provides \textsubscript
14 | \ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
15 | \usepackage[$if(fontenc)$$fontenc$$else$T1$endif$]{fontenc}
16 | \usepackage[utf8]{inputenc}
17 | $if(euro)$
18 | \usepackage{eurosym}
19 | $endif$
20 | \else % if luatex or xelatex
21 | \ifxetex
22 | \usepackage{mathspec}
23 | \else
24 | \usepackage{fontspec}
25 | \fi
26 | \defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}
27 | $if(euro)$
28 | \newcommand{\euro}{€}
29 | $endif$
30 | $if(mainfont)$
31 | \setmainfont[$for(mainfontoptions)$$mainfontoptions$$sep$,$endfor$]{$mainfont$}
32 | $endif$
33 | $if(sansfont)$
34 | \setsansfont[$for(sansfontoptions)$$sansfontoptions$$sep$,$endfor$]{$sansfont$}
35 | $endif$
36 | $if(monofont)$
37 | \setmonofont[Mapping=tex-ansi$if(monofontoptions)$,$for(monofontoptions)$$monofontoptions$$sep$,$endfor$$endif$]{$monofont$}
38 | $endif$
39 | $if(mathfont)$
40 | \setmathfont(Digits,Latin,Greek)[$for(mathfontoptions)$$mathfontoptions$$sep$,$endfor$]{$mathfont$}
41 | $endif$
42 | $if(CJKmainfont)$
43 | \usepackage{xeCJK}
44 | \setCJKmainfont[$for(CJKoptions)$$CJKoptions$$sep$,$endfor$]{$CJKmainfont$}
45 | $endif$
46 | \fi
47 | % use upquote if available, for straight quotes in verbatim environments
48 | \IfFileExists{upquote.sty}{\usepackage{upquote}}{}
49 | % use microtype if available
50 | \IfFileExists{microtype.sty}{%
51 | \usepackage{microtype}
52 | \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
53 | }{}
54 | $if(geometry)$
55 | \usepackage[$for(geometry)$$geometry$$sep$,$endfor$]{geometry}
56 | $endif$
57 | \usepackage{hyperref}
58 | $if(colorlinks)$
59 | \PassOptionsToPackage{usenames,dvipsnames}{color} % color is loaded by hyperref
60 | $endif$
61 | \hypersetup{unicode=true,
62 | $if(title-meta)$
63 | pdftitle={$title-meta$},
64 | $endif$
65 | $if(author-meta)$
66 | pdfauthor={$author-meta$},
67 | $endif$
68 | $if(keywords)$
69 | pdfkeywords={$for(keywords)$$keywords$$sep$; $endfor$},
70 | $endif$
71 | $if(colorlinks)$
72 | colorlinks=true,
73 | linkcolor=$if(linkcolor)$$linkcolor$$else$Maroon$endif$,
74 | citecolor=$if(citecolor)$$citecolor$$else$Blue$endif$,
75 | urlcolor=$if(urlcolor)$$urlcolor$$else$Blue$endif$,
76 | $else$
77 | pdfborder={0 0 0},
78 | $endif$
79 | breaklinks=true}
80 | \urlstyle{same} % don't use monospace font for urls
81 | $if(lang)$
82 | \ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
83 | \usepackage[shorthands=off,$for(babel-otherlangs)$$babel-otherlangs$,$endfor$main=$babel-lang$]{babel}
84 | $if(babel-newcommands)$
85 | $babel-newcommands$
86 | $endif$
87 | \else
88 | \usepackage{polyglossia}
89 | \setmainlanguage[$polyglossia-lang.options$]{$polyglossia-lang.name$}
90 | $for(polyglossia-otherlangs)$
91 | \setotherlanguage[$polyglossia-otherlangs.options$]{$polyglossia-otherlangs.name$}
92 | $endfor$
93 | \fi
94 | $endif$
95 | $if(natbib)$
96 | \usepackage{natbib}
97 | \bibliographystyle{$if(biblio-style)$$biblio-style$$else$plainnat$endif$}
98 | $endif$
99 | $if(biblatex)$
100 | \usepackage$if(biblio-style)$[style=$biblio-style$]$endif${biblatex}
101 | $if(biblatexoptions)$\ExecuteBibliographyOptions{$for(biblatexoptions)$$biblatexoptions$$sep$,$endfor$}$endif$
102 | $for(bibliography)$
103 | \addbibresource{$bibliography$}
104 | $endfor$
105 | $endif$
106 | $if(listings)$
107 | \usepackage{listings}
108 | $endif$
109 | $if(lhs)$
110 | \lstnewenvironment{code}{\lstset{language=Haskell,basicstyle=\small\ttfamily}}{}
111 | $endif$
112 | $if(highlighting-macros)$
113 | $highlighting-macros$
114 | $endif$
115 | $if(verbatim-in-note)$
116 | \usepackage{fancyvrb}
117 | \VerbatimFootnotes % allows verbatim text in footnotes
118 | $endif$
119 | $if(tables)$
120 | \usepackage{longtable,booktabs}
121 | $endif$
122 | $if(graphics)$
123 | \usepackage{graphicx,grffile}
124 | \makeatletter
125 | \def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
126 | \def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
127 | \makeatother
128 | % Scale images if necessary, so that they will not overflow the page
129 | % margins by default, and it is still possible to overwrite the defaults
130 | % using explicit options in \includegraphics[width, height, ...]{}
131 | \setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
132 | $endif$
133 | $if(links-as-notes)$
134 | % Make links footnotes instead of hotlinks:
135 | \renewcommand{\href}[2]{#2\footnote{\url{#1}}}
136 | $endif$
137 | $if(strikeout)$
138 | \usepackage[normalem]{ulem}
139 | % avoid problems with \sout in headers with hyperref:
140 | \pdfstringdefDisableCommands{\renewcommand{\sout}{}}
141 | $endif$
142 | $if(indent)$
143 | $else$
144 | \IfFileExists{parskip.sty}{%
145 | \usepackage{parskip}
146 | }{% else
147 | \setlength{\parindent}{0pt}
148 | \setlength{\parskip}{6pt plus 2pt minus 1pt}
149 | }
150 | $endif$
151 | \setlength{\emergencystretch}{3em} % prevent overfull lines
152 | \providecommand{\tightlist}{%
153 | \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
154 | $if(numbersections)$
155 | \setcounter{secnumdepth}{5}
156 | $else$
157 | \setcounter{secnumdepth}{0}
158 | $endif$
159 | $if(subparagraph)$
160 | $else$
161 | % Redefines (sub)paragraphs to behave more like sections
162 | \ifx\paragraph\undefined\else
163 | \let\oldparagraph\paragraph
164 | \renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}}
165 | \fi
166 | \ifx\subparagraph\undefined\else
167 | \let\oldsubparagraph\subparagraph
168 | \renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}}
169 | \fi
170 | $endif$
171 | $if(dir)$
172 | \ifxetex
173 | % load bidi as late as possible as it modifies e.g. graphicx
174 | $if(latex-dir-rtl)$
175 | \usepackage[RTLdocument]{bidi}
176 | $else$
177 | \usepackage{bidi}
178 | $endif$
179 | \fi
180 | \ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
181 | \TeXXeTstate=1
182 | \newcommand{\RL}[1]{\beginR #1\endR}
183 | \newcommand{\LR}[1]{\beginL #1\endL}
184 | \newenvironment{RTL}{\beginR}{\endR}
185 | \newenvironment{LTR}{\beginL}{\endL}
186 | \fi
187 | $endif$
188 | $for(header-includes)$
189 | $header-includes$
190 | $endfor$
191 |
192 | $if(title)$
193 | \title{$title$$if(thanks)$\thanks{$thanks$}$endif$}
194 | $endif$
195 | $if(subtitle)$
196 | \providecommand{\subtitle}[1]{}
197 | \subtitle{$subtitle$}
198 | $endif$
199 | $if(author)$
200 | \author{$for(author)$$author$$sep$ \and $endfor$}
201 | $endif$
202 | \date{$date$}
203 |
204 | \begin{document}
205 | $if(title)$
206 | \maketitle
207 | $endif$
208 | $if(abstract)$
209 | \begin{abstract}
210 | $abstract$
211 | \end{abstract}
212 | $endif$
213 |
214 | $for(include-before)$
215 | $include-before$
216 |
217 | $endfor$
218 | $if(toc)$
219 | {
220 | $if(colorlinks)$
221 | \hypersetup{linkcolor=$if(toccolor)$$toccolor$$else$black$endif$}
222 | $endif$
223 | \setcounter{tocdepth}{$toc-depth$}
224 | \tableofcontents
225 | }
226 | $endif$
227 | $if(lot)$
228 | \listoftables
229 | $endif$
230 | $if(lof)$
231 | \listoffigures
232 | $endif$
233 | $body$
234 |
235 | $if(natbib)$
236 | $if(bibliography)$
237 | $if(biblio-title)$
238 | $if(book-class)$
239 | \renewcommand\bibname{$biblio-title$}
240 | $else$
241 | \renewcommand\refname{$biblio-title$}
242 | $endif$
243 | $endif$
244 | \bibliography{$for(bibliography)$$bibliography$$sep$,$endfor$}
245 |
246 | $endif$
247 | $endif$
248 | $if(biblatex)$
249 | \printbibliography$if(biblio-title)$[title=$biblio-title$]$endif$
250 |
251 | $endif$
252 | $for(include-after)$
253 | $include-after$
254 |
255 | $endfor$
256 | \end{document}
257 |
--------------------------------------------------------------------------------
/notebooks/style/includes.tex:
--------------------------------------------------------------------------------
1 | \usepackage[os=win]{menukeys}
2 | \usepackage{marvosym}
3 | \usepackage{unicode-math}
4 | %\setmathfont{XITS Math}
5 | \usepackage{amsmath}
6 | \DeclareMathOperator*{\argmax}{arg\,max}
7 | \DeclareMathOperator*{\argmin}{arg\,min}
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/notebooks/style/keystroke-setup.tex:
--------------------------------------------------------------------------------
1 | \usepackage{tikz}
2 | \usetikzlibrary{shadows}
3 |
4 | \newcommand*\keystroke[1]{%
5 | \tikz[baseline=(key.base)]
6 | \node[%
7 | draw,
8 | fill=white,
9 | drop shadow={shadow xshift=0.15ex,shadow yshift=-0.15ex,fill=black,opacity=0.5},
10 | rectangle,
11 | rounded corners=2pt,
12 | inner sep=1pt,
13 | line width=0.5pt,
14 | font=\scriptsize\sffamily
15 | ](key) {#1\strut}
16 | ;
17 | }
18 |
19 |
--------------------------------------------------------------------------------
/notebooks/style/listings-setup.tex:
--------------------------------------------------------------------------------
1 | % Contents of listings-setup.tex
2 | \usepackage{xcolor}
3 |
4 | \lstset{
5 | basicstyle=\ttfamily,
6 | keywordstyle=\color[rgb]{0.13,0.29,0.53}\bfseries,
7 | stringstyle=\color[rgb]{0.31,0.60,0.02},
8 | commentstyle=\color[rgb]{0.56,0.35,0.01}\itshape,
9 | numberstyle=\footnotesize,
10 | framesep=2pt,
11 | framexleftmargin=6pt,
12 | framextopmargin=1pt,
13 | framexbottommargin=1pt,
14 | frame=tb, framerule=0pt,
15 | stepnumber=1,
16 | numbersep=5pt,
17 | backgroundcolor=\color[RGB]{240,240,240},
18 | showspaces=false,
19 | showstringspaces=false,
20 | showtabs=false,
21 | tabsize=2,
22 | captionpos=b,
23 | breaklines=true,
24 | breakatwhitespace=true,
25 | breakautoindent=true,
26 | escapeinside={\%*}{*)},
27 | linewidth=\textwidth,
28 | basewidth=0.5em,
29 | }
30 |
31 |
--------------------------------------------------------------------------------
/projects/Audio (music).pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/projects/Audio (music).pdf
--------------------------------------------------------------------------------
/projects/Audio (speech).pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/projects/Audio (speech).pdf
--------------------------------------------------------------------------------
/projects/Generating images.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/projects/Generating images.pdf
--------------------------------------------------------------------------------
/projects/Generating text.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/projects/Generating text.pdf
--------------------------------------------------------------------------------
/projects/ML and society_ Fairness, privacy, explainability.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/projects/ML and society_ Fairness, privacy, explainability.pdf
--------------------------------------------------------------------------------
/projects/Reinforcement learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/projects/Reinforcement learning.pdf
--------------------------------------------------------------------------------
/projects/Security and robustness.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/projects/Security and robustness.pdf
--------------------------------------------------------------------------------
/projects/Understanding images.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/projects/Understanding images.pdf
--------------------------------------------------------------------------------
/projects/Understanding text.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ffund/ml-notebooks/6f263e179db2670f5182623d69fe580e8c7992bf/projects/Understanding text.pdf
--------------------------------------------------------------------------------