├── requirements.txt
├── .gitignore
├── images
├── 03_iris.png
├── youtube.png
├── 01_robot.png
├── 04_1nn_map.png
├── 04_5nn_map.png
├── 01_clustering.png
├── 01_spam_filter.png
├── 04_knn_dataset.png
├── 05_overfitting.png
├── 02_ipython_header.png
├── 02_sklearn_logo.png
├── 05_train_test_split.png
├── 01_supervised_learning.png
├── 02_sklearn_algorithms.png
├── 09_confusion_matrix_1.png
├── 09_confusion_matrix_2.png
└── 07_cross_validation_diagram.png
├── styles
└── custom.css
├── README.md
├── 01_machine_learning_intro.ipynb
├── 02_machine_learning_setup.ipynb
├── 04_model_training.ipynb
├── 03_getting_started_with_iris.ipynb
├── 05_model_evaluation.ipynb
├── 07_cross_validation.ipynb
└── 08_grid_search.ipynb
/requirements.txt:
--------------------------------------------------------------------------------
1 | seaborn
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | *.pyc
3 | v3/
4 |
--------------------------------------------------------------------------------
/images/03_iris.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/03_iris.png
--------------------------------------------------------------------------------
/images/youtube.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/youtube.png
--------------------------------------------------------------------------------
/images/01_robot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/01_robot.png
--------------------------------------------------------------------------------
/images/04_1nn_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/04_1nn_map.png
--------------------------------------------------------------------------------
/images/04_5nn_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/04_5nn_map.png
--------------------------------------------------------------------------------
/images/01_clustering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/01_clustering.png
--------------------------------------------------------------------------------
/images/01_spam_filter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/01_spam_filter.png
--------------------------------------------------------------------------------
/images/04_knn_dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/04_knn_dataset.png
--------------------------------------------------------------------------------
/images/05_overfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/05_overfitting.png
--------------------------------------------------------------------------------
/images/02_ipython_header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/02_ipython_header.png
--------------------------------------------------------------------------------
/images/02_sklearn_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/02_sklearn_logo.png
--------------------------------------------------------------------------------
/images/05_train_test_split.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/05_train_test_split.png
--------------------------------------------------------------------------------
/images/01_supervised_learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/01_supervised_learning.png
--------------------------------------------------------------------------------
/images/02_sklearn_algorithms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/02_sklearn_algorithms.png
--------------------------------------------------------------------------------
/images/09_confusion_matrix_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/09_confusion_matrix_1.png
--------------------------------------------------------------------------------
/images/09_confusion_matrix_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/09_confusion_matrix_2.png
--------------------------------------------------------------------------------
/images/07_cross_validation_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chapagain/scikit-learn-videos/master/images/07_cross_validation_diagram.png
--------------------------------------------------------------------------------
/styles/custom.css:
--------------------------------------------------------------------------------
1 |
53 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Introduction to machine learning with scikit-learn
2 |
3 | This video series will teach you how to solve machine learning problems using Python's popular scikit-learn library. It was [featured on Kaggle's blog](http://blog.kaggle.com/author/kevin-markham/) in 2015.
4 |
5 | There are **9 video tutorials** totaling 4 hours, each with a corresponding **Jupyter notebook**. The notebook contains everything you see in the video: code, output, images, and comments.
6 |
7 | You can [watch the entire series](https://www.youtube.com/playlist?list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A) on YouTube, and [view all of the notebooks](http://nbviewer.jupyter.org/github/justmarkham/scikit-learn-videos/tree/master/) using nbviewer.
8 |
9 | [](https://www.youtube.com/watch?v=elojMnjn4kk&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=1 "Watch the first tutorial video")
10 |
11 | There is also a [binder](http://mybinder.org/repo/justmarkham/scikit-learn-videos) linked to this repository, which will allow you to interact with the notebooks online (without downloading them).
12 |
13 | Once you complete this video series, I recommend enrolling in my online course, [Machine Learning with Text in Python](http://www.dataschool.io/learn/), to gain a deeper understanding of scikit-learn and Natural Language Processing.
14 |
15 | ## Table of Contents
16 |
17 | 1. What is machine learning, and how does it work? ([video](https://www.youtube.com/watch?v=elojMnjn4kk&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=1), [notebook](01_machine_learning_intro.ipynb), [blog post](http://blog.kaggle.com/2015/04/08/new-video-series-introduction-to-machine-learning-with-scikit-learn/))
18 | - What is machine learning?
19 | - What are the two main categories of machine learning?
20 | - What are some examples of machine learning?
21 | - How does machine learning "work"?
22 |
23 | 2. Setting up Python for machine learning: scikit-learn and IPython Notebook ([video](https://www.youtube.com/watch?v=IsXXlYVBt1M&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=2), [notebook](02_machine_learning_setup.ipynb), [blog post](http://blog.kaggle.com/2015/04/15/scikit-learn-video-2-setting-up-python-for-machine-learning/))
24 | - What are the benefits and drawbacks of scikit-learn?
25 | - How do I install scikit-learn?
26 | - How do I use the IPython Notebook?
27 | - What are some good resources for learning Python?
28 |
29 | 3. Getting started in scikit-learn with the famous iris dataset ([video](https://www.youtube.com/watch?v=hd1W4CyPX58&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=3), [notebook](03_getting_started_with_iris.ipynb), [blog post](http://blog.kaggle.com/2015/04/22/scikit-learn-video-3-machine-learning-first-steps-with-the-iris-dataset/))
30 | - What is the famous iris dataset, and how does it relate to machine learning?
31 | - How do we load the iris dataset into scikit-learn?
32 | - How do we describe a dataset using machine learning terminology?
33 | - What are scikit-learn's four key requirements for working with data?
34 |
35 | 4. Training a machine learning model with scikit-learn ([video](https://www.youtube.com/watch?v=RlQuVL6-qe8&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=4), [notebook](04_model_training.ipynb), [blog post](http://blog.kaggle.com/2015/04/30/scikit-learn-video-4-model-training-and-prediction-with-k-nearest-neighbors/))
36 | - What is the K-nearest neighbors classification model?
37 | - What are the four steps for model training and prediction in scikit-learn?
38 | - How can I apply this pattern to other machine learning models?
39 |
40 | 5. Comparing machine learning models in scikit-learn ([video](https://www.youtube.com/watch?v=0pP4EwWJgIU&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=5), [notebook](05_model_evaluation.ipynb), [blog post](http://blog.kaggle.com/2015/05/14/scikit-learn-video-5-choosing-a-machine-learning-model/))
41 | - How do I choose which model to use for my supervised learning task?
42 | - How do I choose the best tuning parameters for that model?
43 | - How do I estimate the likely performance of my model on out-of-sample data?
44 |
45 | 6. Data science pipeline: pandas, seaborn, scikit-learn ([video](https://www.youtube.com/watch?v=3ZWuPVWq7p4&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=6), [notebook](06_linear_regression.ipynb), [blog post](http://blog.kaggle.com/2015/05/28/scikit-learn-video-6-linear-regression-plus-pandas-seaborn/))
46 | - How do I use the pandas library to read data into Python?
47 | - How do I use the seaborn library to visualize data?
48 | - What is linear regression, and how does it work?
49 | - How do I train and interpret a linear regression model in scikit-learn?
50 | - What are some evaluation metrics for regression problems?
51 | - How do I choose which features to include in my model?
52 |
53 | 7. Cross-validation for parameter tuning, model selection, and feature selection ([video](https://www.youtube.com/watch?v=6dbrR-WymjI&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=7), [notebook](07_cross_validation.ipynb), [blog post](http://blog.kaggle.com/2015/06/29/scikit-learn-video-7-optimizing-your-model-with-cross-validation/))
54 | - What is the drawback of using the train/test split procedure for model evaluation?
55 | - How does K-fold cross-validation overcome this limitation?
56 | - How can cross-validation be used for selecting tuning parameters, choosing between models, and selecting features?
57 | - What are some possible improvements to cross-validation?
58 |
59 | 8. Efficiently searching for optimal tuning parameters ([video](https://www.youtube.com/watch?v=Gol_qOgRqfA&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=8), [notebook](08_grid_search.ipynb), [blog post](http://blog.kaggle.com/2015/07/16/scikit-learn-video-8-efficiently-searching-for-optimal-tuning-parameters/))
60 | - How can K-fold cross-validation be used to search for an optimal tuning parameter?
61 | - How can this process be made more efficient?
62 | - How do you search for multiple tuning parameters at once?
63 | - What do you do with those tuning parameters before making real predictions?
64 | - How can the computational expense of this process be reduced?
65 |
66 | 9. Evaluating a classification model ([video](https://www.youtube.com/watch?v=85dtiMz9tSo&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=9), [notebook](09_classification_metrics.ipynb), [blog post](http://blog.kaggle.com/2015/10/23/scikit-learn-video-9-better-evaluation-of-classification-models/))
67 | - What is the purpose of model evaluation, and what are some common evaluation procedures?
68 | - What is the usage of classification accuracy, and what are its limitations?
69 | - How does a confusion matrix describe the performance of a classifier?
70 | - What metrics can be computed from a confusion matrix?
71 | - How can you adjust classifier performance by changing the classification threshold?
72 | - What is the purpose of an ROC curve?
73 | - How does Area Under the Curve (AUC) differ from classification accuracy?
74 |
75 | ## Bonus Video
76 |
77 | At the PyCon 2016 conference, I taught a **3-hour tutorial** that builds upon this video series and focuses on **text-based data**. You can watch the [tutorial video](https://www.youtube.com/watch?v=ZiKMIuYidY0&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=10) on YouTube.
78 |
79 | Here are the topics I covered:
80 |
81 | 1. Model building in scikit-learn (refresher)
82 | 2. Representing text as numerical data
83 | 3. Reading a text-based dataset into pandas
84 | 4. Vectorizing our dataset
85 | 5. Building and evaluating a model
86 | 6. Comparing models
87 | 7. Examining a model for further insight
88 | 8. Practicing this workflow on another dataset
89 | 9. Tuning the vectorizer (discussion)
90 |
91 | Visit this [GitHub repository](https://github.com/justmarkham/pycon-2016-tutorial) to access the tutorial notebooks and many other recommended resources.
92 |
--------------------------------------------------------------------------------
/01_machine_learning_intro.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# What is machine learning, and how does it work?\n",
8 | "*From the video series: [Introduction to machine learning with scikit-learn](https://github.com/justmarkham/scikit-learn-videos)*"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | ""
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "## Agenda\n",
23 | "\n",
24 | "- What is machine learning?\n",
25 | "- What are the two main categories of machine learning?\n",
26 | "- What are some examples of machine learning?\n",
27 | "- How does machine learning \"work\"?"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## What is machine learning?\n",
35 | "\n",
36 | "One definition: \"Machine learning is the semi-automated extraction of knowledge from data\"\n",
37 | "\n",
38 | "- **Knowledge from data**: Starts with a question that might be answerable using data\n",
39 | "- **Automated extraction**: A computer provides the insight\n",
40 | "- **Semi-automated**: Requires many smart decisions by a human"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {},
46 | "source": [
47 | "## What are the two main categories of machine learning?\n",
48 | "\n",
49 | "**Supervised learning**: Making predictions using data\n",
50 | " \n",
51 | "- Example: Is a given email \"spam\" or \"ham\"?\n",
52 | "- There is an outcome we are trying to predict"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | ""
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "**Unsupervised learning**: Extracting structure from data\n",
67 | "\n",
68 | "- Example: Segment grocery store shoppers into clusters that exhibit similar behaviors\n",
69 | "- There is no \"right answer\""
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | ""
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "## How does machine learning \"work\"?\n",
84 | "\n",
85 | "High-level steps of supervised learning:\n",
86 | "\n",
87 | "1. First, train a **machine learning model** using **labeled data**\n",
88 | "\n",
89 | " - \"Labeled data\" has been labeled with the outcome\n",
90 | " - \"Machine learning model\" learns the relationship between the attributes of the data and its outcome\n",
91 | "\n",
92 | "2. Then, make **predictions** on **new data** for which the label is unknown"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | ""
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "The primary goal of supervised learning is to build a model that \"generalizes\": It accurately predicts the **future** rather than the **past**!"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "## Questions about machine learning\n",
114 | "\n",
115 | "- How do I choose **which attributes** of my data to include in the model?\n",
116 | "- How do I choose **which model** to use?\n",
117 | "- How do I **optimize** this model for best performance?\n",
118 | "- How do I ensure that I'm building a model that will **generalize** to unseen data?\n",
119 | "- Can I **estimate** how well my model is likely to perform on unseen data?"
120 | ]
121 | },
122 | {
123 | "cell_type": "markdown",
124 | "metadata": {},
125 | "source": [
126 | "## Resources\n",
127 | "\n",
128 | "- Book: [An Introduction to Statistical Learning](http://www-bcf.usc.edu/~gareth/ISL/) (section 2.1, 14 pages)\n",
129 | "- Video: [Learning Paradigms](http://work.caltech.edu/library/014.html) (13 minutes)"
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "## Comments or Questions?\n",
137 | "\n",
138 | "- Email: \n",
139 | "- Website: http://dataschool.io\n",
140 | "- Twitter: [@justmarkham](https://twitter.com/justmarkham)"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 1,
146 | "metadata": {
147 | "collapsed": false
148 | },
149 | "outputs": [
150 | {
151 | "data": {
152 | "text/html": [
153 | "\n",
205 | ""
220 | ],
221 | "text/plain": [
222 | ""
223 | ]
224 | },
225 | "execution_count": 1,
226 | "metadata": {},
227 | "output_type": "execute_result"
228 | }
229 | ],
230 | "source": [
231 | "from IPython.core.display import HTML\n",
232 | "def css_styling():\n",
233 | " styles = open(\"styles/custom.css\", \"r\").read()\n",
234 | " return HTML(styles)\n",
235 | "css_styling()"
236 | ]
237 | }
238 | ],
239 | "metadata": {
240 | "kernelspec": {
241 | "display_name": "Python 2",
242 | "language": "python",
243 | "name": "python2"
244 | },
245 | "language_info": {
246 | "codemirror_mode": {
247 | "name": "ipython",
248 | "version": 2
249 | },
250 | "file_extension": ".py",
251 | "mimetype": "text/x-python",
252 | "name": "python",
253 | "nbconvert_exporter": "python",
254 | "pygments_lexer": "ipython2",
255 | "version": "2.7.11"
256 | }
257 | },
258 | "nbformat": 4,
259 | "nbformat_minor": 0
260 | }
261 |
--------------------------------------------------------------------------------
/02_machine_learning_setup.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Setting up Python for machine learning: scikit-learn and IPython Notebook\n",
8 | "*From the video series: [Introduction to machine learning with scikit-learn](https://github.com/justmarkham/scikit-learn-videos)*"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "## Agenda\n",
16 | "\n",
17 | "- What are the benefits and drawbacks of scikit-learn?\n",
18 | "- How do I install scikit-learn?\n",
19 | "- How do I use the IPython Notebook?\n",
20 | "- What are some good resources for learning Python?"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | ""
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Benefits and drawbacks of scikit-learn\n",
35 | "\n",
36 | "### Benefits:\n",
37 | "\n",
38 | "- **Consistent interface** to machine learning models\n",
39 | "- Provides many **tuning parameters** but with **sensible defaults**\n",
40 | "- Exceptional **documentation**\n",
41 | "- Rich set of functionality for **companion tasks**\n",
42 | "- **Active community** for development and support\n",
43 | "\n",
44 | "### Potential drawbacks:\n",
45 | "\n",
46 | "- Harder (than R) to **get started with machine learning**\n",
47 | "- Less emphasis (than R) on **model interpretability**\n",
48 | "\n",
49 | "### Further reading:\n",
50 | "\n",
51 | "- Ben Lorica: [Six reasons why I recommend scikit-learn](http://radar.oreilly.com/2013/12/six-reasons-why-i-recommend-scikit-learn.html)\n",
52 | "- scikit-learn authors: [API design for machine learning software](http://arxiv.org/pdf/1309.0238v1.pdf)\n",
53 | "- Data School: [Should you teach Python or R for data science?](http://www.dataschool.io/python-or-r-for-data-science/)"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | ""
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "## Installing scikit-learn\n",
68 | "\n",
69 | "**Option 1:** [Install scikit-learn library](http://scikit-learn.org/stable/install.html) and dependencies (NumPy and SciPy)\n",
70 | "\n",
71 | "**Option 2:** [Install Anaconda distribution](https://www.continuum.io/downloads) of Python, which includes:\n",
72 | "\n",
73 | "- Hundreds of useful packages (including scikit-learn)\n",
74 | "- IPython and IPython Notebook\n",
75 | "- conda package manager\n",
76 | "- Spyder IDE"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | ""
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "## Using the IPython Notebook\n",
91 | "\n",
92 | "### Components:\n",
93 | "\n",
94 | "- **IPython interpreter:** enhanced version of the standard Python interpreter\n",
95 | "- **Browser-based notebook interface:** weave together code, formatted text, and plots\n",
96 | "\n",
97 | "### Installation:\n",
98 | "\n",
99 | "- **Option 1:** Install [IPython](http://ipython.org/install.html) and the [notebook](https://jupyter.readthedocs.io/en/latest/install.html)\n",
100 | "- **Option 2:** Included with the Anaconda distribution\n",
101 | "\n",
102 | "### Launching the Notebook:\n",
103 | "\n",
104 | "- Type **ipython notebook** at the command line to open the dashboard\n",
105 | "- Don't close the command line window while the Notebook is running\n",
106 | "\n",
107 | "### Keyboard shortcuts:\n",
108 | "\n",
109 | "**Command mode** (gray border)\n",
110 | "\n",
111 | "- Create new cells above (**a**) or below (**b**) the current cell\n",
112 | "- Navigate using the **up arrow** and **down arrow**\n",
113 | "- Convert the cell type to Markdown (**m**) or code (**y**)\n",
114 | "- See keyboard shortcuts using **h**\n",
115 | "- Switch to Edit mode using **Enter**\n",
116 | "\n",
117 | "**Edit mode** (green border)\n",
118 | "\n",
119 | "- **Ctrl+Enter** to run a cell\n",
120 | "- Switch to Command mode using **Esc**\n",
121 | "\n",
122 | "### IPython and Markdown resources:\n",
123 | "\n",
124 | "- [nbviewer](http://nbviewer.jupyter.org/): view notebooks online as static documents\n",
125 | "- [IPython documentation](http://ipython.readthedocs.io/en/stable/): focuses on the interpreter\n",
126 | "- [IPython Notebook tutorials](http://jupyter.readthedocs.io/en/latest/content-quickstart.html): in-depth introduction\n",
127 | "- [GitHub's Mastering Markdown](https://guides.github.com/features/mastering-markdown/): short guide with lots of examples"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {},
133 | "source": [
134 | "## Resources for learning Python\n",
135 | "\n",
136 | "- [Codecademy's Python course](https://www.codecademy.com/learn/python): browser-based, tons of exercises\n",
137 | "- [DataQuest](https://www.dataquest.io/): browser-based, teaches Python in the context of data science\n",
138 | "- [Google's Python class](https://developers.google.com/edu/python/): slightly more advanced, includes videos and downloadable exercises (with solutions)\n",
139 | "- [Python for Informatics](http://www.pythonlearn.com/): beginner-oriented book, includes slides and videos"
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "metadata": {},
145 | "source": [
146 | "## Comments or Questions?\n",
147 | "\n",
148 | "- Email: \n",
149 | "- Website: http://dataschool.io\n",
150 | "- Twitter: [@justmarkham](https://twitter.com/justmarkham)"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 1,
156 | "metadata": {
157 | "collapsed": false
158 | },
159 | "outputs": [
160 | {
161 | "data": {
162 | "text/html": [
163 | "\n",
215 | ""
230 | ],
231 | "text/plain": [
232 | ""
233 | ]
234 | },
235 | "execution_count": 1,
236 | "metadata": {},
237 | "output_type": "execute_result"
238 | }
239 | ],
240 | "source": [
241 | "from IPython.core.display import HTML\n",
242 | "def css_styling():\n",
243 | " styles = open(\"styles/custom.css\", \"r\").read()\n",
244 | " return HTML(styles)\n",
245 | "css_styling()"
246 | ]
247 | }
248 | ],
249 | "metadata": {
250 | "kernelspec": {
251 | "display_name": "Python 2",
252 | "language": "python",
253 | "name": "python2"
254 | },
255 | "language_info": {
256 | "codemirror_mode": {
257 | "name": "ipython",
258 | "version": 2
259 | },
260 | "file_extension": ".py",
261 | "mimetype": "text/x-python",
262 | "name": "python",
263 | "nbconvert_exporter": "python",
264 | "pygments_lexer": "ipython2",
265 | "version": "2.7.11"
266 | }
267 | },
268 | "nbformat": 4,
269 | "nbformat_minor": 0
270 | }
271 |
--------------------------------------------------------------------------------
/04_model_training.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Training a machine learning model with scikit-learn\n",
8 | "*From the video series: [Introduction to machine learning with scikit-learn](https://github.com/justmarkham/scikit-learn-videos)*"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "## Agenda\n",
16 | "\n",
17 | "- What is the **K-nearest neighbors** classification model?\n",
18 | "- What are the four steps for **model training and prediction** in scikit-learn?\n",
19 | "- How can I apply this pattern to **other machine learning models**?"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## Reviewing the iris dataset"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {
33 | "collapsed": false
34 | },
35 | "outputs": [
36 | {
37 | "data": {
38 | "text/html": [
39 | "\n",
40 | " \n",
47 | " "
48 | ],
49 | "text/plain": [
50 | ""
51 | ]
52 | },
53 | "execution_count": 2,
54 | "metadata": {},
55 | "output_type": "execute_result"
56 | }
57 | ],
58 | "source": [
59 | "from IPython.display import IFrame\n",
60 | "IFrame('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', width=300, height=200)"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "- 150 **observations**\n",
68 | "- 4 **features** (sepal length, sepal width, petal length, petal width)\n",
69 | "- **Response** variable is the iris species\n",
70 | "- **Classification** problem since response is categorical\n",
71 | "- More information in the [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/datasets/Iris)"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "## K-nearest neighbors (KNN) classification"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "1. Pick a value for K.\n",
86 | "2. Search for the K observations in the training data that are \"nearest\" to the measurements of the unknown iris.\n",
87 | "3. Use the most popular response value from the K nearest neighbors as the predicted response value for the unknown iris."
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "### Example training data\n",
95 | "\n",
96 | ""
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "### KNN classification map (K=1)\n",
104 | "\n",
105 | ""
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "### KNN classification map (K=5)\n",
113 | "\n",
114 | ""
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {},
120 | "source": [
121 | "*Image Credits: [Data3classes](http://commons.wikimedia.org/wiki/File:Data3classes.png#/media/File:Data3classes.png), [Map1NN](http://commons.wikimedia.org/wiki/File:Map1NN.png#/media/File:Map1NN.png), [Map5NN](http://commons.wikimedia.org/wiki/File:Map5NN.png#/media/File:Map5NN.png) by Agor153. Licensed under CC BY-SA 3.0*"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "## Loading the data"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": 3,
134 | "metadata": {
135 | "collapsed": false
136 | },
137 | "outputs": [],
138 | "source": [
139 | "# import load_iris function from datasets module\n",
140 | "from sklearn.datasets import load_iris\n",
141 | "\n",
142 | "# save \"bunch\" object containing iris dataset and its attributes\n",
143 | "iris = load_iris()\n",
144 | "\n",
145 | "# store feature matrix in \"X\"\n",
146 | "X = iris.data\n",
147 | "\n",
148 | "# store response vector in \"y\"\n",
149 | "y = iris.target"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 4,
155 | "metadata": {
156 | "collapsed": false
157 | },
158 | "outputs": [
159 | {
160 | "name": "stdout",
161 | "output_type": "stream",
162 | "text": [
163 | "(150L, 4L)\n",
164 | "(150L,)\n"
165 | ]
166 | }
167 | ],
168 | "source": [
169 | "# print the shapes of X and y\n",
170 | "print(X.shape)\n",
171 | "print(y.shape)"
172 | ]
173 | },
174 | {
175 | "cell_type": "markdown",
176 | "metadata": {},
177 | "source": [
178 | "## scikit-learn 4-step modeling pattern"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "**Step 1:** Import the class you plan to use"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": 5,
191 | "metadata": {
192 | "collapsed": false
193 | },
194 | "outputs": [],
195 | "source": [
196 | "from sklearn.neighbors import KNeighborsClassifier"
197 | ]
198 | },
199 | {
200 | "cell_type": "markdown",
201 | "metadata": {},
202 | "source": [
203 | "**Step 2:** \"Instantiate\" the \"estimator\"\n",
204 | "\n",
205 | "- \"Estimator\" is scikit-learn's term for model\n",
206 | "- \"Instantiate\" means \"make an instance of\""
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 6,
212 | "metadata": {
213 | "collapsed": false
214 | },
215 | "outputs": [],
216 | "source": [
217 | "knn = KNeighborsClassifier(n_neighbors=1)"
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {},
223 | "source": [
224 | "- Name of the object does not matter\n",
225 | "- Can specify tuning parameters (aka \"hyperparameters\") during this step\n",
226 | "- All parameters not specified are set to their defaults"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 7,
232 | "metadata": {
233 | "collapsed": false
234 | },
235 | "outputs": [
236 | {
237 | "name": "stdout",
238 | "output_type": "stream",
239 | "text": [
240 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
241 | " metric_params=None, n_jobs=1, n_neighbors=1, p=2,\n",
242 | " weights='uniform')\n"
243 | ]
244 | }
245 | ],
246 | "source": [
247 | "print(knn)"
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "metadata": {},
253 | "source": [
254 | "**Step 3:** Fit the model with data (aka \"model training\")\n",
255 | "\n",
256 | "- Model is learning the relationship between X and y\n",
257 | "- Occurs in-place"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 8,
263 | "metadata": {
264 | "collapsed": false
265 | },
266 | "outputs": [
267 | {
268 | "data": {
269 | "text/plain": [
270 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
271 | " metric_params=None, n_jobs=1, n_neighbors=1, p=2,\n",
272 | " weights='uniform')"
273 | ]
274 | },
275 | "execution_count": 8,
276 | "metadata": {},
277 | "output_type": "execute_result"
278 | }
279 | ],
280 | "source": [
281 | "knn.fit(X, y)"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "**Step 4:** Predict the response for a new observation\n",
289 | "\n",
290 | "- New observations are called \"out-of-sample\" data\n",
291 | "- Uses the information it learned during the model training process"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 9,
297 | "metadata": {
298 | "collapsed": false
299 | },
300 | "outputs": [
301 | {
302 | "data": {
303 | "text/plain": [
304 | "array([2])"
305 | ]
306 | },
307 | "execution_count": 9,
308 | "metadata": {},
309 | "output_type": "execute_result"
310 | }
311 | ],
312 | "source": [
313 | "knn.predict([[3, 5, 4, 2]])"
314 | ]
315 | },
316 | {
317 | "cell_type": "markdown",
318 | "metadata": {},
319 | "source": [
320 | "- Returns a NumPy array\n",
321 | "- Can predict for multiple observations at once"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": 10,
327 | "metadata": {
328 | "collapsed": false
329 | },
330 | "outputs": [
331 | {
332 | "data": {
333 | "text/plain": [
334 | "array([2, 1])"
335 | ]
336 | },
337 | "execution_count": 10,
338 | "metadata": {},
339 | "output_type": "execute_result"
340 | }
341 | ],
342 | "source": [
343 | "X_new = [[3, 5, 4, 2], [5, 4, 3, 2]]\n",
344 | "knn.predict(X_new)"
345 | ]
346 | },
347 | {
348 | "cell_type": "markdown",
349 | "metadata": {},
350 | "source": [
351 | "## Using a different value for K"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": 11,
357 | "metadata": {
358 | "collapsed": false
359 | },
360 | "outputs": [
361 | {
362 | "data": {
363 | "text/plain": [
364 | "array([1, 1])"
365 | ]
366 | },
367 | "execution_count": 11,
368 | "metadata": {},
369 | "output_type": "execute_result"
370 | }
371 | ],
372 | "source": [
373 | "# instantiate the model (using the value K=5)\n",
374 | "knn = KNeighborsClassifier(n_neighbors=5)\n",
375 | "\n",
376 | "# fit the model with data\n",
377 | "knn.fit(X, y)\n",
378 | "\n",
379 | "# predict the response for new observations\n",
380 | "knn.predict(X_new)"
381 | ]
382 | },
383 | {
384 | "cell_type": "markdown",
385 | "metadata": {},
386 | "source": [
387 | "## Using a different classification model"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": 12,
393 | "metadata": {
394 | "collapsed": false
395 | },
396 | "outputs": [
397 | {
398 | "data": {
399 | "text/plain": [
400 | "array([2, 0])"
401 | ]
402 | },
403 | "execution_count": 12,
404 | "metadata": {},
405 | "output_type": "execute_result"
406 | }
407 | ],
408 | "source": [
409 | "# import the class\n",
410 | "from sklearn.linear_model import LogisticRegression\n",
411 | "\n",
412 | "# instantiate the model (using the default parameters)\n",
413 | "logreg = LogisticRegression()\n",
414 | "\n",
415 | "# fit the model with data\n",
416 | "logreg.fit(X, y)\n",
417 | "\n",
418 | "# predict the response for new observations\n",
419 | "logreg.predict(X_new)"
420 | ]
421 | },
422 | {
423 | "cell_type": "markdown",
424 | "metadata": {},
425 | "source": [
426 | "## Resources\n",
427 | "\n",
428 | "- [Nearest Neighbors](http://scikit-learn.org/stable/modules/neighbors.html) (user guide), [KNeighborsClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html) (class documentation)\n",
429 | "- [Logistic Regression](http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression) (user guide), [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) (class documentation)\n",
430 | "- [Videos from An Introduction to Statistical Learning](http://www.dataschool.io/15-hours-of-expert-machine-learning-videos/)\n",
431 | " - Classification Problems and K-Nearest Neighbors (Chapter 2)\n",
432 | " - Introduction to Classification (Chapter 4)\n",
433 | " - Logistic Regression and Maximum Likelihood (Chapter 4)"
434 | ]
435 | },
436 | {
437 | "cell_type": "markdown",
438 | "metadata": {},
439 | "source": [
440 | "## Comments or Questions?\n",
441 | "\n",
442 | "- Email: \n",
443 | "- Website: http://dataschool.io\n",
444 | "- Twitter: [@justmarkham](https://twitter.com/justmarkham)"
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": 1,
450 | "metadata": {
451 | "collapsed": false
452 | },
453 | "outputs": [
454 | {
455 | "data": {
456 | "text/html": [
457 | "\n",
509 | ""
524 | ],
525 | "text/plain": [
526 | ""
527 | ]
528 | },
529 | "execution_count": 1,
530 | "metadata": {},
531 | "output_type": "execute_result"
532 | }
533 | ],
534 | "source": [
535 | "from IPython.core.display import HTML\n",
536 | "def css_styling():\n",
537 | " styles = open(\"styles/custom.css\", \"r\").read()\n",
538 | " return HTML(styles)\n",
539 | "css_styling()"
540 | ]
541 | }
542 | ],
543 | "metadata": {
544 | "kernelspec": {
545 | "display_name": "Python 2",
546 | "language": "python",
547 | "name": "python2"
548 | },
549 | "language_info": {
550 | "codemirror_mode": {
551 | "name": "ipython",
552 | "version": 2
553 | },
554 | "file_extension": ".py",
555 | "mimetype": "text/x-python",
556 | "name": "python",
557 | "nbconvert_exporter": "python",
558 | "pygments_lexer": "ipython2",
559 | "version": "2.7.11"
560 | }
561 | },
562 | "nbformat": 4,
563 | "nbformat_minor": 0
564 | }
565 |
--------------------------------------------------------------------------------
/03_getting_started_with_iris.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Getting started in scikit-learn with the famous iris dataset\n",
8 | "*From the video series: [Introduction to machine learning with scikit-learn](https://github.com/justmarkham/scikit-learn-videos)*"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "## Agenda\n",
16 | "\n",
17 | "- What is the famous iris dataset, and how does it relate to machine learning?\n",
18 | "- How do we load the iris dataset into scikit-learn?\n",
19 | "- How do we describe a dataset using machine learning terminology?\n",
20 | "- What are scikit-learn's four key requirements for working with data?"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "## Introducing the iris dataset"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | ""
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "- 50 samples of 3 different species of iris (150 samples total)\n",
42 | "- Measurements: sepal length, sepal width, petal length, petal width"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 2,
48 | "metadata": {
49 | "collapsed": false
50 | },
51 | "outputs": [
52 | {
53 | "data": {
54 | "text/html": [
55 | "\n",
56 | " \n",
63 | " "
64 | ],
65 | "text/plain": [
66 | ""
67 | ]
68 | },
69 | "execution_count": 2,
70 | "metadata": {},
71 | "output_type": "execute_result"
72 | }
73 | ],
74 | "source": [
75 | "from IPython.display import IFrame\n",
76 | "IFrame('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', width=300, height=200)"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "## Machine learning on the iris dataset\n",
84 | "\n",
85 | "- Framed as a **supervised learning** problem: Predict the species of an iris using the measurements\n",
86 | "- Famous dataset for machine learning because prediction is **easy**\n",
87 | "- Learn more about the iris dataset: [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/datasets/Iris)"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "## Loading the iris dataset into scikit-learn"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 3,
100 | "metadata": {
101 | "collapsed": false
102 | },
103 | "outputs": [],
104 | "source": [
105 | "# import load_iris function from datasets module\n",
106 | "from sklearn.datasets import load_iris"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 4,
112 | "metadata": {
113 | "collapsed": false
114 | },
115 | "outputs": [
116 | {
117 | "data": {
118 | "text/plain": [
119 | "sklearn.datasets.base.Bunch"
120 | ]
121 | },
122 | "execution_count": 4,
123 | "metadata": {},
124 | "output_type": "execute_result"
125 | }
126 | ],
127 | "source": [
128 | "# save \"bunch\" object containing iris dataset and its attributes\n",
129 | "iris = load_iris()\n",
130 | "type(iris)"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 5,
136 | "metadata": {
137 | "collapsed": false
138 | },
139 | "outputs": [
140 | {
141 | "name": "stdout",
142 | "output_type": "stream",
143 | "text": [
144 | "[[ 5.1 3.5 1.4 0.2]\n",
145 | " [ 4.9 3. 1.4 0.2]\n",
146 | " [ 4.7 3.2 1.3 0.2]\n",
147 | " [ 4.6 3.1 1.5 0.2]\n",
148 | " [ 5. 3.6 1.4 0.2]\n",
149 | " [ 5.4 3.9 1.7 0.4]\n",
150 | " [ 4.6 3.4 1.4 0.3]\n",
151 | " [ 5. 3.4 1.5 0.2]\n",
152 | " [ 4.4 2.9 1.4 0.2]\n",
153 | " [ 4.9 3.1 1.5 0.1]\n",
154 | " [ 5.4 3.7 1.5 0.2]\n",
155 | " [ 4.8 3.4 1.6 0.2]\n",
156 | " [ 4.8 3. 1.4 0.1]\n",
157 | " [ 4.3 3. 1.1 0.1]\n",
158 | " [ 5.8 4. 1.2 0.2]\n",
159 | " [ 5.7 4.4 1.5 0.4]\n",
160 | " [ 5.4 3.9 1.3 0.4]\n",
161 | " [ 5.1 3.5 1.4 0.3]\n",
162 | " [ 5.7 3.8 1.7 0.3]\n",
163 | " [ 5.1 3.8 1.5 0.3]\n",
164 | " [ 5.4 3.4 1.7 0.2]\n",
165 | " [ 5.1 3.7 1.5 0.4]\n",
166 | " [ 4.6 3.6 1. 0.2]\n",
167 | " [ 5.1 3.3 1.7 0.5]\n",
168 | " [ 4.8 3.4 1.9 0.2]\n",
169 | " [ 5. 3. 1.6 0.2]\n",
170 | " [ 5. 3.4 1.6 0.4]\n",
171 | " [ 5.2 3.5 1.5 0.2]\n",
172 | " [ 5.2 3.4 1.4 0.2]\n",
173 | " [ 4.7 3.2 1.6 0.2]\n",
174 | " [ 4.8 3.1 1.6 0.2]\n",
175 | " [ 5.4 3.4 1.5 0.4]\n",
176 | " [ 5.2 4.1 1.5 0.1]\n",
177 | " [ 5.5 4.2 1.4 0.2]\n",
178 | " [ 4.9 3.1 1.5 0.1]\n",
179 | " [ 5. 3.2 1.2 0.2]\n",
180 | " [ 5.5 3.5 1.3 0.2]\n",
181 | " [ 4.9 3.1 1.5 0.1]\n",
182 | " [ 4.4 3. 1.3 0.2]\n",
183 | " [ 5.1 3.4 1.5 0.2]\n",
184 | " [ 5. 3.5 1.3 0.3]\n",
185 | " [ 4.5 2.3 1.3 0.3]\n",
186 | " [ 4.4 3.2 1.3 0.2]\n",
187 | " [ 5. 3.5 1.6 0.6]\n",
188 | " [ 5.1 3.8 1.9 0.4]\n",
189 | " [ 4.8 3. 1.4 0.3]\n",
190 | " [ 5.1 3.8 1.6 0.2]\n",
191 | " [ 4.6 3.2 1.4 0.2]\n",
192 | " [ 5.3 3.7 1.5 0.2]\n",
193 | " [ 5. 3.3 1.4 0.2]\n",
194 | " [ 7. 3.2 4.7 1.4]\n",
195 | " [ 6.4 3.2 4.5 1.5]\n",
196 | " [ 6.9 3.1 4.9 1.5]\n",
197 | " [ 5.5 2.3 4. 1.3]\n",
198 | " [ 6.5 2.8 4.6 1.5]\n",
199 | " [ 5.7 2.8 4.5 1.3]\n",
200 | " [ 6.3 3.3 4.7 1.6]\n",
201 | " [ 4.9 2.4 3.3 1. ]\n",
202 | " [ 6.6 2.9 4.6 1.3]\n",
203 | " [ 5.2 2.7 3.9 1.4]\n",
204 | " [ 5. 2. 3.5 1. ]\n",
205 | " [ 5.9 3. 4.2 1.5]\n",
206 | " [ 6. 2.2 4. 1. ]\n",
207 | " [ 6.1 2.9 4.7 1.4]\n",
208 | " [ 5.6 2.9 3.6 1.3]\n",
209 | " [ 6.7 3.1 4.4 1.4]\n",
210 | " [ 5.6 3. 4.5 1.5]\n",
211 | " [ 5.8 2.7 4.1 1. ]\n",
212 | " [ 6.2 2.2 4.5 1.5]\n",
213 | " [ 5.6 2.5 3.9 1.1]\n",
214 | " [ 5.9 3.2 4.8 1.8]\n",
215 | " [ 6.1 2.8 4. 1.3]\n",
216 | " [ 6.3 2.5 4.9 1.5]\n",
217 | " [ 6.1 2.8 4.7 1.2]\n",
218 | " [ 6.4 2.9 4.3 1.3]\n",
219 | " [ 6.6 3. 4.4 1.4]\n",
220 | " [ 6.8 2.8 4.8 1.4]\n",
221 | " [ 6.7 3. 5. 1.7]\n",
222 | " [ 6. 2.9 4.5 1.5]\n",
223 | " [ 5.7 2.6 3.5 1. ]\n",
224 | " [ 5.5 2.4 3.8 1.1]\n",
225 | " [ 5.5 2.4 3.7 1. ]\n",
226 | " [ 5.8 2.7 3.9 1.2]\n",
227 | " [ 6. 2.7 5.1 1.6]\n",
228 | " [ 5.4 3. 4.5 1.5]\n",
229 | " [ 6. 3.4 4.5 1.6]\n",
230 | " [ 6.7 3.1 4.7 1.5]\n",
231 | " [ 6.3 2.3 4.4 1.3]\n",
232 | " [ 5.6 3. 4.1 1.3]\n",
233 | " [ 5.5 2.5 4. 1.3]\n",
234 | " [ 5.5 2.6 4.4 1.2]\n",
235 | " [ 6.1 3. 4.6 1.4]\n",
236 | " [ 5.8 2.6 4. 1.2]\n",
237 | " [ 5. 2.3 3.3 1. ]\n",
238 | " [ 5.6 2.7 4.2 1.3]\n",
239 | " [ 5.7 3. 4.2 1.2]\n",
240 | " [ 5.7 2.9 4.2 1.3]\n",
241 | " [ 6.2 2.9 4.3 1.3]\n",
242 | " [ 5.1 2.5 3. 1.1]\n",
243 | " [ 5.7 2.8 4.1 1.3]\n",
244 | " [ 6.3 3.3 6. 2.5]\n",
245 | " [ 5.8 2.7 5.1 1.9]\n",
246 | " [ 7.1 3. 5.9 2.1]\n",
247 | " [ 6.3 2.9 5.6 1.8]\n",
248 | " [ 6.5 3. 5.8 2.2]\n",
249 | " [ 7.6 3. 6.6 2.1]\n",
250 | " [ 4.9 2.5 4.5 1.7]\n",
251 | " [ 7.3 2.9 6.3 1.8]\n",
252 | " [ 6.7 2.5 5.8 1.8]\n",
253 | " [ 7.2 3.6 6.1 2.5]\n",
254 | " [ 6.5 3.2 5.1 2. ]\n",
255 | " [ 6.4 2.7 5.3 1.9]\n",
256 | " [ 6.8 3. 5.5 2.1]\n",
257 | " [ 5.7 2.5 5. 2. ]\n",
258 | " [ 5.8 2.8 5.1 2.4]\n",
259 | " [ 6.4 3.2 5.3 2.3]\n",
260 | " [ 6.5 3. 5.5 1.8]\n",
261 | " [ 7.7 3.8 6.7 2.2]\n",
262 | " [ 7.7 2.6 6.9 2.3]\n",
263 | " [ 6. 2.2 5. 1.5]\n",
264 | " [ 6.9 3.2 5.7 2.3]\n",
265 | " [ 5.6 2.8 4.9 2. ]\n",
266 | " [ 7.7 2.8 6.7 2. ]\n",
267 | " [ 6.3 2.7 4.9 1.8]\n",
268 | " [ 6.7 3.3 5.7 2.1]\n",
269 | " [ 7.2 3.2 6. 1.8]\n",
270 | " [ 6.2 2.8 4.8 1.8]\n",
271 | " [ 6.1 3. 4.9 1.8]\n",
272 | " [ 6.4 2.8 5.6 2.1]\n",
273 | " [ 7.2 3. 5.8 1.6]\n",
274 | " [ 7.4 2.8 6.1 1.9]\n",
275 | " [ 7.9 3.8 6.4 2. ]\n",
276 | " [ 6.4 2.8 5.6 2.2]\n",
277 | " [ 6.3 2.8 5.1 1.5]\n",
278 | " [ 6.1 2.6 5.6 1.4]\n",
279 | " [ 7.7 3. 6.1 2.3]\n",
280 | " [ 6.3 3.4 5.6 2.4]\n",
281 | " [ 6.4 3.1 5.5 1.8]\n",
282 | " [ 6. 3. 4.8 1.8]\n",
283 | " [ 6.9 3.1 5.4 2.1]\n",
284 | " [ 6.7 3.1 5.6 2.4]\n",
285 | " [ 6.9 3.1 5.1 2.3]\n",
286 | " [ 5.8 2.7 5.1 1.9]\n",
287 | " [ 6.8 3.2 5.9 2.3]\n",
288 | " [ 6.7 3.3 5.7 2.5]\n",
289 | " [ 6.7 3. 5.2 2.3]\n",
290 | " [ 6.3 2.5 5. 1.9]\n",
291 | " [ 6.5 3. 5.2 2. ]\n",
292 | " [ 6.2 3.4 5.4 2.3]\n",
293 | " [ 5.9 3. 5.1 1.8]]\n"
294 | ]
295 | }
296 | ],
297 | "source": [
298 | "# print the iris data\n",
299 | "print(iris.data)"
300 | ]
301 | },
302 | {
303 | "cell_type": "markdown",
304 | "metadata": {},
305 | "source": [
306 | "## Machine learning terminology\n",
307 | "\n",
308 | "- Each row is an **observation** (also known as: sample, example, instance, record)\n",
309 | "- Each column is a **feature** (also known as: predictor, attribute, independent variable, input, regressor, covariate)"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 6,
315 | "metadata": {
316 | "collapsed": false
317 | },
318 | "outputs": [
319 | {
320 | "name": "stdout",
321 | "output_type": "stream",
322 | "text": [
323 | "['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']\n"
324 | ]
325 | }
326 | ],
327 | "source": [
328 | "# print the names of the four features\n",
329 | "print(iris.feature_names)"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": 7,
335 | "metadata": {
336 | "collapsed": false
337 | },
338 | "outputs": [
339 | {
340 | "name": "stdout",
341 | "output_type": "stream",
342 | "text": [
343 | "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
344 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
345 | " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2\n",
346 | " 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2\n",
347 | " 2 2]\n"
348 | ]
349 | }
350 | ],
351 | "source": [
352 | "# print integers representing the species of each observation\n",
353 | "print(iris.target)"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": 8,
359 | "metadata": {
360 | "collapsed": false
361 | },
362 | "outputs": [
363 | {
364 | "name": "stdout",
365 | "output_type": "stream",
366 | "text": [
367 | "['setosa' 'versicolor' 'virginica']\n"
368 | ]
369 | }
370 | ],
371 | "source": [
372 | "# print the encoding scheme for species: 0 = setosa, 1 = versicolor, 2 = virginica\n",
373 | "print(iris.target_names)"
374 | ]
375 | },
376 | {
377 | "cell_type": "markdown",
378 | "metadata": {},
379 | "source": [
380 | "- Each value we are predicting is the **response** (also known as: target, outcome, label, dependent variable)\n",
381 | "- **Classification** is supervised learning in which the response is categorical\n",
382 | "- **Regression** is supervised learning in which the response is ordered and continuous"
383 | ]
384 | },
385 | {
386 | "cell_type": "markdown",
387 | "metadata": {},
388 | "source": [
389 | "## Requirements for working with data in scikit-learn\n",
390 | "\n",
391 | "1. Features and response are **separate objects**\n",
392 | "2. Features and response should be **numeric**\n",
393 | "3. Features and response should be **NumPy arrays**\n",
394 | "4. Features and response should have **specific shapes**"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": 9,
400 | "metadata": {
401 | "collapsed": false
402 | },
403 | "outputs": [
404 | {
405 | "name": "stdout",
406 | "output_type": "stream",
407 | "text": [
408 | "\n",
409 | "\n"
410 | ]
411 | }
412 | ],
413 | "source": [
414 | "# check the types of the features and response\n",
415 | "print(type(iris.data))\n",
416 | "print(type(iris.target))"
417 | ]
418 | },
419 | {
420 | "cell_type": "code",
421 | "execution_count": 10,
422 | "metadata": {
423 | "collapsed": false
424 | },
425 | "outputs": [
426 | {
427 | "name": "stdout",
428 | "output_type": "stream",
429 | "text": [
430 | "(150L, 4L)\n"
431 | ]
432 | }
433 | ],
434 | "source": [
435 | "# check the shape of the features (first dimension = number of observations, second dimensions = number of features)\n",
436 | "print(iris.data.shape)"
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": 11,
442 | "metadata": {
443 | "collapsed": false
444 | },
445 | "outputs": [
446 | {
447 | "name": "stdout",
448 | "output_type": "stream",
449 | "text": [
450 | "(150L,)\n"
451 | ]
452 | }
453 | ],
454 | "source": [
455 | "# check the shape of the response (single dimension matching the number of observations)\n",
456 | "print(iris.target.shape)"
457 | ]
458 | },
459 | {
460 | "cell_type": "code",
461 | "execution_count": 12,
462 | "metadata": {
463 | "collapsed": false
464 | },
465 | "outputs": [],
466 | "source": [
467 | "# store feature matrix in \"X\"\n",
468 | "X = iris.data\n",
469 | "\n",
470 | "# store response vector in \"y\"\n",
471 | "y = iris.target"
472 | ]
473 | },
474 | {
475 | "cell_type": "markdown",
476 | "metadata": {},
477 | "source": [
478 | "## Resources\n",
479 | "\n",
480 | "- scikit-learn documentation: [Dataset loading utilities](http://scikit-learn.org/stable/datasets/)\n",
481 | "- Jake VanderPlas: Fast Numerical Computing with NumPy ([slides](https://speakerdeck.com/jakevdp/losing-your-loops-fast-numerical-computing-with-numpy-pycon-2015), [video](https://www.youtube.com/watch?v=EEUXKG97YRw))\n",
482 | "- Scott Shell: [An Introduction to NumPy](http://www.engr.ucsb.edu/~shell/che210d/numpy.pdf) (PDF)"
483 | ]
484 | },
485 | {
486 | "cell_type": "markdown",
487 | "metadata": {},
488 | "source": [
489 | "## Comments or Questions?\n",
490 | "\n",
491 | "- Email: \n",
492 | "- Website: http://dataschool.io\n",
493 | "- Twitter: [@justmarkham](https://twitter.com/justmarkham)"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": 1,
499 | "metadata": {
500 | "collapsed": false
501 | },
502 | "outputs": [
503 | {
504 | "data": {
505 | "text/html": [
506 | "\n",
558 | ""
573 | ],
574 | "text/plain": [
575 | ""
576 | ]
577 | },
578 | "execution_count": 1,
579 | "metadata": {},
580 | "output_type": "execute_result"
581 | }
582 | ],
583 | "source": [
584 | "from IPython.core.display import HTML\n",
585 | "def css_styling():\n",
586 | " styles = open(\"styles/custom.css\", \"r\").read()\n",
587 | " return HTML(styles)\n",
588 | "css_styling()"
589 | ]
590 | }
591 | ],
592 | "metadata": {
593 | "kernelspec": {
594 | "display_name": "Python 2",
595 | "language": "python",
596 | "name": "python2"
597 | },
598 | "language_info": {
599 | "codemirror_mode": {
600 | "name": "ipython",
601 | "version": 2
602 | },
603 | "file_extension": ".py",
604 | "mimetype": "text/x-python",
605 | "name": "python",
606 | "nbconvert_exporter": "python",
607 | "pygments_lexer": "ipython2",
608 | "version": "2.7.11"
609 | }
610 | },
611 | "nbformat": 4,
612 | "nbformat_minor": 0
613 | }
614 |
--------------------------------------------------------------------------------
/05_model_evaluation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Comparing machine learning models in scikit-learn\n",
8 | "*From the video series: [Introduction to machine learning with scikit-learn](https://github.com/justmarkham/scikit-learn-videos)*"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "## Agenda\n",
16 | "\n",
17 | "- How do I choose **which model to use** for my supervised learning task?\n",
18 | "- How do I choose the **best tuning parameters** for that model?\n",
19 | "- How do I estimate the **likely performance of my model** on out-of-sample data?"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## Review\n",
27 | "\n",
28 | "- Classification task: Predicting the species of an unknown iris\n",
29 | "- Used three classification models: KNN (K=1), KNN (K=5), logistic regression\n",
30 | "- Need a way to choose between the models\n",
31 | "\n",
32 | "**Solution:** Model evaluation procedures"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "## Evaluation procedure #1: Train and test on the entire dataset"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "1. Train the model on the **entire dataset**.\n",
47 | "2. Test the model on the **same dataset**, and evaluate how well we did by comparing the **predicted** response values with the **true** response values."
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 2,
53 | "metadata": {
54 | "collapsed": false
55 | },
56 | "outputs": [],
57 | "source": [
58 | "# read in the iris data\n",
59 | "from sklearn.datasets import load_iris\n",
60 | "iris = load_iris()\n",
61 | "\n",
62 | "# create X (features) and y (response)\n",
63 | "X = iris.data\n",
64 | "y = iris.target"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "### Logistic regression"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 3,
77 | "metadata": {
78 | "collapsed": false
79 | },
80 | "outputs": [
81 | {
82 | "data": {
83 | "text/plain": [
84 | "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
85 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
86 | " 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,\n",
87 | " 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,\n",
88 | " 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
89 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,\n",
90 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])"
91 | ]
92 | },
93 | "execution_count": 3,
94 | "metadata": {},
95 | "output_type": "execute_result"
96 | }
97 | ],
98 | "source": [
99 | "# import the class\n",
100 | "from sklearn.linear_model import LogisticRegression\n",
101 | "\n",
102 | "# instantiate the model (using the default parameters)\n",
103 | "logreg = LogisticRegression()\n",
104 | "\n",
105 | "# fit the model with data\n",
106 | "logreg.fit(X, y)\n",
107 | "\n",
108 | "# predict the response values for the observations in X\n",
109 | "logreg.predict(X)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 4,
115 | "metadata": {
116 | "collapsed": false
117 | },
118 | "outputs": [
119 | {
120 | "data": {
121 | "text/plain": [
122 | "150"
123 | ]
124 | },
125 | "execution_count": 4,
126 | "metadata": {},
127 | "output_type": "execute_result"
128 | }
129 | ],
130 | "source": [
131 | "# store the predicted response values\n",
132 | "y_pred = logreg.predict(X)\n",
133 | "\n",
134 | "# check how many predictions were generated\n",
135 | "len(y_pred)"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "Classification accuracy:\n",
143 | "\n",
144 | "- **Proportion** of correct predictions\n",
145 | "- Common **evaluation metric** for classification problems"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 5,
151 | "metadata": {
152 | "collapsed": false
153 | },
154 | "outputs": [
155 | {
156 | "name": "stdout",
157 | "output_type": "stream",
158 | "text": [
159 | "0.96\n"
160 | ]
161 | }
162 | ],
163 | "source": [
164 | "# compute classification accuracy for the logistic regression model\n",
165 | "from sklearn import metrics\n",
166 | "print(metrics.accuracy_score(y, y_pred))"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "- Known as **training accuracy** when you train and test the model on the same data"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "### KNN (K=5)"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 6,
186 | "metadata": {
187 | "collapsed": false
188 | },
189 | "outputs": [
190 | {
191 | "name": "stdout",
192 | "output_type": "stream",
193 | "text": [
194 | "0.966666666667\n"
195 | ]
196 | }
197 | ],
198 | "source": [
199 | "from sklearn.neighbors import KNeighborsClassifier\n",
200 | "knn = KNeighborsClassifier(n_neighbors=5)\n",
201 | "knn.fit(X, y)\n",
202 | "y_pred = knn.predict(X)\n",
203 | "print(metrics.accuracy_score(y, y_pred))"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "### KNN (K=1)"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 7,
216 | "metadata": {
217 | "collapsed": false
218 | },
219 | "outputs": [
220 | {
221 | "name": "stdout",
222 | "output_type": "stream",
223 | "text": [
224 | "1.0\n"
225 | ]
226 | }
227 | ],
228 | "source": [
229 | "knn = KNeighborsClassifier(n_neighbors=1)\n",
230 | "knn.fit(X, y)\n",
231 | "y_pred = knn.predict(X)\n",
232 | "print(metrics.accuracy_score(y, y_pred))"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {},
238 | "source": [
239 | "### Problems with training and testing on the same data\n",
240 | "\n",
241 | "- Goal is to estimate likely performance of a model on **out-of-sample data**\n",
242 | "- But, maximizing training accuracy rewards **overly complex models** that won't necessarily generalize\n",
243 | "- Unnecessarily complex models **overfit** the training data"
244 | ]
245 | },
246 | {
247 | "cell_type": "markdown",
248 | "metadata": {},
249 | "source": [
250 | ""
251 | ]
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "*Image Credit: [Overfitting](http://commons.wikimedia.org/wiki/File:Overfitting.svg#/media/File:Overfitting.svg) by Chabacano. Licensed under GFDL via Wikimedia Commons.*"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "metadata": {},
263 | "source": [
264 | "## Evaluation procedure #2: Train/test split"
265 | ]
266 | },
267 | {
268 | "cell_type": "markdown",
269 | "metadata": {},
270 | "source": [
271 | "1. Split the dataset into two pieces: a **training set** and a **testing set**.\n",
272 | "2. Train the model on the **training set**.\n",
273 | "3. Test the model on the **testing set**, and evaluate how well we did."
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 8,
279 | "metadata": {
280 | "collapsed": false
281 | },
282 | "outputs": [
283 | {
284 | "name": "stdout",
285 | "output_type": "stream",
286 | "text": [
287 | "(150L, 4L)\n",
288 | "(150L,)\n"
289 | ]
290 | }
291 | ],
292 | "source": [
293 | "# print the shapes of X and y\n",
294 | "print(X.shape)\n",
295 | "print(y.shape)"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": 9,
301 | "metadata": {
302 | "collapsed": false
303 | },
304 | "outputs": [],
305 | "source": [
306 | "# STEP 1: split X and y into training and testing sets\n",
307 | "from sklearn.cross_validation import train_test_split\n",
308 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)"
309 | ]
310 | },
311 | {
312 | "cell_type": "markdown",
313 | "metadata": {},
314 | "source": [
315 | ""
316 | ]
317 | },
318 | {
319 | "cell_type": "markdown",
320 | "metadata": {},
321 | "source": [
322 | "What did this accomplish?\n",
323 | "\n",
324 | "- Model can be trained and tested on **different data**\n",
325 | "- Response values are known for the testing set, and thus **predictions can be evaluated**\n",
326 | "- **Testing accuracy** is a better estimate than training accuracy of out-of-sample performance"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 10,
332 | "metadata": {
333 | "collapsed": false
334 | },
335 | "outputs": [
336 | {
337 | "name": "stdout",
338 | "output_type": "stream",
339 | "text": [
340 | "(90L, 4L)\n",
341 | "(60L, 4L)\n"
342 | ]
343 | }
344 | ],
345 | "source": [
346 | "# print the shapes of the new X objects\n",
347 | "print(X_train.shape)\n",
348 | "print(X_test.shape)"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": 11,
354 | "metadata": {
355 | "collapsed": false
356 | },
357 | "outputs": [
358 | {
359 | "name": "stdout",
360 | "output_type": "stream",
361 | "text": [
362 | "(90L,)\n",
363 | "(60L,)\n"
364 | ]
365 | }
366 | ],
367 | "source": [
368 | "# print the shapes of the new y objects\n",
369 | "print(y_train.shape)\n",
370 | "print(y_test.shape)"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 12,
376 | "metadata": {
377 | "collapsed": false
378 | },
379 | "outputs": [
380 | {
381 | "data": {
382 | "text/plain": [
383 | "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
384 | " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
385 | " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
386 | " verbose=0, warm_start=False)"
387 | ]
388 | },
389 | "execution_count": 12,
390 | "metadata": {},
391 | "output_type": "execute_result"
392 | }
393 | ],
394 | "source": [
395 | "# STEP 2: train the model on the training set\n",
396 | "logreg = LogisticRegression()\n",
397 | "logreg.fit(X_train, y_train)"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": 13,
403 | "metadata": {
404 | "collapsed": false
405 | },
406 | "outputs": [
407 | {
408 | "name": "stdout",
409 | "output_type": "stream",
410 | "text": [
411 | "0.95\n"
412 | ]
413 | }
414 | ],
415 | "source": [
416 | "# STEP 3: make predictions on the testing set\n",
417 | "y_pred = logreg.predict(X_test)\n",
418 | "\n",
419 | "# compare actual response values (y_test) with predicted response values (y_pred)\n",
420 | "print(metrics.accuracy_score(y_test, y_pred))"
421 | ]
422 | },
423 | {
424 | "cell_type": "markdown",
425 | "metadata": {},
426 | "source": [
427 | "Repeat for KNN with K=5:"
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": 14,
433 | "metadata": {
434 | "collapsed": false
435 | },
436 | "outputs": [
437 | {
438 | "name": "stdout",
439 | "output_type": "stream",
440 | "text": [
441 | "0.966666666667\n"
442 | ]
443 | }
444 | ],
445 | "source": [
446 | "knn = KNeighborsClassifier(n_neighbors=5)\n",
447 | "knn.fit(X_train, y_train)\n",
448 | "y_pred = knn.predict(X_test)\n",
449 | "print(metrics.accuracy_score(y_test, y_pred))"
450 | ]
451 | },
452 | {
453 | "cell_type": "markdown",
454 | "metadata": {},
455 | "source": [
456 | "Repeat for KNN with K=1:"
457 | ]
458 | },
459 | {
460 | "cell_type": "code",
461 | "execution_count": 15,
462 | "metadata": {
463 | "collapsed": false
464 | },
465 | "outputs": [
466 | {
467 | "name": "stdout",
468 | "output_type": "stream",
469 | "text": [
470 | "0.95\n"
471 | ]
472 | }
473 | ],
474 | "source": [
475 | "knn = KNeighborsClassifier(n_neighbors=1)\n",
476 | "knn.fit(X_train, y_train)\n",
477 | "y_pred = knn.predict(X_test)\n",
478 | "print(metrics.accuracy_score(y_test, y_pred))"
479 | ]
480 | },
481 | {
482 | "cell_type": "markdown",
483 | "metadata": {},
484 | "source": [
485 | "Can we locate an even better value for K?"
486 | ]
487 | },
488 | {
489 | "cell_type": "code",
490 | "execution_count": 16,
491 | "metadata": {
492 | "collapsed": false
493 | },
494 | "outputs": [],
495 | "source": [
496 | "# try K=1 through K=25 and record testing accuracy\n",
497 | "k_range = list(range(1, 26))\n",
498 | "scores = []\n",
499 | "for k in k_range:\n",
500 | " knn = KNeighborsClassifier(n_neighbors=k)\n",
501 | " knn.fit(X_train, y_train)\n",
502 | " y_pred = knn.predict(X_test)\n",
503 | " scores.append(metrics.accuracy_score(y_test, y_pred))"
504 | ]
505 | },
506 | {
507 | "cell_type": "code",
508 | "execution_count": 17,
509 | "metadata": {
510 | "collapsed": false
511 | },
512 | "outputs": [
513 | {
514 | "data": {
515 | "text/plain": [
516 | ""
517 | ]
518 | },
519 | "execution_count": 17,
520 | "metadata": {},
521 | "output_type": "execute_result"
522 | },
523 | {
524 | "data": {
525 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAEPCAYAAACHuClZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xu4XHV97/H3JzcgAZIIuZDbThBFYrmqKdZSNwUlWhWf\nHKtgrXh50mjFW09PiZy2iR6rQE95ioIVKuWJHpSqRYGqCAhb0ZoSTQgXQ0CT7CTkAoEEEsIlyf6e\nP9aaZDKZvffsPWutmT3zeT3PfjKzrr81LOY7v9/39/stRQRmZmb1GNboApiZ2dDnYGJmZnVzMDEz\ns7o5mJiZWd0cTMzMrG4OJmZmVrfcg4mkuZIekfSopEuqrB8n6WZJKyUtlTS7bN2nJT0k6QFJN0oa\nlS5fJGmjpOXp39y8r8PMzHqXazCRNAy4GjgPeDVwoaRXVWx2KbAiIk4FLgK+lO47Bfg4cEZEnAKM\nAC4o2+/KiDgj/bs9z+swM7O+5V0zmQM8FhHdEbEHuAk4v2Kb2cDdABGxGpgpaUK6bjgwRtIIYDSw\nqWw/5VpyMzOrWd7BZCqwoez9xnRZuZXAPABJc4AZwLSI2AT8E7AeeBzYERF3le13saT7JX1N0ti8\nLsDMzPrXDAn4y4DxkpYDHwNWAPskjSOpxXQAU4AjJb033ecrwPERcRqwBbiy+GKbmVnJiJyP/zhJ\nTaNkWrpsv4jYCXyo9F7SGmANMBdYExFPp8tvBv4A+GZEPFl2iH8Fbqt2ckmeeMzMbBAiYkCphLxr\nJsuAEyR1pD2xLgBuLd9A0lhJI9PX84GfRcQukuatMyUdLknAOcCqdLvJZYeYBzzUWwEiwn8RLFq0\nqOFlaJY/fxb+LPxZ9P03GLnWTCJin6SLgTtIAtf1EbFK0oJkdVwHnAQskdQDPAx8ON33PknfJWn2\n2pP+e1166CsknQb0AOuABXleh5mZ9S3vZi4i6bZ7YsWya8teL61cX7bus8Bnqyx/f8bFNDOzOjRD\nAt4K0NnZ2egiNA1/Fgf4szjAn0V9NNj2saFAUrTy9ZmZ5UES0WQJeDMzawMOJmZmVjcHEzMzq1vu\nvblsaNq3D372M9i7t9ElsVZw2GFw1lmgAmbUW7MGjj8+//PYwRxMrKpf/hLe9S4444xGl8RawS9/\nCStWwCteke951qyB174Wnn463/PYoRxMrKo1a2DuXLjxxkaXxFrBuecm91TeweR3v4Pt22HHDhg3\nLt9z2cGcM7Gq1q2DmTMbXQprFTNnJvdU3krnKOJcdjAHE6tq3TqYNavRpbBWMWuWg0mrczCxqlwz\nsSwVWTMZP97BpBEcTKyqtWsdTCw7M2cm91Te1q6Fzs5izmUHczCxQ+zdC5s2wfTpjS6JtYoim7nO\nPts1k0ZwMLFDPP44TJyYjA0wy8LkyUkPq+efz+8cL7wATz0Fr3+9g0kjOJjYIZwvsawNGwYzZkB3\nd37nWL8+qU2//OXJPew5XovlYGKHcL7E8pB33qR035bGl2zfnt+57FAOJnYI10wsD3nnTUrd2aXi\ncjR2gIOJHcJjTCwPeXcPLv8RVFRXZDvAwcQO4ZqJ5cHBpLU5mNghnDOxPBSVMyniXHYoBxM7yJ49\nsHmzx5hY9orKmRRxLjuUg4kdZONGOO44GDmy0SWxVjNpEuzcCc89l/2xn38+GccyeXLy3s1cxXMw\nsYM4X2J5kaCjI58v+e7uZBzLsPQbrXQejzUpTu7BRNJcSY9IelTSJVXWj5N0s6SVkpZKml227tOS\nHpL0gKQbJY1Kl4+XdIek1ZJ+LGls3tfRLpwvsTzlVWOovG/HjYMRI5IR8VaMXIOJpGHA1cB5wKuB\nCyW9qmKzS4EVEXEqcBHwpXTfKcDHgTMi4hSSB3ldkO6zELgrIk4E7gY+k+d1tBN3C7Y85ZXLqHbf\nOm9SrLxrJnOAxyKiOyL2ADcB51dsM5skIBARq4GZkiak64YDYySNAEYDj6fLzweWpK+XAO/M7xLa\ni5u5LE951Uyq3bfOmxQr72AyFdhQ9n5juqzcSmAegKQ5wAxgWkRsAv4JWE8SRHZExE/SfSZGxFaA\niNgCTMztCtqMg4nlKa8uuw4mjdcMz4C/DLhK0nLgQWAFsE/SOJIaSAfwDPBdSe+NiG9WOUavabbF\nixfvf93Z2UlnZ2d2JW9BzplYnorKmZTOtXp19udqRV1dXXR1ddV1DEWO3R0knQksjoi56fuFQETE\n5X3sswY4BZgLnBcR89Plfw78fkRcLGkV0BkRWyVNBu6JiJOqHCvyvL5W89JLcNRRSdfNEc3wM8Na\nzhNPwOzZsG1btsedOBEeeOBA12CA226Dr34VfvCDbM/VDiQRERrIPnk3cy0DTpDUkfbEugC4tXwD\nSWMljUxfzwd+FhG7SJq3zpR0uCQB5wCr0t1uBT6Qvr4IuCXn62gLGzbAlCkOJJafCRNg9+5kvElW\nnnsuOd6kSQcvdzNXsXINJhGxD7gYuAN4GLgpIlZJWiDpL9LNTgIeSmsb5wGfTPe9D/guSbPXSkDA\ndek+lwNvkrSaJMhclud1tAvnSyxvUvZf8t3dybgSVfyO9liTYuX+GzQibgdOrFh2bdnrpZXry9Z9\nFvhsleVPA+dmW1JzvsSKUOqye/LJ2Ryvt/v26KPh8MPhySeTZjDLl0fA234eY2JFyLpm0td967Em\nxXEwsf3czGVFyCOY9HbfOm9SHAcT28/BxIqQ9VgTB5Pm4GBi+zlnYkXIuumpr/vWzzUpjoOJAfDi\ni0nf/6mV8xOYZcw5k9bkYGIArF8P06bB8OGNLom1umOOSQbIPvNM/cfauTMZtzJhQvX1buYqjoOJ\nAc6XWHGyHGvS3Z0cq3KMSUlHR7KNx5rkz8HEAOdLrFhZNT/1d98eeSSMGQNbt9Z/Luubg4kBHmNi\nxcqqZlLLfeu8STEcTAxwM5cVK6teVrXct86bFMPBxAAHEytWljUTB5Pm4GBigHMmVqyicibgsSZF\ncTAxnn8etm9Ppp83K4JzJq3HwcRYvx6mT4dhvhusIOPHQ09P8iNmsJ55JhmvcswxfW/nZq5i+OvD\nnC+xwmUx1qS/MSYlHR3JD6aensGfy/rnYGLOl1hD1Nv8tHZtbd3ZR4+GsWNhy5bBn8v652BiHmNi\nDVFvzWQgNWo3deXPwcTczGUNUW8vKweT5uJgYm7msoaot5lroMHE3YPz5WBirplYQ9RbW6g1ZwLu\nHlwEB5M2t3s3PPssTJ7c6JJYuykFk8HO6OtmrubiYNLmurthxgyPMbHijRuX3HdPPz3wfXfsSLr6\njh9f2/YOJvnzV0ibc77EGmmwzU+lWkl/Y0xKOjpgwwbYt2/g57La5B5MJM2V9IikRyVdUmX9OEk3\nS1opaamk2enyV0paIWl5+u8zkj6RrlskaWO6brmkuXlfR6tyvsQaabA1hoHkSwAOPxxe9jLYvHng\n57LajMjz4JKGAVcD5wCbgGWSbomIR8o2uxRYERHzJJ0IXAOcGxGPAqeXHWcjcHPZfldGxJV5lr8d\neIyJNdJgg8lgfgSVzjVt2sDPZ/3Lu2YyB3gsIrojYg9wE3B+xTazgbsBImI1MFNS5ROdzwV+FxEb\ny5bVWMG1vrhmYo002C679QQTy0fewWQqsKHs/cZ0WbmVwDwASXOAGUDlb4f3AN+qWHaxpPslfU3S\n2OyK3F6cM7FGqjdnMhAea5KvXJu5anQZcJWk5cCDwApgf5pM0kjgHcDCsn2+AnwuIkLS54ErgQ9X\nO/jixYv3v+7s7KSzszPj4g9trplYIxWVM4Fk+6VLB36udtDV1UVXV1ddx1AMtpN3LQeXzgQWR8Tc\n9P1CICLi8j72WQucHBG70vfvAP6ydIwq23cAt0XEKVXWRZ7XN9Tt2gUTJ8Jzz9XeK8YsS888kzxH\nZ9eu2u/BiGTixvXrk+7FtbrzTrjsMvjJTwZX1nYiiYgY0LdC3s1cy4ATJHVIGgVcANxavoGksWnt\nA0nzgZ+WAknqQiqauCSVD7GbBzyUR+FbXXd30mXSgcQaZexYOOww2Lat9n22b0/GpwwkkIBzJnnL\ntZkrIvZJuhi4gyRwXR8RqyQtSFbHdcBJwBJJPcDDlDVXSRpNknz/i4pDXyHpNKAHWAcsyPM6WpXz\nJdYMSnmTCZXdbnox2KbZGTNg48ZkrMnw4QPf3/qWe84kIm4HTqxYdm3Z66WV68vW7QYOucUi4v0Z\nF7MtOV9izaBUY3jd62rbfjD5EkhqQBMmwOOPJ4HFsuUR8G3MY0ysGQy0l1U9P4Lc1JUfB5M25pqJ\nNYOBfsE7mDSnfoOJpI96HEdrcs7EmsFAx5rUc996rEl+aqmZdADLJX1T0rl5F8iK45qJNYPB1EwG\n2zzr55rkp99gEhELgVcANwIfkfSYpM9Jmplz2SxHzz4LL7xQew8as7wM5LkmEcm2HR31ncuyV1PO\nJCJKXXDXkXTHPQ64RdIXcyuZ5aq7e2BTeJvl5cgjYcwYeOKJ/rd96ikYNSoZnzIYDib5qSVn8jFJ\n9wFXAb8GTomI+SQz+r4n5/JZTpwvsWZS65d8vfft9OmwaRPs3Tv4Y1h1tdRMpgAXRsS5EfGtiHgR\n9tdW3pFr6Sw3zpdYM6k1mNTbnX3UKJg0KRm8aNmqJZh8H9haeiPpKEmvBYgIT2MyRHmMiTWTWntZ\nZfEjyE1d+aglmFwH7C57/xxwbS/b2hDhmok1k1p7WTmYNK9agsmwtEkL2N+8NTK/IlkRnDOxZlJU\nzqR0Lo81yV4twWRtOnBxuKRhkj5G0qvLhjDXTKyZFJUzAY81yUstwWQByTPct6Z/bwTm51koy9eO\nHUlvlmOOaXRJzBIdHUl39Z6e3repd4xJiZu58tHvrMERsRV4VwFlsYJ4jIk1mzFj4OijYetWOO64\n6ts8+SSMHg1HHVXfuRxM8tFvMJF0GPAB4NXA4aXlEVH5jBEbIpwvsWZU+pLvLZhkdd9OmwZbtsCe\nPTDS2d/M1NLM9XVgJvA24L+BlwMv5Fgmy5nzJdaM+kuMZ9WdfeTIJGBt2FD/seyAWoLJKyPiM8Cu\niLgemAvMybdYliePMbFm1F/zU5Y/gtzUlb1agsme9N8dkk4CjgIm5lcky5trJtaM+utl5WDS3GoJ\nJtdLGg8sAn4MPAr831xLZblyzsSaUX9f8Fnetx5rkr0+E/CShgPbImI7cA/gJycPcaXulQ4m1myK\nyplAcpy77srmWJbos2YSEfuASwsqixVgx47k3/HjG1sOs0odHUlSvNpYk4ikS3u9Y0xK3MyVvVqa\nue6Q9ClJx0k6uvSXe8ksF6VaiceYWLM54ojkR87mzYeu27o1GV8yZkw253IwyV6/40yA96X//s+y\nZYGbvIYk50usmZW+5KdOPXh51vft1KnJw7hefBEOOyy747azWh7bO73KX82BRNJcSY9IelTSJVXW\nj5N0s6SVkpZKmp0uf6WkFZKWp/8+I+kT6brxku6QtFrSjyUN8rlr7cfdgq2Z9ZY3yfq+HTEiCSge\na5KdWkbAv7fa8oj4Zg37DgOuJpnbaxOwTNItEfFI2WaXAisiYp6kE4FrgHMj4lGSpzmWjrMRuDnd\nZyFwV0RckQaoz6TLrB/r1sHxxze6FGbV9dY9OI9OI6Va0AknZHvcdlVLzuSssr83AV+k9rm65gCP\nRUR3ROwBbgLOr9hmNnA3QESsBmZKmlCxzbnA7yKi9Hy084El6eslwDtrLE/bc08ua2a95TLyDCaW\njVqauT5a9vdB4DTgiBqPPxUor0huTJeVWwnMA5A0hyQXM61im/cA3yp7PzGdgJKI2IIHUdbMORNr\nZr19wedx33qsSbZqScBX2glk2VByGXCVpOXAg8AKYF9ppaSRJM+a76sZK3pbsXjx4v2vOzs76ezs\nrK+0Q5jHmFizKypnAsnxbr8922MOVV1dXXR1ddV1DEX0+j2cbCB9jwNf1sNIZg++JSL+ut+DS2cC\niyNibvp+IRARcXkf+6wFTo6IXen7dwB/WTpGumwV0BkRWyVNBu6JiJOqHCv6u7528tRTSfvw9u2N\nLolZdS++mExFv3s3DB+eLOvpSaae37496T6clXvvhYUL4Re/yO6YrUISETGgAQS11EyuLnu9F+iO\niHU1Hn8ZcIKkDmAzcAFwYfkGaU+s3RGxR9J84KelQJK6kIObuABuJZkW/3LgIuCWGsvT1lwrsWZ3\n2GFw7LGwaRNMn54s27IFxo3LNpCAcyZZqyWYPAY8EREvAEg6QtL0iOi3U11E7JN0MXAHSa3m+ohY\nJWlBsjquA04ClkjqAR4GPlzaX9JokuR75bNTLge+LelDQDfw7hquo+05X2JDQelLvhRM8rpvp0yB\nbdvghRfg8MP73976VkswuRn4g7L3PcB/UOM09BFxO3BixbJry14vrVxftm43UNmzi4h4miTI2AB4\njIkNBaW8yVlnJe/zum+HD08C1vr18MpXZn/8dlNL1+AREfFS6U1EvAh4zOgQ5GYuGwoqx5rked+6\nqSs7tQSTpyS9tfRG0tuAp/MrkuXFwcSGgsoveAeToaGWYPJR4HOS1qY9rf4eWJBvsSwPzpnYUFDZ\nPTjP+9ZjTbLTb84kndbktZLGpe935F4qy5zHmNhQUa1mkleub9Ys+M//zOfY7abfmomk/yNpXETs\niIgd6SSLny2icJadbduSHitH++EB1uRmzEi6Bu/dC/v2JZMxzshpjnI3c2Wnlmaut5XXRtKnLr49\nvyJZHlwrsaFi1CiYOBEefzx5tskxx+TXddfBJDu1dA0eLmlUqUeXpMOBUfkWy7LmfIkNJaVcxvDh\n+d63xx2XjKx//vnsB0W2m1qCyU3AnZL+LX3/IaDf6eetuXiMiQ0lpRrD8OH53rfDhiVNaN3d8KpX\n5XeedlBLAv4Lkh7gwCDBKyLiB/kWy7K2bh3Mnt3oUpjVpjTWJO+aCRwIXA4m9aklZ0JE/GdEfCoi\nPkUy7uSqnMtlGXPOxIaS0hd8Efet8ybZqCmYSDpZ0hck/Q74R8A9s4cY50xsKCnlTIq4bz3WJBu9\nNnNJOp5kxt4LgV3AvwMjI+KsgspmGYlI2oQdTGyoKG/myjvXN2sWfP/7+Z6jHfSVM/ktcC8wLx24\niKSPF1Iqy9QTT8CYMXDkkY0uiVltpk1LugVLB2YPzoububLRVzPXu4EngbskfUXSG4EBPSzFmoPz\nJTbUjByZdNudMCF5xkmeHEyy0WswiYjvRsS7SJ6suJTksbmTJH1Z0h8XVUCrn/MlNhTNnFnMfTtp\nEjz7LDz3XP7namX9JuAjYmdEfD0i3gLMAFYBi3IvmWXGY0xsKJo1q5j7dtgw6OhI8oo2eLUMWtwv\nIrYBX0n/rA433wzvLuj5kD09cMMNxZzLLCunnprMzVWE3/s9OPnkJEfTrK6+Gj7ykUaXoneKiEaX\nITeSolmvb9Gi5Et+UUF1vBED+tlg1l4iigtcg/Ev/wK/+U3ybxEkEREDCq3+immQtWvh7LP9JW/W\nDKTm/n/x+OPhB00+70hNgxYte85jmFmtKh9l3Iz6jcWStgOVbUXPAL8C/ldErMuhXC3P3XXNrFal\nDgIRzZvXqaVidw2wmQMzBV8IzARWAjcAZ+dSshb20kuwdWsyMMvMrD9jxsBRRyXfG5MnN7o01dXS\nzPX2iLgmIranf18B3hwRNwIvy7l8LWnDhmRAVjO30ZpZc2n2OcRqCSbPS5pXepO+fjF929PfzpLm\nSnpE0qOSLqmyfpykmyWtlLRU0uyydWMlfUfSKkkPS/r9dPkiSRslLU//5tZwHU3D+RIzG6hmz5vU\nEkzeB8yX9LSkp4D5wJ9LGg18qq8dJQ0DrgbOIxlJf6GkyqcGXAqsiIhTgYuAL5Wtuwr4YUScBJxK\nMmCy5MqIOCP9u72G62gazpeY2UA1+7QvtTwc67fAW3pZ/dN+dp8DPBYR3QCSbgLOBx4p22Y28MX0\nXKslzZQ0gaT2c1ZEfCBdtxd4tmy/Jk1D9c/BxMwGauZMWLGi0aXoXb81E0nHSvqbdLLH60p/NR5/\nKrCh7P3GdFm5lcC89FxzSKZsmQbMArZJuiFtyrpOUvlTmi+WdL+kr0kaW2N5moLnyjKzgRryNRPg\nFpKJHn8O5DFG9DLgKknLgQeBFel5RgJnAB+LiF9J+meSySYXkUzn8rmICEmfB64EPlzt4IsXL97/\nurOzk87OzhwuYWCcMzGzgcozZ9LV1UVXV1ddx+h3OhVJ90fEaYM6uHQmsDgi5qbvFwIREZf3sc9a\n4GRgDPDLiDg+Xf6HwCUR8faK7TuA2yLilCrHasrpVKZNg//6L5gxo9ElMbOh4vnnYfx42L07mZwy\nT4OZTqWWIv1I0psHWaZlwAmSOiSNAi4Abi3fIO2xNTJ9PR/4aUTsioitwAZJr0w3PQf4TbpdeU/r\necBDgyxf4V58EZ58EqZMaXRJzGwoOeIIGDcueWhYM6qlmesjwCWSdgMvkSS+IyL6HWMSEfskXQzc\nQRK4ro+IVZIWpMe4DjgJWCKpB3iYg5urPgHcmAabNcAH0+VXSDqNpGvyOmBBDdfRFNavh6lTPcbE\nzAaulDeZWpl5bgK1fKUdW88J0m67J1Ysu7bs9dLK9WXrVgKvq7L8/fWUqZGcLzGzwSrlTd7whkaX\n5FC9BhNJr4iIx0jGh1TzQD5Fam3uFmxmg9XMPbr6qpksJGlyuqbKugD+KJcStTgHEzMbrJkzYdmy\nRpeiul6DSUSUchd/HBF7yteVEuY2cGvXwlt6GwJqZtaHmTPhO99pdCmqq6U313/XuMxq4JyJmQ1W\nM8/P1VfOZCJwHHCEpJM5MH3J0cDoAsrWktzMZWaDNWNGMuv4vn0wfHijS3OwvnImfwJ8iGRqk2s4\nEEx2An+Xc7la0gsvwFNPJdPPm5kN1OGHwzHHwKZNMH16o0tzsL5yJjcAN0h6d0R8u8Aytazu7uQG\naLZfFGY2dJSaupotmNSSM5ko6WgASV+VdJ+kc3IuV0tyvsTM6tWs3YNrCSZ/ERHPplOqHEfyPJMr\n8i1Wa3K+xMzqNZSDSWmmxLcCX09Hpec8zVhrcjAxs3o16+N7awkKKyX9EHgbyaSPR3IgwNgA+Dkm\nZlavZu0eXMvcXB8EXgP8NiJ2SzqWXp4dYn1zzsTM6jVkm7kiYh9wPPDRdNERtexnh3Izl5nVa/p0\nePxx2Lu30SU5WC2P7b0aOBt4X7roOeCreRaqFe3eDTt2wOTJ/W9rZtabww6DCROSgNJMaqlh/EFE\nLABeAIiIp4FRuZaqBXV3J6NX835Cmpm1vmbMm9Ty1bZH0jDSpLukY0geSmUD4HyJmWWlGfMmvQYT\nSaXk/DXAfwATJH0W+DnQ6zPcrTrnS8wsK83YPbiv3lz3AWdExNcl/Ro4l2R+rj+NiCHzzPVm4W7B\nZpaVmTPh3nsbXYqD9RVMShM7EhEPkzyf3QZp3To4/fRGl8LMWsGsWfCNbzS6FAfrK5hMkPRXva2M\niCtzKE/Lcs7EzLLSjDmTvoLJcOBIymooNnjOmZhZVqZNS6ah37MHRjbJc2/7CiabI+JzhZWkhe3a\nBTt3wqRJjS6JmbWCUaOSMWsbNzZPi0dfXYNdI8lIdzd0dID8iZpZRpptrElfwSSTZ5ZImivpEUmP\nSrqkyvpxkm6WtFLSUkmzy9aNlfQdSaskPSzp99Pl4yXdIWm1pB9LGptFWfPifImZZa3Z8ia9BpN0\npHtd0sGOVwPnAa8GLpT0qorNLgVWRMSpwEXAl8rWXQX8MCJOAk4FVqXLFwJ3RcSJwN3AZ+ota56c\nLzGzrDXbWJO8J/eYAzwWEd0RsQe4CTi/YpvZJAGBiFgNzJQ0IX2641np44OJiL0R8Wy6z/nAkvT1\nEuCdOV9HXTzGxMyyNpSaubIwFdhQ9n5juqzcSmAegKQ5wAxgGjAL2CbpBknLJV0n6Yh0n4kRsRUg\nIrYAE3O8hrq5ZmJmWWu2Zq5anmeSt8uAqyQtBx4EVgD7gJHAGcDHIuJXkv6ZpHlrEYd2Duj1YV2L\nFy/e/7qzs5POzs4sy14T50zMLGtZBpOuri66urrqOoYi8ntooqQzgcURMTd9vxCIiOh1bi9Ja4GT\ngTHALyPi+HT5HwKXRMTbJa0COiNiq6TJwD1pXqXyWJHn9dXq2GPhN7+BiU1dfzKzoWTvXhgzJhl2\nMCrjedwlERED6n+adzPXMuAESR2SRgEXALeWb5D22BqZvp4P/DQidqXNWBskvTLd9BzgN+nrW4EP\npK8vAm7J9zIGb+fO5FkmEyY0uiRm1kpGjIApU2DDhv63LUKuzVwRsU/SxcAdJIHr+ohYJWlBsjqu\nA04ClkjqIZn/q/yRwJ8AbkyDzRqSRwhDMmvxtyV9COgG3p3nddSjlC/xGBMzy1qpqevlL290SQrI\nmUTE7cCJFcuuLXu9tHJ92bqVwOuqLH+aZBbjpud8iZnlpZm6B/u5fzlzTy4zy0sz9ehyMMmZx5iY\nWV6aaayJg0nOXDMxs7y4ZtJGnDMxs7w4Z9JGXDMxs7xMnQrbtsGLLza6JA4muXrmGXjpJTjmmEaX\nxMxa0fDhyYOy1q9vdEkcTHLlMSZmlrdmyZs4mOTI+RIzy1uz5E0cTHLkfImZ5c01kzbgMSZmlrdm\nGWviYJIj10zMLG+umbQB50zMLG/NkjPJ9Xkmjdbo55mMGwdr1sDLXtawIphZi+vpgdGjYft2OOKI\n/revRTM+z6Rtbd+e/EceP77RJTGzVjZsGEyf3vixJg4mOfEYEzMrSjPkTRxMcuJ8iZkVpRnyJg4m\nOXFPLjMrSjN0D3YwyYnHmJhZUdzM1cJcMzGzoriZq4U5Z2JmRWmGmonHmeQgAsaOTbrqjRtX+OnN\nrM309MCYMfDUU8mYk3p5nEmTePrppO+3A4mZFWHYMJgxA7q7G1iGxp26dbmJy8yK1ui8Se7BRNJc\nSY9IelTSJVXWj5N0s6SVkpZKml22bl26fIWk+8qWL5K0UdLy9G9u3tcxEE6+m1nRGp03GZHnwSUN\nA64GzgE2Acsk3RIRj5RtdimwIiLmSToRuAY4N13XA3RGxPYqh78yIq7MsfiD5mBiZkVr9FiTvGsm\nc4DHIqKIBZ5rAAAK3ElEQVQ7IvYANwHnV2wzG7gbICJWAzMlTUjXqY8yNu1EJR5jYmZFa3TNJO9g\nMhXYUPZ+Y7qs3EpgHoCkOcAMYFq6LoA7JS2TNL9iv4sl3S/pa5LGZl/0wXPOxMyK1uicSa7NXDW6\nDLhK0nLgQWAFsC9d94aI2JzWVO6UtCoifg58BfhcRISkzwNXAh+udvDFixfvf93Z2UlnZ2duF1Li\nZi4zK1o9NZOuri66urrqOn+u40wknQksjoi56fuFQETE5X3ssxY4OSJ2VSxfBOyszJNI6gBui4hT\nqhyr8HEmEXDUUfD448lYEzOzIkQkY02eeAKOPLK+YzXjOJNlwAmSOiSNAi4Abi3fQNJYSSPT1/OB\nn0bELkmjJR2ZLh8DvBl4KH0/uewQ80rLm8G2bTBqlAOJmRVLgo6Oxo01ybWZKyL2SboYuIMkcF0f\nEaskLUhWx3XAScASST3AwxxorpoEfE9SpOW8MSLuSNddIek0kt5e64AFeV7HQDhfYmaNUsqbvPrV\nxZ8795xJRNwOnFix7Nqy10sr16fL1wKn9XLM92dczMw4X2JmjdLI7sEeAZ8xBxMza5RGdg92MMmY\nx5iYWaM4mLQQ50zMrFEaOdbEwSRjbuYys0ZxzqRFRCT/ITs6Gl0SM2tHxx4LL7wAzz5b/LkdTDL0\nxBPJg2mOOqrRJTGzdiQ1Lm/iYJIh50vMrNEcTFqA8yVm1miNyps4mGTIwcTMGs01kxbgMSZm1miN\n6h7sYJIh50zMrNFcM2kBbuYys0ZzzmSIi0imfvYYEzNrpJe9DPbuhR07ij2vg0lGtmxJxpeMGdPo\nkphZO2vUWBMHk4w4X2JmzcLBZAhzvsTMmkUj8iYOJhlxt2AzaxaumQxhrpmYWbNoxFgTB5OMOGdi\nZs3CzVxDmGsmZtYsSs1cEcWd08EkAz09sH69x5iYWXMYNy75t8ixJg4mGdi8OfmPd8QRjS6JmdmB\nsSZF5k1yDyaS5kp6RNKjki6psn6cpJslrZS0VNLssnXr0uUrJN1Xtny8pDskrZb0Y0lj876Ovjhf\nYmbNpui8Sa7BRNIw4GrgPODVwIWSXlWx2aXAiog4FbgI+FLZuh6gMyJOj4g5ZcsXAndFxInA3cBn\n8rqGWgyFfElXV1eji9A0/Fkc4M/igFb7LIruHpx3zWQO8FhEdEfEHuAm4PyKbWaTBAQiYjUwU9KE\ndJ16KeP5wJL09RLgnVkXfCCGwhiTVvsfpR7+LA7wZ3FAq30WrdbMNRXYUPZ+Y7qs3EpgHoCkOcAM\nYFq6LoA7JS2TNL9sn4kRsRUgIrYAE3Moe82GQs3EzNpL0TWTEcWdqleXAVdJWg48CKwA9qXr3hAR\nm9Oayp2SVkXEz6scI5MOcBs3wkc/OvD97rsP3v3uLEpgZpaN44+He++Ft7+9mPMpcuyILOlMYHFE\nzE3fLwQiIi7vY5+1wMkRsati+SJgZ0RcKWkVSS5lq6TJwD0RcVKVYxXYy9rMrHVEhAayfd41k2XA\nCZI6gM3ABcCF5RukPbF2R8SetCnrpxGxS9JoYFj6egzwZuCz6W63Ah8ALidJ2t9S7eQD/TDMzGxw\ncg0mEbFP0sXAHST5mesjYpWkBcnquA44CVgiqQd4GPhwuvsk4Htp7WIEcGNE3JGuuxz4tqQPAd2A\nG5nMzBoo12YuMzNrDy05Ar6/gZLtprfBn+1A0vWStkp6oGxZUw16LUovn8UiSRslLU//5jayjEWQ\nNE3S3ZIelvSgpE+ky9vuvqjyWXw8XT7g+6LlaibpQMlHgXOATSR5mwsi4pGGFqyBJK0BXhMR2xtd\nlqJJ+kNgF/D1iDglXXY58FREXJH+2BgfEQsbWc4i9PJZ7O/Y0tDCFSjttDM5Iu6XdCTwa5Kxax+k\nze6LPj6L9zDA+6IVaya1DJRsN70N/mx5aVfyyiDaVINei9LLZwHJ/dE2ImJLRNyfvt4FrCIZ29Z2\n90Uvn0VpLOCA7otW/IKpZaBku+lt8Ge7aqpBr03gYkn3S/paOzTtlJM0EzgNWApMauf7ouyz+O90\n0YDui1YMJnaoN0TEGcBbgY+lzR12QGu19Q7MV4DjI+I0YAvQTs1dRwLfBT6Z/iqvvA/a5r6o8lkM\n+L5oxWDyOMmULCXT0mVtKyI2p/8+CXyPpCmwnW2VNAn2txk/0eDyNExEPBkHEqf/CryukeUpiqQR\nJF+e34iI0ji1trwvqn0Wg7kvWjGY7B8oKWkUyUDJWxtcpoaRNDr91UHZ4M+HGluqwomD239Lg16h\nj0GvLeqgzyL90iyZR/vcG/8G/CYiripb1q73xSGfxWDui5brzQVJ12DgKg4MlLyswUVqGEmzSGoj\n5YM/2+bzkPRNoBM4BtgKLAK+D3wHmE466DUiCnwmXWP08lmcTdJO3gOsAxaU8gatStIbgJ+RzAUY\n6d+lwH3At2mj+6KPz+K9DPC+aMlgYmZmxWrFZi4zMyuYg4mZmdXNwcTMzOrmYGJmZnVzMDEzs7o5\nmJiZWd0cTGxIS6fPflPFsk9Kuqaf/XbmXK5jJS2V9Ou0L3/5unsknZG+npU+KuFNVY7xj+m04L0+\n5rqfMrxR0m1l7z8v6YeSRkrqkrSsbN1rJN1Ttl+PpD8pW3+bpD8aTDmsPTiY2FD3TSoeBU0y68E3\n+9kv7wFW5wIPRMRrIuIX1TaQNA34EfDpiLizyibzgVMioqZn8kgaXmVxpOv+Fng98M50Nu0AJkg6\nr3Lb1Ebgf9dyXjNwMLGh7z+At6bzCyGpAzguIn4haYykuyT9Kn042Dsqd67y6/3Lkt6fvj6j9Ate\n0o9K8zZV7N8h6Sfp8e9MHzZ0Ksmjpc9PHyx0WJVyTwF+DHwmIn5Q5bi3AEcCv5b0p2Xnub90nnS7\nGyT9i6Sl6TmrHEp/BZwHvD0iXipb94/A31b9VGEl8Iykc3pZb3YQBxMb0tIHft0HvCVddAHJlBgA\nL5D8En8t8MfAP/V2mMoFaXD6MvA/IuJ1wA3AF6rs+2Xghog4laQ29OWIWAn8PfDvEXFGRLxYZb8l\n6bbf6+W6zgd2p/t/p+w8p5XOU7b51Ig4MyL+usqh3gAsAN4SEbsrrvmXwIuS3litCMA/AH9XrXxm\nlRxMrBXcRBJESP/9VvpawBclrQTuAqZIqvUZFScCv0fyHJgVJE0+U6ps9/qy832D5Mu7FncC75N0\neB/blE9O2dd5vtPHMX6bHufNvRy714CRPkwrKnM+ZtU4mFgruAU4R9LpwBERsSJd/mfAscDpEXE6\nyZTilV/eezn4/4PSegEPpTWD0yPi1Ih4C4cabO7lCpIZrr+bPmq6mujldaXn+li3heQ5Nv8sqfOQ\nE0TcQ3LNZ/ay/xdImsI8iZ/1ycHEhryIeA7oIplK+1tlq8YCT0REj6SzgY6ydaVf5t3A7LSH0zig\nlCNYTZKgPhOSZi9Js6uc/r840AHgfcC9Ayj3p4Fn0nJXU14zqec8vyWZRvz/STqlyib/APxNL/ve\nCYwHqu1ntp+DibWKb5F84ZUHkxuB16XNXO8jeb51SQBExEaSHMtDJM1ly9Ple4B3AZdLuh9YQdLU\nVOkTwAfTbf4M+GQNZS3/lf8BYHIv3X/Lt+vtPDXVGCLiV8AHgVvTxxJE2bofkdTaejvWP5BMy27W\nK09Bb2ZmdXPNxMzM6uZgYmZmdXMwMTOzujmYmJlZ3RxMzMysbg4mZmZWNwcTMzOrm4OJmZnV7f8D\nw6JECwTkHloAAAAASUVORK5CYII=\n",
526 | "text/plain": [
527 | ""
528 | ]
529 | },
530 | "metadata": {},
531 | "output_type": "display_data"
532 | }
533 | ],
534 | "source": [
535 | "# import Matplotlib (scientific plotting library)\n",
536 | "import matplotlib.pyplot as plt\n",
537 | "\n",
538 | "# allow plots to appear within the notebook\n",
539 | "%matplotlib inline\n",
540 | "\n",
541 | "# plot the relationship between K and testing accuracy\n",
542 | "plt.plot(k_range, scores)\n",
543 | "plt.xlabel('Value of K for KNN')\n",
544 | "plt.ylabel('Testing Accuracy')"
545 | ]
546 | },
547 | {
548 | "cell_type": "markdown",
549 | "metadata": {},
550 | "source": [
551 | "- **Training accuracy** rises as model complexity increases\n",
552 | "- **Testing accuracy** penalizes models that are too complex or not complex enough\n",
553 | "- For KNN models, complexity is determined by the **value of K** (lower value = more complex)"
554 | ]
555 | },
556 | {
557 | "cell_type": "markdown",
558 | "metadata": {},
559 | "source": [
560 | "## Making predictions on out-of-sample data"
561 | ]
562 | },
563 | {
564 | "cell_type": "code",
565 | "execution_count": 18,
566 | "metadata": {
567 | "collapsed": false
568 | },
569 | "outputs": [
570 | {
571 | "data": {
572 | "text/plain": [
573 | "array([1])"
574 | ]
575 | },
576 | "execution_count": 18,
577 | "metadata": {},
578 | "output_type": "execute_result"
579 | }
580 | ],
581 | "source": [
582 | "# instantiate the model with the best known parameters\n",
583 | "knn = KNeighborsClassifier(n_neighbors=11)\n",
584 | "\n",
585 | "# train the model with X and y (not X_train and y_train)\n",
586 | "knn.fit(X, y)\n",
587 | "\n",
588 | "# make a prediction for an out-of-sample observation\n",
589 | "knn.predict([[3, 5, 4, 2]])"
590 | ]
591 | },
592 | {
593 | "cell_type": "markdown",
594 | "metadata": {},
595 | "source": [
596 | "## Downsides of train/test split?"
597 | ]
598 | },
599 | {
600 | "cell_type": "markdown",
601 | "metadata": {},
602 | "source": [
603 | "- Provides a **high-variance estimate** of out-of-sample accuracy\n",
604 | "- **K-fold cross-validation** overcomes this limitation\n",
605 | "- But, train/test split is still useful because of its **flexibility and speed**"
606 | ]
607 | },
608 | {
609 | "cell_type": "markdown",
610 | "metadata": {},
611 | "source": [
612 | "## Resources\n",
613 | "\n",
614 | "- Quora: [What is an intuitive explanation of overfitting?](http://www.quora.com/What-is-an-intuitive-explanation-of-overfitting/answer/Jessica-Su)\n",
615 | "- Video: [Estimating prediction error](https://www.youtube.com/watch?v=_2ij6eaaSl0&t=2m34s) (12 minutes, starting at 2:34) by Hastie and Tibshirani\n",
616 | "- [Understanding the Bias-Variance Tradeoff](http://scott.fortmann-roe.com/docs/BiasVariance.html)\n",
617 | " - [Guiding questions](https://github.com/justmarkham/DAT8/blob/master/homework/09_bias_variance.md) when reading this article\n",
618 | "- Video: [Visualizing bias and variance](http://work.caltech.edu/library/081.html) (15 minutes) by Abu-Mostafa"
619 | ]
620 | },
621 | {
622 | "cell_type": "markdown",
623 | "metadata": {},
624 | "source": [
625 | "## Comments or Questions?\n",
626 | "\n",
627 | "- Email: \n",
628 | "- Website: http://dataschool.io\n",
629 | "- Twitter: [@justmarkham](https://twitter.com/justmarkham)"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": 1,
635 | "metadata": {
636 | "collapsed": false
637 | },
638 | "outputs": [
639 | {
640 | "data": {
641 | "text/html": [
642 | "\n",
694 | ""
709 | ],
710 | "text/plain": [
711 | ""
712 | ]
713 | },
714 | "execution_count": 1,
715 | "metadata": {},
716 | "output_type": "execute_result"
717 | }
718 | ],
719 | "source": [
720 | "from IPython.core.display import HTML\n",
721 | "def css_styling():\n",
722 | " styles = open(\"styles/custom.css\", \"r\").read()\n",
723 | " return HTML(styles)\n",
724 | "css_styling()"
725 | ]
726 | }
727 | ],
728 | "metadata": {
729 | "kernelspec": {
730 | "display_name": "Python 2",
731 | "language": "python",
732 | "name": "python2"
733 | },
734 | "language_info": {
735 | "codemirror_mode": {
736 | "name": "ipython",
737 | "version": 2
738 | },
739 | "file_extension": ".py",
740 | "mimetype": "text/x-python",
741 | "name": "python",
742 | "nbconvert_exporter": "python",
743 | "pygments_lexer": "ipython2",
744 | "version": "2.7.11"
745 | }
746 | },
747 | "nbformat": 4,
748 | "nbformat_minor": 0
749 | }
750 |
--------------------------------------------------------------------------------
/07_cross_validation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Cross-validation for parameter tuning, model selection, and feature selection\n",
8 | "*From the video series: [Introduction to machine learning with scikit-learn](https://github.com/justmarkham/scikit-learn-videos)*"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "## Agenda\n",
16 | "\n",
17 | "- What is the drawback of using the **train/test split** procedure for model evaluation?\n",
18 | "- How does **K-fold cross-validation** overcome this limitation?\n",
19 | "- How can cross-validation be used for selecting **tuning parameters**, choosing between **models**, and selecting **features**?\n",
20 | "- What are some possible **improvements** to cross-validation?"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "## Review of model evaluation procedures"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "**Motivation:** Need a way to choose between machine learning models\n",
35 | "\n",
36 | "- Goal is to estimate likely performance of a model on **out-of-sample data**\n",
37 | "\n",
38 | "**Initial idea:** Train and test on the same data\n",
39 | "\n",
40 | "- But, maximizing **training accuracy** rewards overly complex models which **overfit** the training data\n",
41 | "\n",
42 | "**Alternative idea:** Train/test split\n",
43 | "\n",
44 | "- Split the dataset into two pieces, so that the model can be trained and tested on **different data**\n",
45 | "- **Testing accuracy** is a better estimate than training accuracy of out-of-sample performance\n",
46 | "- But, it provides a **high variance** estimate since changing which observations happen to be in the testing set can significantly change testing accuracy"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 2,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [],
56 | "source": [
57 | "from sklearn.datasets import load_iris\n",
58 | "from sklearn.cross_validation import train_test_split\n",
59 | "from sklearn.neighbors import KNeighborsClassifier\n",
60 | "from sklearn import metrics"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 3,
66 | "metadata": {
67 | "collapsed": false
68 | },
69 | "outputs": [],
70 | "source": [
71 | "# read in the iris data\n",
72 | "iris = load_iris()\n",
73 | "\n",
74 | "# create X (features) and y (response)\n",
75 | "X = iris.data\n",
76 | "y = iris.target"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 4,
82 | "metadata": {
83 | "collapsed": false
84 | },
85 | "outputs": [
86 | {
87 | "name": "stdout",
88 | "output_type": "stream",
89 | "text": [
90 | "0.973684210526\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "# use train/test split with different random_state values\n",
96 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)\n",
97 | "\n",
98 | "# check classification accuracy of KNN with K=5\n",
99 | "knn = KNeighborsClassifier(n_neighbors=5)\n",
100 | "knn.fit(X_train, y_train)\n",
101 | "y_pred = knn.predict(X_test)\n",
102 | "print(metrics.accuracy_score(y_test, y_pred))"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {},
108 | "source": [
109 | "**Question:** What if we created a bunch of train/test splits, calculated the testing accuracy for each, and averaged the results together?\n",
110 | "\n",
111 | "**Answer:** That's the essense of cross-validation!"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "metadata": {},
117 | "source": [
118 | "## Steps for K-fold cross-validation"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "1. Split the dataset into K **equal** partitions (or \"folds\").\n",
126 | "2. Use fold 1 as the **testing set** and the union of the other folds as the **training set**.\n",
127 | "3. Calculate **testing accuracy**.\n",
128 | "4. Repeat steps 2 and 3 K times, using a **different fold** as the testing set each time.\n",
129 | "5. Use the **average testing accuracy** as the estimate of out-of-sample accuracy."
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "Diagram of **5-fold cross-validation:**\n",
137 | "\n",
138 | ""
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 5,
144 | "metadata": {
145 | "collapsed": false
146 | },
147 | "outputs": [
148 | {
149 | "name": "stdout",
150 | "output_type": "stream",
151 | "text": [
152 | "Iteration Training set observations Testing set observations\n",
153 | " 1 [ 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] [0 1 2 3 4] \n",
154 | " 2 [ 0 1 2 3 4 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] [5 6 7 8 9] \n",
155 | " 3 [ 0 1 2 3 4 5 6 7 8 9 15 16 17 18 19 20 21 22 23 24] [10 11 12 13 14] \n",
156 | " 4 [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 20 21 22 23 24] [15 16 17 18 19] \n",
157 | " 5 [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19] [20 21 22 23 24] \n"
158 | ]
159 | }
160 | ],
161 | "source": [
162 | "# simulate splitting a dataset of 25 observations into 5 folds\n",
163 | "from sklearn.cross_validation import KFold\n",
164 | "kf = KFold(25, n_folds=5, shuffle=False)\n",
165 | "\n",
166 | "# print the contents of each training and testing set\n",
167 | "print('{} {:^61} {}'.format('Iteration', 'Training set observations', 'Testing set observations'))\n",
168 | "for iteration, data in enumerate(kf, start=1):\n",
169 | " print('{:^9} {} {:^25}'.format(iteration, data[0], data[1]))"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {},
175 | "source": [
176 | "- Dataset contains **25 observations** (numbered 0 through 24)\n",
177 | "- 5-fold cross-validation, thus it runs for **5 iterations**\n",
178 | "- For each iteration, every observation is either in the training set or the testing set, **but not both**\n",
179 | "- Every observation is in the testing set **exactly once**"
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {},
185 | "source": [
186 | "## Comparing cross-validation to train/test split"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "metadata": {},
192 | "source": [
193 | "Advantages of **cross-validation:**\n",
194 | "\n",
195 | "- More accurate estimate of out-of-sample accuracy\n",
196 | "- More \"efficient\" use of data (every observation is used for both training and testing)\n",
197 | "\n",
198 | "Advantages of **train/test split:**\n",
199 | "\n",
200 | "- Runs K times faster than K-fold cross-validation\n",
201 | "- Simpler to examine the detailed results of the testing process"
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {},
207 | "source": [
208 | "## Cross-validation recommendations"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "1. K can be any number, but **K=10** is generally recommended\n",
216 | "2. For classification problems, **stratified sampling** is recommended for creating the folds\n",
217 | " - Each response class should be represented with equal proportions in each of the K folds\n",
218 | " - scikit-learn's `cross_val_score` function does this by default"
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {},
224 | "source": [
225 | "## Cross-validation example: parameter tuning"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "metadata": {},
231 | "source": [
232 | "**Goal:** Select the best tuning parameters (aka \"hyperparameters\") for KNN on the iris dataset"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 6,
238 | "metadata": {
239 | "collapsed": false
240 | },
241 | "outputs": [],
242 | "source": [
243 | "from sklearn.cross_validation import cross_val_score"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 7,
249 | "metadata": {
250 | "collapsed": false
251 | },
252 | "outputs": [
253 | {
254 | "name": "stdout",
255 | "output_type": "stream",
256 | "text": [
257 | "[ 1. 0.93333333 1. 1. 0.86666667 0.93333333\n",
258 | " 0.93333333 1. 1. 1. ]\n"
259 | ]
260 | }
261 | ],
262 | "source": [
263 | "# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)\n",
264 | "knn = KNeighborsClassifier(n_neighbors=5)\n",
265 | "scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')\n",
266 | "print(scores)"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 8,
272 | "metadata": {
273 | "collapsed": false
274 | },
275 | "outputs": [
276 | {
277 | "name": "stdout",
278 | "output_type": "stream",
279 | "text": [
280 | "0.966666666667\n"
281 | ]
282 | }
283 | ],
284 | "source": [
285 | "# use average accuracy as an estimate of out-of-sample accuracy\n",
286 | "print(scores.mean())"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 9,
292 | "metadata": {
293 | "collapsed": false
294 | },
295 | "outputs": [
296 | {
297 | "name": "stdout",
298 | "output_type": "stream",
299 | "text": [
300 | "[0.95999999999999996, 0.95333333333333337, 0.96666666666666656, 0.96666666666666656, 0.96666666666666679, 0.96666666666666679, 0.96666666666666679, 0.96666666666666679, 0.97333333333333338, 0.96666666666666679, 0.96666666666666679, 0.97333333333333338, 0.98000000000000009, 0.97333333333333338, 0.97333333333333338, 0.97333333333333338, 0.97333333333333338, 0.98000000000000009, 0.97333333333333338, 0.98000000000000009, 0.96666666666666656, 0.96666666666666656, 0.97333333333333338, 0.95999999999999996, 0.96666666666666656, 0.95999999999999996, 0.96666666666666656, 0.95333333333333337, 0.95333333333333337, 0.95333333333333337]\n"
301 | ]
302 | }
303 | ],
304 | "source": [
305 | "# search for an optimal value of K for KNN\n",
306 | "k_range = list(range(1, 31))\n",
307 | "k_scores = []\n",
308 | "for k in k_range:\n",
309 | " knn = KNeighborsClassifier(n_neighbors=k)\n",
310 | " scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')\n",
311 | " k_scores.append(scores.mean())\n",
312 | "print(k_scores)"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 10,
318 | "metadata": {
319 | "collapsed": false
320 | },
321 | "outputs": [
322 | {
323 | "data": {
324 | "text/plain": [
325 | ""
326 | ]
327 | },
328 | "execution_count": 10,
329 | "metadata": {},
330 | "output_type": "execute_result"
331 | },
332 | {
333 | "data": {
334 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAEPCAYAAACHuClZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xm8VOWd5/HP9yIggiyKiLKJMaKAsukVSKZzE2O7jRpN\npxM7GbfE2N0aEzvTo3E6I2bSHU26HbNMOtoxGcdOYmJiopmk3b3ZuBdBLqhsalAEBFwIgqDI8ps/\nnnOgKKruPbWcqjp1f+/X676oe5annkPB+dXz/M7zPDIznHPOuUq01LsCzjnnss+DiXPOuYp5MHHO\nOVcxDybOOecq5sHEOedcxTyYOOecq1jqwUTSGZKWS3pW0rUF9g+VdK+kxZI6JU3M2XeNpGckPSXp\nB5L6RduHSXpI0gpJD0oakvZ1OOecKy7VYCKpBfgWcDowCbhQ0nF5h10PdJnZFOBi4BvRuUcCnwGm\nm9mJwAHAx6JzrgMeMbMJwGPAF9K8Duecc91Lu2XSCjxnZqvMbAdwN3Be3jETCQEBM1sBHCXpsGhf\nH2CgpAOAg4C10fbzgDuj13cCH0rvEpxzzvUk7WAyClid8/uaaFuuxcAFAJJagbHAaDN7GfgX4CVC\nENlkZo9G54wwsw0AZrYeGJHaFTjnnOtRIyTgbwKGSVoIXAl0AbskDSW0QMYBRwKDJP1VkTJ8Thjn\nnKujA1Iufy2hpREbzd6uKgDMbAtwWfy7pJXASuAMYKWZbYy23wvMBn4IbJB0uJltkDQSeKXQm0vy\nIOOcc2UwM5VyfNotk/nAMZLGRU9ifQy4P/cASUMk9Y1eXw781szeJHRvzZR0oCQBpwLLotPuBy6J\nXl8M3FesAmbWtD833HBD3evQrNe3e7dx2GHGwIHGjh3NdW1mxksvGWDMnl39sk86yRg6tL7Xl/ZP\nvT+/tH/KkWowMbNdwFXAQ8AS4G4zWybpCkmfjg47HnhG0jLCU1+fjc59AvgpodtrMSDg9uicm4HT\nJK0gBJmb0rwO1/v88Y/Qrx+MHQtPPVXv2lTf3LnwgQ/AokWwfXv1yt22DZYsgc2bYffu6pXrGl/a\n3VyY2QPAhLxtt+W87szfn7PvRuDGAts3Ah+sbk2d26ujA2bPhiFDwuvp0+tdo+rq6IDTT4eNG6Gr\nC2bOrE65CxbAiSeGAPzKKzByZHXKdY2vERLwrkxtbW31rkKq6nl9c+fCrFnhZ+7c6pdf788ureuL\nyx01qo01a6pXbqOp9+fXiFRu/1gWSLJmvj6XnqlT4bbbQsvkrLNg5cp616h63noLhg+H116Dn/0M\n7rsP7rmnOmWfey78l/8Cd90Fl14K559fnXJdbUnCGiwB71zmbN4Mzz0H06bBscfCpk2wbl29a1U9\nCxbApEkwYEDoyps7F6rxncssdJ/NmgVjxsDq1T2f45qHBxPn8jzxRAgk/fpBS0u4OXZ01LtW1TN3\nbggiAOPHw86d8NJLlZf7/PMhQI0e7cGkN/Jg4lyeOPkemz27uYJJ7vVJ1bu+3HI9mPQ+HkycyxMn\nkWNpJeHrwSy968std8wYmjoB7/bnwcS5HLt3Q2fnvjfb1lZYvLi64zHqZeVK6N8/3Oxj1WqZ5Haf\njR7tLZPexoOJczmWL4dhw/YdHzFoELz73WE8Rtblt0oAZsyApUth69byy928OQSqKVPC76NGhYcW\ndu0qv0yXLR5MnMuR++06V/zUU9YVur4BA2Dy5PCUV7nmzQsDO/v1C7/37w+HHALr15dfpssWDybO\n5chPvseaJQmf1vUVKteT8L2LBxPnchTqBoK9Seosj4HdsiU8vjt16v77Km15Ffp78yR87+LBxLnI\nxo3h5nfCCfvvGz8+9P9n+Zt27viZfPFYmnKCZaGHFsCT8L2NBxPnIp2dcPLJcECB6U+l7D8iXKzV\nBeHGP2BAaLmUatmyMD3LiLz1Tr2bq3fxYOJcpFjyPZb1JHxP11fuSP9i5Xow6V08mDgXKZacjmU5\nCV+sKypXucGy2N+bB5PexYOJc4T5qZ54ovt1PeLxGNu21a5e1bJ8eXhU9/DDix9TbjAp1n3mCfje\nxYOJc8Azz4S8wSGHFD/mwANDcr6S8Rj10lOrC8KAw5UrwwDEpF5/HV5+OYxTyXfEEWGBrJ07S6ur\nyyYPJs7RfXI6V1aT8Emur1+/MPBw3rzk5XZ2hulm+vTZf1/fvnDYYSHYuObnwcQ5kn1zh+wm4XtK\nvsdKzQv1VK7nTXoPDybOkfxmW8l4jHrZuBHWri3cFZWv1JZXT0HYg0nv4cHE9XobNoQb7nHH9Xxs\nJeMx6qW78TP5Zs0Kx+/e3fOxO3fC/PlwyinFj/EkfO/hwcT1eh0d4SmuloT/G7L2iHDSLjwIAw+H\nDw8DEXvy1FMwdmyYZbkYHwXfe3gwcb1e0i6uWNbyJmldX7zee3e8m6v38GDier0kN8VcWXqiK+6K\n6m78TL6kLa8kQcqDSe/hwcT1au+8AwsXhsdbk5o6tfTxGPUSj5/prisqX9JgmaT7zINJ7+HBxPVq\nXV1wzDEweHDyc/r2LX08Rr0kHT+Ta/LksEri668XP2bdOti0CY49tvuyRo4MDze8805pdXDZ48HE\n9WqlJKdzZSUJX8719ekTWmqdnd2XO2tWzw8t9OkTAsrataXVwWWPBxPXq5WanI5lJQlf7vX11NVV\nSp7Ju7p6h9SDiaQzJC2X9KykawvsHyrpXkmLJXVKmhhtP1ZSl6SF0Z9vSLo62neDpDXRvoWSzkj7\nOlxzKjX5Hps5M/l4jHrZsAH+9CeYMKH0c3tqeZUSpDyY9A6pBhNJLcC3gNOBScCFkvKHhl0PdJnZ\nFOBi4BsAZvasmU0zs+nADGArcG/OebeY2fTo54E0r8M1p9WrYft2eNe7Sj93xIgw71SS8Rj1Uur4\nmVynnBKeAis0SeP27bBoUfKHFjyY9A5pt0xagefMbJWZ7QDuBs7LO2Yi8BiAma0AjpJ0WN4xHwT+\naGa5Y2mVUp1dLxEnp1Xmv6RGf0S4nOR7bNiwMCDx6af339fVFRLvgwYlK8tHwfcOaQeTUUDud5I1\n0bZci4ELACS1AmOB0XnHfBT4Ud62qyQtkvRdSUOqV2XXW5SbfI81ehK+GtdXKFiWmofxUfC9QyMk\n4G8ChklaCFwJdAG74p2S+gLnAvfknPNt4GgzmwqsB26pXXVdsyg3OR1r5CT8O++EFkQp42fyFWt5\nlZpn8m6u3iHB1G8VWUtoacRGR9v2MLMtwGXx75JeAFbmHHIm8KSZvZpzzqs5+/8N+GWxCsyZM2fP\n67a2Ntra2kqpv2tSb70FS5bASSeVX8akSXvHYxx6aPXqVg3x+JmDDy6/jNmz4ctf3nebWQgwN9+c\nvBwPJo2vvb2d9vb2isqQpTiXtqQ+wArgVGAd8ARwoZktyzlmCLDNzHZIuhx4j5ldkrP/R8ADZnZn\nzraRZrY+en0NcLKZ/VWB97c0r89l1+9+B5//fFiqtxKnnQaf+xycfXZ16lUtt94Kzz4L3/52+WXs\n3h0mfVy6NIwVAVi1KiTn161LnmvavTvMtLxpU/jTNT5JmFlJ2cRUu7nMbBdwFfAQsAS428yWSbpC\n0qejw44HnpG0jPDU12fj8yUdREi+37tvyXxV0lOSFgHvA65J8zpc8yn3keB8jZqEryT5Hmtp2bt+\nSyz+eyvloYWWFhg1ygcuNru0u7mIHtudkLfttpzXnfn7c/ZtA/Kf7MLMLqpyNV0vM3cuXHhh5eXM\nng1f/Wrl5VRbRwd85SuVlxPnhc4/P/xebp4pTsIfc0zldXKNqRES8M7VVNzvX0nyPTZzZvHxGPWy\nenVIwB99dOVlFWqZlPP35nmT5ufBxPU6K1dCv37hBlepoUPDeIynnqq8rGqpdPxMrtbWMEBx+3bY\nujXkT2bMKL0cDybNz4OJ63Wq1SqJNdp4k0rHl+QaNCgMUOzqggUL4IQT4MADSy/Hg0nz82Diep1q\nJd9jjZaEr0byPVfc1VXJ35uPgm9+Hkxcr9PMLZNqjJ/JFyfhK/l781Hwzc+DietVtmyB55+HadOq\nV+axx8Ibb4SxF/W2YEFY3Kqa4znillelLRMPJs3Ng4nrVZ54Iiy7269f9cpsaQlPdTVC66TaXVwA\n48fDrl0hQI3OnzUvoeHDQ6tp69bq1s01jtTHmTgH4XHc5cvh+OPrW49qd3HFZs+GH/84LOlbT7/6\nFVx1VXXLlML1VRKApb1dXcflL0KRkuXLw1ou1XiqzfWsx2Ai6RzgV2bWwMsAuUa3aBG8972hO+iA\nOn6F6eiAyy+vfrnnnx/WhL/99uqXXYpDD4UPfKD65X7qU2EJ3krESfhaBJNdu8K0L52d9f8C01sk\n+W/9UeBWST8Dvmdmy1Ouk2tCc+fCtm1hPMb06fWpw+7dIZh873vVL3viRLj//uqX2yjOOqvyMmqZ\nhF+yBDZvhpde8mBSKz3mTMzsE8A04I/A/5HUIenTkiqYj9T1NnPnwuDB9X2EdvlyOOSQvZMWutqq\nZRI+/nfmSf/aSZSAN7PNwE8JKyUeAZwPLJT0mRTr5ppIRwdccUV9k9TVHl/iSlPLYNLRAUcc4cGk\nlnoMJpLOlfRzoB3oC7Sa2ZnAFODz6VbPNYP168P045dcUt+WSVrJd5dMrVsmH/mID5SspSQtkw8D\n/8vMTjCzr5nZK7BnRt9Pplo71xQ6OsKjs8cdV9/xGN4yqa9ajYJ/9dXwc/rp3jKppSTBZA5hUSsA\nJA2QdBSAmT2aSq1cU4lbBIXWx6iVjRvDjeWEE2r/3i6oVQK+oyM8yTVunAeTWkoSTO4Bch8L3sW+\n67E7163c7qV6TT3S2RlmwK3nY8m93bBhYar+zZvTfZ+4BRp3q/liq7WRJJgcYGbvxL9Er6s4ftg1\ns+3bwxiT1tbwe70mRfQurvqTapM3ib+8DB4cWsObNqX7fi5IEkxelXRu/Iuk84DX0quSayZdXWHu\nqkGDwu+562PUkiffG0PawWTHDnjyydDNFb+fJ+FrI0kw+WvgekkvSVoNXAtckW61XLPIbxHkro9R\nKzt3htUQZ86s3Xu6wtK+uS9eHOYSGzJk7/t53qQ2euxBNrM/AjMlDYp+fzP1WrmmMXcunHfevtvi\nKc1rdXN/5hkYNSoMWHT1lXYSPr8F6sGkdhKlIyWdDUwCDlQ0a5qZfSnFerkmEK+1fvPN+26fPRt+\n8Yva1cO7uBrHmDHpPoDR0REeCc59Pw8mtZFk0OJ3CPNzfQYQ8BFgXMr1ck1g9erQxTR+/L7b4yR8\nrZ6y8eR740j75p7/xcEX5aqdJDmT2WZ2EfAnM7sRmAUcm261XDOI/2PnTwEer4/x0ku1rYervzSD\nydq1Yb2Ud7973/fzBHxtJAkmb0d/bpN0JLCDMD+Xc90q1iKQajd4ccOGMGCxVmtouO7FN/c0WqXx\nv7fcLy/ezVU7SYLJLyUNBb4GLAReBH6YZqVcc+iuRRAn4dMWT+XS4muKNoQ0x34U+veWZvBy++r2\nv5ikFuBRM9tkZj8j5EqOM7P/UZPauczatg2WLoUZMwrvr9VIeO/iajxptRYKtYQHDoQDD4TXX6/+\n+7l9dRtMotUV/3fO79vN7I3Ua+Uyb8ECmDw5rBteyIwZIdikvSa4J98bTxrB5O23w8JrJ5+8/z5P\nwtdGksb/o5I+LPlKyi65nloEBx4YJl1csCC9OrzzDixcuHcqF9cY0ggmTz4ZVlQcOLDw+3kSPn1J\ngskVhIkdt0vaLGmLpJSnanNZN3duzy2CtJPwXV1wzDGhn941jjRu7t21QD0JXxtJlu092MxazKyf\nmQ2Ofk/831PSGZKWS3pW0rUF9g+VdK+kxZI6JU2Mth8rqUvSwujPNyRdHe0bJukhSSskPShpSCkX\n7dJlFv5z95SrSDsJn6QOrvbS6HbqriXswaQ2kgxa/LNCP0kKjxL43wJOJ4ygv1BS/kOa1wNdZjYF\nuBj4BoCZPWtm08xsOjAD2ArcG51zHfCImU0AHgO+kKQ+rjaefz7kSkaP7v64uGWS1pM2nnxvTNW+\nucdfXrxlUl9Jurn+Pufni8AvCQtmJdEKPGdmq8xsB2EN+byZmphICAiY2QrgKEmH5R3zQeCPZhY3\njs8D7oxe3wl8KGF9XA0kTXqPHh2CzvPP17cerraqfXN/8cUwtmRckXk5PAFfG0m6uc7J+TkNmAz8\nKWH5o4Dcj3FNtC3XYuACAEmtwFgg/zvtR4Ef5fw+wsw2RPVbD4xIWB9XA6W0CNJ6RHj16jDN/bve\nVf2yXWWqPfYj7s4s9oiQJ+Bro5x159YAx1exDjcBX5e0EHga6CKs5giApL7AuYSurWKK/rOcM2fO\nntdtbW20tbVVVlvXo7lz4bLLkh0bz9N10UXVr0P+aGjXGAYODC3S11+H4cMrL6+nhz1Gjw5Treze\n7YNXi2lvb6e9vb2iMnoMJpK+yd6bdQswlTASPom1hJZGbHS0bQ8z2wLsufVIegFYmXPImcCTZvZq\nzrYNkg43sw2SRgKvFKtAbjBx6du8GVauhKlTkx0/ezbccUf16+HJ98YWdz1VK5h8/OPF9w8YAAcf\nDK++CocfXvn7NaP8L9o33nhjyWUkidMLgCejnw7gWjP7RMLy5wPHSBonqR/wMeD+3AMkDYlaH0i6\nHPhN3popF7JvFxdRGZdEry8G7ktYH5eyefNg+nTol3Bh56lTQ/Cp9rrgSR5NdvVTrbzJm2/CihXh\n31wt3s8Vl6Sb66fA22a2C0BSH0kHmdm2nk40s12SrgIeIgSuO8xsmaQrwm67ndBldqek3cAS4JPx\n+ZIOIiTfP51X9M3ATyRdBqwC/jLBdbgaKDXp3bdvuBHMmwennVadOrz1FixZAiedVJ3yXPVV6+Y+\nfz5MmQL9+3d/XNwS8n8T6UkSTB4l3NDj1sIAQnBI1IlgZg8AE/K23ZbzujN/f86+bUD+k12Y2cao\nTq7BzJ0Lf/M3pZ0TJ+GrFUwWLIBJk+Cgg6pTnqu+agWTpN2ZnoRPX5JurgNzu52i1/7f1O1n927o\n7Cy9eylOwleLd3E1vmrd3JN+1t7Nlb4kwWSrpD09kpJmAG+lVyWXVcuWhYTqiBIf1J41KwSh3bur\nUw9Pvje+aoz96GmwYi4PJulLEkw+B9wj6XeSfg/8GLgq3Wq5LCp3xPmIEXDYYSEYVSped95bJo2t\nGjf3Z58NT2kdeWRt3s91r8eciZnNj6ZAifMaK6LR7M7to5IR53FX16RJldVh5crwJNmYMZWV49JV\njbEfpXx58VHw6UsyN9eVwEAze8bMngEGSfrb9KvmsqaSubCqNRK+2LrzrrEMGBBmc36l6AixnpXS\nnTlqFKxbB7t29XysK0+S7wSXm9meRTbN7E/A5elVyWXR66/Dyy+HBbHKUa0kvHdxZUelSfhSPuv+\n/eGQQ2DDhvLfz3UvSTDpk7swlqQ+QMIhaa636OwMi1D16VPe+ZMnh2BU6fKqnnzPjkq6njZtglWr\n4MQTk5/jeZN0JQkmDwA/lnSqpFMJo9EfSLdaLmsqne69Tx845ZQQlMq1ZQs89xxMm1Z+Ga52Krm5\nz5sXln7u27c27+d6liSYXEuYIv5vop9HCdPRO7dHNaZ7r7Sr64knQiBJOpWLq69Kbu7lfHnxJHy6\nkkxBv9vMvmNmf2FmfwH8Gvh8+lVzWbFzZ5jWYubMysqpNAnvi2FlSyXBpJzuTB8Fn65ED+VJOkzS\n30r6HdAO+Nybbo+nngr/UYcNq6ycU04JQWnnzvLO9+R7tpR7c9+1K3Rzlfrlxbu50lU0mEg6WNLF\nkh4EngDeBYw3s3eZ2X+tWQ1dw6tW0nvYMBg7Fp5+uvRzy53KxdVPud1OS5fCyJGlT1/vwSRd3bVM\nXiGsM/Jl4Ggz+zzwTk1q5TKlmt1Ls2eXlzdZvjwEo5Ejq1MPl75yx36U2wL1YJKu7oLJF4D+wLeB\nL0jyBVBdQdXsXio3Ce/5kuyJx36sX1/aeeV+1kccEQZJltuN6rpXNJiY2a1mNhM4L9r0C+BISddK\nOrYmtXMNb906eOMNmFBwEYHSlZuE9/El2VROa6Hcz7pv3zAH3Lp1pZ/repbkaa6VZvZPZnYCcBIw\nmPBEl3N0dIREaLXW1j722DAgrZxvq54vyZ5Sk/CvvhpaFxMnlv9+3tWVjpJuAdH8XP/dzI5Jq0Iu\nW6rdImhpCUGhlNbJxo3hhnTCCdWrh6uNUpPwnZ3hqb9yv7x4MElPlb5Put4qjRZBqXmTzk44+WQ4\nIMm6oa6hlHpzr/TfmweT9HgwcWXbvh0WLQpzclVTqU90efI9u8oJJpV81j4KPj3+XS5F27fDRRfB\ntm31rkk6tm4NOY6DD65uua2t0NUF55yT7Pgnn4Tvfre6dXC1MXYsPP548s96/vzQzVWuMWPgD38o\n/3yAn/88PIbe1lZZOfnWrIEf/xg+n9H5RYoGE0lPA1Zsv5mVMF9n7/TCC+Gb1Le/Xe+apOeYFLJn\ngwbBI48kn0G4Tx847bTq18Olr7UV7ror+eO6110HQ4aU/37V6Ob6znfCGJlqB5Nf/Qpuu60Jgwnw\nn6M/r4z+vCv68+PpVae5rF4dvrkn/dbl9vJuq97hgAPgzDNr936VBpN4poUjjqhenWJz54a6mWVz\ncbeiwcTMVgFIOs3Mcif1vk7SQuC6tCuXdWvWhD5a51xjGDkytHjfeae82aWXLg3TuKxbF8o59NDq\n1a2jA95+OzydWM1yayVJAl6S3pPzy+yE5/V6q1f7WuTONZI+fUJAWbu2vPPnzoX3vjc8PVjJ2jv5\n4vEzkyZl9wGBJEHhk8C3Jb0o6UXC9CqXpVqrJuHBxLnGU8lU9PHTZOXOIVdMR0d4sGDcuCYOJmb2\npJlNAaYAU8xsqpktTL9q2efBxLnGU+k6KrNmlT6wNmm5WR4H02MwkXS4pDuAu83sDUkTJX2yBnXL\nPA8mzjWecm/Yr70WpvmZNClMIVTJ2jv54hZPlsfBJOnm+j/Ag8CR0e/PAp9Lq0LNxBPwzjWecoNJ\nPJVLnz6Vrb2Tb8cOWLgwlJ3l1SCTBJPhZvYTYDeAme0EEq9AIOkMScslPSvp2gL7h0q6V9JiSZ2S\nJubsGyLpHknLJC2RdEq0/QZJayQtjH7OSFqfWtm8OazTMHRovWvinMtV7rf//Klcyl0uId/ixXDU\nUWH8TFN3cwFbJR1KNIBR0kzgjSSFS2oBvgWcDkwCLpR0XN5h1wNdUV7mYuAbOfu+DvzazI4n5GyW\n5ey7xcymRz8PJKlPLcVdXFl8Xty5Zlbut//8qVyqlYTPLbfZg8nfAfcD75L0B+D/AlcnLL8VeM7M\nVpnZDuBu9q6PEpsIPAZgZiuAo6I15wcD/8nMvh/t22lmm3POa+jbtOdLnGtM5dywd+wI0/bkTuVS\nrSR8nHyH0GpauzYMjsyaJMFkCfA+YDZwBaGFsTxh+aOA3I9tTbQt12LgAgBJrcBYYDQwHnhN0vej\nrqzbJQ3IOe8qSYskfVdSBRMspMODiXONacSI0A399tvJz3nqqfDYbm639YQJ5a29ky+3ZTJgQJhO\n6LXXKiuzHpJM9NhhZtMJQQWAaAT89CrV4Sbg61GZTwNdhJxM3+g9rjSzBZJuJYy6v4Ew1uVLZmaS\nvgzcQhgPs585c+bsed3W1kZbtSfUKcKT7841ppYWOPLI8H806dxyhWYrzl175/zzy6vL2rVhwtR3\nv3vvtrjlNGJEeWWWo729nfb29orK6G6ix5GEVsQASdPY2600GDgoYflrCS2N2Oho2x5mtoWcQZCS\nXgBWAgOB1Wa2INr1U+Da6JxXc4r4N+CXxSqQG0xqafVqn1/KuUYVJ+GTBpOOjsKTicZJ+HKDSdzF\nlZtbjYPJjBnllVmO/C/aN954Y8lldNfNdTrwz4QAcAvwL9HP3xGS5knMB46RNE5SP+BjhPzLHtET\nW32j15cDvzGzN81sA7A6Z735U4Gl0XEjc4q4AHgmYX1qxru5nGtcpSbhi62jUmkSvlC5WU3CdzfR\n453AnZI+bGY/K6dwM9sl6SrgIULgusPMlkm6Iuy224Hjo/fZTehKy+2uuhr4QRRsVgKXRtu/Kmkq\n4XHlFwm5nIbiwcS5xlXKDfvll2HLljADeL7W1rBA3Pbt0L9/6fXo6ICvfKX8ujWSHnMmZvYzSWcT\nEu8H5mz/UpI3iB7bnZC37bac1535+3P2LQZOLrD9oiTvXS9mHkyca2RjxsAzCfszCnVFxQYNCkGm\nqyuMii/F22+HxP7JeXe40aPD9qxJMp3Kd4CPAp8h5E0+AoxLuV6ZtmlTWKeh2isQOueqo5Rv/z0t\nFTx7dnmPCC9cCMcfDwMHll+3RpLk0eDZUUvgT2Z2IzALKNDgczFvlTjX2Eq5YeeOAymk3JHw+SPq\ny6lbI0kSTN6K/twm6UhgB5DCOmPNw4OJc41t9OhkCfjt28N0J/ldUbniJLwVXeS8sGItnlGjwuJb\nuxJPWtUYkgST/ydpKPA1YCEh4f2jNCuVdR5MnGtsw4fDtm3hpzsLF4bBiYMGFT9m/Pgwe3AprQmz\n4i2e/v3DRJIbNiQvrxEkWc/kf5rZpuiJrnHAcWb2xfSrll0eTJxrbFKyCR97ypfEZZX6iPCLL4bz\nxhXJPidtOTWSosFE0gX5P8DZwKnRa1eEj353rvElyU0kCSZQehK+oyOcU2wi2CzmTbp7NPic6M8R\nhHm5Hot+fz8wF7g3xXplmrdMnGt8Pd2wzUIw+ed/7rmsWbPgmmuSv3ex5HvSujWi7gYtXgog6SFg\nopmti34/grBglivCg4lzja+nrqRVq8KfRx3Vc1kzZsDSpSEHc1CCyabmzoWPf7z4/iwGkyQJ+DFx\nIIlsYN/5tlwOs/AP1IOJc42tpxt2d4MV8w0YAJMnw4IFPR/75puwYgVM72aq3GYNJo9KelDSJZIu\nAX4FPJJutbLrtdfCN5Mk306cc/XT0w07ab4kljQJP38+TJnS/fQrTZWAj5nZVcBthJUOpwC3m9ln\n0q5YVnny3blsqFcwSVJuFlsmSdYzwczuxRPuiXi+xLls6O6GvXUrLF/efVdUvlmz4G//NnR1d9c1\n1tEBl16eP7BeAAAWWUlEQVRafD+E9VY2bAjjVw5IdJeuv+4eDf599OcWSZtzfrZI2lzsvN7Og4lz\n2TB0aLhZby5wN5s/H048EQ48cP99xYweHXInzz9f/JjuBivm6tsXDjssjITPiqLBxMzeG/15sJkN\nzvk52MwG166K2eLBxLlskIqva5Lkhl9IT+vCP/tsmAD2yCN7LitrXV3dtUwO6e6nlpXMEg8mzmVH\nsRt2qfmSWE95k1LKzVoSvrveuCcBY+9yvbkMODqVGmWcJ+Cdy45CwSTuirrttsLndGf2bLjjjuL7\nSwkmWWuZdDdocXwtK9IsvGXiXHYUumE/91xYYyRJV1S+KVNg5cqQhxlcIBnQ0QF//dfJ6/bSS6XX\noV6SjDNB0jBJrZL+LP5Ju2JZtHs3rF3rLRPnsqJQV1K5XVwA/fqFJ8Dmzdt/36ZNYVT9iScmKytr\nLZMkKy1+Cvgt8CBwY/TnnHSrlU2vvAJDhpT2BIhzrn4K3bDLTb7HiiXh580L06707Vt+3RpZkpbJ\nZwnrsK8ys/cD04BNqdYqo7yLy7lsKXTDrqRlAsWT8KWWm7UEfJJg8raZvQ0gqb+ZLQcmpFutbPLk\nu3PZEgeTeJXEN96AF14IuY9yzZoFnZ2h2ztXqcHkiCPC9EzvvFN+XWopSTBZE620+AvgYUn3AavS\nrVY2ecvEuWwZPBj69An5DCi9K6qQESPCSo7Llu3dtmsXPPEEzJyZvJw+fWDkSHj55fLrUktJ5uY6\nP1ppcQ7wReAO4ENpVyyLPJg4lz253UmVdnHF8ru6liwJgWH48NLKyVLepLtBi7+W9AlJe1Y/NrPf\nmNn9ZpaRhldteTBxLntyb9iVJt9j+Un4csttimBCmCn4bOAFST+RdL6kfjWqVyZ5MHEue+Ib9q5d\nIddRjWCS3zIpt8WTpSR8d3Nz3WdmFwLjgJ8BFwEvSfq+pNNqVcEs8QS8c9kTB5OlS+Hww8MEi5Wa\nPDnkOl5/PfxebjBplpYJAGa2zcx+bGbnA38OTAUeSL1mGbNrF6xfD6NG1bsmzrlSxDfsanVxQUie\nt7aGls6rr4afiRPLr1sW9DhTvqTDgb8EPgYcAfwEuCTdamXP+vVwyCFhBKxzLjvirqRqJd9jcVfX\nrl1wyinQkmi+kX1lKZh0l4C/XNJjwELg3cDfm9nRZnadmS1O+gaSzpC0XNKzkq4tsH+opHslLZbU\nKWlizr4hku6RtEzSEkmnRNuHSXpI0opoSeEhJV11Cjxf4lw2pdEygb1J+ErKbYpgAswCvgKMMbOr\nzSzBgpT7ktQCfAs4HZgEXCjpuLzDrge6zGwKcDHwjZx9Xwd+bWbHE5YMjp/cvg54xMwmAI8BXyi1\nbtXmwcS5bBozBl58MfQuTJpUvXJnzgyLbP32t+W3eEaMCAMp3367evVKS3cJ+MvM7GEz2zOOU9Kc\nEstvBZ4zs1VmtgO4Gzgv75iJhICAma0AjpJ0mKTBwH8ys+9H+3aaWbwm2nnAndHrO2mAcS+efHcu\nmwYOhEGDQldUnz7VK3fYMBg7NuRNTjmlvDJaWsLsxWvXVq9eaSm1F+/cEo8fBeQ20tZE23ItBi4A\nkNQKjAVGA+OB16KnxxZKul3SgOicEWa2AcDM1gMjSqzXPnbsCNMWVMJbJs5l15gx1e3iis2aFRLv\nQyroiM9KV1epS9UXWiirUjcBX5e0EHga6AJ2AX2B6cCVZrZA0q2E7q0bCtTDihU+Z86cPa/b2tpo\na2vb75i77oJHH4Uf/KD8i1i9uvxvH865+nr/++HMM6tf7jnnhNZJJWoRTNrb22lvb6+oDJkVvQ/v\nf7DUktvtleD4mcAcMzsj+v06wMzs5m7OeQE4ARgIdJjZ0dH29wLXmtk5kpYBbWa2QdJI4PEor5Jf\nliW5vmXL4KyzwgRv5Zo5E265pbpPgzjn3HXXhTnErr++du8pCTMrqfGQZD2Tr0oaLKkvYaLHVyV9\nImH584FjJI2LRs9/DLg/r/whUdlIuhz4jZm9GXVjrZZ0bHToqcDS6PX97H08+WLgvoT1KWjChJDk\nWreu/DK8m8s5l4asjIJPkjP58yjx/Z+BF4FjgL9PUriZ7QKuAh4ClgB3m9kySVdI+nR02PHAM1Fr\n43TC+imxq4EfSFpEeJrrn6LtNwOnSVpBCDI3JalPMS0txRe0SWLHjjAo6YgjKqmFc87tr5lyJvEx\nZwP3mNkbUvLWj5k9QN76J2Z2W87rzvz9OfsWExbmyt++Efhg4kokEAeTCy4o/dx168IjfAeUmoFy\nzrkeZCWYJGmZ/D9Jy4EZwKOSDgMy8NRzaYqtjpaEd3E559LSNMHEzK4DZgMnRWNFtrL/WJHMa22F\nRYtg+/bSz/Vg4pxLy/DhsG1b+GlkSRLwHwF2mNkuSf8A/DtwZOo1q7FBg+DYY6Grq/RzPZg459Ii\nhQlkGz0Jn6Sb64tmtiV6NPeDhJUW/zXdatVHuV1dPvrdOZemLHR1JQkmu6I/zwZuN7NfAU05N265\nT3R5y8Q5l6ZmCSZrJd0GfBT4taT+Cc/LnLhlUsI4TsCDiXMuXc0STP4SeBA43cw2AYeQcJxJ1owf\nDzt3wksvlXaeBxPnXJqaIpiY2Tbgj8Dpkq4iTLL4UOo1qwMptE5K6eravh02bgzLfTrnXBqyMAo+\nydNcnwV+QJiZdwTw75I+k3bF6qXUJPzLL4eR79Wcuto553JloWWSZMz2J4FTzGwrgKSbgQ7gm2lW\nrF5mzYKf/CT58d7F5ZxLWxaCSZKcidj7RBfR6zSmom8IM2bA0qXJBwh5MHHOpW3YsDAH4JYt9a5J\ncUmCyfeBeZLmRCstdhLGmjSlAQNg8mRYsCDZ8R5MnHNpkxq/dZIkAX8LcCmwMfq51MxuTbti9VRK\n3sSDiXOuFho9Cd9tzkRSH2CJmR0HLKxNlepv9uyw+mISa9bAB6s6f7Fzzu0v0y2TaD2SFZIqXHgy\nW+KR8EkGL3rLxDlXC40eTJI8zTUMWCLpCcKMwQCY2bmp1arORo8OuZPnn4d3v7v7Yz2YOOdqYcwY\n6Oysdy2KSxJMvph6LRpQ3DrpLpi89VZ4uuKww2pXL+dc7zRmDNxzT71rUVzRbi5Jx0h6j5n9JveH\n8GhwA6eBqiNJEn7NmjA1dEtTzlTmnGskjZ6A7+42eCuwucD2N6J9TS1pMPGp551ztRDnTEqdiLZW\nugsmh5vZ0/kbo21HpVajBjFlCqxcCZsLhdOI50ucc7UyZEgYb/LGG/WuSWHdBZOh3ewbUO2KNJp+\n/WD6dJg3r/gxHkycc7XUyE90dRdMFki6PH+jpE8BT6ZXpcbR02JZHkycc7XUyMGku6e5Pgf8XNLH\n2Rs8TiKssnh+2hVrBLNnw792s0Dx6tVw9tm1q49zrndr5CR80WBiZhuA2ZLeD0yONv/KzB6rSc0a\nwKxZcPHFsHt34Se2PAHvnKulrLZMADCzx4HHa1CXhjNiBAwfDsuWwaRJ++/3bi7nXC2NGQO//W29\na1GYj5DoQbFHhLduhbffhkMPrX2dnHO9UyO3TDyY9KBYEn716tDFpaZd2cU512g8mGRYsZaJd3E5\n52otTsA34sDF1IOJpDMkLZf0rKRrC+wfKuleSYsldUqamLPvxWh7VzTRZLz9BklrJC2Mfs5Iq/6T\nJ4d13l9/fd/tnnx3ztXaoEHQvz9s3Fjvmuwv1WAiqQX4FnA6MAm4UNJxeYddD3SZ2RTgYuAbOft2\nA21mNs3MWvPOu8XMpkc/D6R0CfTpA62t+8/W6S0T51w9NGpXV9otk1bgOTNbZWY7gLuB8/KOmQg8\nBmBmK4CjJMXz8KqbOtYsW1Goq8uDiXOuHnprMBkF5F72mmhbrsXABQCSWoGxQNyBZMDDkuYXGI1/\nlaRFkr4raUj1q75XoSS8BxPnXD301mCSxE3AMEkLgSuBLsI09wDvMbPpwFnAlZLeG23/NnC0mU0F\n1gO3pFnBmTNh/nzYuXPvNg8mzrl6GDOmMUfBJ1kcqxJrCS2N2Oho2x5mtgW4LP5d0gvAymjfuujP\nVyX9nNBt9nszezWniH8DflmsAnPmzNnzuq2tjba2tpIvYtgwGDsWnnoqTP4InoB3ztXH6NHwyCPV\nLbO9vZ329vaKypCl+IyZpD7ACuBUYB3wBHChmS3LOWYIsM3MdkRdWe8xs0skHQS0mNmbkgYCDwE3\nmtlDkkaa2fro/GuAk83srwq8v1Xr+j71KZg2Da68MkxLf+SRYZVFH2finKulxx6DL30JKrz3d0sS\nZlbS3S3Vbi4z2wVcRQgES4C7zWyZpCskfTo67HjgGUnLCE99fTbafjjwe0ldQCfwSzN7KNr3VUlP\nSVoEvA+4Js3rgH2T8HEXlwcS51ytNWrOJNWWSb1Vs2WybFmYIXjlSnjgAbjlFnjooZ7Pc865anrr\nrdD1vm1bekuGN1zLpJlMmACbNsG6dZ58d87Vz4ABcPDB8Npr9a7JvjyYJNTSsvcRYU++O+fqafTo\nxuvq8mBSgjiYeMvEOVdPjZg38WBSgjgJ78HEOVdPjRhM0h5n0lRaW2HRIjj8cA8mzrn6acRg4i2T\nEgwaBMceCy+84MHEOVc/jTgK3oNJiWbPhsGDw9MUzjlXD42YgPdurhLNmgW/+U29a+Gc683GjIGl\nSyFntqi682BSonPPDQOGnHOuXsaNg+uug61b612TvXwEvHPOuX34CHjnnHN14cHEOedcxTyYOOec\nq5gHE+eccxXzYOKcc65iHkycc85VzIOJc865inkwcc45VzEPJs455yrmwcQ551zFPJg455yrmAcT\n55xzFfNg4pxzrmIeTJxzzlXMg4lzzrmKeTBxzjlXMQ8mzjnnKubBxDnnXMVSDyaSzpC0XNKzkq4t\nsH+opHslLZbUKWlizr4Xo+1dkp7I2T5M0kOSVkh6UNKQtK/DOedccakGE0ktwLeA04FJwIWSjss7\n7Hqgy8ymABcD38jZtxtoM7NpZtaas/064BEzmwA8BnwhrWtoZO3t7fWuQqqa+fqa+drAr683Srtl\n0go8Z2arzGwHcDdwXt4xEwkBATNbARwl6bBon4rU8Tzgzuj1ncCHql3xLGj2f9DNfH3NfG3g19cb\npR1MRgGrc35fE23LtRi4AEBSKzAWGB3tM+BhSfMlXZ5zzggz2wBgZuuBESnU3TnnXEIH1LsCwE3A\n1yUtBJ4GuoBd0b73mNm6qKXysKRlZvb7AmVYjerqnHOuAJmldx+WNBOYY2ZnRL9fB5iZ3dzNOS8A\nJ5jZm3nbbwC2mNktkpYRcikbJI0EHjez4wuU5UHGOefKYGYq5fi0WybzgWMkjQPWAR8DLsw9IHoS\na5uZ7Yi6sn5jZm9KOghoiV4PBP4cuDE67X7gEuBmQtL+vkJvXupfhnPOufKkGkzMbJekq4CHCPmZ\nO8xsmaQrwm67HTgeuFPSbmAJ8Mno9MOBn0etiwOAH5jZQ9G+m4GfSLoMWAX8ZZrX4ZxzrnupdnM5\n55zrHZpyBHxPAyWzrthgzqySdIekDZKeytnWNANTi1zfDZLWSFoY/ZxRzzpWQtJoSY9JWiLpaUlX\nR9sz/xkWuLbPRNub4vOT1F/SvOhe8nSUmy7rs2u6lkk0UPJZ4FTgZULe5mNmtryuFasiSSuBGWb2\np3rXpRokvRd4E/i/ZnZitO1m4HUz+2r0hWCYmV1Xz3qWq8j17XmgpK6Vq4LoIZiRZrZI0iDgScJY\nsEvJ+GfYzbV9lOb5/A4ys22S+gB/AK4GPkyJn10ztkySDJTMumKDOTMpetw7PzA2zcDUItcH4XPM\nPDNbb2aLotdvAssIY8Uy/xkWubZ4rFyzfH7bopf9Cflpo4zPrmluSDmSDJTMumKDOZtJbxiYepWk\nRZK+m8UuoEIkHQVMBTqBw5vpM8y5tnnRpqb4/CS1SOoC1gMPm9l8yvjsmjGY9AbvMbPpwFnAlVE3\nSrNrrv5Y+DZwtJlNJfwnbobukkHAT4HPRt/i8z+zzH6GBa6taT4/M9ttZtMIrclWSZMo47NrxmCy\nljAlS2x0tK1pmNm66M9XgZ8TuvaazQZJh8OefutX6lyfqjKzV21vwvLfgJPrWZ9KSTqAcLO9y8zi\ncV9N8RkWurZm+/wAzGwz0A6cQRmfXTMGkz0DJSX1IwyUvL/OdaoaSQdF35LIGcz5TH1rVRVi3z7o\neGAqdDMwNUP2ub7oP2jsArL/GX4PWGpmX8/Z1iyf4X7X1iyfn6ThcRedpAHAaYS8UMmfXdM9zQXh\n0WDg6+wdKHlTnatUNZLGE1ojuYM5M319kn4ItAGHAhuAG4BfAPcAY4gGpprZpnrVsRJFru/9hP73\n3cCLwBVxH3XWSHoP8FvC3HoW/VwPPAH8hAx/ht1c21/RBJ+fpBMICfaW6OfHZvaPkg6hxM+uKYOJ\nc8652mrGbi7nnHM15sHEOedcxTyYOOecq5gHE+eccxXzYOKcc65iHkycc85VzIOJy7RoevDT8rZ9\nVtL/7uG8LSnXa7ikTklPRmMVcvc9Lml69Hp8tFTCaQXK+Fo0LXjRZa57qMP7JP0y5/cvS/q1pL6S\n2iXNz9k3Q9LjOeftlnR2zv5fSvqzcurhegcPJi7rfkjeUtCEWQ9+2MN5aQ+w+iDwlJnNMLM/FDpA\n0mjgP4BrzOzhAodcDpxoZonW5ImmEM9n0b5/AGYBH4pm0zbgMEmn5x8bWQP89yTv6xx4MHHZ9zPg\nrGj+JCSNA44wsz9IGijpEUkLFBYTOzf/5ALf3r8p6aLo9fT4G7yk/4jnKso7f5ykR6PyH1ZYTGkK\nYWnp86KFk/oXqPeRwIPAF8zsVwXKvQ8YBDwp6SM577Mofp/ouO9L+ldJndF7FihKfwecDpxjZu/k\n7Psa8A8F/1ZhMfCGpFOL7HduHx5MXKZFC4Q9AZwZbfoYYRoIgLcJ38RPAj4A/EuxYvI3RMHpm8CH\nzexk4PvAPxU495vA981sCqE19E0zWwz8D8LUFNPNbHuB8+6Mjv15kes6D9gWnX9PzvtMjd8n5/BR\nZjbTzP5rgaLeA1wBnJmzbkV8zR3AdknvK1QF4B+BLxaqn3P5PJi4ZnA3IYgQ/fmj6LWAr0haDDwC\nHCkp6ZoaE4DJhHVjughdPkcWOG5WzvvdRbh5J/Ew8AlJB3ZzTO7El929zz3dlPF8VM6fFym7aMCI\nFvWy/JyPc4V4MHHN4D7gVEnTgAFm1hVt/zgwHJgWrdfwCpB/897Jvv8P4v0CnolaBtPMbIqZncn+\nys29fJUww/VPFZaaLsSKvM63tZt96wnr3twqqW2/NzB7nHDNM4uc/0+ErjCfxM91y4OJyzwz20pY\nh+F77P32DjAEeMXMdkt6PzAuZ1/8zXwVMDF6wmkoEOcIVhAS1DMhdHtJmljg7eey9wGATwC/K6He\n1wBvRPUuJLdlUsn7PE+YJv3fJZ1Y4JB/BP5bkXMfBoYBhc5zbg8PJq5Z/Ihww8sNJj8ATo66uT5B\nWKchZgBmtoaQY3mG0F22MNq+A/gL4GZJi4AuQldTvquBS6NjPg58NkFdc7/lXwKMLPL4b+5xxd4n\nUYvBzBYAlwL3R8sYWM6+/yC02oqV9Y+EqcidK8qnoHfOOVcxb5k455yrmAcT55xzFfNg4pxzrmIe\nTJxzzlXMg4lzzrmKeTBxzjlXMQ8mzjnnKubBxDnnXMX+PygS5di7aan1AAAAAElFTkSuQmCC\n",
335 | "text/plain": [
336 | ""
337 | ]
338 | },
339 | "metadata": {},
340 | "output_type": "display_data"
341 | }
342 | ],
343 | "source": [
344 | "import matplotlib.pyplot as plt\n",
345 | "%matplotlib inline\n",
346 | "\n",
347 | "# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)\n",
348 | "plt.plot(k_range, k_scores)\n",
349 | "plt.xlabel('Value of K for KNN')\n",
350 | "plt.ylabel('Cross-Validated Accuracy')"
351 | ]
352 | },
353 | {
354 | "cell_type": "markdown",
355 | "metadata": {},
356 | "source": [
357 | "## Cross-validation example: model selection"
358 | ]
359 | },
360 | {
361 | "cell_type": "markdown",
362 | "metadata": {},
363 | "source": [
364 | "**Goal:** Compare the best KNN model with logistic regression on the iris dataset"
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "execution_count": 11,
370 | "metadata": {
371 | "collapsed": false
372 | },
373 | "outputs": [
374 | {
375 | "name": "stdout",
376 | "output_type": "stream",
377 | "text": [
378 | "0.98\n"
379 | ]
380 | }
381 | ],
382 | "source": [
383 | "# 10-fold cross-validation with the best KNN model\n",
384 | "knn = KNeighborsClassifier(n_neighbors=20)\n",
385 | "print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": 12,
391 | "metadata": {
392 | "collapsed": false
393 | },
394 | "outputs": [
395 | {
396 | "name": "stdout",
397 | "output_type": "stream",
398 | "text": [
399 | "0.953333333333\n"
400 | ]
401 | }
402 | ],
403 | "source": [
404 | "# 10-fold cross-validation with logistic regression\n",
405 | "from sklearn.linear_model import LogisticRegression\n",
406 | "logreg = LogisticRegression()\n",
407 | "print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())"
408 | ]
409 | },
410 | {
411 | "cell_type": "markdown",
412 | "metadata": {},
413 | "source": [
414 | "## Cross-validation example: feature selection"
415 | ]
416 | },
417 | {
418 | "cell_type": "markdown",
419 | "metadata": {},
420 | "source": [
421 | "**Goal**: Select whether the Newspaper feature should be included in the linear regression model on the advertising dataset"
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": 13,
427 | "metadata": {
428 | "collapsed": false
429 | },
430 | "outputs": [],
431 | "source": [
432 | "import pandas as pd\n",
433 | "import numpy as np\n",
434 | "from sklearn.linear_model import LinearRegression"
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 14,
440 | "metadata": {
441 | "collapsed": false
442 | },
443 | "outputs": [],
444 | "source": [
445 | "# read in the advertising dataset\n",
446 | "data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)"
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": 15,
452 | "metadata": {
453 | "collapsed": false
454 | },
455 | "outputs": [],
456 | "source": [
457 | "# create a Python list of three feature names\n",
458 | "feature_cols = ['TV', 'Radio', 'Newspaper']\n",
459 | "\n",
460 | "# use the list to select a subset of the DataFrame (X)\n",
461 | "X = data[feature_cols]\n",
462 | "\n",
463 | "# select the Sales column as the response (y)\n",
464 | "y = data.Sales"
465 | ]
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": 16,
470 | "metadata": {
471 | "collapsed": false
472 | },
473 | "outputs": [
474 | {
475 | "name": "stdout",
476 | "output_type": "stream",
477 | "text": [
478 | "[-3.56038438 -3.29767522 -2.08943356 -2.82474283 -1.3027754 -1.74163618\n",
479 | " -8.17338214 -2.11409746 -3.04273109 -2.45281793]\n"
480 | ]
481 | }
482 | ],
483 | "source": [
484 | "# 10-fold cross-validation with all three features\n",
485 | "lm = LinearRegression()\n",
486 | "scores = cross_val_score(lm, X, y, cv=10, scoring='mean_squared_error')\n",
487 | "print(scores)"
488 | ]
489 | },
490 | {
491 | "cell_type": "code",
492 | "execution_count": 17,
493 | "metadata": {
494 | "collapsed": false
495 | },
496 | "outputs": [
497 | {
498 | "name": "stdout",
499 | "output_type": "stream",
500 | "text": [
501 | "[ 3.56038438 3.29767522 2.08943356 2.82474283 1.3027754 1.74163618\n",
502 | " 8.17338214 2.11409746 3.04273109 2.45281793]\n"
503 | ]
504 | }
505 | ],
506 | "source": [
507 | "# fix the sign of MSE scores\n",
508 | "mse_scores = -scores\n",
509 | "print(mse_scores)"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": 18,
515 | "metadata": {
516 | "collapsed": false
517 | },
518 | "outputs": [
519 | {
520 | "name": "stdout",
521 | "output_type": "stream",
522 | "text": [
523 | "[ 1.88689808 1.81595022 1.44548731 1.68069713 1.14139187 1.31971064\n",
524 | " 2.85891276 1.45399362 1.7443426 1.56614748]\n"
525 | ]
526 | }
527 | ],
528 | "source": [
529 | "# convert from MSE to RMSE\n",
530 | "rmse_scores = np.sqrt(mse_scores)\n",
531 | "print(rmse_scores)"
532 | ]
533 | },
534 | {
535 | "cell_type": "code",
536 | "execution_count": 19,
537 | "metadata": {
538 | "collapsed": false
539 | },
540 | "outputs": [
541 | {
542 | "name": "stdout",
543 | "output_type": "stream",
544 | "text": [
545 | "1.69135317081\n"
546 | ]
547 | }
548 | ],
549 | "source": [
550 | "# calculate the average RMSE\n",
551 | "print(rmse_scores.mean())"
552 | ]
553 | },
554 | {
555 | "cell_type": "code",
556 | "execution_count": 20,
557 | "metadata": {
558 | "collapsed": false
559 | },
560 | "outputs": [
561 | {
562 | "name": "stdout",
563 | "output_type": "stream",
564 | "text": [
565 | "1.67967484191\n"
566 | ]
567 | }
568 | ],
569 | "source": [
570 | "# 10-fold cross-validation with two features (excluding Newspaper)\n",
571 | "feature_cols = ['TV', 'Radio']\n",
572 | "X = data[feature_cols]\n",
573 | "print(np.sqrt(-cross_val_score(lm, X, y, cv=10, scoring='mean_squared_error')).mean())"
574 | ]
575 | },
576 | {
577 | "cell_type": "markdown",
578 | "metadata": {},
579 | "source": [
580 | "## Improvements to cross-validation"
581 | ]
582 | },
583 | {
584 | "cell_type": "markdown",
585 | "metadata": {},
586 | "source": [
587 | "**Repeated cross-validation**\n",
588 | "\n",
589 | "- Repeat cross-validation multiple times (with **different random splits** of the data) and average the results\n",
590 | "- More reliable estimate of out-of-sample performance by **reducing the variance** associated with a single trial of cross-validation\n",
591 | "\n",
592 | "**Creating a hold-out set**\n",
593 | "\n",
594 | "- \"Hold out\" a portion of the data **before** beginning the model building process\n",
595 | "- Locate the best model using cross-validation on the remaining data, and test it **using the hold-out set**\n",
596 | "- More reliable estimate of out-of-sample performance since hold-out set is **truly out-of-sample**\n",
597 | "\n",
598 | "**Feature engineering and selection within cross-validation iterations**\n",
599 | "\n",
600 | "- Normally, feature engineering and selection occurs **before** cross-validation\n",
601 | "- Instead, perform all feature engineering and selection **within each cross-validation iteration**\n",
602 | "- More reliable estimate of out-of-sample performance since it **better mimics** the application of the model to out-of-sample data"
603 | ]
604 | },
605 | {
606 | "cell_type": "markdown",
607 | "metadata": {},
608 | "source": [
609 | "## Resources\n",
610 | "\n",
611 | "- scikit-learn documentation: [Cross-validation](http://scikit-learn.org/stable/modules/cross_validation.html), [Model evaluation](http://scikit-learn.org/stable/modules/model_evaluation.html)\n",
612 | "- scikit-learn issue on GitHub: [MSE is negative when returned by cross_val_score](https://github.com/scikit-learn/scikit-learn/issues/2439)\n",
613 | "- Section 5.1 of [An Introduction to Statistical Learning](http://www-bcf.usc.edu/~gareth/ISL/) (11 pages) and related videos: [K-fold and leave-one-out cross-validation](https://www.youtube.com/watch?v=nZAM5OXrktY) (14 minutes), [Cross-validation the right and wrong ways](https://www.youtube.com/watch?v=S06JpVoNaA0) (10 minutes)\n",
614 | "- Scott Fortmann-Roe: [Accurately Measuring Model Prediction Error](http://scott.fortmann-roe.com/docs/MeasuringError.html)\n",
615 | "- Machine Learning Mastery: [An Introduction to Feature Selection](http://machinelearningmastery.com/an-introduction-to-feature-selection/)\n",
616 | "- Harvard CS109: [Cross-Validation: The Right and Wrong Way](https://github.com/cs109/content/blob/master/lec_10_cross_val.ipynb)\n",
617 | "- Journal of Cheminformatics: [Cross-validation pitfalls when selecting and assessing regression and classification models](http://www.jcheminf.com/content/pdf/1758-2946-6-10.pdf)"
618 | ]
619 | },
620 | {
621 | "cell_type": "markdown",
622 | "metadata": {},
623 | "source": [
624 | "## Comments or Questions?\n",
625 | "\n",
626 | "- Email: \n",
627 | "- Website: http://dataschool.io\n",
628 | "- Twitter: [@justmarkham](https://twitter.com/justmarkham)"
629 | ]
630 | },
631 | {
632 | "cell_type": "code",
633 | "execution_count": 1,
634 | "metadata": {
635 | "collapsed": false
636 | },
637 | "outputs": [
638 | {
639 | "data": {
640 | "text/html": [
641 | "\n",
693 | ""
708 | ],
709 | "text/plain": [
710 | ""
711 | ]
712 | },
713 | "execution_count": 1,
714 | "metadata": {},
715 | "output_type": "execute_result"
716 | }
717 | ],
718 | "source": [
719 | "from IPython.core.display import HTML\n",
720 | "def css_styling():\n",
721 | " styles = open(\"styles/custom.css\", \"r\").read()\n",
722 | " return HTML(styles)\n",
723 | "css_styling()"
724 | ]
725 | }
726 | ],
727 | "metadata": {
728 | "kernelspec": {
729 | "display_name": "Python 2",
730 | "language": "python",
731 | "name": "python2"
732 | },
733 | "language_info": {
734 | "codemirror_mode": {
735 | "name": "ipython",
736 | "version": 2
737 | },
738 | "file_extension": ".py",
739 | "mimetype": "text/x-python",
740 | "name": "python",
741 | "nbconvert_exporter": "python",
742 | "pygments_lexer": "ipython2",
743 | "version": "2.7.11"
744 | }
745 | },
746 | "nbformat": 4,
747 | "nbformat_minor": 0
748 | }
749 |
--------------------------------------------------------------------------------
/08_grid_search.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Efficiently searching for optimal tuning parameters\n",
8 | "*From the video series: [Introduction to machine learning with scikit-learn](https://github.com/justmarkham/scikit-learn-videos)*"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "## Agenda\n",
16 | "\n",
17 | "- How can K-fold cross-validation be used to search for an **optimal tuning parameter**?\n",
18 | "- How can this process be made **more efficient**?\n",
19 | "- How do you search for **multiple tuning parameters** at once?\n",
20 | "- What do you do with those tuning parameters before making **real predictions**?\n",
21 | "- How can the **computational expense** of this process be reduced?"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "## Review of K-fold cross-validation"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "Steps for cross-validation:\n",
36 | "\n",
37 | "- Dataset is split into K \"folds\" of **equal size**\n",
38 | "- Each fold acts as the **testing set** 1 time, and acts as the **training set** K-1 times\n",
39 | "- **Average testing performance** is used as the estimate of out-of-sample performance\n",
40 | "\n",
41 | "Benefits of cross-validation:\n",
42 | "\n",
43 | "- More **reliable** estimate of out-of-sample performance than train/test split\n",
44 | "- Can be used for selecting **tuning parameters**, choosing between **models**, and selecting **features**\n",
45 | "\n",
46 | "Drawbacks of cross-validation:\n",
47 | "\n",
48 | "- Can be computationally **expensive**"
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "## Review of parameter tuning using `cross_val_score`"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "**Goal:** Select the best tuning parameters (aka \"hyperparameters\") for KNN on the iris dataset"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 2,
68 | "metadata": {
69 | "collapsed": false
70 | },
71 | "outputs": [],
72 | "source": [
73 | "from sklearn.datasets import load_iris\n",
74 | "from sklearn.neighbors import KNeighborsClassifier\n",
75 | "from sklearn.cross_validation import cross_val_score\n",
76 | "import matplotlib.pyplot as plt\n",
77 | "%matplotlib inline"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 3,
83 | "metadata": {
84 | "collapsed": false
85 | },
86 | "outputs": [],
87 | "source": [
88 | "# read in the iris data\n",
89 | "iris = load_iris()\n",
90 | "\n",
91 | "# create X (features) and y (response)\n",
92 | "X = iris.data\n",
93 | "y = iris.target"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 4,
99 | "metadata": {
100 | "collapsed": false
101 | },
102 | "outputs": [
103 | {
104 | "name": "stdout",
105 | "output_type": "stream",
106 | "text": [
107 | "[ 1. 0.93333333 1. 1. 0.86666667 0.93333333\n",
108 | " 0.93333333 1. 1. 1. ]\n"
109 | ]
110 | }
111 | ],
112 | "source": [
113 | "# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)\n",
114 | "knn = KNeighborsClassifier(n_neighbors=5)\n",
115 | "scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')\n",
116 | "print(scores)"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 5,
122 | "metadata": {
123 | "collapsed": false
124 | },
125 | "outputs": [
126 | {
127 | "name": "stdout",
128 | "output_type": "stream",
129 | "text": [
130 | "0.966666666667\n"
131 | ]
132 | }
133 | ],
134 | "source": [
135 | "# use average accuracy as an estimate of out-of-sample accuracy\n",
136 | "print(scores.mean())"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 6,
142 | "metadata": {
143 | "collapsed": false
144 | },
145 | "outputs": [
146 | {
147 | "name": "stdout",
148 | "output_type": "stream",
149 | "text": [
150 | "[0.95999999999999996, 0.95333333333333337, 0.96666666666666656, 0.96666666666666656, 0.96666666666666679, 0.96666666666666679, 0.96666666666666679, 0.96666666666666679, 0.97333333333333338, 0.96666666666666679, 0.96666666666666679, 0.97333333333333338, 0.98000000000000009, 0.97333333333333338, 0.97333333333333338, 0.97333333333333338, 0.97333333333333338, 0.98000000000000009, 0.97333333333333338, 0.98000000000000009, 0.96666666666666656, 0.96666666666666656, 0.97333333333333338, 0.95999999999999996, 0.96666666666666656, 0.95999999999999996, 0.96666666666666656, 0.95333333333333337, 0.95333333333333337, 0.95333333333333337]\n"
151 | ]
152 | }
153 | ],
154 | "source": [
155 | "# search for an optimal value of K for KNN\n",
156 | "k_range = list(range(1, 31))\n",
157 | "k_scores = []\n",
158 | "for k in k_range:\n",
159 | " knn = KNeighborsClassifier(n_neighbors=k)\n",
160 | " scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')\n",
161 | " k_scores.append(scores.mean())\n",
162 | "print(k_scores)"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 7,
168 | "metadata": {
169 | "collapsed": false
170 | },
171 | "outputs": [
172 | {
173 | "data": {
174 | "text/plain": [
175 | ""
176 | ]
177 | },
178 | "execution_count": 7,
179 | "metadata": {},
180 | "output_type": "execute_result"
181 | },
182 | {
183 | "data": {
184 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAEPCAYAAACHuClZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xm8VOWd5/HP9yIggiyKiLKJMaKAsukVSKZzE2O7jRpN\npxM7GbfE2N0aEzvTo3E6I2bSHU26HbNMOtoxGcdOYmJiopmk3b3ZuBdBLqhsalAEBFwIgqDI8ps/\nnnOgKKruPbWcqjp1f+/X676oe5annkPB+dXz/M7zPDIznHPOuUq01LsCzjnnss+DiXPOuYp5MHHO\nOVcxDybOOecq5sHEOedcxTyYOOecq1jqwUTSGZKWS3pW0rUF9g+VdK+kxZI6JU3M2XeNpGckPSXp\nB5L6RduHSXpI0gpJD0oakvZ1OOecKy7VYCKpBfgWcDowCbhQ0nF5h10PdJnZFOBi4BvRuUcCnwGm\nm9mJwAHAx6JzrgMeMbMJwGPAF9K8Duecc91Lu2XSCjxnZqvMbAdwN3Be3jETCQEBM1sBHCXpsGhf\nH2CgpAOAg4C10fbzgDuj13cCH0rvEpxzzvUk7WAyClid8/uaaFuuxcAFAJJagbHAaDN7GfgX4CVC\nENlkZo9G54wwsw0AZrYeGJHaFTjnnOtRIyTgbwKGSVoIXAl0AbskDSW0QMYBRwKDJP1VkTJ8Thjn\nnKujA1Iufy2hpREbzd6uKgDMbAtwWfy7pJXASuAMYKWZbYy23wvMBn4IbJB0uJltkDQSeKXQm0vy\nIOOcc2UwM5VyfNotk/nAMZLGRU9ifQy4P/cASUMk9Y1eXw781szeJHRvzZR0oCQBpwLLotPuBy6J\nXl8M3FesAmbWtD833HBD3evQrNe3e7dx2GHGwIHGjh3NdW1mxksvGWDMnl39sk86yRg6tL7Xl/ZP\nvT+/tH/KkWowMbNdwFXAQ8AS4G4zWybpCkmfjg47HnhG0jLCU1+fjc59AvgpodtrMSDg9uicm4HT\nJK0gBJmb0rwO1/v88Y/Qrx+MHQtPPVXv2lTf3LnwgQ/AokWwfXv1yt22DZYsgc2bYffu6pXrGl/a\n3VyY2QPAhLxtt+W87szfn7PvRuDGAts3Ah+sbk2d26ujA2bPhiFDwuvp0+tdo+rq6IDTT4eNG6Gr\nC2bOrE65CxbAiSeGAPzKKzByZHXKdY2vERLwrkxtbW31rkKq6nl9c+fCrFnhZ+7c6pdf788ureuL\nyx01qo01a6pXbqOp9+fXiFRu/1gWSLJmvj6XnqlT4bbbQsvkrLNg5cp616h63noLhg+H116Dn/0M\n7rsP7rmnOmWfey78l/8Cd90Fl14K559fnXJdbUnCGiwB71zmbN4Mzz0H06bBscfCpk2wbl29a1U9\nCxbApEkwYEDoyps7F6rxncssdJ/NmgVjxsDq1T2f45qHBxPn8jzxRAgk/fpBS0u4OXZ01LtW1TN3\nbggiAOPHw86d8NJLlZf7/PMhQI0e7cGkN/Jg4lyeOPkemz27uYJJ7vVJ1bu+3HI9mPQ+HkycyxMn\nkWNpJeHrwSy968std8wYmjoB7/bnwcS5HLt3Q2fnvjfb1lZYvLi64zHqZeVK6N8/3Oxj1WqZ5Haf\njR7tLZPexoOJczmWL4dhw/YdHzFoELz73WE8Rtblt0oAZsyApUth69byy928OQSqKVPC76NGhYcW\ndu0qv0yXLR5MnMuR++06V/zUU9YVur4BA2Dy5PCUV7nmzQsDO/v1C7/37w+HHALr15dfpssWDybO\n5chPvseaJQmf1vUVKteT8L2LBxPnchTqBoK9Seosj4HdsiU8vjt16v77Km15Ffp78yR87+LBxLnI\nxo3h5nfCCfvvGz8+9P9n+Zt27viZfPFYmnKCZaGHFsCT8L2NBxPnIp2dcPLJcECB6U+l7D8iXKzV\nBeHGP2BAaLmUatmyMD3LiLz1Tr2bq3fxYOJcpFjyPZb1JHxP11fuSP9i5Xow6V08mDgXKZacjmU5\nCV+sKypXucGy2N+bB5PexYOJc4T5qZ54ovt1PeLxGNu21a5e1bJ8eXhU9/DDix9TbjAp1n3mCfje\nxYOJc8Azz4S8wSGHFD/mwANDcr6S8Rj10lOrC8KAw5UrwwDEpF5/HV5+OYxTyXfEEWGBrJ07S6ur\nyyYPJs7RfXI6V1aT8Emur1+/MPBw3rzk5XZ2hulm+vTZf1/fvnDYYSHYuObnwcQ5kn1zh+wm4XtK\nvsdKzQv1VK7nTXoPDybOkfxmW8l4jHrZuBHWri3cFZWv1JZXT0HYg0nv4cHE9XobNoQb7nHH9Xxs\nJeMx6qW78TP5Zs0Kx+/e3fOxO3fC/PlwyinFj/EkfO/hwcT1eh0d4SmuloT/G7L2iHDSLjwIAw+H\nDw8DEXvy1FMwdmyYZbkYHwXfe3gwcb1e0i6uWNbyJmldX7zee3e8m6v38GDier0kN8VcWXqiK+6K\n6m78TL6kLa8kQcqDSe/hwcT1au+8AwsXhsdbk5o6tfTxGPUSj5/prisqX9JgmaT7zINJ7+HBxPVq\nXV1wzDEweHDyc/r2LX08Rr0kHT+Ta/LksEri668XP2bdOti0CY49tvuyRo4MDze8805pdXDZ48HE\n9WqlJKdzZSUJX8719ekTWmqdnd2XO2tWzw8t9OkTAsrataXVwWWPBxPXq5WanI5lJQlf7vX11NVV\nSp7Ju7p6h9SDiaQzJC2X9KykawvsHyrpXkmLJXVKmhhtP1ZSl6SF0Z9vSLo62neDpDXRvoWSzkj7\nOlxzKjX5Hps5M/l4jHrZsAH+9CeYMKH0c3tqeZUSpDyY9A6pBhNJLcC3gNOBScCFkvKHhl0PdJnZ\nFOBi4BsAZvasmU0zs+nADGArcG/OebeY2fTo54E0r8M1p9WrYft2eNe7Sj93xIgw71SS8Rj1Uur4\nmVynnBKeAis0SeP27bBoUfKHFjyY9A5pt0xagefMbJWZ7QDuBs7LO2Yi8BiAma0AjpJ0WN4xHwT+\naGa5Y2mVUp1dLxEnp1Xmv6RGf0S4nOR7bNiwMCDx6af339fVFRLvgwYlK8tHwfcOaQeTUUDud5I1\n0bZci4ELACS1AmOB0XnHfBT4Ud62qyQtkvRdSUOqV2XXW5SbfI81ehK+GtdXKFiWmofxUfC9QyMk\n4G8ChklaCFwJdAG74p2S+gLnAvfknPNt4GgzmwqsB26pXXVdsyg3OR1r5CT8O++EFkQp42fyFWt5\nlZpn8m6u3iHB1G8VWUtoacRGR9v2MLMtwGXx75JeAFbmHHIm8KSZvZpzzqs5+/8N+GWxCsyZM2fP\n67a2Ntra2kqpv2tSb70FS5bASSeVX8akSXvHYxx6aPXqVg3x+JmDDy6/jNmz4ctf3nebWQgwN9+c\nvBwPJo2vvb2d9vb2isqQpTiXtqQ+wArgVGAd8ARwoZktyzlmCLDNzHZIuhx4j5ldkrP/R8ADZnZn\nzraRZrY+en0NcLKZ/VWB97c0r89l1+9+B5//fFiqtxKnnQaf+xycfXZ16lUtt94Kzz4L3/52+WXs\n3h0mfVy6NIwVAVi1KiTn161LnmvavTvMtLxpU/jTNT5JmFlJ2cRUu7nMbBdwFfAQsAS428yWSbpC\n0qejw44HnpG0jPDU12fj8yUdREi+37tvyXxV0lOSFgHvA65J8zpc8yn3keB8jZqEryT5Hmtp2bt+\nSyz+eyvloYWWFhg1ygcuNru0u7mIHtudkLfttpzXnfn7c/ZtA/Kf7MLMLqpyNV0vM3cuXHhh5eXM\nng1f/Wrl5VRbRwd85SuVlxPnhc4/P/xebp4pTsIfc0zldXKNqRES8M7VVNzvX0nyPTZzZvHxGPWy\nenVIwB99dOVlFWqZlPP35nmT5ufBxPU6K1dCv37hBlepoUPDeIynnqq8rGqpdPxMrtbWMEBx+3bY\nujXkT2bMKL0cDybNz4OJ63Wq1SqJNdp4k0rHl+QaNCgMUOzqggUL4IQT4MADSy/Hg0nz82Diep1q\nJd9jjZaEr0byPVfc1VXJ35uPgm9+Hkxcr9PMLZNqjJ/JFyfhK/l781Hwzc+DietVtmyB55+HadOq\nV+axx8Ibb4SxF/W2YEFY3Kqa4znillelLRMPJs3Ng4nrVZ54Iiy7269f9cpsaQlPdTVC66TaXVwA\n48fDrl0hQI3OnzUvoeHDQ6tp69bq1s01jtTHmTgH4XHc5cvh+OPrW49qd3HFZs+GH/84LOlbT7/6\nFVx1VXXLlML1VRKApb1dXcflL0KRkuXLw1ou1XiqzfWsx2Ai6RzgV2bWwMsAuUa3aBG8972hO+iA\nOn6F6eiAyy+vfrnnnx/WhL/99uqXXYpDD4UPfKD65X7qU2EJ3krESfhaBJNdu8K0L52d9f8C01sk\n+W/9UeBWST8Dvmdmy1Ouk2tCc+fCtm1hPMb06fWpw+7dIZh873vVL3viRLj//uqX2yjOOqvyMmqZ\nhF+yBDZvhpde8mBSKz3mTMzsE8A04I/A/5HUIenTkiqYj9T1NnPnwuDB9X2EdvlyOOSQvZMWutqq\nZRI+/nfmSf/aSZSAN7PNwE8JKyUeAZwPLJT0mRTr5ppIRwdccUV9k9TVHl/iSlPLYNLRAUcc4cGk\nlnoMJpLOlfRzoB3oC7Sa2ZnAFODz6VbPNYP168P045dcUt+WSVrJd5dMrVsmH/mID5SspSQtkw8D\n/8vMTjCzr5nZK7BnRt9Pplo71xQ6OsKjs8cdV9/xGN4yqa9ajYJ/9dXwc/rp3jKppSTBZA5hUSsA\nJA2QdBSAmT2aSq1cU4lbBIXWx6iVjRvDjeWEE2r/3i6oVQK+oyM8yTVunAeTWkoSTO4Bch8L3sW+\n67E7163c7qV6TT3S2RlmwK3nY8m93bBhYar+zZvTfZ+4BRp3q/liq7WRJJgcYGbvxL9Er6s4ftg1\ns+3bwxiT1tbwe70mRfQurvqTapM3ib+8DB4cWsObNqX7fi5IEkxelXRu/Iuk84DX0quSayZdXWHu\nqkGDwu+562PUkiffG0PawWTHDnjyydDNFb+fJ+FrI0kw+WvgekkvSVoNXAtckW61XLPIbxHkro9R\nKzt3htUQZ86s3Xu6wtK+uS9eHOYSGzJk7/t53qQ2euxBNrM/AjMlDYp+fzP1WrmmMXcunHfevtvi\nKc1rdXN/5hkYNSoMWHT1lXYSPr8F6sGkdhKlIyWdDUwCDlQ0a5qZfSnFerkmEK+1fvPN+26fPRt+\n8Yva1cO7uBrHmDHpPoDR0REeCc59Pw8mtZFk0OJ3CPNzfQYQ8BFgXMr1ck1g9erQxTR+/L7b4yR8\nrZ6y8eR740j75p7/xcEX5aqdJDmT2WZ2EfAnM7sRmAUcm261XDOI/2PnTwEer4/x0ku1rYervzSD\nydq1Yb2Ud7973/fzBHxtJAkmb0d/bpN0JLCDMD+Xc90q1iKQajd4ccOGMGCxVmtouO7FN/c0WqXx\nv7fcLy/ezVU7SYLJLyUNBb4GLAReBH6YZqVcc+iuRRAn4dMWT+XS4muKNoQ0x34U+veWZvBy++r2\nv5ikFuBRM9tkZj8j5EqOM7P/UZPauczatg2WLoUZMwrvr9VIeO/iajxptRYKtYQHDoQDD4TXX6/+\n+7l9dRtMotUV/3fO79vN7I3Ua+Uyb8ECmDw5rBteyIwZIdikvSa4J98bTxrB5O23w8JrJ5+8/z5P\nwtdGksb/o5I+LPlKyi65nloEBx4YJl1csCC9OrzzDixcuHcqF9cY0ggmTz4ZVlQcOLDw+3kSPn1J\ngskVhIkdt0vaLGmLpJSnanNZN3duzy2CtJPwXV1wzDGhn941jjRu7t21QD0JXxtJlu092MxazKyf\nmQ2Ofk/831PSGZKWS3pW0rUF9g+VdK+kxZI6JU2Mth8rqUvSwujPNyRdHe0bJukhSSskPShpSCkX\n7dJlFv5z95SrSDsJn6QOrvbS6HbqriXswaQ2kgxa/LNCP0kKjxL43wJOJ4ygv1BS/kOa1wNdZjYF\nuBj4BoCZPWtm08xsOjAD2ArcG51zHfCImU0AHgO+kKQ+rjaefz7kSkaP7v64uGWS1pM2nnxvTNW+\nucdfXrxlUl9Jurn+Pufni8AvCQtmJdEKPGdmq8xsB2EN+byZmphICAiY2QrgKEmH5R3zQeCPZhY3\njs8D7oxe3wl8KGF9XA0kTXqPHh2CzvPP17cerraqfXN/8cUwtmRckXk5PAFfG0m6uc7J+TkNmAz8\nKWH5o4Dcj3FNtC3XYuACAEmtwFgg/zvtR4Ef5fw+wsw2RPVbD4xIWB9XA6W0CNJ6RHj16jDN/bve\nVf2yXWWqPfYj7s4s9oiQJ+Bro5x159YAx1exDjcBX5e0EHga6CKs5giApL7AuYSurWKK/rOcM2fO\nntdtbW20tbVVVlvXo7lz4bLLkh0bz9N10UXVr0P+aGjXGAYODC3S11+H4cMrL6+nhz1Gjw5Treze\n7YNXi2lvb6e9vb2iMnoMJpK+yd6bdQswlTASPom1hJZGbHS0bQ8z2wLsufVIegFYmXPImcCTZvZq\nzrYNkg43sw2SRgKvFKtAbjBx6du8GVauhKlTkx0/ezbccUf16+HJ98YWdz1VK5h8/OPF9w8YAAcf\nDK++CocfXvn7NaP8L9o33nhjyWUkidMLgCejnw7gWjP7RMLy5wPHSBonqR/wMeD+3AMkDYlaH0i6\nHPhN3popF7JvFxdRGZdEry8G7ktYH5eyefNg+nTol3Bh56lTQ/Cp9rrgSR5NdvVTrbzJm2/CihXh\n31wt3s8Vl6Sb66fA22a2C0BSH0kHmdm2nk40s12SrgIeIgSuO8xsmaQrwm67ndBldqek3cAS4JPx\n+ZIOIiTfP51X9M3ATyRdBqwC/jLBdbgaKDXp3bdvuBHMmwennVadOrz1FixZAiedVJ3yXPVV6+Y+\nfz5MmQL9+3d/XNwS8n8T6UkSTB4l3NDj1sIAQnBI1IlgZg8AE/K23ZbzujN/f86+bUD+k12Y2cao\nTq7BzJ0Lf/M3pZ0TJ+GrFUwWLIBJk+Cgg6pTnqu+agWTpN2ZnoRPX5JurgNzu52i1/7f1O1n927o\n7Cy9eylOwleLd3E1vmrd3JN+1t7Nlb4kwWSrpD09kpJmAG+lVyWXVcuWhYTqiBIf1J41KwSh3bur\nUw9Pvje+aoz96GmwYi4PJulLEkw+B9wj6XeSfg/8GLgq3Wq5LCp3xPmIEXDYYSEYVSped95bJo2t\nGjf3Z58NT2kdeWRt3s91r8eciZnNj6ZAifMaK6LR7M7to5IR53FX16RJldVh5crwJNmYMZWV49JV\njbEfpXx58VHw6UsyN9eVwEAze8bMngEGSfrb9KvmsqaSubCqNRK+2LrzrrEMGBBmc36l6AixnpXS\nnTlqFKxbB7t29XysK0+S7wSXm9meRTbN7E/A5elVyWXR66/Dyy+HBbHKUa0kvHdxZUelSfhSPuv+\n/eGQQ2DDhvLfz3UvSTDpk7swlqQ+QMIhaa636OwMi1D16VPe+ZMnh2BU6fKqnnzPjkq6njZtglWr\n4MQTk5/jeZN0JQkmDwA/lnSqpFMJo9EfSLdaLmsqne69Tx845ZQQlMq1ZQs89xxMm1Z+Ga52Krm5\nz5sXln7u27c27+d6liSYXEuYIv5vop9HCdPRO7dHNaZ7r7Sr64knQiBJOpWLq69Kbu7lfHnxJHy6\nkkxBv9vMvmNmf2FmfwH8Gvh8+lVzWbFzZ5jWYubMysqpNAnvi2FlSyXBpJzuTB8Fn65ED+VJOkzS\n30r6HdAO+Nybbo+nngr/UYcNq6ycU04JQWnnzvLO9+R7tpR7c9+1K3Rzlfrlxbu50lU0mEg6WNLF\nkh4EngDeBYw3s3eZ2X+tWQ1dw6tW0nvYMBg7Fp5+uvRzy53KxdVPud1OS5fCyJGlT1/vwSRd3bVM\nXiGsM/Jl4Ggz+zzwTk1q5TKlmt1Ls2eXlzdZvjwEo5Ejq1MPl75yx36U2wL1YJKu7oLJF4D+wLeB\nL0jyBVBdQdXsXio3Ce/5kuyJx36sX1/aeeV+1kccEQZJltuN6rpXNJiY2a1mNhM4L9r0C+BISddK\nOrYmtXMNb906eOMNmFBwEYHSlZuE9/El2VROa6Hcz7pv3zAH3Lp1pZ/repbkaa6VZvZPZnYCcBIw\nmPBEl3N0dIREaLXW1j722DAgrZxvq54vyZ5Sk/CvvhpaFxMnlv9+3tWVjpJuAdH8XP/dzI5Jq0Iu\nW6rdImhpCUGhlNbJxo3hhnTCCdWrh6uNUpPwnZ3hqb9yv7x4MElPlb5Put4qjRZBqXmTzk44+WQ4\nIMm6oa6hlHpzr/TfmweT9HgwcWXbvh0WLQpzclVTqU90efI9u8oJJpV81j4KPj3+XS5F27fDRRfB\ntm31rkk6tm4NOY6DD65uua2t0NUF55yT7Pgnn4Tvfre6dXC1MXYsPP548s96/vzQzVWuMWPgD38o\n/3yAn/88PIbe1lZZOfnWrIEf/xg+n9H5RYoGE0lPA1Zsv5mVMF9n7/TCC+Gb1Le/Xe+apOeYFLJn\ngwbBI48kn0G4Tx847bTq18Olr7UV7ror+eO6110HQ4aU/37V6Ob6znfCGJlqB5Nf/Qpuu60Jgwnw\nn6M/r4z+vCv68+PpVae5rF4dvrkn/dbl9vJuq97hgAPgzDNr936VBpN4poUjjqhenWJz54a6mWVz\ncbeiwcTMVgFIOs3Mcif1vk7SQuC6tCuXdWvWhD5a51xjGDkytHjfeae82aWXLg3TuKxbF8o59NDq\n1a2jA95+OzydWM1yayVJAl6S3pPzy+yE5/V6q1f7WuTONZI+fUJAWbu2vPPnzoX3vjc8PVjJ2jv5\n4vEzkyZl9wGBJEHhk8C3Jb0o6UXC9CqXpVqrJuHBxLnGU8lU9PHTZOXOIVdMR0d4sGDcuCYOJmb2\npJlNAaYAU8xsqpktTL9q2efBxLnGU+k6KrNmlT6wNmm5WR4H02MwkXS4pDuAu83sDUkTJX2yBnXL\nPA8mzjWecm/Yr70WpvmZNClMIVTJ2jv54hZPlsfBJOnm+j/Ag8CR0e/PAp9Lq0LNxBPwzjWecoNJ\nPJVLnz6Vrb2Tb8cOWLgwlJ3l1SCTBJPhZvYTYDeAme0EEq9AIOkMScslPSvp2gL7h0q6V9JiSZ2S\nJubsGyLpHknLJC2RdEq0/QZJayQtjH7OSFqfWtm8OazTMHRovWvinMtV7rf//Klcyl0uId/ixXDU\nUWH8TFN3cwFbJR1KNIBR0kzgjSSFS2oBvgWcDkwCLpR0XN5h1wNdUV7mYuAbOfu+DvzazI4n5GyW\n5ey7xcymRz8PJKlPLcVdXFl8Xty5Zlbut//8qVyqlYTPLbfZg8nfAfcD75L0B+D/AlcnLL8VeM7M\nVpnZDuBu9q6PEpsIPAZgZiuAo6I15wcD/8nMvh/t22lmm3POa+jbtOdLnGtM5dywd+wI0/bkTuVS\nrSR8nHyH0GpauzYMjsyaJMFkCfA+YDZwBaGFsTxh+aOA3I9tTbQt12LgAgBJrcBYYDQwHnhN0vej\nrqzbJQ3IOe8qSYskfVdSBRMspMODiXONacSI0A399tvJz3nqqfDYbm639YQJ5a29ky+3ZTJgQJhO\n6LXXKiuzHpJM9NhhZtMJQQWAaAT89CrV4Sbg61GZTwNdhJxM3+g9rjSzBZJuJYy6v4Ew1uVLZmaS\nvgzcQhgPs585c+bsed3W1kZbtSfUKcKT7841ppYWOPLI8H806dxyhWYrzl175/zzy6vL2rVhwtR3\nv3vvtrjlNGJEeWWWo729nfb29orK6G6ix5GEVsQASdPY2600GDgoYflrCS2N2Oho2x5mtoWcQZCS\nXgBWAgOB1Wa2INr1U+Da6JxXc4r4N+CXxSqQG0xqafVqn1/KuUYVJ+GTBpOOjsKTicZJ+HKDSdzF\nlZtbjYPJjBnllVmO/C/aN954Y8lldNfNdTrwz4QAcAvwL9HP3xGS5knMB46RNE5SP+BjhPzLHtET\nW32j15cDvzGzN81sA7A6Z735U4Gl0XEjc4q4AHgmYX1qxru5nGtcpSbhi62jUmkSvlC5WU3CdzfR\n453AnZI+bGY/K6dwM9sl6SrgIULgusPMlkm6Iuy224Hjo/fZTehKy+2uuhr4QRRsVgKXRtu/Kmkq\n4XHlFwm5nIbiwcS5xlXKDfvll2HLljADeL7W1rBA3Pbt0L9/6fXo6ICvfKX8ujWSHnMmZvYzSWcT\nEu8H5mz/UpI3iB7bnZC37bac1535+3P2LQZOLrD9oiTvXS9mHkyca2RjxsAzCfszCnVFxQYNCkGm\nqyuMii/F22+HxP7JeXe40aPD9qxJMp3Kd4CPAp8h5E0+AoxLuV6ZtmlTWKeh2isQOueqo5Rv/z0t\nFTx7dnmPCC9cCMcfDwMHll+3RpLk0eDZUUvgT2Z2IzALKNDgczFvlTjX2Eq5YeeOAymk3JHw+SPq\ny6lbI0kSTN6K/twm6UhgB5DCOmPNw4OJc41t9OhkCfjt28N0J/ldUbniJLwVXeS8sGItnlGjwuJb\nuxJPWtUYkgST/ydpKPA1YCEh4f2jNCuVdR5MnGtsw4fDtm3hpzsLF4bBiYMGFT9m/Pgwe3AprQmz\n4i2e/v3DRJIbNiQvrxEkWc/kf5rZpuiJrnHAcWb2xfSrll0eTJxrbFKyCR97ypfEZZX6iPCLL4bz\nxhXJPidtOTWSosFE0gX5P8DZwKnRa1eEj353rvElyU0kCSZQehK+oyOcU2wi2CzmTbp7NPic6M8R\nhHm5Hot+fz8wF7g3xXplmrdMnGt8Pd2wzUIw+ed/7rmsWbPgmmuSv3ex5HvSujWi7gYtXgog6SFg\nopmti34/grBglivCg4lzja+nrqRVq8KfRx3Vc1kzZsDSpSEHc1CCyabmzoWPf7z4/iwGkyQJ+DFx\nIIlsYN/5tlwOs/AP1IOJc42tpxt2d4MV8w0YAJMnw4IFPR/75puwYgVM72aq3GYNJo9KelDSJZIu\nAX4FPJJutbLrtdfCN5Mk306cc/XT0w07ab4kljQJP38+TJnS/fQrTZWAj5nZVcBthJUOpwC3m9ln\n0q5YVnny3blsqFcwSVJuFlsmSdYzwczuxRPuiXi+xLls6O6GvXUrLF/efVdUvlmz4G//NnR1d9c1\n1tEBl16eP7BeAAAWWUlEQVRafD+E9VY2bAjjVw5IdJeuv+4eDf599OcWSZtzfrZI2lzsvN7Og4lz\n2TB0aLhZby5wN5s/H048EQ48cP99xYweHXInzz9f/JjuBivm6tsXDjssjITPiqLBxMzeG/15sJkN\nzvk52MwG166K2eLBxLlskIqva5Lkhl9IT+vCP/tsmAD2yCN7LitrXV3dtUwO6e6nlpXMEg8mzmVH\nsRt2qfmSWE95k1LKzVoSvrveuCcBY+9yvbkMODqVGmWcJ+Cdy45CwSTuirrttsLndGf2bLjjjuL7\nSwkmWWuZdDdocXwtK9IsvGXiXHYUumE/91xYYyRJV1S+KVNg5cqQhxlcIBnQ0QF//dfJ6/bSS6XX\noV6SjDNB0jBJrZL+LP5Ju2JZtHs3rF3rLRPnsqJQV1K5XVwA/fqFJ8Dmzdt/36ZNYVT9iScmKytr\nLZMkKy1+Cvgt8CBwY/TnnHSrlU2vvAJDhpT2BIhzrn4K3bDLTb7HiiXh580L06707Vt+3RpZkpbJ\nZwnrsK8ys/cD04BNqdYqo7yLy7lsKXTDrqRlAsWT8KWWm7UEfJJg8raZvQ0gqb+ZLQcmpFutbPLk\nu3PZEgeTeJXEN96AF14IuY9yzZoFnZ2h2ztXqcHkiCPC9EzvvFN+XWopSTBZE620+AvgYUn3AavS\nrVY2ecvEuWwZPBj69An5DCi9K6qQESPCSo7Llu3dtmsXPPEEzJyZvJw+fWDkSHj55fLrUktJ5uY6\nP1ppcQ7wReAO4ENpVyyLPJg4lz253UmVdnHF8ru6liwJgWH48NLKyVLepLtBi7+W9AlJe1Y/NrPf\nmNn9ZpaRhldteTBxLntyb9iVJt9j+Un4csttimBCmCn4bOAFST+RdL6kfjWqVyZ5MHEue+Ib9q5d\nIddRjWCS3zIpt8WTpSR8d3Nz3WdmFwLjgJ8BFwEvSfq+pNNqVcEs8QS8c9kTB5OlS+Hww8MEi5Wa\nPDnkOl5/PfxebjBplpYJAGa2zcx+bGbnA38OTAUeSL1mGbNrF6xfD6NG1bsmzrlSxDfsanVxQUie\nt7aGls6rr4afiRPLr1sW9DhTvqTDgb8EPgYcAfwEuCTdamXP+vVwyCFhBKxzLjvirqRqJd9jcVfX\nrl1wyinQkmi+kX1lKZh0l4C/XNJjwELg3cDfm9nRZnadmS1O+gaSzpC0XNKzkq4tsH+opHslLZbU\nKWlizr4hku6RtEzSEkmnRNuHSXpI0opoSeEhJV11Cjxf4lw2pdEygb1J+ErKbYpgAswCvgKMMbOr\nzSzBgpT7ktQCfAs4HZgEXCjpuLzDrge6zGwKcDHwjZx9Xwd+bWbHE5YMjp/cvg54xMwmAI8BXyi1\nbtXmwcS5bBozBl58MfQuTJpUvXJnzgyLbP32t+W3eEaMCAMp3367evVKS3cJ+MvM7GEz2zOOU9Kc\nEstvBZ4zs1VmtgO4Gzgv75iJhICAma0AjpJ0mKTBwH8ys+9H+3aaWbwm2nnAndHrO2mAcS+efHcu\nmwYOhEGDQldUnz7VK3fYMBg7NuRNTjmlvDJaWsLsxWvXVq9eaSm1F+/cEo8fBeQ20tZE23ItBi4A\nkNQKjAVGA+OB16KnxxZKul3SgOicEWa2AcDM1gMjSqzXPnbsCNMWVMJbJs5l15gx1e3iis2aFRLv\nQyroiM9KV1epS9UXWiirUjcBX5e0EHga6AJ2AX2B6cCVZrZA0q2E7q0bCtTDihU+Z86cPa/b2tpo\na2vb75i77oJHH4Uf/KD8i1i9uvxvH865+nr/++HMM6tf7jnnhNZJJWoRTNrb22lvb6+oDJkVvQ/v\nf7DUktvtleD4mcAcMzsj+v06wMzs5m7OeQE4ARgIdJjZ0dH29wLXmtk5kpYBbWa2QdJI4PEor5Jf\nliW5vmXL4KyzwgRv5Zo5E265pbpPgzjn3HXXhTnErr++du8pCTMrqfGQZD2Tr0oaLKkvYaLHVyV9\nImH584FjJI2LRs9/DLg/r/whUdlIuhz4jZm9GXVjrZZ0bHToqcDS6PX97H08+WLgvoT1KWjChJDk\nWreu/DK8m8s5l4asjIJPkjP58yjx/Z+BF4FjgL9PUriZ7QKuAh4ClgB3m9kySVdI+nR02PHAM1Fr\n43TC+imxq4EfSFpEeJrrn6LtNwOnSVpBCDI3JalPMS0txRe0SWLHjjAo6YgjKqmFc87tr5lyJvEx\nZwP3mNkbUvLWj5k9QN76J2Z2W87rzvz9OfsWExbmyt++Efhg4kokEAeTCy4o/dx168IjfAeUmoFy\nzrkeZCWYJGmZ/D9Jy4EZwKOSDgMy8NRzaYqtjpaEd3E559LSNMHEzK4DZgMnRWNFtrL/WJHMa22F\nRYtg+/bSz/Vg4pxLy/DhsG1b+GlkSRLwHwF2mNkuSf8A/DtwZOo1q7FBg+DYY6Grq/RzPZg459Ii\nhQlkGz0Jn6Sb64tmtiV6NPeDhJUW/zXdatVHuV1dPvrdOZemLHR1JQkmu6I/zwZuN7NfAU05N265\nT3R5y8Q5l6ZmCSZrJd0GfBT4taT+Cc/LnLhlUsI4TsCDiXMuXc0STP4SeBA43cw2AYeQcJxJ1owf\nDzt3wksvlXaeBxPnXJqaIpiY2Tbgj8Dpkq4iTLL4UOo1qwMptE5K6eravh02bgzLfTrnXBqyMAo+\nydNcnwV+QJiZdwTw75I+k3bF6qXUJPzLL4eR79Wcuto553JloWWSZMz2J4FTzGwrgKSbgQ7gm2lW\nrF5mzYKf/CT58d7F5ZxLWxaCSZKcidj7RBfR6zSmom8IM2bA0qXJBwh5MHHOpW3YsDAH4JYt9a5J\ncUmCyfeBeZLmRCstdhLGmjSlAQNg8mRYsCDZ8R5MnHNpkxq/dZIkAX8LcCmwMfq51MxuTbti9VRK\n3sSDiXOuFho9Cd9tzkRSH2CJmR0HLKxNlepv9uyw+mISa9bAB6s6f7Fzzu0v0y2TaD2SFZIqXHgy\nW+KR8EkGL3rLxDlXC40eTJI8zTUMWCLpCcKMwQCY2bmp1arORo8OuZPnn4d3v7v7Yz2YOOdqYcwY\n6Oysdy2KSxJMvph6LRpQ3DrpLpi89VZ4uuKww2pXL+dc7zRmDNxzT71rUVzRbi5Jx0h6j5n9JveH\n8GhwA6eBqiNJEn7NmjA1dEtTzlTmnGskjZ6A7+42eCuwucD2N6J9TS1pMPGp551ztRDnTEqdiLZW\nugsmh5vZ0/kbo21HpVajBjFlCqxcCZsLhdOI50ucc7UyZEgYb/LGG/WuSWHdBZOh3ewbUO2KNJp+\n/WD6dJg3r/gxHkycc7XUyE90dRdMFki6PH+jpE8BT6ZXpcbR02JZHkycc7XUyMGku6e5Pgf8XNLH\n2Rs8TiKssnh+2hVrBLNnw792s0Dx6tVw9tm1q49zrndr5CR80WBiZhuA2ZLeD0yONv/KzB6rSc0a\nwKxZcPHFsHt34Se2PAHvnKulrLZMADCzx4HHa1CXhjNiBAwfDsuWwaRJ++/3bi7nXC2NGQO//W29\na1GYj5DoQbFHhLduhbffhkMPrX2dnHO9UyO3TDyY9KBYEn716tDFpaZd2cU512g8mGRYsZaJd3E5\n52otTsA34sDF1IOJpDMkLZf0rKRrC+wfKuleSYsldUqamLPvxWh7VzTRZLz9BklrJC2Mfs5Iq/6T\nJ4d13l9/fd/tnnx3ztXaoEHQvz9s3Fjvmuwv1WAiqQX4FnA6MAm4UNJxeYddD3SZ2RTgYuAbOft2\nA21mNs3MWvPOu8XMpkc/D6R0CfTpA62t+8/W6S0T51w9NGpXV9otk1bgOTNbZWY7gLuB8/KOmQg8\nBmBmK4CjJMXz8KqbOtYsW1Goq8uDiXOuHnprMBkF5F72mmhbrsXABQCSWoGxQNyBZMDDkuYXGI1/\nlaRFkr4raUj1q75XoSS8BxPnXD301mCSxE3AMEkLgSuBLsI09wDvMbPpwFnAlZLeG23/NnC0mU0F\n1gO3pFnBmTNh/nzYuXPvNg8mzrl6GDOmMUfBJ1kcqxJrCS2N2Oho2x5mtgW4LP5d0gvAymjfuujP\nVyX9nNBt9nszezWniH8DflmsAnPmzNnzuq2tjba2tpIvYtgwGDsWnnoqTP4InoB3ztXH6NHwyCPV\nLbO9vZ329vaKypCl+IyZpD7ACuBUYB3wBHChmS3LOWYIsM3MdkRdWe8xs0skHQS0mNmbkgYCDwE3\nmtlDkkaa2fro/GuAk83srwq8v1Xr+j71KZg2Da68MkxLf+SRYZVFH2finKulxx6DL30JKrz3d0sS\nZlbS3S3Vbi4z2wVcRQgES4C7zWyZpCskfTo67HjgGUnLCE99fTbafjjwe0ldQCfwSzN7KNr3VUlP\nSVoEvA+4Js3rgH2T8HEXlwcS51ytNWrOJNWWSb1Vs2WybFmYIXjlSnjgAbjlFnjooZ7Pc865anrr\nrdD1vm1bekuGN1zLpJlMmACbNsG6dZ58d87Vz4ABcPDB8Npr9a7JvjyYJNTSsvcRYU++O+fqafTo\nxuvq8mBSgjiYeMvEOVdPjZg38WBSgjgJ78HEOVdPjRhM0h5n0lRaW2HRIjj8cA8mzrn6acRg4i2T\nEgwaBMceCy+84MHEOVc/jTgK3oNJiWbPhsGDw9MUzjlXD42YgPdurhLNmgW/+U29a+Gc683GjIGl\nSyFntqi682BSonPPDQOGnHOuXsaNg+uug61b612TvXwEvHPOuX34CHjnnHN14cHEOedcxTyYOOec\nq5gHE+eccxXzYOKcc65iHkycc85VzIOJc865inkwcc45VzEPJs455yrmwcQ551zFPJg455yrmAcT\n55xzFfNg4pxzrmIeTJxzzlXMg4lzzrmKeTBxzjlXMQ8mzjnnKubBxDnnXMVSDyaSzpC0XNKzkq4t\nsH+opHslLZbUKWlizr4Xo+1dkp7I2T5M0kOSVkh6UNKQtK/DOedccakGE0ktwLeA04FJwIWSjss7\n7Hqgy8ymABcD38jZtxtoM7NpZtaas/064BEzmwA8BnwhrWtoZO3t7fWuQqqa+fqa+drAr683Srtl\n0go8Z2arzGwHcDdwXt4xEwkBATNbARwl6bBon4rU8Tzgzuj1ncCHql3xLGj2f9DNfH3NfG3g19cb\npR1MRgGrc35fE23LtRi4AEBSKzAWGB3tM+BhSfMlXZ5zzggz2wBgZuuBESnU3TnnXEIH1LsCwE3A\n1yUtBJ4GuoBd0b73mNm6qKXysKRlZvb7AmVYjerqnHOuAJmldx+WNBOYY2ZnRL9fB5iZ3dzNOS8A\nJ5jZm3nbbwC2mNktkpYRcikbJI0EHjez4wuU5UHGOefKYGYq5fi0WybzgWMkjQPWAR8DLsw9IHoS\na5uZ7Yi6sn5jZm9KOghoiV4PBP4cuDE67X7gEuBmQtL+vkJvXupfhnPOufKkGkzMbJekq4CHCPmZ\nO8xsmaQrwm67HTgeuFPSbmAJ8Mno9MOBn0etiwOAH5jZQ9G+m4GfSLoMWAX8ZZrX4ZxzrnupdnM5\n55zrHZpyBHxPAyWzrthgzqySdIekDZKeytnWNANTi1zfDZLWSFoY/ZxRzzpWQtJoSY9JWiLpaUlX\nR9sz/xkWuLbPRNub4vOT1F/SvOhe8nSUmy7rs2u6lkk0UPJZ4FTgZULe5mNmtryuFasiSSuBGWb2\np3rXpRokvRd4E/i/ZnZitO1m4HUz+2r0hWCYmV1Xz3qWq8j17XmgpK6Vq4LoIZiRZrZI0iDgScJY\nsEvJ+GfYzbV9lOb5/A4ys22S+gB/AK4GPkyJn10ztkySDJTMumKDOTMpetw7PzA2zcDUItcH4XPM\nPDNbb2aLotdvAssIY8Uy/xkWubZ4rFyzfH7bopf9Cflpo4zPrmluSDmSDJTMumKDOZtJbxiYepWk\nRZK+m8UuoEIkHQVMBTqBw5vpM8y5tnnRpqb4/CS1SOoC1gMPm9l8yvjsmjGY9AbvMbPpwFnAlVE3\nSrNrrv5Y+DZwtJlNJfwnbobukkHAT4HPRt/i8z+zzH6GBa6taT4/M9ttZtMIrclWSZMo47NrxmCy\nljAlS2x0tK1pmNm66M9XgZ8TuvaazQZJh8OefutX6lyfqjKzV21vwvLfgJPrWZ9KSTqAcLO9y8zi\ncV9N8RkWurZm+/wAzGwz0A6cQRmfXTMGkz0DJSX1IwyUvL/OdaoaSQdF35LIGcz5TH1rVRVi3z7o\neGAqdDMwNUP2ub7oP2jsArL/GX4PWGpmX8/Z1iyf4X7X1iyfn6ThcRedpAHAaYS8UMmfXdM9zQXh\n0WDg6+wdKHlTnatUNZLGE1ojuYM5M319kn4ItAGHAhuAG4BfAPcAY4gGpprZpnrVsRJFru/9hP73\n3cCLwBVxH3XWSHoP8FvC3HoW/VwPPAH8hAx/ht1c21/RBJ+fpBMICfaW6OfHZvaPkg6hxM+uKYOJ\nc8652mrGbi7nnHM15sHEOedcxTyYOOecq5gHE+eccxXzYOKcc65iHkycc85VzIOJy7RoevDT8rZ9\nVtL/7uG8LSnXa7ikTklPRmMVcvc9Lml69Hp8tFTCaQXK+Fo0LXjRZa57qMP7JP0y5/cvS/q1pL6S\n2iXNz9k3Q9LjOeftlnR2zv5fSvqzcurhegcPJi7rfkjeUtCEWQ9+2MN5aQ+w+iDwlJnNMLM/FDpA\n0mjgP4BrzOzhAodcDpxoZonW5ImmEM9n0b5/AGYBH4pm0zbgMEmn5x8bWQP89yTv6xx4MHHZ9zPg\nrGj+JCSNA44wsz9IGijpEUkLFBYTOzf/5ALf3r8p6aLo9fT4G7yk/4jnKso7f5ykR6PyH1ZYTGkK\nYWnp86KFk/oXqPeRwIPAF8zsVwXKvQ8YBDwp6SM577Mofp/ouO9L+ldJndF7FihKfwecDpxjZu/k\n7Psa8A8F/1ZhMfCGpFOL7HduHx5MXKZFC4Q9AZwZbfoYYRoIgLcJ38RPAj4A/EuxYvI3RMHpm8CH\nzexk4PvAPxU495vA981sCqE19E0zWwz8D8LUFNPNbHuB8+6Mjv15kes6D9gWnX9PzvtMjd8n5/BR\nZjbTzP5rgaLeA1wBnJmzbkV8zR3AdknvK1QF4B+BLxaqn3P5PJi4ZnA3IYgQ/fmj6LWAr0haDDwC\nHCkp6ZoaE4DJhHVjughdPkcWOG5WzvvdRbh5J/Ew8AlJB3ZzTO7El929zz3dlPF8VM6fFym7aMCI\nFvWy/JyPc4V4MHHN4D7gVEnTgAFm1hVt/zgwHJgWrdfwCpB/897Jvv8P4v0CnolaBtPMbIqZncn+\nys29fJUww/VPFZaaLsSKvM63tZt96wnr3twqqW2/NzB7nHDNM4uc/0+ErjCfxM91y4OJyzwz20pY\nh+F77P32DjAEeMXMdkt6PzAuZ1/8zXwVMDF6wmkoEOcIVhAS1DMhdHtJmljg7eey9wGATwC/K6He\n1wBvRPUuJLdlUsn7PE+YJv3fJZ1Y4JB/BP5bkXMfBoYBhc5zbg8PJq5Z/Ihww8sNJj8ATo66uT5B\nWKchZgBmtoaQY3mG0F22MNq+A/gL4GZJi4AuQldTvquBS6NjPg58NkFdc7/lXwKMLPL4b+5xxd4n\nUYvBzBYAlwL3R8sYWM6+/yC02oqV9Y+EqcidK8qnoHfOOVcxb5k455yrmAcT55xzFfNg4pxzrmIe\nTJxzzlXMg4lzzrmKeTBxzjlXMQ8mzjnnKubBxDnnXMX+PygS5di7aan1AAAAAElFTkSuQmCC\n",
185 | "text/plain": [
186 | ""
187 | ]
188 | },
189 | "metadata": {},
190 | "output_type": "display_data"
191 | }
192 | ],
193 | "source": [
194 | "# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)\n",
195 | "plt.plot(k_range, k_scores)\n",
196 | "plt.xlabel('Value of K for KNN')\n",
197 | "plt.ylabel('Cross-Validated Accuracy')"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "## More efficient parameter tuning using `GridSearchCV`"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "Allows you to define a **grid of parameters** that will be **searched** using K-fold cross-validation"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 8,
217 | "metadata": {
218 | "collapsed": false
219 | },
220 | "outputs": [],
221 | "source": [
222 | "from sklearn.grid_search import GridSearchCV"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 9,
228 | "metadata": {
229 | "collapsed": false
230 | },
231 | "outputs": [
232 | {
233 | "name": "stdout",
234 | "output_type": "stream",
235 | "text": [
236 | "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]\n"
237 | ]
238 | }
239 | ],
240 | "source": [
241 | "# define the parameter values that should be searched\n",
242 | "k_range = list(range(1, 31))\n",
243 | "print(k_range)"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 10,
249 | "metadata": {
250 | "collapsed": false
251 | },
252 | "outputs": [
253 | {
254 | "name": "stdout",
255 | "output_type": "stream",
256 | "text": [
257 | "{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]}\n"
258 | ]
259 | }
260 | ],
261 | "source": [
262 | "# create a parameter grid: map the parameter names to the values that should be searched\n",
263 | "param_grid = dict(n_neighbors=k_range)\n",
264 | "print(param_grid)"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 11,
270 | "metadata": {
271 | "collapsed": false
272 | },
273 | "outputs": [],
274 | "source": [
275 | "# instantiate the grid\n",
276 | "grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')"
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "metadata": {},
282 | "source": [
283 | "- You can set **`n_jobs = -1`** to run computations in parallel (if supported by your computer and OS)"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 12,
289 | "metadata": {
290 | "collapsed": false
291 | },
292 | "outputs": [
293 | {
294 | "data": {
295 | "text/plain": [
296 | "GridSearchCV(cv=10, error_score='raise',\n",
297 | " estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
298 | " metric_params=None, n_jobs=1, n_neighbors=30, p=2,\n",
299 | " weights='uniform'),\n",
300 | " fit_params={}, iid=True, n_jobs=1,\n",
301 | " param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]},\n",
302 | " pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)"
303 | ]
304 | },
305 | "execution_count": 12,
306 | "metadata": {},
307 | "output_type": "execute_result"
308 | }
309 | ],
310 | "source": [
311 | "# fit the grid with data\n",
312 | "grid.fit(X, y)"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 13,
318 | "metadata": {
319 | "collapsed": false
320 | },
321 | "outputs": [
322 | {
323 | "data": {
324 | "text/plain": [
325 | "[mean: 0.96000, std: 0.05333, params: {'n_neighbors': 1},\n",
326 | " mean: 0.95333, std: 0.05207, params: {'n_neighbors': 2},\n",
327 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 3},\n",
328 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 4},\n",
329 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 5},\n",
330 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 6},\n",
331 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 7},\n",
332 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 8},\n",
333 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 9},\n",
334 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 10},\n",
335 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 11},\n",
336 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 12},\n",
337 | " mean: 0.98000, std: 0.03055, params: {'n_neighbors': 13},\n",
338 | " mean: 0.97333, std: 0.04422, params: {'n_neighbors': 14},\n",
339 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 15},\n",
340 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 16},\n",
341 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 17},\n",
342 | " mean: 0.98000, std: 0.03055, params: {'n_neighbors': 18},\n",
343 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 19},\n",
344 | " mean: 0.98000, std: 0.03055, params: {'n_neighbors': 20},\n",
345 | " mean: 0.96667, std: 0.03333, params: {'n_neighbors': 21},\n",
346 | " mean: 0.96667, std: 0.03333, params: {'n_neighbors': 22},\n",
347 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 23},\n",
348 | " mean: 0.96000, std: 0.04422, params: {'n_neighbors': 24},\n",
349 | " mean: 0.96667, std: 0.03333, params: {'n_neighbors': 25},\n",
350 | " mean: 0.96000, std: 0.04422, params: {'n_neighbors': 26},\n",
351 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 27},\n",
352 | " mean: 0.95333, std: 0.04269, params: {'n_neighbors': 28},\n",
353 | " mean: 0.95333, std: 0.04269, params: {'n_neighbors': 29},\n",
354 | " mean: 0.95333, std: 0.04269, params: {'n_neighbors': 30}]"
355 | ]
356 | },
357 | "execution_count": 13,
358 | "metadata": {},
359 | "output_type": "execute_result"
360 | }
361 | ],
362 | "source": [
363 | "# view the complete results (list of named tuples)\n",
364 | "grid.grid_scores_"
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "execution_count": 14,
370 | "metadata": {
371 | "collapsed": false
372 | },
373 | "outputs": [
374 | {
375 | "name": "stdout",
376 | "output_type": "stream",
377 | "text": [
378 | "{'n_neighbors': 1}\n",
379 | "[ 1. 0.93333333 1. 0.93333333 0.86666667 1.\n",
380 | " 0.86666667 1. 1. 1. ]\n",
381 | "0.96\n"
382 | ]
383 | }
384 | ],
385 | "source": [
386 | "# examine the first tuple\n",
387 | "print(grid.grid_scores_[0].parameters)\n",
388 | "print(grid.grid_scores_[0].cv_validation_scores)\n",
389 | "print(grid.grid_scores_[0].mean_validation_score)"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": 15,
395 | "metadata": {
396 | "collapsed": false
397 | },
398 | "outputs": [
399 | {
400 | "name": "stdout",
401 | "output_type": "stream",
402 | "text": [
403 | "[0.95999999999999996, 0.95333333333333337, 0.96666666666666667, 0.96666666666666667, 0.96666666666666667, 0.96666666666666667, 0.96666666666666667, 0.96666666666666667, 0.97333333333333338, 0.96666666666666667, 0.96666666666666667, 0.97333333333333338, 0.97999999999999998, 0.97333333333333338, 0.97333333333333338, 0.97333333333333338, 0.97333333333333338, 0.97999999999999998, 0.97333333333333338, 0.97999999999999998, 0.96666666666666667, 0.96666666666666667, 0.97333333333333338, 0.95999999999999996, 0.96666666666666667, 0.95999999999999996, 0.96666666666666667, 0.95333333333333337, 0.95333333333333337, 0.95333333333333337]\n"
404 | ]
405 | }
406 | ],
407 | "source": [
408 | "# create a list of the mean scores only\n",
409 | "grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]\n",
410 | "print(grid_mean_scores)"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": 16,
416 | "metadata": {
417 | "collapsed": false
418 | },
419 | "outputs": [
420 | {
421 | "data": {
422 | "text/plain": [
423 | ""
424 | ]
425 | },
426 | "execution_count": 16,
427 | "metadata": {},
428 | "output_type": "execute_result"
429 | },
430 | {
431 | "data": {
432 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAEPCAYAAACHuClZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xm8VOWd5/HP9yIggiyKiLKJMaKAsukVSKZzE2O7jRpN\npxM7GbfE2N0aEzvTo3E6I2bSHU26HbNMOtoxGcdOYmJiopmk3b3ZuBdBLqhsalAEBFwIgqDI8ps/\nnnOgKKruPbWcqjp1f+/X676oe5annkPB+dXz/M7zPDIznHPOuUq01LsCzjnnss+DiXPOuYp5MHHO\nOVcxDybOOecq5sHEOedcxTyYOOecq1jqwUTSGZKWS3pW0rUF9g+VdK+kxZI6JU3M2XeNpGckPSXp\nB5L6RduHSXpI0gpJD0oakvZ1OOecKy7VYCKpBfgWcDowCbhQ0nF5h10PdJnZFOBi4BvRuUcCnwGm\nm9mJwAHAx6JzrgMeMbMJwGPAF9K8Duecc91Lu2XSCjxnZqvMbAdwN3Be3jETCQEBM1sBHCXpsGhf\nH2CgpAOAg4C10fbzgDuj13cCH0rvEpxzzvUk7WAyClid8/uaaFuuxcAFAJJagbHAaDN7GfgX4CVC\nENlkZo9G54wwsw0AZrYeGJHaFTjnnOtRIyTgbwKGSVoIXAl0AbskDSW0QMYBRwKDJP1VkTJ8Thjn\nnKujA1Iufy2hpREbzd6uKgDMbAtwWfy7pJXASuAMYKWZbYy23wvMBn4IbJB0uJltkDQSeKXQm0vy\nIOOcc2UwM5VyfNotk/nAMZLGRU9ifQy4P/cASUMk9Y1eXw781szeJHRvzZR0oCQBpwLLotPuBy6J\nXl8M3FesAmbWtD833HBD3evQrNe3e7dx2GHGwIHGjh3NdW1mxksvGWDMnl39sk86yRg6tL7Xl/ZP\nvT+/tH/KkWowMbNdwFXAQ8AS4G4zWybpCkmfjg47HnhG0jLCU1+fjc59AvgpodtrMSDg9uicm4HT\nJK0gBJmb0rwO1/v88Y/Qrx+MHQtPPVXv2lTf3LnwgQ/AokWwfXv1yt22DZYsgc2bYffu6pXrGl/a\n3VyY2QPAhLxtt+W87szfn7PvRuDGAts3Ah+sbk2d26ujA2bPhiFDwuvp0+tdo+rq6IDTT4eNG6Gr\nC2bOrE65CxbAiSeGAPzKKzByZHXKdY2vERLwrkxtbW31rkKq6nl9c+fCrFnhZ+7c6pdf788ureuL\nyx01qo01a6pXbqOp9+fXiFRu/1gWSLJmvj6XnqlT4bbbQsvkrLNg5cp616h63noLhg+H116Dn/0M\n7rsP7rmnOmWfey78l/8Cd90Fl14K559fnXJdbUnCGiwB71zmbN4Mzz0H06bBscfCpk2wbl29a1U9\nCxbApEkwYEDoyps7F6rxncssdJ/NmgVjxsDq1T2f45qHBxPn8jzxRAgk/fpBS0u4OXZ01LtW1TN3\nbggiAOPHw86d8NJLlZf7/PMhQI0e7cGkN/Jg4lyeOPkemz27uYJJ7vVJ1bu+3HI9mPQ+HkycyxMn\nkWNpJeHrwSy968std8wYmjoB7/bnwcS5HLt3Q2fnvjfb1lZYvLi64zHqZeVK6N8/3Oxj1WqZ5Haf\njR7tLZPexoOJczmWL4dhw/YdHzFoELz73WE8Rtblt0oAZsyApUth69byy928OQSqKVPC76NGhYcW\ndu0qv0yXLR5MnMuR++06V/zUU9YVur4BA2Dy5PCUV7nmzQsDO/v1C7/37w+HHALr15dfpssWDybO\n5chPvseaJQmf1vUVKteT8L2LBxPnchTqBoK9Seosj4HdsiU8vjt16v77Km15Ffp78yR87+LBxLnI\nxo3h5nfCCfvvGz8+9P9n+Zt27viZfPFYmnKCZaGHFsCT8L2NBxPnIp2dcPLJcECB6U+l7D8iXKzV\nBeHGP2BAaLmUatmyMD3LiLz1Tr2bq3fxYOJcpFjyPZb1JHxP11fuSP9i5Xow6V08mDgXKZacjmU5\nCV+sKypXucGy2N+bB5PexYOJc4T5qZ54ovt1PeLxGNu21a5e1bJ8eXhU9/DDix9TbjAp1n3mCfje\nxYOJc8Azz4S8wSGHFD/mwANDcr6S8Rj10lOrC8KAw5UrwwDEpF5/HV5+OYxTyXfEEWGBrJ07S6ur\nyyYPJs7RfXI6V1aT8Emur1+/MPBw3rzk5XZ2hulm+vTZf1/fvnDYYSHYuObnwcQ5kn1zh+wm4XtK\nvsdKzQv1VK7nTXoPDybOkfxmW8l4jHrZuBHWri3cFZWv1JZXT0HYg0nv4cHE9XobNoQb7nHH9Xxs\nJeMx6qW78TP5Zs0Kx+/e3fOxO3fC/PlwyinFj/EkfO/hwcT1eh0d4SmuloT/G7L2iHDSLjwIAw+H\nDw8DEXvy1FMwdmyYZbkYHwXfe3gwcb1e0i6uWNbyJmldX7zee3e8m6v38GDier0kN8VcWXqiK+6K\n6m78TL6kLa8kQcqDSe/hwcT1au+8AwsXhsdbk5o6tfTxGPUSj5/prisqX9JgmaT7zINJ7+HBxPVq\nXV1wzDEweHDyc/r2LX08Rr0kHT+Ta/LksEri668XP2bdOti0CY49tvuyRo4MDze8805pdXDZ48HE\n9WqlJKdzZSUJX8719ekTWmqdnd2XO2tWzw8t9OkTAsrataXVwWWPBxPXq5WanI5lJQlf7vX11NVV\nSp7Ju7p6h9SDiaQzJC2X9KykawvsHyrpXkmLJXVKmhhtP1ZSl6SF0Z9vSLo62neDpDXRvoWSzkj7\nOlxzKjX5Hps5M/l4jHrZsAH+9CeYMKH0c3tqeZUSpDyY9A6pBhNJLcC3gNOBScCFkvKHhl0PdJnZ\nFOBi4BsAZvasmU0zs+nADGArcG/OebeY2fTo54E0r8M1p9WrYft2eNe7Sj93xIgw71SS8Rj1Uur4\nmVynnBKeAis0SeP27bBoUfKHFjyY9A5pt0xagefMbJWZ7QDuBs7LO2Yi8BiAma0AjpJ0WN4xHwT+\naGa5Y2mVUp1dLxEnp1Xmv6RGf0S4nOR7bNiwMCDx6af339fVFRLvgwYlK8tHwfcOaQeTUUDud5I1\n0bZci4ELACS1AmOB0XnHfBT4Ud62qyQtkvRdSUOqV2XXW5SbfI81ehK+GtdXKFiWmofxUfC9QyMk\n4G8ChklaCFwJdAG74p2S+gLnAvfknPNt4GgzmwqsB26pXXVdsyg3OR1r5CT8O++EFkQp42fyFWt5\nlZpn8m6u3iHB1G8VWUtoacRGR9v2MLMtwGXx75JeAFbmHHIm8KSZvZpzzqs5+/8N+GWxCsyZM2fP\n67a2Ntra2kqpv2tSb70FS5bASSeVX8akSXvHYxx6aPXqVg3x+JmDDy6/jNmz4ctf3nebWQgwN9+c\nvBwPJo2vvb2d9vb2isqQpTiXtqQ+wArgVGAd8ARwoZktyzlmCLDNzHZIuhx4j5ldkrP/R8ADZnZn\nzraRZrY+en0NcLKZ/VWB97c0r89l1+9+B5//fFiqtxKnnQaf+xycfXZ16lUtt94Kzz4L3/52+WXs\n3h0mfVy6NIwVAVi1KiTn161LnmvavTvMtLxpU/jTNT5JmFlJ2cRUu7nMbBdwFfAQsAS428yWSbpC\n0qejw44HnpG0jPDU12fj8yUdREi+37tvyXxV0lOSFgHvA65J8zpc8yn3keB8jZqEryT5Hmtp2bt+\nSyz+eyvloYWWFhg1ygcuNru0u7mIHtudkLfttpzXnfn7c/ZtA/Kf7MLMLqpyNV0vM3cuXHhh5eXM\nng1f/Wrl5VRbRwd85SuVlxPnhc4/P/xebp4pTsIfc0zldXKNqRES8M7VVNzvX0nyPTZzZvHxGPWy\nenVIwB99dOVlFWqZlPP35nmT5ufBxPU6K1dCv37hBlepoUPDeIynnqq8rGqpdPxMrtbWMEBx+3bY\nujXkT2bMKL0cDybNz4OJ63Wq1SqJNdp4k0rHl+QaNCgMUOzqggUL4IQT4MADSy/Hg0nz82Diep1q\nJd9jjZaEr0byPVfc1VXJ35uPgm9+Hkxcr9PMLZNqjJ/JFyfhK/l781Hwzc+DietVtmyB55+HadOq\nV+axx8Ibb4SxF/W2YEFY3Kqa4znillelLRMPJs3Ng4nrVZ54Iiy7269f9cpsaQlPdTVC66TaXVwA\n48fDrl0hQI3OnzUvoeHDQ6tp69bq1s01jtTHmTgH4XHc5cvh+OPrW49qd3HFZs+GH/84LOlbT7/6\nFVx1VXXLlML1VRKApb1dXcflL0KRkuXLw1ou1XiqzfWsx2Ai6RzgV2bWwMsAuUa3aBG8972hO+iA\nOn6F6eiAyy+vfrnnnx/WhL/99uqXXYpDD4UPfKD65X7qU2EJ3krESfhaBJNdu8K0L52d9f8C01sk\n+W/9UeBWST8Dvmdmy1Ouk2tCc+fCtm1hPMb06fWpw+7dIZh873vVL3viRLj//uqX2yjOOqvyMmqZ\nhF+yBDZvhpde8mBSKz3mTMzsE8A04I/A/5HUIenTkiqYj9T1NnPnwuDB9X2EdvlyOOSQvZMWutqq\nZRI+/nfmSf/aSZSAN7PNwE8JKyUeAZwPLJT0mRTr5ppIRwdccUV9k9TVHl/iSlPLYNLRAUcc4cGk\nlnoMJpLOlfRzoB3oC7Sa2ZnAFODz6VbPNYP168P045dcUt+WSVrJd5dMrVsmH/mID5SspSQtkw8D\n/8vMTjCzr5nZK7BnRt9Pplo71xQ6OsKjs8cdV9/xGN4yqa9ajYJ/9dXwc/rp3jKppSTBZA5hUSsA\nJA2QdBSAmT2aSq1cU4lbBIXWx6iVjRvDjeWEE2r/3i6oVQK+oyM8yTVunAeTWkoSTO4Bch8L3sW+\n67E7163c7qV6TT3S2RlmwK3nY8m93bBhYar+zZvTfZ+4BRp3q/liq7WRJJgcYGbvxL9Er6s4ftg1\ns+3bwxiT1tbwe70mRfQurvqTapM3ib+8DB4cWsObNqX7fi5IEkxelXRu/Iuk84DX0quSayZdXWHu\nqkGDwu+562PUkiffG0PawWTHDnjyydDNFb+fJ+FrI0kw+WvgekkvSVoNXAtckW61XLPIbxHkro9R\nKzt3htUQZ86s3Xu6wtK+uS9eHOYSGzJk7/t53qQ2euxBNrM/AjMlDYp+fzP1WrmmMXcunHfevtvi\nKc1rdXN/5hkYNSoMWHT1lXYSPr8F6sGkdhKlIyWdDUwCDlQ0a5qZfSnFerkmEK+1fvPN+26fPRt+\n8Yva1cO7uBrHmDHpPoDR0REeCc59Pw8mtZFk0OJ3CPNzfQYQ8BFgXMr1ck1g9erQxTR+/L7b4yR8\nrZ6y8eR740j75p7/xcEX5aqdJDmT2WZ2EfAnM7sRmAUcm261XDOI/2PnTwEer4/x0ku1rYervzSD\nydq1Yb2Ud7973/fzBHxtJAkmb0d/bpN0JLCDMD+Xc90q1iKQajd4ccOGMGCxVmtouO7FN/c0WqXx\nv7fcLy/ezVU7SYLJLyUNBb4GLAReBH6YZqVcc+iuRRAn4dMWT+XS4muKNoQ0x34U+veWZvBy++r2\nv5ikFuBRM9tkZj8j5EqOM7P/UZPauczatg2WLoUZMwrvr9VIeO/iajxptRYKtYQHDoQDD4TXX6/+\n+7l9dRtMotUV/3fO79vN7I3Ua+Uyb8ECmDw5rBteyIwZIdikvSa4J98bTxrB5O23w8JrJ5+8/z5P\nwtdGksb/o5I+LPlKyi65nloEBx4YJl1csCC9OrzzDixcuHcqF9cY0ggmTz4ZVlQcOLDw+3kSPn1J\ngskVhIkdt0vaLGmLpJSnanNZN3duzy2CtJPwXV1wzDGhn941jjRu7t21QD0JXxtJlu092MxazKyf\nmQ2Ofk/831PSGZKWS3pW0rUF9g+VdK+kxZI6JU2Mth8rqUvSwujPNyRdHe0bJukhSSskPShpSCkX\n7dJlFv5z95SrSDsJn6QOrvbS6HbqriXswaQ2kgxa/LNCP0kKjxL43wJOJ4ygv1BS/kOa1wNdZjYF\nuBj4BoCZPWtm08xsOjAD2ArcG51zHfCImU0AHgO+kKQ+rjaefz7kSkaP7v64uGWS1pM2nnxvTNW+\nucdfXrxlUl9Jurn+Pufni8AvCQtmJdEKPGdmq8xsB2EN+byZmphICAiY2QrgKEmH5R3zQeCPZhY3\njs8D7oxe3wl8KGF9XA0kTXqPHh2CzvPP17cerraqfXN/8cUwtmRckXk5PAFfG0m6uc7J+TkNmAz8\nKWH5o4Dcj3FNtC3XYuACAEmtwFgg/zvtR4Ef5fw+wsw2RPVbD4xIWB9XA6W0CNJ6RHj16jDN/bve\nVf2yXWWqPfYj7s4s9oiQJ+Bro5x159YAx1exDjcBX5e0EHga6CKs5giApL7AuYSurWKK/rOcM2fO\nntdtbW20tbVVVlvXo7lz4bLLkh0bz9N10UXVr0P+aGjXGAYODC3S11+H4cMrL6+nhz1Gjw5Treze\n7YNXi2lvb6e9vb2iMnoMJpK+yd6bdQswlTASPom1hJZGbHS0bQ8z2wLsufVIegFYmXPImcCTZvZq\nzrYNkg43sw2SRgKvFKtAbjBx6du8GVauhKlTkx0/ezbccUf16+HJ98YWdz1VK5h8/OPF9w8YAAcf\nDK++CocfXvn7NaP8L9o33nhjyWUkidMLgCejnw7gWjP7RMLy5wPHSBonqR/wMeD+3AMkDYlaH0i6\nHPhN3popF7JvFxdRGZdEry8G7ktYH5eyefNg+nTol3Bh56lTQ/Cp9rrgSR5NdvVTrbzJm2/CihXh\n31wt3s8Vl6Sb66fA22a2C0BSH0kHmdm2nk40s12SrgIeIgSuO8xsmaQrwm67ndBldqek3cAS4JPx\n+ZIOIiTfP51X9M3ATyRdBqwC/jLBdbgaKDXp3bdvuBHMmwennVadOrz1FixZAiedVJ3yXPVV6+Y+\nfz5MmQL9+3d/XNwS8n8T6UkSTB4l3NDj1sIAQnBI1IlgZg8AE/K23ZbzujN/f86+bUD+k12Y2cao\nTq7BzJ0Lf/M3pZ0TJ+GrFUwWLIBJk+Cgg6pTnqu+agWTpN2ZnoRPX5JurgNzu52i1/7f1O1n927o\n7Cy9eylOwleLd3E1vmrd3JN+1t7Nlb4kwWSrpD09kpJmAG+lVyWXVcuWhYTqiBIf1J41KwSh3bur\nUw9Pvje+aoz96GmwYi4PJulLEkw+B9wj6XeSfg/8GLgq3Wq5LCp3xPmIEXDYYSEYVSped95bJo2t\nGjf3Z58NT2kdeWRt3s91r8eciZnNj6ZAifMaK6LR7M7to5IR53FX16RJldVh5crwJNmYMZWV49JV\njbEfpXx58VHw6UsyN9eVwEAze8bMngEGSfrb9KvmsqaSubCqNRK+2LrzrrEMGBBmc36l6AixnpXS\nnTlqFKxbB7t29XysK0+S7wSXm9meRTbN7E/A5elVyWXR66/Dyy+HBbHKUa0kvHdxZUelSfhSPuv+\n/eGQQ2DDhvLfz3UvSTDpk7swlqQ+QMIhaa636OwMi1D16VPe+ZMnh2BU6fKqnnzPjkq6njZtglWr\n4MQTk5/jeZN0JQkmDwA/lnSqpFMJo9EfSLdaLmsqne69Tx845ZQQlMq1ZQs89xxMm1Z+Ga52Krm5\nz5sXln7u27c27+d6liSYXEuYIv5vop9HCdPRO7dHNaZ7r7Sr64knQiBJOpWLq69Kbu7lfHnxJHy6\nkkxBv9vMvmNmf2FmfwH8Gvh8+lVzWbFzZ5jWYubMysqpNAnvi2FlSyXBpJzuTB8Fn65ED+VJOkzS\n30r6HdAO+Nybbo+nngr/UYcNq6ycU04JQWnnzvLO9+R7tpR7c9+1K3Rzlfrlxbu50lU0mEg6WNLF\nkh4EngDeBYw3s3eZ2X+tWQ1dw6tW0nvYMBg7Fp5+uvRzy53KxdVPud1OS5fCyJGlT1/vwSRd3bVM\nXiGsM/Jl4Ggz+zzwTk1q5TKlmt1Ls2eXlzdZvjwEo5Ejq1MPl75yx36U2wL1YJKu7oLJF4D+wLeB\nL0jyBVBdQdXsXio3Ce/5kuyJx36sX1/aeeV+1kccEQZJltuN6rpXNJiY2a1mNhM4L9r0C+BISddK\nOrYmtXMNb906eOMNmFBwEYHSlZuE9/El2VROa6Hcz7pv3zAH3Lp1pZ/repbkaa6VZvZPZnYCcBIw\nmPBEl3N0dIREaLXW1j722DAgrZxvq54vyZ5Sk/CvvhpaFxMnlv9+3tWVjpJuAdH8XP/dzI5Jq0Iu\nW6rdImhpCUGhlNbJxo3hhnTCCdWrh6uNUpPwnZ3hqb9yv7x4MElPlb5Put4qjRZBqXmTzk44+WQ4\nIMm6oa6hlHpzr/TfmweT9HgwcWXbvh0WLQpzclVTqU90efI9u8oJJpV81j4KPj3+XS5F27fDRRfB\ntm31rkk6tm4NOY6DD65uua2t0NUF55yT7Pgnn4Tvfre6dXC1MXYsPP548s96/vzQzVWuMWPgD38o\n/3yAn/88PIbe1lZZOfnWrIEf/xg+n9H5RYoGE0lPA1Zsv5mVMF9n7/TCC+Gb1Le/Xe+apOeYFLJn\ngwbBI48kn0G4Tx847bTq18Olr7UV7ror+eO6110HQ4aU/37V6Ob6znfCGJlqB5Nf/Qpuu60Jgwnw\nn6M/r4z+vCv68+PpVae5rF4dvrkn/dbl9vJuq97hgAPgzDNr936VBpN4poUjjqhenWJz54a6mWVz\ncbeiwcTMVgFIOs3Mcif1vk7SQuC6tCuXdWvWhD5a51xjGDkytHjfeae82aWXLg3TuKxbF8o59NDq\n1a2jA95+OzydWM1yayVJAl6S3pPzy+yE5/V6q1f7WuTONZI+fUJAWbu2vPPnzoX3vjc8PVjJ2jv5\n4vEzkyZl9wGBJEHhk8C3Jb0o6UXC9CqXpVqrJuHBxLnGU8lU9PHTZOXOIVdMR0d4sGDcuCYOJmb2\npJlNAaYAU8xsqpktTL9q2efBxLnGU+k6KrNmlT6wNmm5WR4H02MwkXS4pDuAu83sDUkTJX2yBnXL\nPA8mzjWecm/Yr70WpvmZNClMIVTJ2jv54hZPlsfBJOnm+j/Ag8CR0e/PAp9Lq0LNxBPwzjWecoNJ\nPJVLnz6Vrb2Tb8cOWLgwlJ3l1SCTBJPhZvYTYDeAme0EEq9AIOkMScslPSvp2gL7h0q6V9JiSZ2S\nJubsGyLpHknLJC2RdEq0/QZJayQtjH7OSFqfWtm8OazTMHRovWvinMtV7rf//Klcyl0uId/ixXDU\nUWH8TFN3cwFbJR1KNIBR0kzgjSSFS2oBvgWcDkwCLpR0XN5h1wNdUV7mYuAbOfu+DvzazI4n5GyW\n5ey7xcymRz8PJKlPLcVdXFl8Xty5Zlbut//8qVyqlYTPLbfZg8nfAfcD75L0B+D/AlcnLL8VeM7M\nVpnZDuBu9q6PEpsIPAZgZiuAo6I15wcD/8nMvh/t22lmm3POa+jbtOdLnGtM5dywd+wI0/bkTuVS\nrSR8nHyH0GpauzYMjsyaJMFkCfA+YDZwBaGFsTxh+aOA3I9tTbQt12LgAgBJrcBYYDQwHnhN0vej\nrqzbJQ3IOe8qSYskfVdSBRMspMODiXONacSI0A399tvJz3nqqfDYbm639YQJ5a29ky+3ZTJgQJhO\n6LXXKiuzHpJM9NhhZtMJQQWAaAT89CrV4Sbg61GZTwNdhJxM3+g9rjSzBZJuJYy6v4Ew1uVLZmaS\nvgzcQhgPs585c+bsed3W1kZbtSfUKcKT7841ppYWOPLI8H806dxyhWYrzl175/zzy6vL2rVhwtR3\nv3vvtrjlNGJEeWWWo729nfb29orK6G6ix5GEVsQASdPY2600GDgoYflrCS2N2Oho2x5mtoWcQZCS\nXgBWAgOB1Wa2INr1U+Da6JxXc4r4N+CXxSqQG0xqafVqn1/KuUYVJ+GTBpOOjsKTicZJ+HKDSdzF\nlZtbjYPJjBnllVmO/C/aN954Y8lldNfNdTrwz4QAcAvwL9HP3xGS5knMB46RNE5SP+BjhPzLHtET\nW32j15cDvzGzN81sA7A6Z735U4Gl0XEjc4q4AHgmYX1qxru5nGtcpSbhi62jUmkSvlC5WU3CdzfR\n453AnZI+bGY/K6dwM9sl6SrgIULgusPMlkm6Iuy224Hjo/fZTehKy+2uuhr4QRRsVgKXRtu/Kmkq\n4XHlFwm5nIbiwcS5xlXKDfvll2HLljADeL7W1rBA3Pbt0L9/6fXo6ICvfKX8ujWSHnMmZvYzSWcT\nEu8H5mz/UpI3iB7bnZC37bac1535+3P2LQZOLrD9oiTvXS9mHkyca2RjxsAzCfszCnVFxQYNCkGm\nqyuMii/F22+HxP7JeXe40aPD9qxJMp3Kd4CPAp8h5E0+AoxLuV6ZtmlTWKeh2isQOueqo5Rv/z0t\nFTx7dnmPCC9cCMcfDwMHll+3RpLk0eDZUUvgT2Z2IzALKNDgczFvlTjX2Eq5YeeOAymk3JHw+SPq\ny6lbI0kSTN6K/twm6UhgB5DCOmPNw4OJc41t9OhkCfjt28N0J/ldUbniJLwVXeS8sGItnlGjwuJb\nuxJPWtUYkgST/ydpKPA1YCEh4f2jNCuVdR5MnGtsw4fDtm3hpzsLF4bBiYMGFT9m/Pgwe3AprQmz\n4i2e/v3DRJIbNiQvrxEkWc/kf5rZpuiJrnHAcWb2xfSrll0eTJxrbFKyCR97ypfEZZX6iPCLL4bz\nxhXJPidtOTWSosFE0gX5P8DZwKnRa1eEj353rvElyU0kCSZQehK+oyOcU2wi2CzmTbp7NPic6M8R\nhHm5Hot+fz8wF7g3xXplmrdMnGt8Pd2wzUIw+ed/7rmsWbPgmmuSv3ex5HvSujWi7gYtXgog6SFg\nopmti34/grBglivCg4lzja+nrqRVq8KfRx3Vc1kzZsDSpSEHc1CCyabmzoWPf7z4/iwGkyQJ+DFx\nIIlsYN/5tlwOs/AP1IOJc42tpxt2d4MV8w0YAJMnw4IFPR/75puwYgVM72aq3GYNJo9KelDSJZIu\nAX4FPJJutbLrtdfCN5Mk306cc/XT0w07ab4kljQJP38+TJnS/fQrTZWAj5nZVcBthJUOpwC3m9ln\n0q5YVnny3blsqFcwSVJuFlsmSdYzwczuxRPuiXi+xLls6O6GvXUrLF/efVdUvlmz4G//NnR1d9c1\n1tEBl16eP7BeAAAWWUlEQVRafD+E9VY2bAjjVw5IdJeuv+4eDf599OcWSZtzfrZI2lzsvN7Og4lz\n2TB0aLhZby5wN5s/H048EQ48cP99xYweHXInzz9f/JjuBivm6tsXDjssjITPiqLBxMzeG/15sJkN\nzvk52MwG166K2eLBxLlskIqva5Lkhl9IT+vCP/tsmAD2yCN7LitrXV3dtUwO6e6nlpXMEg8mzmVH\nsRt2qfmSWE95k1LKzVoSvrveuCcBY+9yvbkMODqVGmWcJ+Cdy45CwSTuirrttsLndGf2bLjjjuL7\nSwkmWWuZdDdocXwtK9IsvGXiXHYUumE/91xYYyRJV1S+KVNg5cqQhxlcIBnQ0QF//dfJ6/bSS6XX\noV6SjDNB0jBJrZL+LP5Ju2JZtHs3rF3rLRPnsqJQV1K5XVwA/fqFJ8Dmzdt/36ZNYVT9iScmKytr\nLZMkKy1+Cvgt8CBwY/TnnHSrlU2vvAJDhpT2BIhzrn4K3bDLTb7HiiXh580L06707Vt+3RpZkpbJ\nZwnrsK8ys/cD04BNqdYqo7yLy7lsKXTDrqRlAsWT8KWWm7UEfJJg8raZvQ0gqb+ZLQcmpFutbPLk\nu3PZEgeTeJXEN96AF14IuY9yzZoFnZ2h2ztXqcHkiCPC9EzvvFN+XWopSTBZE620+AvgYUn3AavS\nrVY2ecvEuWwZPBj69An5DCi9K6qQESPCSo7Llu3dtmsXPPEEzJyZvJw+fWDkSHj55fLrUktJ5uY6\nP1ppcQ7wReAO4ENpVyyLPJg4lz253UmVdnHF8ru6liwJgWH48NLKyVLepLtBi7+W9AlJe1Y/NrPf\nmNn9ZpaRhldteTBxLntyb9iVJt9j+Un4csttimBCmCn4bOAFST+RdL6kfjWqVyZ5MHEue+Ib9q5d\nIddRjWCS3zIpt8WTpSR8d3Nz3WdmFwLjgJ8BFwEvSfq+pNNqVcEs8QS8c9kTB5OlS+Hww8MEi5Wa\nPDnkOl5/PfxebjBplpYJAGa2zcx+bGbnA38OTAUeSL1mGbNrF6xfD6NG1bsmzrlSxDfsanVxQUie\nt7aGls6rr4afiRPLr1sW9DhTvqTDgb8EPgYcAfwEuCTdamXP+vVwyCFhBKxzLjvirqRqJd9jcVfX\nrl1wyinQkmi+kX1lKZh0l4C/XNJjwELg3cDfm9nRZnadmS1O+gaSzpC0XNKzkq4tsH+opHslLZbU\nKWlizr4hku6RtEzSEkmnRNuHSXpI0opoSeEhJV11Cjxf4lw2pdEygb1J+ErKbYpgAswCvgKMMbOr\nzSzBgpT7ktQCfAs4HZgEXCjpuLzDrge6zGwKcDHwjZx9Xwd+bWbHE5YMjp/cvg54xMwmAI8BXyi1\nbtXmwcS5bBozBl58MfQuTJpUvXJnzgyLbP32t+W3eEaMCAMp3367evVKS3cJ+MvM7GEz2zOOU9Kc\nEstvBZ4zs1VmtgO4Gzgv75iJhICAma0AjpJ0mKTBwH8ys+9H+3aaWbwm2nnAndHrO2mAcS+efHcu\nmwYOhEGDQldUnz7VK3fYMBg7NuRNTjmlvDJaWsLsxWvXVq9eaSm1F+/cEo8fBeQ20tZE23ItBi4A\nkNQKjAVGA+OB16KnxxZKul3SgOicEWa2AcDM1gMjSqzXPnbsCNMWVMJbJs5l15gx1e3iis2aFRLv\nQyroiM9KV1epS9UXWiirUjcBX5e0EHga6AJ2AX2B6cCVZrZA0q2E7q0bCtTDihU+Z86cPa/b2tpo\na2vb75i77oJHH4Uf/KD8i1i9uvxvH865+nr/++HMM6tf7jnnhNZJJWoRTNrb22lvb6+oDJkVvQ/v\nf7DUktvtleD4mcAcMzsj+v06wMzs5m7OeQE4ARgIdJjZ0dH29wLXmtk5kpYBbWa2QdJI4PEor5Jf\nliW5vmXL4KyzwgRv5Zo5E265pbpPgzjn3HXXhTnErr++du8pCTMrqfGQZD2Tr0oaLKkvYaLHVyV9\nImH584FjJI2LRs9/DLg/r/whUdlIuhz4jZm9GXVjrZZ0bHToqcDS6PX97H08+WLgvoT1KWjChJDk\nWreu/DK8m8s5l4asjIJPkjP58yjx/Z+BF4FjgL9PUriZ7QKuAh4ClgB3m9kySVdI+nR02PHAM1Fr\n43TC+imxq4EfSFpEeJrrn6LtNwOnSVpBCDI3JalPMS0txRe0SWLHjjAo6YgjKqmFc87tr5lyJvEx\nZwP3mNkbUvLWj5k9QN76J2Z2W87rzvz9OfsWExbmyt++Efhg4kokEAeTCy4o/dx168IjfAeUmoFy\nzrkeZCWYJGmZ/D9Jy4EZwKOSDgMy8NRzaYqtjpaEd3E559LSNMHEzK4DZgMnRWNFtrL/WJHMa22F\nRYtg+/bSz/Vg4pxLy/DhsG1b+GlkSRLwHwF2mNkuSf8A/DtwZOo1q7FBg+DYY6Grq/RzPZg459Ii\nhQlkGz0Jn6Sb64tmtiV6NPeDhJUW/zXdatVHuV1dPvrdOZemLHR1JQkmu6I/zwZuN7NfAU05N265\nT3R5y8Q5l6ZmCSZrJd0GfBT4taT+Cc/LnLhlUsI4TsCDiXMuXc0STP4SeBA43cw2AYeQcJxJ1owf\nDzt3wksvlXaeBxPnXJqaIpiY2Tbgj8Dpkq4iTLL4UOo1qwMptE5K6eravh02bgzLfTrnXBqyMAo+\nydNcnwV+QJiZdwTw75I+k3bF6qXUJPzLL4eR79Wcuto553JloWWSZMz2J4FTzGwrgKSbgQ7gm2lW\nrF5mzYKf/CT58d7F5ZxLWxaCSZKcidj7RBfR6zSmom8IM2bA0qXJBwh5MHHOpW3YsDAH4JYt9a5J\ncUmCyfeBeZLmRCstdhLGmjSlAQNg8mRYsCDZ8R5MnHNpkxq/dZIkAX8LcCmwMfq51MxuTbti9VRK\n3sSDiXOuFho9Cd9tzkRSH2CJmR0HLKxNlepv9uyw+mISa9bAB6s6f7Fzzu0v0y2TaD2SFZIqXHgy\nW+KR8EkGL3rLxDlXC40eTJI8zTUMWCLpCcKMwQCY2bmp1arORo8OuZPnn4d3v7v7Yz2YOOdqYcwY\n6Oysdy2KSxJMvph6LRpQ3DrpLpi89VZ4uuKww2pXL+dc7zRmDNxzT71rUVzRbi5Jx0h6j5n9JveH\n8GhwA6eBqiNJEn7NmjA1dEtTzlTmnGskjZ6A7+42eCuwucD2N6J9TS1pMPGp551ztRDnTEqdiLZW\nugsmh5vZ0/kbo21HpVajBjFlCqxcCZsLhdOI50ucc7UyZEgYb/LGG/WuSWHdBZOh3ewbUO2KNJp+\n/WD6dJg3r/gxHkycc7XUyE90dRdMFki6PH+jpE8BT6ZXpcbR02JZHkycc7XUyMGku6e5Pgf8XNLH\n2Rs8TiKssnh+2hVrBLNnw792s0Dx6tVw9tm1q49zrndr5CR80WBiZhuA2ZLeD0yONv/KzB6rSc0a\nwKxZcPHFsHt34Se2PAHvnKulrLZMADCzx4HHa1CXhjNiBAwfDsuWwaRJ++/3bi7nXC2NGQO//W29\na1GYj5DoQbFHhLduhbffhkMPrX2dnHO9UyO3TDyY9KBYEn716tDFpaZd2cU512g8mGRYsZaJd3E5\n52otTsA34sDF1IOJpDMkLZf0rKRrC+wfKuleSYsldUqamLPvxWh7VzTRZLz9BklrJC2Mfs5Iq/6T\nJ4d13l9/fd/tnnx3ztXaoEHQvz9s3Fjvmuwv1WAiqQX4FnA6MAm4UNJxeYddD3SZ2RTgYuAbOft2\nA21mNs3MWvPOu8XMpkc/D6R0CfTpA62t+8/W6S0T51w9NGpXV9otk1bgOTNbZWY7gLuB8/KOmQg8\nBmBmK4CjJMXz8KqbOtYsW1Goq8uDiXOuHnprMBkF5F72mmhbrsXABQCSWoGxQNyBZMDDkuYXGI1/\nlaRFkr4raUj1q75XoSS8BxPnXD301mCSxE3AMEkLgSuBLsI09wDvMbPpwFnAlZLeG23/NnC0mU0F\n1gO3pFnBmTNh/nzYuXPvNg8mzrl6GDOmMUfBJ1kcqxJrCS2N2Oho2x5mtgW4LP5d0gvAymjfuujP\nVyX9nNBt9nszezWniH8DflmsAnPmzNnzuq2tjba2tpIvYtgwGDsWnnoqTP4InoB3ztXH6NHwyCPV\nLbO9vZ329vaKypCl+IyZpD7ACuBUYB3wBHChmS3LOWYIsM3MdkRdWe8xs0skHQS0mNmbkgYCDwE3\nmtlDkkaa2fro/GuAk83srwq8v1Xr+j71KZg2Da68MkxLf+SRYZVFH2finKulxx6DL30JKrz3d0sS\nZlbS3S3Vbi4z2wVcRQgES4C7zWyZpCskfTo67HjgGUnLCE99fTbafjjwe0ldQCfwSzN7KNr3VUlP\nSVoEvA+4Js3rgH2T8HEXlwcS51ytNWrOJNWWSb1Vs2WybFmYIXjlSnjgAbjlFnjooZ7Pc865anrr\nrdD1vm1bekuGN1zLpJlMmACbNsG6dZ58d87Vz4ABcPDB8Npr9a7JvjyYJNTSsvcRYU++O+fqafTo\nxuvq8mBSgjiYeMvEOVdPjZg38WBSgjgJ78HEOVdPjRhM0h5n0lRaW2HRIjj8cA8mzrn6acRg4i2T\nEgwaBMceCy+84MHEOVc/jTgK3oNJiWbPhsGDw9MUzjlXD42YgPdurhLNmgW/+U29a+Gc683GjIGl\nSyFntqi682BSonPPDQOGnHOuXsaNg+uug61b612TvXwEvHPOuX34CHjnnHN14cHEOedcxTyYOOec\nq5gHE+eccxXzYOKcc65iHkycc85VzIOJc865inkwcc45VzEPJs455yrmwcQ551zFPJg455yrmAcT\n55xzFfNg4pxzrmIeTJxzzlXMg4lzzrmKeTBxzjlXMQ8mzjnnKubBxDnnXMVSDyaSzpC0XNKzkq4t\nsH+opHslLZbUKWlizr4Xo+1dkp7I2T5M0kOSVkh6UNKQtK/DOedccakGE0ktwLeA04FJwIWSjss7\n7Hqgy8ymABcD38jZtxtoM7NpZtaas/064BEzmwA8BnwhrWtoZO3t7fWuQqqa+fqa+drAr683Srtl\n0go8Z2arzGwHcDdwXt4xEwkBATNbARwl6bBon4rU8Tzgzuj1ncCHql3xLGj2f9DNfH3NfG3g19cb\npR1MRgGrc35fE23LtRi4AEBSKzAWGB3tM+BhSfMlXZ5zzggz2wBgZuuBESnU3TnnXEIH1LsCwE3A\n1yUtBJ4GuoBd0b73mNm6qKXysKRlZvb7AmVYjerqnHOuAJmldx+WNBOYY2ZnRL9fB5iZ3dzNOS8A\nJ5jZm3nbbwC2mNktkpYRcikbJI0EHjez4wuU5UHGOefKYGYq5fi0WybzgWMkjQPWAR8DLsw9IHoS\na5uZ7Yi6sn5jZm9KOghoiV4PBP4cuDE67X7gEuBmQtL+vkJvXupfhnPOufKkGkzMbJekq4CHCPmZ\nO8xsmaQrwm67HTgeuFPSbmAJ8Mno9MOBn0etiwOAH5jZQ9G+m4GfSLoMWAX8ZZrX4ZxzrnupdnM5\n55zrHZpyBHxPAyWzrthgzqySdIekDZKeytnWNANTi1zfDZLWSFoY/ZxRzzpWQtJoSY9JWiLpaUlX\nR9sz/xkWuLbPRNub4vOT1F/SvOhe8nSUmy7rs2u6lkk0UPJZ4FTgZULe5mNmtryuFasiSSuBGWb2\np3rXpRokvRd4E/i/ZnZitO1m4HUz+2r0hWCYmV1Xz3qWq8j17XmgpK6Vq4LoIZiRZrZI0iDgScJY\nsEvJ+GfYzbV9lOb5/A4ys22S+gB/AK4GPkyJn10ztkySDJTMumKDOTMpetw7PzA2zcDUItcH4XPM\nPDNbb2aLotdvAssIY8Uy/xkWubZ4rFyzfH7bopf9Cflpo4zPrmluSDmSDJTMumKDOZtJbxiYepWk\nRZK+m8UuoEIkHQVMBTqBw5vpM8y5tnnRpqb4/CS1SOoC1gMPm9l8yvjsmjGY9AbvMbPpwFnAlVE3\nSrNrrv5Y+DZwtJlNJfwnbobukkHAT4HPRt/i8z+zzH6GBa6taT4/M9ttZtMIrclWSZMo47NrxmCy\nljAlS2x0tK1pmNm66M9XgZ8TuvaazQZJh8OefutX6lyfqjKzV21vwvLfgJPrWZ9KSTqAcLO9y8zi\ncV9N8RkWurZm+/wAzGwz0A6cQRmfXTMGkz0DJSX1IwyUvL/OdaoaSQdF35LIGcz5TH1rVRVi3z7o\neGAqdDMwNUP2ub7oP2jsArL/GX4PWGpmX8/Z1iyf4X7X1iyfn6ThcRedpAHAaYS8UMmfXdM9zQXh\n0WDg6+wdKHlTnatUNZLGE1ojuYM5M319kn4ItAGHAhuAG4BfAPcAY4gGpprZpnrVsRJFru/9hP73\n3cCLwBVxH3XWSHoP8FvC3HoW/VwPPAH8hAx/ht1c21/RBJ+fpBMICfaW6OfHZvaPkg6hxM+uKYOJ\nc8652mrGbi7nnHM15sHEOedcxTyYOOecq5gHE+eccxXzYOKcc65iHkycc85VzIOJy7RoevDT8rZ9\nVtL/7uG8LSnXa7ikTklPRmMVcvc9Lml69Hp8tFTCaQXK+Fo0LXjRZa57qMP7JP0y5/cvS/q1pL6S\n2iXNz9k3Q9LjOeftlnR2zv5fSvqzcurhegcPJi7rfkjeUtCEWQ9+2MN5aQ+w+iDwlJnNMLM/FDpA\n0mjgP4BrzOzhAodcDpxoZonW5ImmEM9n0b5/AGYBH4pm0zbgMEmn5x8bWQP89yTv6xx4MHHZ9zPg\nrGj+JCSNA44wsz9IGijpEUkLFBYTOzf/5ALf3r8p6aLo9fT4G7yk/4jnKso7f5ykR6PyH1ZYTGkK\nYWnp86KFk/oXqPeRwIPAF8zsVwXKvQ8YBDwp6SM577Mofp/ouO9L+ldJndF7FihKfwecDpxjZu/k\n7Psa8A8F/1ZhMfCGpFOL7HduHx5MXKZFC4Q9AZwZbfoYYRoIgLcJ38RPAj4A/EuxYvI3RMHpm8CH\nzexk4PvAPxU495vA981sCqE19E0zWwz8D8LUFNPNbHuB8+6Mjv15kes6D9gWnX9PzvtMjd8n5/BR\nZjbTzP5rgaLeA1wBnJmzbkV8zR3AdknvK1QF4B+BLxaqn3P5PJi4ZnA3IYgQ/fmj6LWAr0haDDwC\nHCkp6ZoaE4DJhHVjughdPkcWOG5WzvvdRbh5J/Ew8AlJB3ZzTO7El929zz3dlPF8VM6fFym7aMCI\nFvWy/JyPc4V4MHHN4D7gVEnTgAFm1hVt/zgwHJgWrdfwCpB/897Jvv8P4v0CnolaBtPMbIqZncn+\nys29fJUww/VPFZaaLsSKvM63tZt96wnr3twqqW2/NzB7nHDNM4uc/0+ErjCfxM91y4OJyzwz20pY\nh+F77P32DjAEeMXMdkt6PzAuZ1/8zXwVMDF6wmkoEOcIVhAS1DMhdHtJmljg7eey9wGATwC/K6He\n1wBvRPUuJLdlUsn7PE+YJv3fJZ1Y4JB/BP5bkXMfBoYBhc5zbg8PJq5Z/Ihww8sNJj8ATo66uT5B\nWKchZgBmtoaQY3mG0F22MNq+A/gL4GZJi4AuQldTvquBS6NjPg58NkFdc7/lXwKMLPL4b+5xxd4n\nUYvBzBYAlwL3R8sYWM6+/yC02oqV9Y+EqcidK8qnoHfOOVcxb5k455yrmAcT55xzFfNg4pxzrmIe\nTJxzzlXMg4lzzrmKeTBxzjlXMQ8mzjnnKubBxDnnXMX+PygS5di7aan1AAAAAElFTkSuQmCC\n",
433 | "text/plain": [
434 | ""
435 | ]
436 | },
437 | "metadata": {},
438 | "output_type": "display_data"
439 | }
440 | ],
441 | "source": [
442 | "# plot the results\n",
443 | "plt.plot(k_range, grid_mean_scores)\n",
444 | "plt.xlabel('Value of K for KNN')\n",
445 | "plt.ylabel('Cross-Validated Accuracy')"
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": 17,
451 | "metadata": {
452 | "collapsed": false
453 | },
454 | "outputs": [
455 | {
456 | "name": "stdout",
457 | "output_type": "stream",
458 | "text": [
459 | "0.98\n",
460 | "{'n_neighbors': 13}\n",
461 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
462 | " metric_params=None, n_jobs=1, n_neighbors=13, p=2,\n",
463 | " weights='uniform')\n"
464 | ]
465 | }
466 | ],
467 | "source": [
468 | "# examine the best model\n",
469 | "print(grid.best_score_)\n",
470 | "print(grid.best_params_)\n",
471 | "print(grid.best_estimator_)"
472 | ]
473 | },
474 | {
475 | "cell_type": "markdown",
476 | "metadata": {},
477 | "source": [
478 | "## Searching multiple parameters simultaneously"
479 | ]
480 | },
481 | {
482 | "cell_type": "markdown",
483 | "metadata": {},
484 | "source": [
485 | "- **Example:** tuning `max_depth` and `min_samples_leaf` for a `DecisionTreeClassifier`\n",
486 | "- Could tune parameters **independently**: change `max_depth` while leaving `min_samples_leaf` at its default value, and vice versa\n",
487 | "- But, best performance might be achieved when **neither parameter** is at its default value"
488 | ]
489 | },
490 | {
491 | "cell_type": "code",
492 | "execution_count": 18,
493 | "metadata": {
494 | "collapsed": false
495 | },
496 | "outputs": [],
497 | "source": [
498 | "# define the parameter values that should be searched\n",
499 | "k_range = list(range(1, 31))\n",
500 | "weight_options = ['uniform', 'distance']"
501 | ]
502 | },
503 | {
504 | "cell_type": "code",
505 | "execution_count": 19,
506 | "metadata": {
507 | "collapsed": false
508 | },
509 | "outputs": [
510 | {
511 | "name": "stdout",
512 | "output_type": "stream",
513 | "text": [
514 | "{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'weights': ['uniform', 'distance']}\n"
515 | ]
516 | }
517 | ],
518 | "source": [
519 | "# create a parameter grid: map the parameter names to the values that should be searched\n",
520 | "param_grid = dict(n_neighbors=k_range, weights=weight_options)\n",
521 | "print(param_grid)"
522 | ]
523 | },
524 | {
525 | "cell_type": "code",
526 | "execution_count": 20,
527 | "metadata": {
528 | "collapsed": false
529 | },
530 | "outputs": [
531 | {
532 | "data": {
533 | "text/plain": [
534 | "GridSearchCV(cv=10, error_score='raise',\n",
535 | " estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
536 | " metric_params=None, n_jobs=1, n_neighbors=30, p=2,\n",
537 | " weights='uniform'),\n",
538 | " fit_params={}, iid=True, n_jobs=1,\n",
539 | " param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'weights': ['uniform', 'distance']},\n",
540 | " pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)"
541 | ]
542 | },
543 | "execution_count": 20,
544 | "metadata": {},
545 | "output_type": "execute_result"
546 | }
547 | ],
548 | "source": [
549 | "# instantiate and fit the grid\n",
550 | "grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')\n",
551 | "grid.fit(X, y)"
552 | ]
553 | },
554 | {
555 | "cell_type": "code",
556 | "execution_count": 21,
557 | "metadata": {
558 | "collapsed": false
559 | },
560 | "outputs": [
561 | {
562 | "data": {
563 | "text/plain": [
564 | "[mean: 0.96000, std: 0.05333, params: {'n_neighbors': 1, 'weights': 'uniform'},\n",
565 | " mean: 0.96000, std: 0.05333, params: {'n_neighbors': 1, 'weights': 'distance'},\n",
566 | " mean: 0.95333, std: 0.05207, params: {'n_neighbors': 2, 'weights': 'uniform'},\n",
567 | " mean: 0.96000, std: 0.05333, params: {'n_neighbors': 2, 'weights': 'distance'},\n",
568 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 3, 'weights': 'uniform'},\n",
569 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 3, 'weights': 'distance'},\n",
570 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 4, 'weights': 'uniform'},\n",
571 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 4, 'weights': 'distance'},\n",
572 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 5, 'weights': 'uniform'},\n",
573 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 5, 'weights': 'distance'},\n",
574 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 6, 'weights': 'uniform'},\n",
575 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 6, 'weights': 'distance'},\n",
576 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 7, 'weights': 'uniform'},\n",
577 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 7, 'weights': 'distance'},\n",
578 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 8, 'weights': 'uniform'},\n",
579 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 8, 'weights': 'distance'},\n",
580 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 9, 'weights': 'uniform'},\n",
581 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 9, 'weights': 'distance'},\n",
582 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 10, 'weights': 'uniform'},\n",
583 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 10, 'weights': 'distance'},\n",
584 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 11, 'weights': 'uniform'},\n",
585 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 11, 'weights': 'distance'},\n",
586 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 12, 'weights': 'uniform'},\n",
587 | " mean: 0.97333, std: 0.04422, params: {'n_neighbors': 12, 'weights': 'distance'},\n",
588 | " mean: 0.98000, std: 0.03055, params: {'n_neighbors': 13, 'weights': 'uniform'},\n",
589 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 13, 'weights': 'distance'},\n",
590 | " mean: 0.97333, std: 0.04422, params: {'n_neighbors': 14, 'weights': 'uniform'},\n",
591 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 14, 'weights': 'distance'},\n",
592 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 15, 'weights': 'uniform'},\n",
593 | " mean: 0.98000, std: 0.03055, params: {'n_neighbors': 15, 'weights': 'distance'},\n",
594 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 16, 'weights': 'uniform'},\n",
595 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 16, 'weights': 'distance'},\n",
596 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 17, 'weights': 'uniform'},\n",
597 | " mean: 0.98000, std: 0.03055, params: {'n_neighbors': 17, 'weights': 'distance'},\n",
598 | " mean: 0.98000, std: 0.03055, params: {'n_neighbors': 18, 'weights': 'uniform'},\n",
599 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 18, 'weights': 'distance'},\n",
600 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 19, 'weights': 'uniform'},\n",
601 | " mean: 0.98000, std: 0.03055, params: {'n_neighbors': 19, 'weights': 'distance'},\n",
602 | " mean: 0.98000, std: 0.03055, params: {'n_neighbors': 20, 'weights': 'uniform'},\n",
603 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 20, 'weights': 'distance'},\n",
604 | " mean: 0.96667, std: 0.03333, params: {'n_neighbors': 21, 'weights': 'uniform'},\n",
605 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 21, 'weights': 'distance'},\n",
606 | " mean: 0.96667, std: 0.03333, params: {'n_neighbors': 22, 'weights': 'uniform'},\n",
607 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 22, 'weights': 'distance'},\n",
608 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 23, 'weights': 'uniform'},\n",
609 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 23, 'weights': 'distance'},\n",
610 | " mean: 0.96000, std: 0.04422, params: {'n_neighbors': 24, 'weights': 'uniform'},\n",
611 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 24, 'weights': 'distance'},\n",
612 | " mean: 0.96667, std: 0.03333, params: {'n_neighbors': 25, 'weights': 'uniform'},\n",
613 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 25, 'weights': 'distance'},\n",
614 | " mean: 0.96000, std: 0.04422, params: {'n_neighbors': 26, 'weights': 'uniform'},\n",
615 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 26, 'weights': 'distance'},\n",
616 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 27, 'weights': 'uniform'},\n",
617 | " mean: 0.98000, std: 0.03055, params: {'n_neighbors': 27, 'weights': 'distance'},\n",
618 | " mean: 0.95333, std: 0.04269, params: {'n_neighbors': 28, 'weights': 'uniform'},\n",
619 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 28, 'weights': 'distance'},\n",
620 | " mean: 0.95333, std: 0.04269, params: {'n_neighbors': 29, 'weights': 'uniform'},\n",
621 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 29, 'weights': 'distance'},\n",
622 | " mean: 0.95333, std: 0.04269, params: {'n_neighbors': 30, 'weights': 'uniform'},\n",
623 | " mean: 0.96667, std: 0.03333, params: {'n_neighbors': 30, 'weights': 'distance'}]"
624 | ]
625 | },
626 | "execution_count": 21,
627 | "metadata": {},
628 | "output_type": "execute_result"
629 | }
630 | ],
631 | "source": [
632 | "# view the complete results\n",
633 | "grid.grid_scores_"
634 | ]
635 | },
636 | {
637 | "cell_type": "code",
638 | "execution_count": 22,
639 | "metadata": {
640 | "collapsed": false
641 | },
642 | "outputs": [
643 | {
644 | "name": "stdout",
645 | "output_type": "stream",
646 | "text": [
647 | "0.98\n",
648 | "{'n_neighbors': 13, 'weights': 'uniform'}\n"
649 | ]
650 | }
651 | ],
652 | "source": [
653 | "# examine the best model\n",
654 | "print(grid.best_score_)\n",
655 | "print(grid.best_params_)"
656 | ]
657 | },
658 | {
659 | "cell_type": "markdown",
660 | "metadata": {},
661 | "source": [
662 | "## Using the best parameters to make predictions"
663 | ]
664 | },
665 | {
666 | "cell_type": "code",
667 | "execution_count": 23,
668 | "metadata": {
669 | "collapsed": false
670 | },
671 | "outputs": [
672 | {
673 | "data": {
674 | "text/plain": [
675 | "array([1])"
676 | ]
677 | },
678 | "execution_count": 23,
679 | "metadata": {},
680 | "output_type": "execute_result"
681 | }
682 | ],
683 | "source": [
684 | "# train your model using all data and the best known parameters\n",
685 | "knn = KNeighborsClassifier(n_neighbors=13, weights='uniform')\n",
686 | "knn.fit(X, y)\n",
687 | "\n",
688 | "# make a prediction on out-of-sample data\n",
689 | "knn.predict([[3, 5, 4, 2]])"
690 | ]
691 | },
692 | {
693 | "cell_type": "code",
694 | "execution_count": 24,
695 | "metadata": {
696 | "collapsed": false
697 | },
698 | "outputs": [
699 | {
700 | "data": {
701 | "text/plain": [
702 | "array([1])"
703 | ]
704 | },
705 | "execution_count": 24,
706 | "metadata": {},
707 | "output_type": "execute_result"
708 | }
709 | ],
710 | "source": [
711 | "# shortcut: GridSearchCV automatically refits the best model using all of the data\n",
712 | "grid.predict([[3, 5, 4, 2]])"
713 | ]
714 | },
715 | {
716 | "cell_type": "markdown",
717 | "metadata": {},
718 | "source": [
719 | "## Reducing computational expense using `RandomizedSearchCV`"
720 | ]
721 | },
722 | {
723 | "cell_type": "markdown",
724 | "metadata": {},
725 | "source": [
726 | "- Searching many different parameters at once may be computationally infeasible\n",
727 | "- `RandomizedSearchCV` searches a subset of the parameters, and you control the computational \"budget\""
728 | ]
729 | },
730 | {
731 | "cell_type": "code",
732 | "execution_count": 25,
733 | "metadata": {
734 | "collapsed": false
735 | },
736 | "outputs": [],
737 | "source": [
738 | "from sklearn.grid_search import RandomizedSearchCV"
739 | ]
740 | },
741 | {
742 | "cell_type": "code",
743 | "execution_count": 26,
744 | "metadata": {
745 | "collapsed": false
746 | },
747 | "outputs": [],
748 | "source": [
749 | "# specify \"parameter distributions\" rather than a \"parameter grid\"\n",
750 | "param_dist = dict(n_neighbors=k_range, weights=weight_options)"
751 | ]
752 | },
753 | {
754 | "cell_type": "markdown",
755 | "metadata": {},
756 | "source": [
757 | "- **Important:** Specify a continuous distribution (rather than a list of values) for any continous parameters"
758 | ]
759 | },
760 | {
761 | "cell_type": "code",
762 | "execution_count": 27,
763 | "metadata": {
764 | "collapsed": false
765 | },
766 | "outputs": [
767 | {
768 | "data": {
769 | "text/plain": [
770 | "[mean: 0.97333, std: 0.03266, params: {'n_neighbors': 18, 'weights': 'distance'},\n",
771 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 8, 'weights': 'uniform'},\n",
772 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 24, 'weights': 'distance'},\n",
773 | " mean: 0.98000, std: 0.03055, params: {'n_neighbors': 20, 'weights': 'uniform'},\n",
774 | " mean: 0.95333, std: 0.04269, params: {'n_neighbors': 28, 'weights': 'uniform'},\n",
775 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 9, 'weights': 'uniform'},\n",
776 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 5, 'weights': 'distance'},\n",
777 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 5, 'weights': 'uniform'},\n",
778 | " mean: 0.97333, std: 0.03266, params: {'n_neighbors': 19, 'weights': 'uniform'},\n",
779 | " mean: 0.96667, std: 0.04472, params: {'n_neighbors': 20, 'weights': 'distance'}]"
780 | ]
781 | },
782 | "execution_count": 27,
783 | "metadata": {},
784 | "output_type": "execute_result"
785 | }
786 | ],
787 | "source": [
788 | "# n_iter controls the number of searches\n",
789 | "rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=5)\n",
790 | "rand.fit(X, y)\n",
791 | "rand.grid_scores_"
792 | ]
793 | },
794 | {
795 | "cell_type": "code",
796 | "execution_count": 28,
797 | "metadata": {
798 | "collapsed": false
799 | },
800 | "outputs": [
801 | {
802 | "name": "stdout",
803 | "output_type": "stream",
804 | "text": [
805 | "0.98\n",
806 | "{'n_neighbors': 20, 'weights': 'uniform'}\n"
807 | ]
808 | }
809 | ],
810 | "source": [
811 | "# examine the best model\n",
812 | "print(rand.best_score_)\n",
813 | "print(rand.best_params_)"
814 | ]
815 | },
816 | {
817 | "cell_type": "code",
818 | "execution_count": 29,
819 | "metadata": {
820 | "collapsed": false
821 | },
822 | "outputs": [
823 | {
824 | "name": "stdout",
825 | "output_type": "stream",
826 | "text": [
827 | "[0.98, 0.973, 0.98, 0.973, 0.973, 0.98, 0.98, 0.98, 0.973, 0.98, 0.98, 0.973, 0.98, 0.973, 0.973, 0.98, 0.98, 0.98, 0.98, 0.98]\n"
828 | ]
829 | }
830 | ],
831 | "source": [
832 | "# run RandomizedSearchCV 20 times (with n_iter=10) and record the best score\n",
833 | "best_scores = []\n",
834 | "for _ in range(20):\n",
835 | " rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10)\n",
836 | " rand.fit(X, y)\n",
837 | " best_scores.append(round(rand.best_score_, 3))\n",
838 | "print(best_scores)"
839 | ]
840 | },
841 | {
842 | "cell_type": "markdown",
843 | "metadata": {},
844 | "source": [
845 | "## Resources\n",
846 | "\n",
847 | "- scikit-learn documentation: [Grid search](http://scikit-learn.org/stable/modules/grid_search.html), [GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html), [RandomizedSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.RandomizedSearchCV.html)\n",
848 | "- Timed example: [Comparing randomized search and grid search](http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html)\n",
849 | "- scikit-learn workshop by Andreas Mueller: [Video segment on randomized search](https://youtu.be/0wUF_Ov8b0A?t=17m38s) (3 minutes), [related notebook](https://github.com/amueller/pydata-nyc-advanced-sklearn/blob/master/Chapter%203%20-%20Randomized%20Hyper%20Parameter%20Search.ipynb)\n",
850 | "- Paper by Yoshua Bengio: [Random Search for Hyper-Parameter Optimization](http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf)"
851 | ]
852 | },
853 | {
854 | "cell_type": "markdown",
855 | "metadata": {},
856 | "source": [
857 | "## Comments or Questions?\n",
858 | "\n",
859 | "- Email: \n",
860 | "- Website: http://dataschool.io\n",
861 | "- Twitter: [@justmarkham](https://twitter.com/justmarkham)"
862 | ]
863 | },
864 | {
865 | "cell_type": "code",
866 | "execution_count": 1,
867 | "metadata": {
868 | "collapsed": false
869 | },
870 | "outputs": [
871 | {
872 | "data": {
873 | "text/html": [
874 | "\n",
926 | ""
941 | ],
942 | "text/plain": [
943 | ""
944 | ]
945 | },
946 | "execution_count": 1,
947 | "metadata": {},
948 | "output_type": "execute_result"
949 | }
950 | ],
951 | "source": [
952 | "from IPython.core.display import HTML\n",
953 | "def css_styling():\n",
954 | " styles = open(\"styles/custom.css\", \"r\").read()\n",
955 | " return HTML(styles)\n",
956 | "css_styling()"
957 | ]
958 | }
959 | ],
960 | "metadata": {
961 | "kernelspec": {
962 | "display_name": "Python 2",
963 | "language": "python",
964 | "name": "python2"
965 | },
966 | "language_info": {
967 | "codemirror_mode": {
968 | "name": "ipython",
969 | "version": 2
970 | },
971 | "file_extension": ".py",
972 | "mimetype": "text/x-python",
973 | "name": "python",
974 | "nbconvert_exporter": "python",
975 | "pygments_lexer": "ipython2",
976 | "version": "2.7.11"
977 | }
978 | },
979 | "nbformat": 4,
980 | "nbformat_minor": 0
981 | }
982 |
--------------------------------------------------------------------------------