├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── abstract.rst
├── check_env.ipynb
├── environment.yml
├── fetch_data.py
├── images
├── check_env-1.png
├── check_env-2.png
└── download-repo.png
├── notebooks
├── 01.Introduction_to_Machine_Learning.ipynb
├── 02.Scientific_Computing_Tools_in_Python.ipynb
├── 03.Data_Representation_for_Machine_Learning.ipynb
├── 04.Training_and_Testing_Data.ipynb
├── 05.Supervised_Learning-Classification.ipynb
├── 06.Supervised_Learning-Regression.ipynb
├── 07.Unsupervised_Learning-Transformations_and_Dimensionality_Reduction.ipynb
├── 08.Unsupervised_Learning-Clustering.ipynb
├── 09.Review_of_Scikit-learn_API.ipynb
├── 10.Case_Study-Titanic_Survival.ipynb
├── 11.Text_Feature_Extraction.ipynb
├── 12.Case_Study-SMS_Spam_Detection.ipynb
├── 13.Cross_Validation.ipynb
├── 14.Model_Complexity_and_GridSearchCV.ipynb
├── 15.Pipelining_Estimators.ipynb
├── 16.Performance_metrics_and_Model_Evaluation.ipynb
├── 17.In_Depth-Linear_Models.ipynb
├── 18.In_Depth-Trees_and_Forests.ipynb
├── 19.Feature_Selection.ipynb
├── 20.Unsupervised_learning-Hierarchical_and_density-based_clustering_algorithms.ipynb
├── 21.Unsupervised_learning-Non-linear_dimensionality_reduction.ipynb
├── 22.Unsupervised_learning-anomaly_detection.ipynb
├── 23.Out-of-core_Learning_Large_Scale_Text_Classification.ipynb
├── datasets
│ ├── smsspam
│ │ ├── SMSSpamCollection
│ │ └── readme
│ └── titanic3.csv
├── figures
│ ├── ML_flow_chart.py
│ ├── __init__.py
│ ├── average-per-class.png
│ ├── bag_of_words.svg
│ ├── check_env-1.png
│ ├── cluster_comparison.png
│ ├── clustering-linkage.png
│ ├── clustering.png
│ ├── cross_validation.svg
│ ├── data_representation.svg
│ ├── dbscan.png
│ ├── feature_union.svg
│ ├── grid_search_cross_validation.svg
│ ├── hashing_vectorizer.svg
│ ├── ipython_help-1.png
│ ├── ipython_help-2.png
│ ├── ipython_run_cell.png
│ ├── iris_setosa.jpg
│ ├── iris_versicolor.jpg
│ ├── iris_virginica.jpg
│ ├── ml_taxonomy.png
│ ├── overfitting_underfitting_cartoon.svg
│ ├── petal_sepal.jpg
│ ├── pipeline.svg
│ ├── pipeline_cross_validation.svg
│ ├── plot_2d_separator.py
│ ├── plot_digits_dataset.py
│ ├── plot_helpers.py
│ ├── plot_interactive_forest.py
│ ├── plot_interactive_tree.py
│ ├── plot_kneigbors_regularization.png
│ ├── plot_kneighbors_regularization.py
│ ├── plot_linear_svc_regularization.py
│ ├── plot_pca.py
│ ├── plot_rbf_svm_parameters.py
│ ├── plot_scaling.py
│ ├── randomized_search.png
│ ├── supervised_scikit_learn.png
│ ├── supervised_workflow.svg
│ ├── train_test_split.svg
│ ├── train_test_split_matrix.svg
│ ├── train_validation_test2.svg
│ ├── tree_plotting.py
│ └── unsupervised_workflow.svg
├── helpers.py
├── images
│ ├── parallel_text_clf.png
│ └── parallel_text_clf_average.png
└── solutions
│ ├── 03A_faces_plot.py
│ ├── 04_wrong-predictions.py
│ ├── 05A_knn_with_diff_k.py
│ ├── 06A_knn_vs_linreg.py
│ ├── 06B_lin_with_sine.py
│ ├── 07A_iris-pca.py
│ ├── 08B_digits_clustering.py
│ ├── 10_titanic.py
│ ├── 11_ngrams.py
│ ├── 12A_tfidf.py
│ ├── 12B_vectorizer_params.py
│ ├── 13_cross_validation.py
│ ├── 14_grid_search.py
│ ├── 15A_ridge_grid.py
│ ├── 16A_avg_per_class_acc.py
│ ├── 17A_logreg_grid.py
│ ├── 17B_learning_curve_alpha.py
│ ├── 18_gbc_grid.py
│ ├── 19_univariate_vs_mb_selection.py
│ ├── 20_clustering_comparison.py
│ ├── 21A_isomap_digits.py
│ ├── 21B_tsne_classification.py
│ ├── 22_A-anomaly_ocsvm_gamma.py
│ ├── 22_B-anomaly_iforest_n_trees.py
│ ├── 22_C-anomaly_digits.py
│ └── 23_batchtrain.py
├── requirements.txt
└── todo.rst
/.gitignore:
--------------------------------------------------------------------------------
1 | # exlude datasets and externals
2 | notebooks/datasets
3 | notebooks/joblib/
4 |
5 | # exclude temporary files
6 | .ipynb_checkpoints
7 | .DS_Store
8 | gmon.out
9 | __pycache__
10 | *.pyc
11 | *.o
12 | *.so
13 | *.gcno
14 | *.swp
15 | *.egg-info
16 | *.egg
17 | *~
18 | build
19 | dist
20 | lib/test
21 | doc/_build
22 | *env
23 | *ENV
24 | .idea
25 | *.code-workspace
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | before_install:
3 | - wget -q http://repo.continuum.io/miniconda/Miniconda-3.6.0-Linux-x86_64.sh -O miniconda.sh
4 | - chmod +x miniconda.sh
5 | - ./miniconda.sh -b -p /home/travis/miniconda
6 | - export PATH=/home/travis/miniconda/bin:$PATH
7 | - conda update --yes --quiet conda
8 | install:
9 | - conda create -n testenv --yes pip python=3.6
10 | - source activate testenv
11 | - pip install -r requirements.txt
12 | script:
13 | - python fetch_data.py
14 | - jupyter nbconvert --execute check_env.ipynb
15 | - cd notebooks
16 | - for i in *.ipynb; do jupyter nbconvert --ExecutePreprocessor.timeout=None --execute $i; done
17 | notifications:
18 | email: true
19 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | CC0 1.0 Universal
2 |
3 | Statement of Purpose
4 |
5 | The laws of most jurisdictions throughout the world automatically confer
6 | exclusive Copyright and Related Rights (defined below) upon the creator and
7 | subsequent owner(s) (each and all, an "owner") of an original work of
8 | authorship and/or a database (each, a "Work").
9 |
10 | Certain owners wish to permanently relinquish those rights to a Work for the
11 | purpose of contributing to a commons of creative, cultural and scientific
12 | works ("Commons") that the public can reliably and without fear of later
13 | claims of infringement build upon, modify, incorporate in other works, reuse
14 | and redistribute as freely as possible in any form whatsoever and for any
15 | purposes, including without limitation commercial purposes. These owners may
16 | contribute to the Commons to promote the ideal of a free culture and the
17 | further production of creative, cultural and scientific works, or to gain
18 | reputation or greater distribution for their Work in part through the use and
19 | efforts of others.
20 |
21 | For these and/or other purposes and motivations, and without any expectation
22 | of additional consideration or compensation, the person associating CC0 with a
23 | Work (the "Affirmer"), to the extent that he or she is an owner of Copyright
24 | and Related Rights in the Work, voluntarily elects to apply CC0 to the Work
25 | and publicly distribute the Work under its terms, with knowledge of his or her
26 | Copyright and Related Rights in the Work and the meaning and intended legal
27 | effect of CC0 on those rights.
28 |
29 | 1. Copyright and Related Rights. A Work made available under CC0 may be
30 | protected by copyright and related or neighboring rights ("Copyright and
31 | Related Rights"). Copyright and Related Rights include, but are not limited
32 | to, the following:
33 |
34 | i. the right to reproduce, adapt, distribute, perform, display, communicate,
35 | and translate a Work;
36 |
37 | ii. moral rights retained by the original author(s) and/or performer(s);
38 |
39 | iii. publicity and privacy rights pertaining to a person's image or likeness
40 | depicted in a Work;
41 |
42 | iv. rights protecting against unfair competition in regards to a Work,
43 | subject to the limitations in paragraph 4(a), below;
44 |
45 | v. rights protecting the extraction, dissemination, use and reuse of data in
46 | a Work;
47 |
48 | vi. database rights (such as those arising under Directive 96/9/EC of the
49 | European Parliament and of the Council of 11 March 1996 on the legal
50 | protection of databases, and under any national implementation thereof,
51 | including any amended or successor version of such directive); and
52 |
53 | vii. other similar, equivalent or corresponding rights throughout the world
54 | based on applicable law or treaty, and any national implementations thereof.
55 |
56 | 2. Waiver. To the greatest extent permitted by, but not in contravention of,
57 | applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and
58 | unconditionally waives, abandons, and surrenders all of Affirmer's Copyright
59 | and Related Rights and associated claims and causes of action, whether now
60 | known or unknown (including existing as well as future claims and causes of
61 | action), in the Work (i) in all territories worldwide, (ii) for the maximum
62 | duration provided by applicable law or treaty (including future time
63 | extensions), (iii) in any current or future medium and for any number of
64 | copies, and (iv) for any purpose whatsoever, including without limitation
65 | commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes
66 | the Waiver for the benefit of each member of the public at large and to the
67 | detriment of Affirmer's heirs and successors, fully intending that such Waiver
68 | shall not be subject to revocation, rescission, cancellation, termination, or
69 | any other legal or equitable action to disrupt the quiet enjoyment of the Work
70 | by the public as contemplated by Affirmer's express Statement of Purpose.
71 |
72 | 3. Public License Fallback. Should any part of the Waiver for any reason be
73 | judged legally invalid or ineffective under applicable law, then the Waiver
74 | shall be preserved to the maximum extent permitted taking into account
75 | Affirmer's express Statement of Purpose. In addition, to the extent the Waiver
76 | is so judged Affirmer hereby grants to each affected person a royalty-free,
77 | non transferable, non sublicensable, non exclusive, irrevocable and
78 | unconditional license to exercise Affirmer's Copyright and Related Rights in
79 | the Work (i) in all territories worldwide, (ii) for the maximum duration
80 | provided by applicable law or treaty (including future time extensions), (iii)
81 | in any current or future medium and for any number of copies, and (iv) for any
82 | purpose whatsoever, including without limitation commercial, advertising or
83 | promotional purposes (the "License"). The License shall be deemed effective as
84 | of the date CC0 was applied by Affirmer to the Work. Should any part of the
85 | License for any reason be judged legally invalid or ineffective under
86 | applicable law, such partial invalidity or ineffectiveness shall not
87 | invalidate the remainder of the License, and in such case Affirmer hereby
88 | affirms that he or she will not (i) exercise any of his or her remaining
89 | Copyright and Related Rights in the Work or (ii) assert any associated claims
90 | and causes of action with respect to the Work, in either case contrary to
91 | Affirmer's express Statement of Purpose.
92 |
93 | 4. Limitations and Disclaimers.
94 |
95 | a. No trademark or patent rights held by Affirmer are waived, abandoned,
96 | surrendered, licensed or otherwise affected by this document.
97 |
98 | b. Affirmer offers the Work as-is and makes no representations or warranties
99 | of any kind concerning the Work, express, implied, statutory or otherwise,
100 | including without limitation warranties of title, merchantability, fitness
101 | for a particular purpose, non infringement, or the absence of latent or
102 | other defects, accuracy, or the present or absence of errors, whether or not
103 | discoverable, all to the greatest extent permissible under applicable law.
104 |
105 | c. Affirmer disclaims responsibility for clearing rights of other persons
106 | that may apply to the Work or any use thereof, including without limitation
107 | any person's Copyright and Related Rights in the Work. Further, Affirmer
108 | disclaims responsibility for obtaining any necessary consents, permissions
109 | or other rights required for any use of the Work.
110 |
111 | d. Affirmer understands and acknowledges that Creative Commons is not a
112 | party to this document and has no duty or obligation with respect to this
113 | CC0 or use of the Work.
114 |
115 | For more information, please see
116 |
117 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | SciPy 2018 Scikit-learn Tutorial
2 | ================================
3 |
4 |
5 | Instructors
6 | -----------
7 |
8 | - [Guillaume Lemaitre](https://glemaitre.github.io/) [@glemaitre](https://github.com/glemaitre) - Inria, Université Paris-Saclay
9 | - [Andreas Mueller](http://amuller.github.io) [@amuellerml](https://twitter.com/amuellerml) - Columbia University; [Book: Introduction to Machine Learning with Python](http://shop.oreilly.com/product/0636920030515.do)
10 |
11 | ---
12 |
13 |
14 | This repository will contain the teaching material and other info associated with our scikit-learn tutorial
15 | at [SciPy 2018](http://scipy2018.scipy.org/) held July 9-15 in Austin, Texas.
16 |
17 | Parts 1 to 12 make up the morning session, while
18 | parts 13 to 23 will be presented in the afternoon (approximately)
19 |
20 | ### Schedule:
21 |
22 | The 2-part tutorial will be held on Tuesday, July 10, 2018.
23 |
24 |
25 |
26 | Obtaining the Tutorial Material
27 | --------------------------------
28 |
29 |
30 | If you have a GitHub account, it is probably most convenient if you clone or
31 | fork the GitHub repository. You can clone the repository by running:
32 |
33 | ```bash
34 | git clone https://github.com/amueller/scipy-2018-sklearn.git
35 | ```
36 |
37 | If you are not familiar with git or don’t have an
38 | GitHub account, you can download the repository as a .zip file by heading over
39 | to the GitHub repository (https://github.com/amueller/scipy-2018-sklearn) in
40 | your browser and click the green “Download” button in the upper right.
41 |
42 | 
43 |
44 | Please note that we may add and improve the material until shortly before the
45 | tutorial session, and we recommend you to update your copy of the materials one
46 | day before the tutorials. If you have an GitHub account and cloned the
47 | repository via GitHub, you can sync your existing local repository with:
48 |
49 | ```bash
50 | git pull origin master
51 | ```
52 |
53 | If you don’t have a GitHub account, you may have to re-download the .zip
54 | archive from GitHub.
55 |
56 |
57 | Installation Notes
58 | ------------------
59 |
60 | This tutorial will require recent installations of
61 |
62 | - [NumPy](http://www.numpy.org)
63 | - [SciPy](http://www.scipy.org)
64 | - [matplotlib](http://matplotlib.org)
65 | - [pandas](http://pandas.pydata.org)
66 | - [pillow](https://python-pillow.org)
67 | - [scikit-learn](http://scikit-learn.org/stable/)
68 | - [IPython](http://ipython.readthedocs.org/en/stable/)
69 | - [Jupyter Notebook](http://jupyter.org)
70 |
71 |
72 | The last one is important and you should be able to type:
73 |
74 | jupyter notebook
75 |
76 | in your terminal window and see the notebook panel load in your web browser.
77 | Try opening and running a notebook from the material to see check that it works. Alternatively you can use Jupyter lab.
78 |
79 | For users who do not yet have the required packages installed, a relatively
80 | painless way to install all the requirements is to use a Python distribution
81 | such as [Anaconda](https://www.anaconda.com/download/ "Anaconda"), which includes
82 | the most relevant Python packages for science, math, engineering, and
83 | data analysis; Anaconda can be downloaded and installed for free
84 | including commercial use and redistribution.
85 | The code examples in this tutorial should be compatible to Python 2.7,
86 | Python 3.4-3.6.
87 |
88 | After obtaining the material, we **strongly recommend** you to open and execute
89 | the Jupyter Notebook `jupter notebook check_env.ipynb` that is located at the
90 | top level of this repository. Inside the repository, you can open the notebook
91 | by executing
92 |
93 | ```bash
94 | jupyter notebook check_env.ipynb
95 | ```
96 |
97 | inside this repository. Inside the Notebook, you can run the code cell by
98 | clicking on the "Run Cells" button as illustrated in the figure below:
99 |
100 | 
101 |
102 |
103 | Finally, if your environment satisfies the requirements for the tutorials, the
104 | executed code cell will produce an output message as shown below:
105 |
106 | 
107 |
108 | Although not required, we also recommend you to update the scikit-learn the latest release version to ensure best compatibility with the teaching material. Please upgrade already installed packages by executing
109 |
110 | - `pip install --no-deps --upgrade [package-name]`
111 | - or `conda update [package-name]`
112 |
113 | Depending on how you installed ``scikit-learn``.
114 |
115 |
116 | Data Downloads
117 | --------------
118 |
119 | The data for this tutorial is not included in the repository. We will be
120 | using several data sets during the tutorial: most are built-in to
121 | scikit-learn, which
122 | includes code that automatically downloads and caches these
123 | data.
124 |
125 | **Because the wireless network
126 | at conferences can often be spotty, it would be a good idea to download these
127 | data sets before arriving at the conference.
128 | Please run**
129 | ```bash
130 | python fetch_data.py
131 | ```
132 | **to download all necessary data beforehand.**
133 |
134 | The download size of the data files are approx. 280 MB, and after `fetch_data.py`
135 | extracted the data on your disk, the ./notebook/dataset folder will take 480 MB
136 | of your local hard drive.
137 |
138 |
139 | Outline
140 | =======
141 |
142 | Morning Session
143 | ---------------
144 |
145 | - 01 Introduction to machine learning with sample applications, Supervised and Unsupervised learning [[view](notebooks/01.Introduction_to_Machine_Learning.ipynb)]
146 | - 02 Scientific Computing Tools for Python: NumPy, SciPy, and matplotlib [[view](notebooks/02.Scientific_Computing_Tools_in_Python.ipynb)]
147 | - 03 Data formats, preparation, and representation [[view](notebooks/03.Data_Representation_for_Machine_Learning.ipynb)]
148 | - 04 Supervised learning: Training and test data [[view](notebooks/04.Training_and_Testing_Data.ipynb)]
149 | - 05 Supervised learning: Estimators for classification [[view](notebooks/05.Supervised_Learning-Classification.ipynb)]
150 | - 06 Supervised learning: Estimators for regression analysis [[view](notebooks/06.Supervised_Learning-Regression.ipynb)]
151 | - 07 Unsupervised learning: Unsupervised Transformers [[view](notebooks/07.Unsupervised_Learning-Transformations_and_Dimensionality_Reduction.ipynb)]
152 | - 08 Unsupervised learning: Clustering [[view](notebooks/08.Unsupervised_Learning-Clustering.ipynb)]
153 | - 09 The scikit-learn estimator interface [[view](notebooks/09.Review_of_Scikit-learn_API.ipynb)]
154 | - 10 Preparing a real-world dataset (titanic) [[view](notebooks/10.Case_Study-Titanic_Survival.ipynb)]
155 | - 11 Working with text data via the bag-of-words model [[view](notebooks/11.Text_Feature_Extraction.ipynb)]
156 | - 12 Application: IMDb Movie Review Sentiment Analysis [[view](notebooks/12.Case_Study-SMS_Spam_Detection.ipynb)]
157 |
158 | Afternoon Session
159 | -----------------
160 |
161 | - 13 Cross-Validation [[view](notebooks/13.Cross_Validation.ipynb)]
162 | - 14 Model complexity and grid search for adjusting hyperparameters [[view](notebooks/14.Model_Complexity_and_GridSearchCV.ipynb)]
163 | - 15 Scikit-learn Pipelines [[view](notebooks/15.Pipelining_Estimators.ipynb)]
164 | - 16 Supervised learning: Performance metrics for classification [[view](notebooks/16.Performance_metrics_and_Model_Evaluation.ipynb)]
165 | - 17 Supervised learning: Linear Models [[view](notebooks/17.In_Depth-Linear_Models.ipynb)]
166 | - 18 Supervised learning: Decision trees and random forests, and ensemble methods [[view](notebooks/18.In_Depth-Trees_and_Forests.ipynb)]
167 | - 19 Supervised learning: feature selection [[view](notebooks/19.Feature_Selection.ipynb)]
168 | - 20 Unsupervised learning: Hierarchical and density-based clustering algorithms [[view](notebooks/20.Unsupervised_learning-Hierarchical_and_density-based_clustering_algorithms.ipynb)]
169 | - 21 Unsupervised learning: Non-linear dimensionality reduction [[view](notebooks/21.Unsupervised_learning-Non-linear_dimensionality_reduction.ipynb)]
170 | - 22 Unsupervised learning: Anomaly Detection [[view](notebooks/22.Unsupervised_learning-anomaly_detection.ipynb)]
171 | - 23 Supervised learning: Out-of-core learning [[view](notebooks/23.Out-of-core_Learning_Large_Scale_Text_Classification.ipynb)]
172 |
--------------------------------------------------------------------------------
/abstract.rst:
--------------------------------------------------------------------------------
1 | Machine Learning with scikit-learn
2 |
3 | Tutorial Topic
4 | --------------
5 |
6 | This tutorial aims to provide an introduction to machine learning and
7 | scikit-learn "from the ground up". We will start with core concepts of machine
8 | learning, some example uses of machine learning, and how to implement them
9 | using scikit-learn. Going in detail through the characteristics of several
10 | methods, we will discuss how to pick an algorithm for your application, how to
11 | set its hyper-parameters, and how to evaluate performance.
12 |
13 | Please provide a more detailed abstract of your tutorial (again, see last years tutorials).
14 | ---------------------------------------------------------------------------------------------
15 |
16 | Machine learning is the task of extracting knowledge from data, often with the
17 | goal of generalizing to new and unseen data. Applications of machine learning
18 | now touch nearly every aspect of everyday life, from the face detection in our
19 | phones and the streams of social media we consume to picking restaurants,
20 | partners, and movies. Machine learning has also become indispensable to many
21 | empirical sciences, from physics, astronomy and biology to social sciences.
22 |
23 | Scikit-learn has emerged as one of the most popular toolkits for machine
24 | learning, and is now widely used in industry and academia.
25 | The goal of this tutorial is to enable participants to use the wide variety of
26 | machine learning algorithms available in scikit-learn on their own data sets,
27 | for their own domains.
28 |
29 | This tutorial will comprise an introductory morning session and an advanced
30 | afternoon session. The morning part of the tutorial will cover basic concepts
31 | of machine learning, data representation, and preprocessing. We will explain
32 | different problem settings and concepts such as supervised learning,
33 | unsupervised learning, dimensionality reduction, anomaly detection or clustering,
34 | and illustrate them with applications showing with algorithms
35 | can be used in each situation. We will cover the different families of
36 | methods (nearest-neighbors, kernel machines, tree-based techniques, linear
37 | models, neural network) with demos of SVMs, Random Forests, K-Means, PCA, t-SNE,
38 | multi-layer perceptrons and others.
39 |
40 | In the afternoon session, we will discuss setting hyper-parameters and how to
41 | prevent overfitting. We will go in-depth into the trade-off of model complexity
42 | and dataset size, as well as discussing complexity of learning algorithms and
43 | how to cope with very large datasets using online methods that support
44 | out-of-core computations. The session will conclude by stepping
45 | through the process of building machine learning pipelines consisting of
46 | feature extraction, preprocessing and supervised learning.
47 |
48 |
49 | Outline
50 | ========
51 |
52 | Morning Session
53 | ----------------
54 |
55 | - Introduction to machine learning with sample applications
56 |
57 | - Types of machine learning: Unsupervised vs. supervised learning
58 |
59 | - Scientific Computing Tools for Python: NumPy, SciPy, and matplotlib
60 |
61 | - Data formats, preparation, and representation
62 |
63 | - Supervised learning: Training and test data
64 | - Supervised learning: The scikit-learn estimator interface
65 | - Supervised learning: Estimators for classification
66 | - Supervised learning: Estimators for regression analysis
67 |
68 | - Unsupervised learning: Unsupervised Transformers
69 | - Unsupervised learning: Preprocessing and scaling
70 | - Unsupervised learning: Feature extraction and dimensionality reduction
71 | - Unsupervised learning: Clustering
72 | - Unsupervised learning: Anomaly/Novelty Detection
73 |
74 | - Preparing a real-world dataset
75 | - Working with text data via the bag-of-words model
76 | - Application: IMDB Movie Review Sentiment Analysis
77 |
78 |
79 | Afternoon Session
80 | ------------------
81 | - Cross-Validation
82 | - Model Complexity: Overfitting and underfitting
83 | - Complexity of various model types
84 | - Grid search for adjusting hyperparameters
85 |
86 | - Scikit-learn Pipelines
87 |
88 | - Supervised learning: Performance metrics for classification
89 | - Supervised learning: Support Vector Machines
90 | - Supervised learning: Algorithm and model selection via nested cross-validation
91 | - Supervised learning: Decision trees and random forests, and ensemble methods
92 |
93 | - Unsupervised learning: Non-linear regression analysis
94 | - Unsupervised learning: Hierarchical and density-based clustering algorithms
95 | - Unsupervised learning: Non-linear dimensionality reduction
96 |
97 | - Wrapper, filter, and embedded approaches for feature selection
98 |
99 | - Supervised learning: Artificial neural networks: Multi-layer perceptrons
100 | - Supervised learning: Out-of-core learning
101 |
--------------------------------------------------------------------------------
/check_env.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from __future__ import print_function\n",
10 | "from distutils.version import LooseVersion as Version\n",
11 | "import sys\n",
12 | "\n",
13 | "\n",
14 | "try:\n",
15 | " import curses\n",
16 | " curses.setupterm()\n",
17 | " assert curses.tigetnum(\"colors\") > 2\n",
18 | " OK = \"\\x1b[1;%dm[ OK ]\\x1b[0m\" % (30 + curses.COLOR_GREEN)\n",
19 | " FAIL = \"\\x1b[1;%dm[FAIL]\\x1b[0m\" % (30 + curses.COLOR_RED)\n",
20 | "except:\n",
21 | " OK = '[ OK ]'\n",
22 | " FAIL = '[FAIL]'\n",
23 | "\n",
24 | "try:\n",
25 | " import importlib\n",
26 | "except ImportError:\n",
27 | " print(FAIL, \"Python version 3.4 (or 2.7) is required,\"\n",
28 | " \" but %s is installed.\" % sys.version)\n",
29 | "\n",
30 | " \n",
31 | "def import_version(pkg, min_ver, fail_msg=\"\"):\n",
32 | " mod = None\n",
33 | " try:\n",
34 | " mod = importlib.import_module(pkg)\n",
35 | " if pkg in {'PIL'}:\n",
36 | " ver = mod.VERSION\n",
37 | " else:\n",
38 | " ver = mod.__version__\n",
39 | " if Version(ver) < min_ver:\n",
40 | " print(FAIL, \"%s version %s or higher required, but %s installed.\"\n",
41 | " % (lib, min_ver, ver))\n",
42 | " else:\n",
43 | " print(OK, '%s version %s' % (pkg, ver))\n",
44 | " except ImportError:\n",
45 | " print(FAIL, '%s not installed. %s' % (pkg, fail_msg))\n",
46 | " return mod\n",
47 | "\n",
48 | "\n",
49 | "# first check the python version\n",
50 | "print('Using python in', sys.prefix)\n",
51 | "print(sys.version)\n",
52 | "pyversion = Version(sys.version)\n",
53 | "if pyversion >= \"3\":\n",
54 | " if pyversion < \"3.4\":\n",
55 | " print(FAIL, \"Python version 3.4 (or 2.7) is required,\"\n",
56 | " \" but %s is installed.\" % sys.version)\n",
57 | "elif pyversion >= \"2\":\n",
58 | " if pyversion < \"2.7\":\n",
59 | " print(FAIL, \"Python version 2.7 is required,\"\n",
60 | " \" but %s is installed.\" % sys.version)\n",
61 | "else:\n",
62 | " print(FAIL, \"Unknown Python version: %s\" % sys.version)\n",
63 | "\n",
64 | "print()\n",
65 | "requirements = {'numpy': \"1.7.1\", 'scipy': \"0.9\", 'matplotlib': \"2.0\",\n",
66 | " 'IPython': \"3.0\", 'sklearn': \"0.19.1\", 'pandas': \"0.19\",\n",
67 | " 'PIL': \"1.1.7\", 'ipywidgets': '6.0'}\n",
68 | "\n",
69 | "# now the dependencies\n",
70 | "for lib, required_version in list(requirements.items()):\n",
71 | " import_version(lib, required_version)"
72 | ]
73 | }
74 | ],
75 | "metadata": {
76 | "anaconda-cloud": {},
77 | "kernelspec": {
78 | "display_name": "Python 3",
79 | "language": "python",
80 | "name": "python3"
81 | },
82 | "language_info": {
83 | "codemirror_mode": {
84 | "name": "ipython",
85 | "version": 3
86 | },
87 | "file_extension": ".py",
88 | "mimetype": "text/x-python",
89 | "name": "python",
90 | "nbconvert_exporter": "python",
91 | "pygments_lexer": "ipython3",
92 | "version": "3.6.5"
93 | }
94 | },
95 | "nbformat": 4,
96 | "nbformat_minor": 1
97 | }
98 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: tutorial-sklearn
2 | dependencies:
3 | - python=3.6
4 | - numpy
5 | - scipy
6 | - matplotlib
7 | - pandas
8 | - pillow
9 | - scikit-learn
10 | - jupyter
11 | - ipython
12 | - pyzmq
13 |
--------------------------------------------------------------------------------
/fetch_data.py:
--------------------------------------------------------------------------------
1 | import os
2 | import zipfile
3 |
4 | try:
5 | from urllib.request import urlopen
6 | except ImportError:
7 | from urllib import urlopen
8 |
9 | import tarfile
10 |
11 |
12 | IMDB_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
13 | IMDB_ARCHIVE_NAME = "aclImdb_v1.tar.gz"
14 |
15 |
16 | def get_datasets_folder():
17 | here = os.path.dirname(__file__)
18 | notebooks = os.path.join(here, 'notebooks')
19 | datasets_folder = os.path.abspath(os.path.join(notebooks, 'datasets'))
20 | datasets_archive = os.path.abspath(os.path.join(notebooks, 'datasets.zip'))
21 |
22 | if not os.path.exists(datasets_folder):
23 | if os.path.exists(datasets_archive):
24 | print("Extracting " + datasets_archive)
25 | zf = zipfile.ZipFile(datasets_archive)
26 | zf.extractall('.')
27 | assert os.path.exists(datasets_folder)
28 | else:
29 | print("Creating datasets folder: " + datasets_folder)
30 | os.makedirs(datasets_folder)
31 | else:
32 | print("Using existing dataset folder:" + datasets_folder)
33 | return datasets_folder
34 |
35 |
36 | def check_imdb(datasets_folder):
37 | print("\nChecking availability of the IMDb dataset")
38 | archive_path = os.path.join(datasets_folder, IMDB_ARCHIVE_NAME)
39 | imdb_path = os.path.join(datasets_folder, 'IMDb')
40 |
41 | train_path = os.path.join(imdb_path, 'aclImdb', 'train')
42 | test_path = os.path.join(imdb_path, 'aclImdb', 'test')
43 |
44 | if not os.path.exists(imdb_path):
45 | if not os.path.exists(archive_path):
46 | print("Downloading dataset from %s (84.1MB)" % IMDB_URL)
47 | opener = urlopen(IMDB_URL)
48 | open(archive_path, 'wb').write(opener.read())
49 | else:
50 | print("Found archive: " + archive_path)
51 |
52 | print("Extracting %s to %s" % (archive_path, imdb_path))
53 |
54 | tar = tarfile.open(archive_path, "r:gz")
55 | tar.extractall(path=imdb_path)
56 | tar.close()
57 | os.remove(archive_path)
58 |
59 | print("Checking that the IMDb train & test directories exist...")
60 | assert os.path.exists(train_path)
61 | assert os.path.exists(test_path)
62 | print("=> Success!")
63 |
64 |
65 | if __name__ == "__main__":
66 | datasets_folder = get_datasets_folder()
67 | check_imdb(datasets_folder)
68 |
69 | print("\nLoading Labeled Faces Data (~200MB)")
70 | from sklearn.datasets import fetch_lfw_people
71 | fetch_lfw_people(min_faces_per_person=70, resize=0.4,
72 | data_home=datasets_folder)
73 | print("=> Success!")
74 |
--------------------------------------------------------------------------------
/images/check_env-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/images/check_env-1.png
--------------------------------------------------------------------------------
/images/check_env-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/images/check_env-2.png
--------------------------------------------------------------------------------
/images/download-repo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/images/download-repo.png
--------------------------------------------------------------------------------
/notebooks/01.Introduction_to_Machine_Learning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# SciPy 2018 Scikit-learn Tutorial"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Introduction to Machine Learning in Python"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## What is Machine Learning?"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "Machine learning is the process of extracting knowledge from data automatically, usually with the goal of making predictions on new, unseen data. A classical example is a spam filter, for which the user keeps labeling incoming mails as either spam or not spam. A machine learning algorithm then \"learns\" a predictive model from data that distinguishes spam from normal emails, a model which can predict for new emails whether they are spam or not. \n",
29 | "\n",
30 | "Central to machine learning is the concept of **automating decision making** from data **without the user specifying explicit rules** how this decision should be made.\n",
31 | "\n",
32 | "For the case of emails, the user doesn't provide a list of words or characteristics that make an email spam. Instead, the user provides examples of spam and non-spam emails that are labeled as such.\n",
33 | "\n",
34 | "The second central concept is **generalization**. The goal of a machine learning model is to predict on new, previously unseen data. In a real-world application, we are not interested in marking an already labeled email as spam or not. Instead, we want to make the user's life easier by automatically classifying new incoming mail."
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "
"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "The data is presented to the algorithm usually as a two-dimensional array (or matrix) of numbers. Each data point (also known as a *sample* or *training instance*) that we want to either learn from or make a decision on is represented as a list of numbers, a so-called feature vector, and its containing features represent the properties of this point. \n",
49 | "\n",
50 | "Later, we will work with a popular dataset called *Iris* -- among many other datasets. Iris, a classic benchmark dataset in the field of machine learning, contains the measurements of 150 iris flowers from 3 different species: Iris-Setosa, Iris-Versicolor, and Iris-Virginica. \n",
51 | "\n",
52 | "\n",
53 | "\n",
54 | "
\n",
55 | " \n",
56 | " Species | \n",
57 | " Image | \n",
58 | "
\n",
59 | " \n",
60 | " Iris Setosa | \n",
61 | "  | \n",
62 | "
\n",
63 | " \n",
64 | " Iris Versicolor | \n",
65 | "  | \n",
66 | "
\n",
67 | " \n",
68 | " Iris Virginica | \n",
69 | "  | \n",
70 | "
\n",
71 | "
\n",
72 | "\n",
73 | "\n",
74 | "\n",
75 | "\n",
76 | "\n",
77 | "We represent each flower sample as one row in our data array, and the columns (features) represent the flower measurements in centimeters. For instance, we can represent this Iris dataset, consisting of 150 samples and 4 features, a 2-dimensional array or matrix $\\mathbb{R}^{150 \\times 4}$ in the following format:\n",
78 | "\n",
79 | "\n",
80 | "$$\\mathbf{X} = \\begin{bmatrix}\n",
81 | " x_{1}^{(1)} & x_{2}^{(1)} & x_{3}^{(1)} & \\dots & x_{4}^{(1)} \\\\\n",
82 | " x_{1}^{(2)} & x_{2}^{(2)} & x_{3}^{(2)} & \\dots & x_{4}^{(2)} \\\\\n",
83 | " \\vdots & \\vdots & \\vdots & \\ddots & \\vdots \\\\\n",
84 | " x_{1}^{(150)} & x_{2}^{(150)} & x_{3}^{(150)} & \\dots & x_{4}^{(150)}\n",
85 | "\\end{bmatrix}.\n",
86 | "$$\n",
87 | "\n",
88 | "(The superscript denotes the *i*th row, and the subscript denotes the *j*th feature, respectively."
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "metadata": {},
94 | "source": [
95 | "There are two kinds of machine learning we will talk about today: ***supervised learning*** and ***unsupervised learning***."
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "### Supervised Learning: Classification and regression\n",
103 | "\n",
104 | "In **Supervised Learning**, we have a dataset consisting of both input features and a desired output, such as in the spam / no-spam example.\n",
105 | "The task is to construct a model (or program) which is able to predict the desired output of an unseen object\n",
106 | "given the set of features.\n",
107 | "\n",
108 | "Some more complicated examples are:\n",
109 | "\n",
110 | "- Given a multicolor image of an object through a telescope, determine\n",
111 | " whether that object is a star, a quasar, or a galaxy.\n",
112 | "- Given a photograph of a person, identify the person in the photo.\n",
113 | "- Given a list of movies a person has watched and their personal rating\n",
114 | " of the movie, recommend a list of movies they would like.\n",
115 | "- Given a persons age, education and position, infer their salary\n",
116 | "\n",
117 | "What these tasks have in common is that there is one or more unknown\n",
118 | "quantities associated with the object which needs to be determined from other\n",
119 | "observed quantities.\n",
120 | "\n",
121 | "Supervised learning is further broken down into two categories, **classification** and **regression**:\n",
122 | "\n",
123 | "- **In classification, the label is discrete**, such as \"spam\" or \"no spam\". In other words, it provides a clear-cut distinction between categories. Furthermore, it is important to note that class labels are nominal, not ordinal variables. Nominal and ordinal variables are both subcategories of categorical variable. Ordinal variables imply an order, for example, T-shirt sizes \"XL > L > M > S\". On the contrary, nominal variables don't imply an order, for example, we (usually) can't assume \"orange > blue > green\".\n",
124 | "\n",
125 | "\n",
126 | "- **In regression, the label is continuous**, that is a float output. For example,\n",
127 | "in astronomy, the task of determining whether an object is a star, a galaxy, or a quasar is a\n",
128 | "classification problem: the label is from three distinct categories. On the other hand, we might\n",
129 | "wish to estimate the age of an object based on such observations: this would be a regression problem,\n",
130 | "because the label (age) is a continuous quantity.\n",
131 | "\n",
132 | "In supervised learning, there is always a distinction between a **training set** for which the desired outcome is given, and a **test set** for which the desired outcome needs to be inferred. The learning model fits the predictive model to the training set, and we use the test set to evaluate its generalization performance.\n"
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "metadata": {},
138 | "source": [
139 | "### Unsupervised Learning\n",
140 | "\n",
141 | "In **Unsupervised Learning** there is no desired output associated with the data.\n",
142 | "Instead, we are interested in extracting some form of knowledge or model from the given data.\n",
143 | "In a sense, you can think of unsupervised learning as a means of discovering labels from the data itself.\n",
144 | "Unsupervised learning is often harder to understand and to evaluate.\n",
145 | "\n",
146 | "Unsupervised learning comprises tasks such as *dimensionality reduction*, *clustering*, and\n",
147 | "*density estimation*. For example, in the iris data discussed above, we can used unsupervised\n",
148 | "methods to determine combinations of the measurements which best display the structure of the\n",
149 | "data. As we’ll see below, such a projection of the data can be used to visualize the\n",
150 | "four-dimensional dataset in two dimensions. Some more involved unsupervised learning problems are:\n",
151 | "\n",
152 | "- Given detailed observations of distant galaxies, determine which features or combinations of\n",
153 | " features summarize best the information.\n",
154 | "- Given a mixture of two sound sources (for example, a person talking over some music),\n",
155 | " separate the two (this is called the [blind source separation](http://en.wikipedia.org/wiki/Blind_signal_separation) problem).\n",
156 | "- Given a video, isolate a moving object and categorize in relation to other moving objects which have been seen.\n",
157 | "- Given a large collection of news articles, find recurring topics inside these articles.\n",
158 | "- Given a collection of images, cluster similar images together (for example to group them when visualizing a collection)\n",
159 | "\n",
160 | "Sometimes the two may even be combined: e.g. unsupervised learning can be used to find useful\n",
161 | "features in heterogeneous data, and then these features can be used within a supervised\n",
162 | "framework."
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {},
168 | "source": [
169 | "### (simplified) Machine learning taxonomy\n",
170 | "\n",
171 | "
"
172 | ]
173 | }
174 | ],
175 | "metadata": {
176 | "anaconda-cloud": {},
177 | "kernelspec": {
178 | "display_name": "Python 3",
179 | "language": "python",
180 | "name": "python3"
181 | },
182 | "language_info": {
183 | "codemirror_mode": {
184 | "name": "ipython",
185 | "version": 3
186 | },
187 | "file_extension": ".py",
188 | "mimetype": "text/x-python",
189 | "name": "python",
190 | "nbconvert_exporter": "python",
191 | "pygments_lexer": "ipython3",
192 | "version": "3.6.6"
193 | }
194 | },
195 | "nbformat": 4,
196 | "nbformat_minor": 2
197 | }
198 |
--------------------------------------------------------------------------------
/notebooks/02.Scientific_Computing_Tools_in_Python.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Jupyter Notebooks\n",
8 | "==================\n",
9 | "\n",
10 | "* You can run a cell by pressing ``[shift] + [Enter]`` or by pressing the \"play\" button in the menu.\n",
11 | "\n",
12 | "\n",
13 | "\n",
14 | "* You can get help on a function or object by pressing ``[shift] + [tab]`` after the opening parenthesis ``function(``\n",
15 | "\n",
16 | "\n",
17 | "\n",
18 | "* You can also get help by executing ``function?``\n",
19 | "\n",
20 | ""
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "print('test')"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "print('test')"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "## Numpy Arrays"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "Manipulating `numpy` arrays is an important part of doing machine learning\n",
53 | "(or, really, any type of scientific computation) in python. This will likely\n",
54 | "be a short review for most. In any case, let's quickly go through some of the most important features."
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "import numpy as np\n",
64 | "\n",
65 | "# Setting a random seed for reproducibility\n",
66 | "rnd = np.random.RandomState(seed=123)\n",
67 | "\n",
68 | "# Generating a random array\n",
69 | "X = rnd.uniform(low=0.0, high=1.0, size=(3, 5)) # a 3 x 5 array\n",
70 | "\n",
71 | "print(X)"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "(Note that NumPy arrays use 0-indexing just like other data structures in Python.)"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "# Accessing elements\n",
88 | "\n",
89 | "# get a single element \n",
90 | "# (here: an element in the first row and column)\n",
91 | "print(X[0, 0])"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "# get a row \n",
101 | "# (here: 2nd row)\n",
102 | "print(X[1])"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "# get a column\n",
112 | "# (here: 2nd column)\n",
113 | "print(X[:, 1])"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "# Transposing an array\n",
123 | "print(X.T)"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "$$\\begin{bmatrix}\n",
131 | " 1 & 2 & 3 & 4 \\\\\n",
132 | " 5 & 6 & 7 & 8\n",
133 | "\\end{bmatrix}^T\n",
134 | "= \n",
135 | "\\begin{bmatrix}\n",
136 | " 1 & 5 \\\\\n",
137 | " 2 & 6 \\\\\n",
138 | " 3 & 7 \\\\\n",
139 | " 4 & 8\n",
140 | "\\end{bmatrix}\n",
141 | "$$\n",
142 | "\n"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "# Creating a row vector\n",
152 | "# of evenly spaced numbers over a specified interval.\n",
153 | "y = np.linspace(0, 12, 5)\n",
154 | "print(y)"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "# Turning the row vector into a column vector\n",
164 | "print(y[:, np.newaxis])"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "metadata": {},
171 | "outputs": [],
172 | "source": [
173 | "# Getting the shape or reshaping an array\n",
174 | "\n",
175 | "# Generating a random array\n",
176 | "rnd = np.random.RandomState(seed=123)\n",
177 | "X = rnd.uniform(low=0.0, high=1.0, size=(3, 5)) # a 3 x 5 array\n",
178 | "\n",
179 | "print(X.shape)"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "metadata": {},
186 | "outputs": [],
187 | "source": [
188 | "# reshape X to be of size (3, 5)\n",
189 | "X_reshaped = X.reshape(5, 3)\n",
190 | "print(X_reshaped)"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": null,
196 | "metadata": {},
197 | "outputs": [],
198 | "source": [
199 | "# Indexing by an array of integers (fancy indexing)\n",
200 | "indices = np.array([3, 1, 0])\n",
201 | "print(indices)\n",
202 | "X[:, indices]"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "There is much, much more to know, but these few operations are fundamental to what we'll\n",
210 | "do during this tutorial."
211 | ]
212 | },
213 | {
214 | "cell_type": "markdown",
215 | "metadata": {},
216 | "source": [
217 | "## SciPy Sparse Matrices"
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {},
223 | "source": [
224 | "We won't make very much use of these in this tutorial, but sparse matrices are very nice\n",
225 | "in some situations. In some machine learning tasks, especially those associated\n",
226 | "with textual analysis, the data may be mostly zeros. Storing all these zeros is very\n",
227 | "inefficient, and representing in a way that only contains the \"non-zero\" values can be much more efficient. We can create and manipulate sparse matrices as follows:"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {},
234 | "outputs": [],
235 | "source": [
236 | "# Create a random array with a lot of zeros\n",
237 | "rnd = np.random.RandomState(seed=123)\n",
238 | "\n",
239 | "X = rnd.uniform(low=0.0, high=1.0, size=(10, 5))\n",
240 | "print(X)"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "# set the majority of elements to zero\n",
250 | "X[X < 0.7] = 0\n",
251 | "print(X)"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": null,
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "from scipy import sparse\n",
261 | "\n",
262 | "# turn X into a CSR (Compressed-Sparse-Row) matrix\n",
263 | "X_csr = sparse.csr_matrix(X)\n",
264 | "print(X_csr)"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "# Converting the sparse matrix to a dense array\n",
274 | "print(X_csr.toarray())"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {},
280 | "source": [
281 | "(You may have stumbled upon an alternative method for converting sparse to dense representations: `numpy.todense`; `toarray` returns a NumPy array, whereas `todense` returns a NumPy matrix. In this tutorial, we will be working with NumPy arrays, not matrices; the latter are not supported by scikit-learn.)"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "The CSR representation can be very efficient for computations, but it is not\n",
289 | "as good for adding elements. For that, the LIL (List-In-List) representation\n",
290 | "is better:"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": null,
296 | "metadata": {},
297 | "outputs": [],
298 | "source": [
299 | "# Create an empty LIL matrix and add some items\n",
300 | "X_lil = sparse.lil_matrix((5, 5))\n",
301 | "\n",
302 | "for i, j in np.random.randint(0, 5, (15, 2)):\n",
303 | " X_lil[i, j] = i + j\n",
304 | "\n",
305 | "print(X_lil)\n",
306 | "print(type(X_lil))"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": null,
312 | "metadata": {},
313 | "outputs": [],
314 | "source": [
315 | "X_dense = X_lil.toarray()\n",
316 | "print(X_dense)\n",
317 | "print(type(X_dense))"
318 | ]
319 | },
320 | {
321 | "cell_type": "markdown",
322 | "metadata": {},
323 | "source": [
324 | "Often, once an LIL matrix is created, it is useful to convert it to a CSR format\n",
325 | "(many scikit-learn algorithms require CSR or CSC format)"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": null,
331 | "metadata": {},
332 | "outputs": [],
333 | "source": [
334 | "X_csr = X_lil.tocsr()\n",
335 | "print(X_csr)\n",
336 | "print(type(X_csr))"
337 | ]
338 | },
339 | {
340 | "cell_type": "markdown",
341 | "metadata": {},
342 | "source": [
343 | "The available sparse formats that can be useful for various problems are:\n",
344 | "\n",
345 | "- `CSR` (compressed sparse row)\n",
346 | "- `CSC` (compressed sparse column)\n",
347 | "- `BSR` (block sparse row)\n",
348 | "- `COO` (coordinate)\n",
349 | "- `DIA` (diagonal)\n",
350 | "- `DOK` (dictionary of keys)\n",
351 | "- `LIL` (list in list)\n",
352 | "\n",
353 | "The [``scipy.sparse``](http://docs.scipy.org/doc/scipy/reference/sparse.html) submodule also has a lot of functions for sparse matrices\n",
354 | "including linear algebra, sparse solvers, graph algorithms, and much more."
355 | ]
356 | },
357 | {
358 | "cell_type": "markdown",
359 | "metadata": {},
360 | "source": [
361 | "## matplotlib"
362 | ]
363 | },
364 | {
365 | "cell_type": "markdown",
366 | "metadata": {},
367 | "source": [
368 | "Another important part of machine learning is the visualization of data. The most common\n",
369 | "tool for this in Python is [`matplotlib`](http://matplotlib.org). It is an extremely flexible package, and\n",
370 | "we will go over some basics here.\n",
371 | "\n",
372 | "Since we are using Jupyter notebooks, let us use one of IPython's convenient built-in \"[magic functions](https://ipython.org/ipython-doc/3/interactive/magics.html)\", the \"matoplotlib inline\" mode, which will draw the plots directly inside the notebook."
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": null,
378 | "metadata": {},
379 | "outputs": [],
380 | "source": [
381 | "%matplotlib inline"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": null,
387 | "metadata": {},
388 | "outputs": [],
389 | "source": [
390 | "import matplotlib.pyplot as plt"
391 | ]
392 | },
393 | {
394 | "cell_type": "code",
395 | "execution_count": null,
396 | "metadata": {},
397 | "outputs": [],
398 | "source": [
399 | "# Plotting a line\n",
400 | "x = np.linspace(0, 10, 100)\n",
401 | "plt.plot(x, np.sin(x));"
402 | ]
403 | },
404 | {
405 | "cell_type": "code",
406 | "execution_count": null,
407 | "metadata": {},
408 | "outputs": [],
409 | "source": [
410 | "# Scatter-plot points\n",
411 | "x = np.random.normal(size=500)\n",
412 | "y = np.random.normal(size=500)\n",
413 | "plt.scatter(x, y);"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": null,
419 | "metadata": {},
420 | "outputs": [],
421 | "source": [
422 | "# Showing images using imshow\n",
423 | "# - note that origin is at the top-left by default!\n",
424 | "\n",
425 | "x = np.linspace(1, 12, 100)\n",
426 | "y = x[:, np.newaxis]\n",
427 | "\n",
428 | "im = y * np.sin(x) * np.cos(y)\n",
429 | "print(im.shape)\n",
430 | "\n",
431 | "plt.imshow(im);"
432 | ]
433 | },
434 | {
435 | "cell_type": "code",
436 | "execution_count": null,
437 | "metadata": {},
438 | "outputs": [],
439 | "source": [
440 | "# Contour plots \n",
441 | "# - note that origin here is at the bottom-left by default!\n",
442 | "plt.contour(im);"
443 | ]
444 | },
445 | {
446 | "cell_type": "code",
447 | "execution_count": null,
448 | "metadata": {},
449 | "outputs": [],
450 | "source": [
451 | "# 3D plotting\n",
452 | "from mpl_toolkits.mplot3d import Axes3D\n",
453 | "ax = plt.axes(projection='3d')\n",
454 | "xgrid, ygrid = np.meshgrid(x, y.ravel())\n",
455 | "ax.plot_surface(xgrid, ygrid, im, cmap=plt.cm.viridis, cstride=2, rstride=2, linewidth=0);"
456 | ]
457 | },
458 | {
459 | "cell_type": "markdown",
460 | "metadata": {},
461 | "source": [
462 | "There are many, many more plot types available. One useful way to explore these is by\n",
463 | "looking at the [matplotlib gallery](http://matplotlib.org/gallery.html).\n",
464 | "\n",
465 | "You can test these examples out easily in the notebook: simply copy the ``Source Code``\n",
466 | "link on each page, and put it in a notebook using the ``%load`` magic.\n",
467 | "For example:"
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": null,
473 | "metadata": {},
474 | "outputs": [],
475 | "source": [
476 | "# %load http://matplotlib.org/mpl_examples/pylab_examples/ellipse_collection.py"
477 | ]
478 | }
479 | ],
480 | "metadata": {
481 | "anaconda-cloud": {},
482 | "kernelspec": {
483 | "display_name": "Python 3",
484 | "language": "python",
485 | "name": "python3"
486 | },
487 | "language_info": {
488 | "codemirror_mode": {
489 | "name": "ipython",
490 | "version": 3
491 | },
492 | "file_extension": ".py",
493 | "mimetype": "text/x-python",
494 | "name": "python",
495 | "nbconvert_exporter": "python",
496 | "pygments_lexer": "ipython3",
497 | "version": "3.6.2"
498 | }
499 | },
500 | "nbformat": 4,
501 | "nbformat_minor": 2
502 | }
503 |
--------------------------------------------------------------------------------
/notebooks/04.Training_and_Testing_Data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib inline\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "Training and Testing Data\n",
21 | "=====================================\n",
22 | "\n",
23 | "To evaluate how well our supervised models generalize, we can split our data into a training and a test set:\n",
24 | "\n",
25 | "
"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "from sklearn.datasets import load_iris\n",
37 | "\n",
38 | "iris = load_iris()\n",
39 | "X, y = iris.data, iris.target"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "Thinking about how machine learning is normally performed, the idea of a train/test split makes sense. Real world systems train on the data they have, and as other data comes in (from customers, sensors, or other sources) the classifier that was trained must predict on fundamentally *new* data. We can simulate this during training using a train/test split - the test data is a simulation of \"future data\" which will come into the system during production. \n",
47 | "\n",
48 | "Specifically for iris, the 150 labels in iris are sorted, which means that if we split the data using a proportional split, this will result in fudamentally altered class distributions. For instance, if we'd perform a common 2/3 training data and 1/3 test data split, our training dataset will only consists of flower classes 0 and 1 (Setosa and Versicolor), and our test set will only contain samples with class label 2 (Virginica flowers).\n",
49 | "\n",
50 | "Under the assumption that all samples are independent of each other (in contrast time series data), we want to **randomly shuffle the dataset before we split the dataset** as illustrated above."
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {
57 | "collapsed": true
58 | },
59 | "outputs": [],
60 | "source": [
61 | "y"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "Now we need to split the data into training and testing. Luckily, this is a common pattern in machine learning and scikit-learn has a pre-built function to split data into training and testing sets for you. Here, we use 50% of the data as training, and 50% testing. 80% and 20% is another common split, but there are no hard and fast rules. The most important thing is to fairly evaluate your system on data it *has not* seen during training!"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {
75 | "collapsed": true
76 | },
77 | "outputs": [],
78 | "source": [
79 | "from sklearn.model_selection import train_test_split\n",
80 | "\n",
81 | "train_X, test_X, train_y, test_y = train_test_split(X, y, \n",
82 | " train_size=0.5,\n",
83 | " test_size=0.5,\n",
84 | " random_state=123)\n",
85 | "print(\"Labels for training data:\")\n",
86 | "print(train_y)"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "print(\"Labels for test data:\")\n",
96 | "print(test_y)"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "---"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "**Tip: Stratified Split**\n",
111 | "\n",
112 | "Especially for relatively small datasets, it's better to stratify the split. Stratification means that we maintain the original class proportion of the dataset in the test and training sets. For example, after we randomly split the dataset as shown in the previous code example, we have the following class proportions in percent:"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {
119 | "collapsed": true
120 | },
121 | "outputs": [],
122 | "source": [
123 | "print('All:', np.bincount(y) / float(len(y)) * 100.0)\n",
124 | "print('Training:', np.bincount(train_y) / float(len(train_y)) * 100.0)\n",
125 | "print('Test:', np.bincount(test_y) / float(len(test_y)) * 100.0)"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "So, in order to stratify the split, we can pass the label array as an additional option to the `train_test_split` function:"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {
139 | "collapsed": true
140 | },
141 | "outputs": [],
142 | "source": [
143 | "train_X, test_X, train_y, test_y = train_test_split(X, y, \n",
144 | " train_size=0.5,\n",
145 | " test_size=0.5,\n",
146 | " random_state=123,\n",
147 | " stratify=y)\n",
148 | "\n",
149 | "print('All:', np.bincount(y) / float(len(y)) * 100.0)\n",
150 | "print('Training:', np.bincount(train_y) / float(len(train_y)) * 100.0)\n",
151 | "print('Test:', np.bincount(test_y) / float(len(test_y)) * 100.0)"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "---"
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {},
164 | "source": [
165 | "By evaluating our classifier performance on data that has been seen during training, we could get false confidence in the predictive power of our model. In the worst case, it may simply memorize the training samples but completely fails classifying new, similar samples -- we really don't want to put such a system into production!\n",
166 | "\n",
167 | "Instead of using the same dataset for training and testing (this is called \"resubstitution evaluation\"), it is much much better to use a train/test split in order to estimate how well your trained model is doing on new data."
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {
174 | "collapsed": true
175 | },
176 | "outputs": [],
177 | "source": [
178 | "from sklearn.neighbors import KNeighborsClassifier\n",
179 | "\n",
180 | "classifier = KNeighborsClassifier().fit(train_X, train_y)\n",
181 | "pred_y = classifier.predict(test_X)\n",
182 | "\n",
183 | "print(\"Fraction Correct [Accuracy]:\")\n",
184 | "print(np.sum(pred_y == test_y) / float(len(test_y)))"
185 | ]
186 | },
187 | {
188 | "cell_type": "markdown",
189 | "metadata": {},
190 | "source": [
191 | "We can also visualize the correct predictions ..."
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": null,
197 | "metadata": {
198 | "collapsed": true
199 | },
200 | "outputs": [],
201 | "source": [
202 | "print('Samples correctly classified:')\n",
203 | "correct_idx = np.where(pred_y == test_y)[0]\n",
204 | "print(correct_idx)"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "... as well as the failed predictions"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": null,
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "print('Samples incorrectly classified:')\n",
221 | "incorrect_idx = np.where(pred_y != test_y)[0]\n",
222 | "print(incorrect_idx)"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": null,
228 | "metadata": {
229 | "collapsed": true
230 | },
231 | "outputs": [],
232 | "source": [
233 | "# Plot two dimensions\n",
234 | "\n",
235 | "for n in np.unique(test_y):\n",
236 | " idx = np.where(test_y == n)[0]\n",
237 | " plt.scatter(test_X[idx, 1], test_X[idx, 2], label=\"Class %s\" % str(iris.target_names[n]))\n",
238 | "\n",
239 | "plt.scatter(test_X[incorrect_idx, 1], test_X[incorrect_idx, 2], color=\"darkred\")\n",
240 | "\n",
241 | "plt.xlabel('sepal width [cm]')\n",
242 | "plt.ylabel('petal length [cm]')\n",
243 | "plt.legend(loc=3)\n",
244 | "plt.title(\"Iris Classification results\")\n",
245 | "plt.show()"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "We can see that the errors occur in the area where green (class 1) and gray (class 2) overlap. This gives us insight about what features to add - any feature which helps separate class 1 and class 2 should improve classifier performance."
253 | ]
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "metadata": {},
258 | "source": [
259 | "\n",
260 | "
EXERCISE:\n",
261 | "
\n",
262 | " - \n",
263 | " Print the true labels of 3 wrong predictions and modify the scatterplot code, which we used above, to visualize and distinguish these three samples with different markers in the 2D scatterplot. Can you explain why our classifier made these wrong predictions?\n",
264 | "
\n",
265 | "
\n",
266 | "
"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": null,
272 | "metadata": {},
273 | "outputs": [],
274 | "source": []
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "metadata": {
280 | "collapsed": true
281 | },
282 | "outputs": [],
283 | "source": [
284 | "# %load solutions/04_wrong-predictions.py"
285 | ]
286 | }
287 | ],
288 | "metadata": {
289 | "anaconda-cloud": {},
290 | "kernelspec": {
291 | "display_name": "Python 3",
292 | "language": "python",
293 | "name": "python3"
294 | },
295 | "language_info": {
296 | "codemirror_mode": {
297 | "name": "ipython",
298 | "version": 3
299 | },
300 | "file_extension": ".py",
301 | "mimetype": "text/x-python",
302 | "name": "python",
303 | "nbconvert_exporter": "python",
304 | "pygments_lexer": "ipython3",
305 | "version": "3.6.6"
306 | }
307 | },
308 | "nbformat": 4,
309 | "nbformat_minor": 2
310 | }
311 |
--------------------------------------------------------------------------------
/notebooks/06.Supervised_Learning-Regression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib inline\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "# Supervised Learning Part 2 -- Regression Analysis"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "In regression we are trying to predict a continuous output variable -- in contrast to the nominal variables we were predicting in the previous classification examples. \n",
28 | "\n",
29 | "Let's start with a simple toy example with one feature dimension (explanatory variable) and one target variable. We will create a dataset out of a sine curve with some noise:"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "x = np.linspace(-3, 3, 100)\n",
39 | "print(x)"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "rng = np.random.RandomState(42)\n",
49 | "y = np.sin(4 * x) + x + rng.uniform(size=len(x))"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "plt.plot(x, y, 'o');"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "Linear Regression\n",
66 | "=================\n",
67 | "\n",
68 | "The first model that we will introduce is the so-called simple linear regression. Here, we want to fit a line to the data, which \n",
69 | "\n",
70 | "One of the simplest models again is a linear one, that simply tries to predict the data as lying on a line. One way to find such a line is `LinearRegression` (also known as [*Ordinary Least Squares (OLS)*](https://en.wikipedia.org/wiki/Ordinary_least_squares) regression).\n",
71 | "The interface for LinearRegression is exactly the same as for the classifiers before, only that ``y`` now contains float values, instead of classes."
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "As we remember, the scikit-learn API requires us to provide the target variable (`y`) as a 1-dimensional array; scikit-learn's API expects the samples (`X`) in form a 2-dimensional array -- even though it may only consist of 1 feature. Thus, let us convert the 1-dimensional `x` NumPy array into an `X` array with 2 axes:\n"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "print('Before: ', x.shape)\n",
88 | "X = x[:, np.newaxis]\n",
89 | "print('After: ', X.shape)"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | "Again, we start by splitting our dataset into a training (75%) and a test set (25%):"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {
103 | "collapsed": true
104 | },
105 | "outputs": [],
106 | "source": [
107 | "from sklearn.model_selection import train_test_split\n",
108 | "\n",
109 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)"
110 | ]
111 | },
112 | {
113 | "cell_type": "markdown",
114 | "metadata": {},
115 | "source": [
116 | "Next, we use the learning algorithm implemented in `LinearRegression` to **fit a regression model to the training data**:"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "from sklearn.linear_model import LinearRegression\n",
126 | "\n",
127 | "regressor = LinearRegression()\n",
128 | "regressor.fit(X_train, y_train)"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "After fitting to the training data, we paramerterized a linear regression model with the following values."
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "print('Weight coefficients: ', regressor.coef_)\n",
145 | "print('y-axis intercept: ', regressor.intercept_)"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | "Since our regression model is a linear one, the relationship between the target variable (y) and the feature variable (x) is defined as \n",
153 | "\n",
154 | "$$y = \\text{weight} \\times x + \\text{intercept .}$$\n",
155 | "\n",
156 | "Plugging in the min and max values into thos equation, we can plot the regression fit to our training data:"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {},
163 | "outputs": [],
164 | "source": [
165 | "min_pt = X.min() * regressor.coef_[0] + regressor.intercept_\n",
166 | "max_pt = X.max() * regressor.coef_[0] + regressor.intercept_\n",
167 | "\n",
168 | "plt.plot([X.min(), X.max()], [min_pt, max_pt])\n",
169 | "plt.plot(X_train, y_train, 'o');"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {},
175 | "source": [
176 | "Similar to the estimators for classification in the previous notebook, we use the `predict` method to predict the target variable. And we expect these predicted values to fall onto the line that we plotted previously:"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {
183 | "collapsed": true
184 | },
185 | "outputs": [],
186 | "source": [
187 | "y_pred_train = regressor.predict(X_train)"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "metadata": {},
194 | "outputs": [],
195 | "source": [
196 | "plt.plot(X_train, y_train, 'o', label=\"data\")\n",
197 | "plt.plot(X_train, y_pred_train, 'o', label=\"prediction\")\n",
198 | "plt.plot([X.min(), X.max()], [min_pt, max_pt], label='fit')\n",
199 | "plt.legend(loc='best')"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "As we can see in the plot above, the line is able to capture the general slope of the data, but not many details."
207 | ]
208 | },
209 | {
210 | "cell_type": "markdown",
211 | "metadata": {},
212 | "source": [
213 | "Next, let's try the test set:"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "y_pred_test = regressor.predict(X_test)"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": null,
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "plt.plot(X_test, y_test, 'o', label=\"data\")\n",
232 | "plt.plot(X_test, y_pred_test, 'o', label=\"prediction\")\n",
233 | "plt.plot([X.min(), X.max()], [min_pt, max_pt], label='fit')\n",
234 | "plt.legend(loc='best');"
235 | ]
236 | },
237 | {
238 | "cell_type": "markdown",
239 | "metadata": {},
240 | "source": [
241 | "Again, scikit-learn provides an easy way to evaluate the prediction quantitatively using the ``score`` method. For regression tasks, this is the R2 score. Another popular way would be the Mean Squared Error (MSE). As its name implies, the MSE is simply the average squared difference over the predicted and actual target values\n",
242 | "\n",
243 | "$$MSE = \\frac{1}{n} \\sum_{i=1}^{n} (\\text{predicted}_i - \\text{true}_i)^2$$"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {},
250 | "outputs": [],
251 | "source": [
252 | "regressor.score(X_test, y_test)"
253 | ]
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "metadata": {},
258 | "source": [
259 | "\n",
260 | "
EXERCISE:\n",
261 | "
\n",
262 | " - \n",
263 | " Add a (non-linear) feature containing `sin(4x)` to `X` and redo the fit as a new column to X_train (and X_test). Visualize the predictions with this new richer, yet linear, model.\n",
264 | "
\n",
265 | " - \n",
266 | " Hint: you can use `np.concatenate(A, B, axis=1)` to concatenate two matrices A and B horizontal (to combine the columns).\n",
267 | "
\n",
268 | "
\n",
269 | "
"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {},
276 | "outputs": [],
277 | "source": [
278 | "# %load solutions/06B_lin_with_sine.py"
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "metadata": {},
284 | "source": [
285 | "KNeighborsRegression\n",
286 | "=======================\n",
287 | "As for classification, we can also use a neighbor based method for regression. We can simply take the output of the nearest point, or we could average several nearest points. This method is less popular for regression than for classification, but still a good baseline."
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": null,
293 | "metadata": {},
294 | "outputs": [],
295 | "source": [
296 | "from sklearn.neighbors import KNeighborsRegressor\n",
297 | "kneighbor_regression = KNeighborsRegressor(n_neighbors=1)\n",
298 | "kneighbor_regression.fit(X_train, y_train)"
299 | ]
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "metadata": {},
304 | "source": [
305 | "Again, let us look at the behavior on training and test set:"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "y_pred_train = kneighbor_regression.predict(X_train)\n",
315 | "\n",
316 | "plt.plot(X_train, y_train, 'o', label=\"data\", markersize=10)\n",
317 | "plt.plot(X_train, y_pred_train, 's', label=\"prediction\", markersize=4)\n",
318 | "plt.legend(loc='best');"
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "metadata": {},
324 | "source": [
325 | "On the training set, we do a perfect job: each point is its own nearest neighbor!"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": null,
331 | "metadata": {},
332 | "outputs": [],
333 | "source": [
334 | "y_pred_test = kneighbor_regression.predict(X_test)\n",
335 | "\n",
336 | "plt.plot(X_test, y_test, 'o', label=\"data\", markersize=8)\n",
337 | "plt.plot(X_test, y_pred_test, 's', label=\"prediction\", markersize=4)\n",
338 | "plt.legend(loc='best');"
339 | ]
340 | },
341 | {
342 | "cell_type": "markdown",
343 | "metadata": {},
344 | "source": [
345 | "On the test set, we also do a better job of capturing the variation, but our estimates look much messier than before.\n",
346 | "Let us look at the R2 score:"
347 | ]
348 | },
349 | {
350 | "cell_type": "code",
351 | "execution_count": null,
352 | "metadata": {},
353 | "outputs": [],
354 | "source": [
355 | "kneighbor_regression.score(X_test, y_test)"
356 | ]
357 | },
358 | {
359 | "cell_type": "markdown",
360 | "metadata": {},
361 | "source": [
362 | "Much better than before! Here, the linear model was not a good fit for our problem; it was lacking in complexity and thus under-fit our data."
363 | ]
364 | },
365 | {
366 | "cell_type": "markdown",
367 | "metadata": {},
368 | "source": [
369 | "\n",
370 | "
EXERCISE:\n",
371 | "
\n",
372 | " - \n",
373 | " Compare the KNeighborsRegressor and LinearRegression on the boston housing dataset. You can load the dataset using ``sklearn.datasets.load_boston``. You can learn about the dataset by reading the ``DESCR`` attribute.\n",
374 | "
\n",
375 | "
\n",
376 | "
"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": null,
382 | "metadata": {},
383 | "outputs": [],
384 | "source": []
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": null,
389 | "metadata": {
390 | "collapsed": true
391 | },
392 | "outputs": [],
393 | "source": [
394 | "# %load solutions/06A_knn_vs_linreg.py"
395 | ]
396 | }
397 | ],
398 | "metadata": {
399 | "anaconda-cloud": {},
400 | "kernelspec": {
401 | "display_name": "Python 3",
402 | "language": "python",
403 | "name": "python3"
404 | },
405 | "language_info": {
406 | "codemirror_mode": {
407 | "name": "ipython",
408 | "version": 3
409 | },
410 | "file_extension": ".py",
411 | "mimetype": "text/x-python",
412 | "name": "python",
413 | "nbconvert_exporter": "python",
414 | "pygments_lexer": "ipython3",
415 | "version": "3.6.4"
416 | }
417 | },
418 | "nbformat": 4,
419 | "nbformat_minor": 2
420 | }
421 |
--------------------------------------------------------------------------------
/notebooks/09.Review_of_Scikit-learn_API.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# A recap on Scikit-learn's estimator interface"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Scikit-learn strives to have a uniform interface across all methods. Given a scikit-learn *estimator*\n",
15 | "object named `model`, the following methods are available (not all for each model):\n",
16 | "\n",
17 | "- Available in **all Estimators**\n",
18 | " + `model.fit()` : fit training data. For supervised learning applications,\n",
19 | " this accepts two arguments: the data `X` and the labels `y` (e.g. `model.fit(X, y)`).\n",
20 | " For unsupervised learning applications, `fit` takes only a single argument,\n",
21 | " the data `X` (e.g. `model.fit(X)`).\n",
22 | "- Available in **supervised estimators**\n",
23 | " + `model.predict()` : given a trained model, predict the label of a new set of data.\n",
24 | " This method accepts one argument, the new data `X_new` (e.g. `model.predict(X_new)`),\n",
25 | " and returns the learned label for each object in the array.\n",
26 | " + `model.predict_proba()` : For classification problems, some estimators also provide\n",
27 | " this method, which returns the probability that a new observation has each categorical label.\n",
28 | " In this case, the label with the highest probability is returned by `model.predict()`.\n",
29 | " + `model.decision_function()` : For classification problems, some estimators provide an uncertainty estimate that is not a probability. For binary classification, a decision_function >= 0 means the positive class will be predicted, while < 0 means the negative class.\n",
30 | " + `model.score()` : for classification or regression problems, most (all?) estimators implement\n",
31 | " a score method. Scores are between 0 and 1, with a larger score indicating a better fit. For classifiers, the `score` method computes the prediction accuracy. For regressors, `score` computes the coefficient of determination (R2) of the prediction.\n",
32 | " + `model.transform()` : For feature selection algorithms, this will reduce the dataset to the selected features. For some classification and regression models such as some linear models and random forests, this method reduces the dataset to the most informative features. These classification and regression models can therefore also be used as feature selection methods.\n",
33 | " \n",
34 | "- Available in **unsupervised estimators**\n",
35 | " + `model.transform()` : given an unsupervised model, transform new data into the new basis.\n",
36 | " This also accepts one argument `X_new`, and returns the new representation of the data based\n",
37 | " on the unsupervised model.\n",
38 | " + `model.fit_transform()` : some estimators implement this method,\n",
39 | " which more efficiently performs a fit and a transform on the same input data.\n",
40 | " + `model.predict()` : for clustering algorithms, the predict method will produce cluster labels for new data points. Not all clustering methods have this functionality.\n",
41 | " + `model.predict_proba()` : Gaussian mixture models (GMMs) provide the probability for each point to be generated by a given mixture component.\n",
42 | " + `model.score()` : Density models like KDE and GMMs provide the likelihood of the data under the model."
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "Apart from ``fit``, the two most important functions are arguably ``predict`` to produce a target variable (a ``y``) ``transform``, which produces a new representation of the data (an ``X``).\n",
50 | "The following table shows for which class of models which function applies:\n",
51 | "\n"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "\n",
59 | "``model.predict`` | ``model.transform`` |
\n",
60 | "Classification | Preprocessing |
\n",
61 | "Regression | Dimensionality Reduction |
\n",
62 | "Clustering | Feature Extraction |
\n",
63 | " | Feature Selection |
\n",
64 | "\n",
65 | "
\n",
66 | "\n",
67 | "\n"
68 | ]
69 | }
70 | ],
71 | "metadata": {
72 | "anaconda-cloud": {},
73 | "kernelspec": {
74 | "display_name": "Python 3",
75 | "language": "python",
76 | "name": "python3"
77 | },
78 | "language_info": {
79 | "codemirror_mode": {
80 | "name": "ipython",
81 | "version": 3
82 | },
83 | "file_extension": ".py",
84 | "mimetype": "text/x-python",
85 | "name": "python",
86 | "nbconvert_exporter": "python",
87 | "pygments_lexer": "ipython3",
88 | "version": "3.6.4"
89 | }
90 | },
91 | "nbformat": 4,
92 | "nbformat_minor": 2
93 | }
94 |
--------------------------------------------------------------------------------
/notebooks/11.Text_Feature_Extraction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib inline\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "# Methods - Text Feature Extraction with Bag-of-Words"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "In many tasks, like in the classical spam detection, your input data is text.\n",
28 | "Free text with variables length is very far from the fixed length numeric representation that we need to do machine learning with scikit-learn.\n",
29 | "However, there is an easy and effective way to go from text data to a numeric representation using the so-called bag-of-words model, which provides a data structure that is compatible with the machine learning aglorithms in scikit-learn."
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "
\n"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "Let's assume that each sample in your dataset is represented as one string, which could be just a sentence, an email, or a whole news article or book. To represent the sample, we first split the string into a list of tokens, which correspond to (somewhat normalized) words. A simple way to do this to just split by whitespace, and then lowercase the word. \n",
44 | "\n",
45 | "Then, we build a vocabulary of all tokens (lowercased words) that appear in our whole dataset. This is usually a very large vocabulary.\n",
46 | "Finally, looking at our single sample, we could show how often each word in the vocabulary appears.\n",
47 | "We represent our string by a vector, where each entry is how often a given word in the vocabulary appears in the string.\n",
48 | "\n",
49 | "As each sample will only contain very few words, most entries will be zero, leading to a very high-dimensional but sparse representation.\n",
50 | "\n",
51 | "The method is called \"bag-of-words,\" as the order of the words is lost entirely."
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {
58 | "collapsed": true
59 | },
60 | "outputs": [],
61 | "source": [
62 | "X = [\"Some say the world will end in fire,\",\n",
63 | " \"Some say in ice.\"]"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "len(X)"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "from sklearn.feature_extraction.text import CountVectorizer\n",
82 | "\n",
83 | "vectorizer = CountVectorizer()\n",
84 | "vectorizer.fit(X)"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "vectorizer.vocabulary_"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {
100 | "collapsed": true
101 | },
102 | "outputs": [],
103 | "source": [
104 | "X_bag_of_words = vectorizer.transform(X)"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "X_bag_of_words.shape"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "X_bag_of_words"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "X_bag_of_words.toarray()"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {},
138 | "outputs": [],
139 | "source": [
140 | "vectorizer.get_feature_names()"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "vectorizer.inverse_transform(X_bag_of_words)"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "# tf-idf Encoding\n",
157 | "A useful transformation that is often applied to the bag-of-word encoding is the so-called term-frequency inverse-document-frequency (tf-idf) scaling, which is a non-linear transformation of the word counts.\n",
158 | "\n",
159 | "The tf-idf encoding rescales words that are common to have less weight:"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {},
166 | "outputs": [],
167 | "source": [
168 | "from sklearn.feature_extraction.text import TfidfVectorizer\n",
169 | "\n",
170 | "tfidf_vectorizer = TfidfVectorizer()\n",
171 | "tfidf_vectorizer.fit(X)"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "metadata": {},
178 | "outputs": [],
179 | "source": [
180 | "import numpy as np\n",
181 | "np.set_printoptions(precision=2)\n",
182 | "\n",
183 | "print(tfidf_vectorizer.transform(X).toarray())"
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "metadata": {},
189 | "source": [
190 | "tf-idfs are a way to represent documents as feature vectors. tf-idfs can be understood as a modification of the raw term frequencies (`tf`); the `tf` is the count of how often a particular word occurs in a given document. The concept behind the tf-idf is to downweight terms proportionally to the number of documents in which they occur. Here, the idea is that terms that occur in many different documents are likely unimportant or don't contain any useful information for Natural Language Processing tasks such as document classification. If you are interested in the mathematical details and equations, see this [external IPython Notebook](http://nbviewer.jupyter.org/github/rasbt/pattern_classification/blob/master/machine_learning/scikit-learn/tfidf_scikit-learn.ipynb) that walks you through the computation."
191 | ]
192 | },
193 | {
194 | "cell_type": "markdown",
195 | "metadata": {},
196 | "source": [
197 | "# Bigrams and N-Grams\n",
198 | "\n",
199 | "In the example illustrated in the figure at the beginning of this notebook, we used the so-called 1-gram (unigram) tokenization: Each token represents a single element with regard to the splittling criterion. \n",
200 | "\n",
201 | "Entirely discarding word order is not always a good idea, as composite phrases often have specific meaning, and modifiers like \"not\" can invert the meaning of words.\n",
202 | "\n",
203 | "A simple way to include some word order are n-grams, which don't only look at a single token, but at all pairs of neighborhing tokens. For example, in 2-gram (bigram) tokenization, we would group words together with an overlap of one word; in 3-gram (trigram) splits we would create an overlap two words, and so forth:\n",
204 | "\n",
205 | "- original text: \"this is how you get ants\"\n",
206 | "- 1-gram: \"this\", \"is\", \"how\", \"you\", \"get\", \"ants\"\n",
207 | "- 2-gram: \"this is\", \"is how\", \"how you\", \"you get\", \"get ants\"\n",
208 | "- 3-gram: \"this is how\", \"is how you\", \"how you get\", \"you get ants\"\n",
209 | "\n",
210 | "Which \"n\" we choose for \"n-gram\" tokenization to obtain the optimal performance in our predictive model depends on the learning algorithm, dataset, and task. Or in other words, we have consider \"n\" in \"n-grams\" as a tuning parameters, and in later notebooks, we will see how we deal with these.\n",
211 | "\n",
212 | "Now, let's create a bag of words model of bigrams using scikit-learn's `CountVectorizer`:"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "# look at sequences of tokens of minimum length 2 and maximum length 2\n",
222 | "bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))\n",
223 | "bigram_vectorizer.fit(X)"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {},
230 | "outputs": [],
231 | "source": [
232 | "bigram_vectorizer.get_feature_names()"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "metadata": {},
239 | "outputs": [],
240 | "source": [
241 | "bigram_vectorizer.transform(X).toarray()"
242 | ]
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "metadata": {},
247 | "source": [
248 | "Often we want to include unigrams (single tokens) AND bigrams, wich we can do by passing the following tuple as an argument to the `ngram_range` parameter of the `CountVectorizer` function:"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": null,
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "gram_vectorizer = CountVectorizer(ngram_range=(1, 2))\n",
258 | "gram_vectorizer.fit(X)"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": null,
264 | "metadata": {},
265 | "outputs": [],
266 | "source": [
267 | "gram_vectorizer.get_feature_names()"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": null,
273 | "metadata": {},
274 | "outputs": [],
275 | "source": [
276 | "gram_vectorizer.transform(X).toarray()"
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "metadata": {},
282 | "source": [
283 | "Character n-grams\n",
284 | "=================\n",
285 | "\n",
286 | "Sometimes it is also helpful not only to look at words, but to consider single characters instead. \n",
287 | "That is particularly useful if we have very noisy data and want to identify the language, or if we want to predict something about a single word.\n",
288 | "We can simply look at characters instead of words by setting ``analyzer=\"char\"``.\n",
289 | "Looking at single characters is usually not very informative, but looking at longer n-grams of characters could be:"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "X"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "char_vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer=\"char\")\n",
308 | "char_vectorizer.fit(X)"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": null,
314 | "metadata": {
315 | "scrolled": true
316 | },
317 | "outputs": [],
318 | "source": [
319 | "print(char_vectorizer.get_feature_names())"
320 | ]
321 | },
322 | {
323 | "cell_type": "markdown",
324 | "metadata": {},
325 | "source": [
326 | "\n",
327 | "
EXERCISE:\n",
328 | "
\n",
329 | " - \n",
330 | " Compute the bigrams from \"zen of python\" as given below (or by ``import this``), and find the most common trigram.\n",
331 | "We want to treat each line as a separate document. You can achieve this by splitting the string by newlines (``\\n``).\n",
332 | "Compute the Tf-idf encoding of the data. Which words have the highest tf-idf score? Why?\n",
333 | "What changes if you use ``TfidfVectorizer(norm=\"none\")``?\n",
334 | "
\n",
335 | "
\n",
336 | "
"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": null,
342 | "metadata": {
343 | "collapsed": true
344 | },
345 | "outputs": [],
346 | "source": [
347 | "zen = \"\"\"Beautiful is better than ugly.\n",
348 | "Explicit is better than implicit.\n",
349 | "Simple is better than complex.\n",
350 | "Complex is better than complicated.\n",
351 | "Flat is better than nested.\n",
352 | "Sparse is better than dense.\n",
353 | "Readability counts.\n",
354 | "Special cases aren't special enough to break the rules.\n",
355 | "Although practicality beats purity.\n",
356 | "Errors should never pass silently.\n",
357 | "Unless explicitly silenced.\n",
358 | "In the face of ambiguity, refuse the temptation to guess.\n",
359 | "There should be one-- and preferably only one --obvious way to do it.\n",
360 | "Although that way may not be obvious at first unless you're Dutch.\n",
361 | "Now is better than never.\n",
362 | "Although never is often better than *right* now.\n",
363 | "If the implementation is hard to explain, it's a bad idea.\n",
364 | "If the implementation is easy to explain, it may be a good idea.\n",
365 | "Namespaces are one honking great idea -- let's do more of those!\"\"\""
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": null,
371 | "metadata": {
372 | "collapsed": false,
373 | "deletable": true,
374 | "editable": true
375 | },
376 | "outputs": [],
377 | "source": [
378 | "# %load solutions/11_ngrams.py"
379 | ]
380 | }
381 | ],
382 | "metadata": {
383 | "anaconda-cloud": {},
384 | "kernelspec": {
385 | "display_name": "Python 3",
386 | "language": "python",
387 | "name": "python3"
388 | },
389 | "language_info": {
390 | "codemirror_mode": {
391 | "name": "ipython",
392 | "version": 3
393 | },
394 | "file_extension": ".py",
395 | "mimetype": "text/x-python",
396 | "name": "python",
397 | "nbconvert_exporter": "python",
398 | "pygments_lexer": "ipython3",
399 | "version": "3.6.4"
400 | }
401 | },
402 | "nbformat": 4,
403 | "nbformat_minor": 2
404 | }
405 |
--------------------------------------------------------------------------------
/notebooks/12.Case_Study-SMS_Spam_Detection.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib inline\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "# Case Study - Text classification for SMS spam detection"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "We first load the text data from the `dataset` directory that should be located in your notebooks directory, which we created by running the `fetch_data.py` script from the top level of the GitHub repository.\n",
28 | "\n",
29 | "Furthermore, we perform some simple preprocessing and split the data array into two parts:\n",
30 | "\n",
31 | "1. `text`: A list of lists, where each sublists contains the contents of our emails\n",
32 | "2. `y`: our SPAM vs HAM labels stored in binary; a 1 represents a spam message, and a 0 represnts a ham (non-spam) message. "
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {
39 | "collapsed": true
40 | },
41 | "outputs": [],
42 | "source": [
43 | "import os\n",
44 | "\n",
45 | "with open(os.path.join(\"datasets\", \"smsspam\", \"SMSSpamCollection\")) as f:\n",
46 | " lines = [line.strip().split(\"\\t\") for line in f.readlines()]\n",
47 | "\n",
48 | "text = [x[1] for x in lines]\n",
49 | "y = [int(x[0] == \"spam\") for x in lines]"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {
56 | "scrolled": true
57 | },
58 | "outputs": [],
59 | "source": [
60 | "text[:10]"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {
67 | "scrolled": true
68 | },
69 | "outputs": [],
70 | "source": [
71 | "y[:10]"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "print('Number of ham and spam messages:', np.bincount(y))"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "type(text)"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "type(y)"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "Next, we split our dataset into 2 parts, the test and training dataset:"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {
112 | "collapsed": true
113 | },
114 | "outputs": [],
115 | "source": [
116 | "from sklearn.model_selection import train_test_split\n",
117 | "\n",
118 | "text_train, text_test, y_train, y_test = train_test_split(text, y, \n",
119 | " random_state=42,\n",
120 | " test_size=0.25,\n",
121 | " stratify=y)"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "Now, we use the CountVectorizer to parse the text data into a bag-of-words model."
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "from sklearn.feature_extraction.text import CountVectorizer\n",
138 | "\n",
139 | "print('CountVectorizer defaults')\n",
140 | "CountVectorizer()"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {
147 | "collapsed": true
148 | },
149 | "outputs": [],
150 | "source": [
151 | "vectorizer = CountVectorizer()\n",
152 | "vectorizer.fit(text_train)\n",
153 | "\n",
154 | "X_train = vectorizer.transform(text_train)\n",
155 | "X_test = vectorizer.transform(text_test)"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {
162 | "scrolled": true
163 | },
164 | "outputs": [],
165 | "source": [
166 | "print(len(vectorizer.vocabulary_))"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "X_train.shape"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "print(vectorizer.get_feature_names()[:20])"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "print(vectorizer.get_feature_names()[2000:2020])"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": [
202 | "print(X_train.shape)\n",
203 | "print(X_test.shape)"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "### Training a Classifier on Text Features"
211 | ]
212 | },
213 | {
214 | "cell_type": "markdown",
215 | "metadata": {},
216 | "source": [
217 | "We can now train a classifier, for instance a logistic regression classifier, which is a fast baseline for text classification tasks:"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": null,
223 | "metadata": {},
224 | "outputs": [],
225 | "source": [
226 | "from sklearn.linear_model import LogisticRegression\n",
227 | "\n",
228 | "clf = LogisticRegression()\n",
229 | "clf"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": null,
235 | "metadata": {},
236 | "outputs": [],
237 | "source": [
238 | "clf.fit(X_train, y_train)"
239 | ]
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {},
244 | "source": [
245 | "We can now evaluate the classifier on the testing set. Let's first use the built-in score function, which is the rate of correct classification in the test set:"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "clf.score(X_test, y_test)"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "metadata": {},
260 | "source": [
261 | "We can also compute the score on the training set to see how well we do there:"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": null,
267 | "metadata": {},
268 | "outputs": [],
269 | "source": [
270 | "clf.score(X_train, y_train)"
271 | ]
272 | },
273 | {
274 | "cell_type": "markdown",
275 | "metadata": {},
276 | "source": [
277 | "# Visualizing important features"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {
284 | "collapsed": true
285 | },
286 | "outputs": [],
287 | "source": [
288 | "def visualize_coefficients(classifier, feature_names, n_top_features=25):\n",
289 | " # get coefficients with large absolute values \n",
290 | " coef = classifier.coef_.ravel()\n",
291 | " positive_coefficients = np.argsort(coef)[-n_top_features:]\n",
292 | " negative_coefficients = np.argsort(coef)[:n_top_features]\n",
293 | " interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])\n",
294 | " # plot them\n",
295 | " plt.figure(figsize=(15, 5))\n",
296 | " colors = [\"tab:orange\" if c < 0 else \"tab:blue\" for c in coef[interesting_coefficients]]\n",
297 | " plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients], color=colors)\n",
298 | " feature_names = np.array(feature_names)\n",
299 | " plt.xticks(np.arange(1, 2 * n_top_features + 1), feature_names[interesting_coefficients], rotation=60, ha=\"right\");"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": null,
305 | "metadata": {},
306 | "outputs": [],
307 | "source": [
308 | "visualize_coefficients(clf, vectorizer.get_feature_names())"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": null,
314 | "metadata": {},
315 | "outputs": [],
316 | "source": [
317 | "vectorizer = CountVectorizer(min_df=2)\n",
318 | "vectorizer.fit(text_train)\n",
319 | "\n",
320 | "X_train = vectorizer.transform(text_train)\n",
321 | "X_test = vectorizer.transform(text_test)\n",
322 | "\n",
323 | "clf = LogisticRegression()\n",
324 | "clf.fit(X_train, y_train)\n",
325 | "\n",
326 | "print(clf.score(X_train, y_train))\n",
327 | "print(clf.score(X_test, y_test))"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {},
334 | "outputs": [],
335 | "source": [
336 | "len(vectorizer.get_feature_names())"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": null,
342 | "metadata": {},
343 | "outputs": [],
344 | "source": [
345 | "print(vectorizer.get_feature_names()[:20])"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": null,
351 | "metadata": {},
352 | "outputs": [],
353 | "source": [
354 | "visualize_coefficients(clf, vectorizer.get_feature_names())"
355 | ]
356 | },
357 | {
358 | "cell_type": "markdown",
359 | "metadata": {},
360 | "source": [
361 | "
"
362 | ]
363 | },
364 | {
365 | "cell_type": "markdown",
366 | "metadata": {},
367 | "source": [
368 | "\n",
369 | "
EXERCISE:\n",
370 | "
\n",
371 | " - \n",
372 | " Use TfidfVectorizer instead of CountVectorizer. Are the results better? How are the coefficients different?\n",
373 | "
\n",
374 | " - \n",
375 | " Change the parameters min_df and ngram_range of the TfidfVectorizer and CountVectorizer. How does that change the important features?\n",
376 | "
\n",
377 | "
\n",
378 | "
"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "metadata": {
385 | "collapsed": true,
386 | "deletable": true,
387 | "editable": true
388 | },
389 | "outputs": [],
390 | "source": [
391 | "# %load solutions/12A_tfidf.py"
392 | ]
393 | },
394 | {
395 | "cell_type": "code",
396 | "execution_count": null,
397 | "metadata": {
398 | "collapsed": true,
399 | "deletable": true,
400 | "editable": true
401 | },
402 | "outputs": [],
403 | "source": [
404 | "# %load solutions/12B_vectorizer_params.py"
405 | ]
406 | }
407 | ],
408 | "metadata": {
409 | "anaconda-cloud": {},
410 | "kernelspec": {
411 | "display_name": "Python 3",
412 | "language": "python",
413 | "name": "python3"
414 | },
415 | "language_info": {
416 | "codemirror_mode": {
417 | "name": "ipython",
418 | "version": 3
419 | },
420 | "file_extension": ".py",
421 | "mimetype": "text/x-python",
422 | "name": "python",
423 | "nbconvert_exporter": "python",
424 | "pygments_lexer": "ipython3",
425 | "version": "3.6.4"
426 | }
427 | },
428 | "nbformat": 4,
429 | "nbformat_minor": 2
430 | }
431 |
--------------------------------------------------------------------------------
/notebooks/13.Cross_Validation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Cross-Validation and scoring methods"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "In the previous sections and notebooks, we split our dataset into two parts, a training set and a test set. We used the training set to fit our model, and we used the test set to evaluate its generalization performance -- how well it performs on new, unseen data.\n",
15 | "\n",
16 | "\n",
17 | "
\n"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "However, often (labeled) data is precious, and this approach lets us only use ~ 3/4 of our data for training. On the other hand, we will only ever try to apply our model 1/4 of our data for testing.\n",
25 | "A common way to use more of the data to build a model, but also get a more robust estimate of the generalization performance, is cross-validation.\n",
26 | "In cross-validation, the data is split repeatedly into a training and non-overlapping test-sets, with a separate model built for every pair. The test-set scores are then aggregated for a more robust estimate.\n",
27 | "\n",
28 | "The most common way to do cross-validation is k-fold cross-validation, in which the data is first split into k (often 5 or 10) equal-sized folds, and then for each iteration, one of the k folds is used as test data, and the rest as training data:"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "
\n"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "This way, each data point will be in the test-set exactly once, and we can use all but a k'th of the data for training.\n",
43 | "Let us apply this technique to evaluate the KNeighborsClassifier algorithm on the Iris dataset:"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "from sklearn.datasets import load_iris\n",
53 | "from sklearn.neighbors import KNeighborsClassifier\n",
54 | "\n",
55 | "iris = load_iris()\n",
56 | "X, y = iris.data, iris.target\n",
57 | "\n",
58 | "classifier = KNeighborsClassifier()"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "The labels in iris are sorted, which means that if we split the data as illustrated above, the first fold will only have the label 0 in it, while the last one will only have the label 2:"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "y"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "To avoid this problem in evaluation, we first shuffle our data:"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "import numpy as np\n",
91 | "rng = np.random.RandomState(0)\n",
92 | "\n",
93 | "permutation = rng.permutation(len(X))\n",
94 | "X, y = X[permutation], y[permutation]\n",
95 | "print(y)"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "Now implementing cross-validation is easy:"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "k = 5\n",
112 | "n_samples = len(X)\n",
113 | "fold_size = n_samples // k\n",
114 | "scores = []\n",
115 | "masks = []\n",
116 | "for fold in range(k):\n",
117 | " # generate a boolean mask for the test set in this fold\n",
118 | " test_mask = np.zeros(n_samples, dtype=bool)\n",
119 | " test_mask[fold * fold_size : (fold + 1) * fold_size] = True\n",
120 | " # store the mask for visualization\n",
121 | " masks.append(test_mask)\n",
122 | " # create training and test sets using this mask\n",
123 | " X_test, y_test = X[test_mask], y[test_mask]\n",
124 | " X_train, y_train = X[~test_mask], y[~test_mask]\n",
125 | " # fit the classifier\n",
126 | " classifier.fit(X_train, y_train)\n",
127 | " # compute the score and record it\n",
128 | " scores.append(classifier.score(X_test, y_test))"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "Let's check that our test mask does the right thing:"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "import matplotlib.pyplot as plt\n",
145 | "%matplotlib inline\n",
146 | "plt.matshow(masks, cmap='gray_r')"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "And now let's look a the scores we computed:"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "print(scores)\n",
163 | "print(np.mean(scores))"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "As you can see, there is a rather wide spectrum of scores from 90% correct to 100% correct. If we only did a single split, we might have gotten either answer."
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {},
176 | "source": [
177 | "As cross-validation is such a common pattern in machine learning, there are functions to do the above for you with much more flexibility and less code.\n",
178 | "The ``sklearn.model_selection`` module has all functions related to cross validation. There easiest function is ``cross_val_score`` which takes an estimator and a dataset, and will do all of the splitting for you:"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": null,
184 | "metadata": {},
185 | "outputs": [],
186 | "source": [
187 | "from sklearn.model_selection import cross_val_score\n",
188 | "scores = cross_val_score(classifier, X, y)\n",
189 | "print('Scores on each CV fold: %s' % scores)\n",
190 | "print('Mean score: %0.3f' % np.mean(scores))"
191 | ]
192 | },
193 | {
194 | "cell_type": "markdown",
195 | "metadata": {},
196 | "source": [
197 | "As you can see, the function uses three folds by default. You can change the number of folds using the cv argument:"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": null,
203 | "metadata": {},
204 | "outputs": [],
205 | "source": [
206 | "cross_val_score(classifier, X, y, cv=5)"
207 | ]
208 | },
209 | {
210 | "cell_type": "markdown",
211 | "metadata": {},
212 | "source": [
213 | "There are also helper objects in the cross-validation module that will generate indices for you for all kinds of different cross-validation methods, including k-fold:"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit"
223 | ]
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "metadata": {},
228 | "source": [
229 | "By default, cross_val_score will use ``StratifiedKFold`` for classification, which ensures that the class proportions in the dataset are reflected in each fold. If you have a binary classification dataset with 90% of data point belonging to class 0, that would mean that in each fold, 90% of datapoints would belong to class 0.\n",
230 | "If you would just use KFold cross-validation, it is likely that you would generate a split that only contains class 0.\n",
231 | "It is generally a good idea to use ``StratifiedKFold`` whenever you do classification.\n",
232 | "\n",
233 | "``StratifiedKFold`` would also remove our need to shuffle ``iris``.\n",
234 | "Let's see what kinds of folds it generates on the unshuffled iris dataset.\n",
235 | "Each cross-validation class is a generator of sets of training and test indices:"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "cv = StratifiedKFold(n_splits=5)\n",
245 | "for train, test in cv.split(iris.data, iris.target):\n",
246 | " print(test)"
247 | ]
248 | },
249 | {
250 | "cell_type": "markdown",
251 | "metadata": {},
252 | "source": [
253 | "As you can see, there are a couple of samples from the beginning, then from the middle, and then from the end, in each of the folds.\n",
254 | "This way, the class ratios are preserved. Let's visualize the split:"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": null,
260 | "metadata": {},
261 | "outputs": [],
262 | "source": [
263 | "def plot_cv(cv, features, labels):\n",
264 | " masks = []\n",
265 | " for train, test in cv.split(features, labels):\n",
266 | " mask = np.zeros(len(labels), dtype=bool)\n",
267 | " mask[test] = 1\n",
268 | " masks.append(mask)\n",
269 | " \n",
270 | " plt.matshow(masks, cmap='gray_r')"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": null,
276 | "metadata": {},
277 | "outputs": [],
278 | "source": [
279 | "plot_cv(StratifiedKFold(n_splits=5), iris.data, iris.target)"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {},
285 | "source": [
286 | "For comparison, again the standard KFold, that ignores the labels:"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": null,
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "plot_cv(KFold(n_splits=5), iris.data, iris.target)"
296 | ]
297 | },
298 | {
299 | "cell_type": "markdown",
300 | "metadata": {},
301 | "source": [
302 | "Keep in mind that increasing the number of folds will give you a larger training dataset, but will lead to more repetitions, and therefore a slower evaluation:"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": null,
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "plot_cv(KFold(n_splits=10), iris.data, iris.target)"
312 | ]
313 | },
314 | {
315 | "cell_type": "markdown",
316 | "metadata": {},
317 | "source": [
318 | "Another helpful cross-validation generator is ``ShuffleSplit``. This generator simply splits of a random portion of the data repeatedly. This allows the user to specify the number of repetitions and the training set size independently:"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "plot_cv(ShuffleSplit(n_splits=5, test_size=.2), iris.data, iris.target)"
328 | ]
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "metadata": {},
333 | "source": [
334 | "If you want a more robust estimate, you can just increase the number of splits:"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": null,
340 | "metadata": {},
341 | "outputs": [],
342 | "source": [
343 | "plot_cv(ShuffleSplit(n_splits=20, test_size=.2), iris.data, iris.target)"
344 | ]
345 | },
346 | {
347 | "cell_type": "markdown",
348 | "metadata": {},
349 | "source": [
350 | "You can use all of these cross-validation generators with the `cross_val_score` method:"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": null,
356 | "metadata": {},
357 | "outputs": [],
358 | "source": [
359 | "cv = ShuffleSplit(n_splits=5, test_size=.2)\n",
360 | "cross_val_score(classifier, X, y, cv=cv)"
361 | ]
362 | },
363 | {
364 | "cell_type": "markdown",
365 | "metadata": {},
366 | "source": [
367 | "\n",
368 | "
EXERCISE:\n",
369 | "
\n",
370 | " - \n",
371 | " Perform three-fold cross-validation using the ``KFold`` class on the iris dataset without shuffling the data. Can you explain the result?\n",
372 | "
\n",
373 | "
\n",
374 | "
"
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": null,
380 | "metadata": {},
381 | "outputs": [],
382 | "source": [
383 | "# %load solutions/13_cross_validation.py"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": null,
389 | "metadata": {},
390 | "outputs": [],
391 | "source": []
392 | }
393 | ],
394 | "metadata": {
395 | "anaconda-cloud": {},
396 | "kernelspec": {
397 | "display_name": "Python 3",
398 | "language": "python",
399 | "name": "python3"
400 | },
401 | "language_info": {
402 | "codemirror_mode": {
403 | "name": "ipython",
404 | "version": 3
405 | },
406 | "file_extension": ".py",
407 | "mimetype": "text/x-python",
408 | "name": "python",
409 | "nbconvert_exporter": "python",
410 | "pygments_lexer": "ipython3",
411 | "version": "3.6.2"
412 | }
413 | },
414 | "nbformat": 4,
415 | "nbformat_minor": 2
416 | }
417 |
--------------------------------------------------------------------------------
/notebooks/15.Pipelining_Estimators.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true,
8 | "deletable": true,
9 | "editable": true
10 | },
11 | "outputs": [],
12 | "source": [
13 | "%matplotlib inline\n",
14 | "import numpy as np\n",
15 | "import matplotlib.pyplot as plt"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {
21 | "deletable": true,
22 | "editable": true
23 | },
24 | "source": [
25 | "# Pipelining estimators"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {
31 | "deletable": true,
32 | "editable": true
33 | },
34 | "source": [
35 | "In this section we study how different estimators maybe be chained."
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {
41 | "deletable": true,
42 | "editable": true
43 | },
44 | "source": [
45 | "## A simple example: feature extraction and selection before an estimator"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {
51 | "deletable": true,
52 | "editable": true
53 | },
54 | "source": [
55 | "### Feature extraction: vectorizer"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {
61 | "deletable": true,
62 | "editable": true
63 | },
64 | "source": [
65 | "For some types of data, for instance text data, a feature extraction step must be applied to convert it to numerical features.\n",
66 | "To illustrate we load the SMS spam dataset we used earlier."
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {
73 | "collapsed": true,
74 | "deletable": true,
75 | "editable": true
76 | },
77 | "outputs": [],
78 | "source": [
79 | "import os\n",
80 | "\n",
81 | "with open(os.path.join(\"datasets\", \"smsspam\", \"SMSSpamCollection\")) as f:\n",
82 | " lines = [line.strip().split(\"\\t\") for line in f.readlines()]\n",
83 | "text = [x[1] for x in lines]\n",
84 | "y = [x[0] == \"ham\" for x in lines]"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {
91 | "collapsed": true,
92 | "deletable": true,
93 | "editable": true
94 | },
95 | "outputs": [],
96 | "source": [
97 | "from sklearn.model_selection import train_test_split\n",
98 | "\n",
99 | "text_train, text_test, y_train, y_test = train_test_split(text, y)"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {
105 | "deletable": true,
106 | "editable": true
107 | },
108 | "source": [
109 | "Previously, we applied the feature extraction manually, like so:"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {
116 | "collapsed": false,
117 | "deletable": true,
118 | "editable": true
119 | },
120 | "outputs": [],
121 | "source": [
122 | "from sklearn.feature_extraction.text import TfidfVectorizer\n",
123 | "from sklearn.linear_model import LogisticRegression\n",
124 | "\n",
125 | "vectorizer = TfidfVectorizer()\n",
126 | "vectorizer.fit(text_train)\n",
127 | "\n",
128 | "X_train = vectorizer.transform(text_train)\n",
129 | "X_test = vectorizer.transform(text_test)\n",
130 | "\n",
131 | "clf = LogisticRegression()\n",
132 | "clf.fit(X_train, y_train)\n",
133 | "\n",
134 | "clf.score(X_test, y_test)"
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {
140 | "deletable": true,
141 | "editable": true
142 | },
143 | "source": [
144 | "The situation where we learn a transformation and then apply it to the test data is very common in machine learning.\n",
145 | "Therefore scikit-learn has a shortcut for this, called pipelines:"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {
152 | "collapsed": false,
153 | "deletable": true,
154 | "editable": true
155 | },
156 | "outputs": [],
157 | "source": [
158 | "from sklearn.pipeline import make_pipeline\n",
159 | "\n",
160 | "pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())\n",
161 | "pipeline.fit(text_train, y_train)\n",
162 | "pipeline.score(text_test, y_test)"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {
168 | "deletable": true,
169 | "editable": true
170 | },
171 | "source": [
172 | "As you can see, this makes the code much shorter and easier to handle. Behind the scenes, exactly the same as above is happening. When calling fit on the pipeline, it will call fit on each step in turn.\n",
173 | "\n",
174 | "After the first step is fit, it will use the ``transform`` method of the first step to create a new representation.\n",
175 | "This will then be fed to the ``fit`` of the next step, and so on.\n",
176 | "Finally, on the last step, only ``fit`` is called.\n",
177 | "\n",
178 | "\n",
179 | "\n",
180 | "If we call ``score``, only ``transform`` will be called on each step - this could be the test set after all! Then, on the last step, ``score`` is called with the new representation. The same goes for ``predict``."
181 | ]
182 | },
183 | {
184 | "cell_type": "markdown",
185 | "metadata": {
186 | "deletable": true,
187 | "editable": true
188 | },
189 | "source": [
190 | "Building pipelines not only simplifies the code, it is also important for model selection.\n",
191 | "Say we want to grid-search C to tune our Logistic Regression above.\n",
192 | "\n",
193 | "Let's say we do it like this:"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {
200 | "collapsed": false,
201 | "deletable": true,
202 | "editable": true
203 | },
204 | "outputs": [],
205 | "source": [
206 | "# This illustrates a common mistake. Don't use this code!\n",
207 | "from sklearn.model_selection import GridSearchCV\n",
208 | "\n",
209 | "vectorizer = TfidfVectorizer()\n",
210 | "vectorizer.fit(text_train)\n",
211 | "\n",
212 | "X_train = vectorizer.transform(text_train)\n",
213 | "X_test = vectorizer.transform(text_test)\n",
214 | "\n",
215 | "clf = LogisticRegression()\n",
216 | "grid = GridSearchCV(clf, param_grid={'C': [.1, 1, 10, 100]}, cv=5)\n",
217 | "grid.fit(X_train, y_train)"
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {
223 | "deletable": true,
224 | "editable": true
225 | },
226 | "source": [
227 | "### What did we do wrong?"
228 | ]
229 | },
230 | {
231 | "cell_type": "markdown",
232 | "metadata": {
233 | "deletable": true,
234 | "editable": true
235 | },
236 | "source": [
237 | "Here, we did grid-search with cross-validation on ``X_train``. However, when applying ``TfidfVectorizer``, it saw all of the ``X_train``,\n",
238 | "not only the training folds! So it could use knowledge of the frequency of the words in the test-folds. This is called \"contamination\" of the test set, and leads to too optimistic estimates of generalization performance, or badly selected parameters.\n",
239 | "We can fix this with the pipeline, though:"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "metadata": {
246 | "collapsed": false,
247 | "deletable": true,
248 | "editable": true
249 | },
250 | "outputs": [],
251 | "source": [
252 | "from sklearn.model_selection import GridSearchCV\n",
253 | "\n",
254 | "pipeline = make_pipeline(TfidfVectorizer(), \n",
255 | " LogisticRegression())\n",
256 | "\n",
257 | "grid = GridSearchCV(pipeline,\n",
258 | " param_grid={'logisticregression__C': [.1, 1, 10, 100]}, cv=5)\n",
259 | "\n",
260 | "grid.fit(text_train, y_train)\n",
261 | "grid.score(text_test, y_test)"
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "metadata": {
267 | "deletable": true,
268 | "editable": true
269 | },
270 | "source": [
271 | "Note that we need to tell the pipeline where at which step we wanted to set the parameter ``C``.\n",
272 | "We can do this using the special ``__`` syntax. The name before the ``__`` is simply the name of the class, the part after ``__`` is the parameter we want to set with grid-search."
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {
278 | "deletable": true,
279 | "editable": true
280 | },
281 | "source": [
282 | "
"
283 | ]
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "metadata": {
288 | "deletable": true,
289 | "editable": true
290 | },
291 | "source": [
292 | "Another benefit of using pipelines is that we can now also search over parameters of the feature extraction with ``GridSearchCV``:"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": null,
298 | "metadata": {
299 | "collapsed": false,
300 | "deletable": true,
301 | "editable": true
302 | },
303 | "outputs": [],
304 | "source": [
305 | "from sklearn.model_selection import GridSearchCV\n",
306 | "\n",
307 | "pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())\n",
308 | "\n",
309 | "params = {'logisticregression__C': [.1, 1, 10, 100],\n",
310 | " \"tfidfvectorizer__ngram_range\": [(1, 1), (1, 2), (2, 2)]}\n",
311 | "grid = GridSearchCV(pipeline, param_grid=params, cv=5)\n",
312 | "grid.fit(text_train, y_train)\n",
313 | "print(grid.best_params_)\n",
314 | "grid.score(text_test, y_test)"
315 | ]
316 | },
317 | {
318 | "cell_type": "markdown",
319 | "metadata": {
320 | "deletable": true,
321 | "editable": true
322 | },
323 | "source": [
324 | "\n",
325 | "
EXERCISE:\n",
326 | "
\n",
327 | " - \n",
328 | " Create a pipeline out of a StandardScaler and Ridge regression and apply it to the Boston housing dataset (load using ``sklearn.datasets.load_boston``). Try adding the ``sklearn.preprocessing.PolynomialFeatures`` transformer as a second preprocessing step, and grid-search the degree of the polynomials (try 1, 2 and 3).\n",
329 | "
\n",
330 | "
\n",
331 | "
"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": null,
337 | "metadata": {
338 | "collapsed": true,
339 | "deletable": true,
340 | "editable": true
341 | },
342 | "outputs": [],
343 | "source": [
344 | "# %load solutions/15A_ridge_grid.py"
345 | ]
346 | }
347 | ],
348 | "metadata": {
349 | "anaconda-cloud": {},
350 | "kernelspec": {
351 | "display_name": "Python 3",
352 | "language": "python",
353 | "name": "python3"
354 | },
355 | "language_info": {
356 | "codemirror_mode": {
357 | "name": "ipython",
358 | "version": 3
359 | },
360 | "file_extension": ".py",
361 | "mimetype": "text/x-python",
362 | "name": "python",
363 | "nbconvert_exporter": "python",
364 | "pygments_lexer": "ipython3",
365 | "version": "3.6.4"
366 | }
367 | },
368 | "nbformat": 4,
369 | "nbformat_minor": 2
370 | }
371 |
--------------------------------------------------------------------------------
/notebooks/18.In_Depth-Trees_and_Forests.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# In Depth - Decision Trees and Forests"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "%matplotlib inline\n",
19 | "import numpy as np\n",
20 | "import matplotlib.pyplot as plt"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "Here we'll explore a class of algorithms based on decision trees.\n",
28 | "Decision trees at their root are extremely intuitive. They\n",
29 | "encode a series of \"if\" and \"else\" choices, similar to how a person might make a decision.\n",
30 | "However, which questions to ask, and how to proceed for each answer is entirely learned from the data.\n",
31 | "\n",
32 | "For example, if you wanted to create a guide to identifying an animal found in nature, you\n",
33 | "might ask the following series of questions:\n",
34 | "\n",
35 | "- Is the animal bigger or smaller than a meter long?\n",
36 | " + *bigger*: does the animal have horns?\n",
37 | " - *yes*: are the horns longer than ten centimeters?\n",
38 | " - *no*: is the animal wearing a collar\n",
39 | " + *smaller*: does the animal have two or four legs?\n",
40 | " - *two*: does the animal have wings?\n",
41 | " - *four*: does the animal have a bushy tail?\n",
42 | "\n",
43 | "and so on. This binary splitting of questions is the essence of a decision tree."
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "One of the main benefit of tree-based models is that they require little preprocessing of the data.\n",
51 | "They can work with variables of different types (continuous and discrete) and are invariant to scaling of the features.\n",
52 | "\n",
53 | "Another benefit is that tree-based models are what is called \"nonparametric\", which means they don't have a fix set of parameters to learn. Instead, a tree model can become more and more flexible, if given more data.\n",
54 | "In other words, the number of free parameters grows with the number of samples and is not fixed, as for example in linear models.\n"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "## Decision Tree Regression"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "A decision tree is a simple binary classification tree that is\n",
69 | "similar to nearest neighbor classification. It can be used as follows:"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "from figures import make_dataset\n",
79 | "x, y = make_dataset()\n",
80 | "X = x.reshape(-1, 1)\n",
81 | "\n",
82 | "plt.figure()\n",
83 | "plt.xlabel('Feature X')\n",
84 | "plt.ylabel('Target y')\n",
85 | "plt.scatter(X, y);"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "from sklearn.tree import DecisionTreeRegressor\n",
95 | "\n",
96 | "reg = DecisionTreeRegressor(max_depth=5)\n",
97 | "reg.fit(X, y)\n",
98 | "\n",
99 | "X_fit = np.linspace(-3, 3, 1000).reshape((-1, 1))\n",
100 | "y_fit_1 = reg.predict(X_fit)\n",
101 | "\n",
102 | "plt.figure()\n",
103 | "plt.plot(X_fit.ravel(), y_fit_1, color='tab:blue', label=\"prediction\")\n",
104 | "plt.plot(X.ravel(), y, 'C7.', label=\"training data\")\n",
105 | "plt.legend(loc=\"best\");"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "A single decision tree allows us to estimate the signal in a non-parametric way,\n",
113 | "but clearly has some issues. In some regions, the model shows high bias and\n",
114 | "under-fits the data.\n",
115 | "(seen in the long flat lines which don't follow the contours of the data),\n",
116 | "while in other regions the model shows high variance and over-fits the data\n",
117 | "(reflected in the narrow spikes which are influenced by noise in single points)."
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "Decision Tree Classification\n",
125 | "==================\n",
126 | "Decision tree classification work very similarly, by assigning all points within a leaf the majority class in that leaf:\n"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "from sklearn.datasets import make_blobs\n",
136 | "from sklearn.model_selection import train_test_split\n",
137 | "from sklearn.tree import DecisionTreeClassifier\n",
138 | "from figures import plot_2d_separator\n",
139 | "from figures import cm2\n",
140 | "\n",
141 | "\n",
142 | "X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=100)\n",
143 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n",
144 | "\n",
145 | "clf = DecisionTreeClassifier(max_depth=5)\n",
146 | "clf.fit(X_train, y_train)\n",
147 | "\n",
148 | "plt.figure()\n",
149 | "plot_2d_separator(clf, X, fill=True)\n",
150 | "plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm2, s=60, alpha=.7, edgecolor='k')\n",
151 | "plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm2, s=60, edgecolor='k');"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "There are many parameter that control the complexity of a tree, but the one that might be easiest to understand is the maximum depth. This limits how finely the tree can partition the input space, or how many \"if-else\" questions can be asked before deciding which class a sample lies in.\n",
159 | "\n",
160 | "This parameter is important to tune for trees and tree-based models. The interactive plot below shows how underfit and overfit looks like for this model. Having a ``max_depth`` of 1 is clearly an underfit model, while a depth of 7 or 8 clearly overfits. The maximum depth a tree can be grown at for this dataset is 8, at which point each leave only contains samples from a single class. This is known as all leaves being \"pure.\"\n",
161 | "\n",
162 | "In the interactive plot below, the regions are assigned blue and red colors to indicate the predicted class for that region. The shade of the color indicates the predicted probability for that class (darker = higher probability), while yellow regions indicate an equal predicted probability for either class."
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "from figures import plot_tree\n",
172 | "max_depth = 3\n",
173 | "plot_tree(max_depth=max_depth)"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "Decision trees are fast to train, easy to understand, and often lead to interpretable models. However, single trees often tend to overfit the training data. Playing with the slider above you might notice that the model starts to overfit even before it has a good separation between the classes.\n",
181 | "\n",
182 | "Therefore, in practice it is more common to combine multiple trees to produce models that generalize better. The most common methods for combining trees are random forests and gradient boosted trees.\n"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {},
188 | "source": [
189 | "## Random Forests"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {},
195 | "source": [
196 | "Random forests are simply many trees, built on different random subsets (drawn with replacement) of the data, and using different random subsets (drawn without replacement) of the features for each split.\n",
197 | "This makes the trees different from each other, and makes them overfit to different aspects. Then, their predictions are averaged, leading to a smoother estimate that overfits less.\n"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": null,
203 | "metadata": {},
204 | "outputs": [],
205 | "source": [
206 | "from figures import plot_forest\n",
207 | "max_depth = 3\n",
208 | "plot_forest(max_depth=max_depth)"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "## Selecting the Optimal Estimator via Cross-Validation"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "from sklearn.model_selection import GridSearchCV\n",
225 | "from sklearn.datasets import load_digits\n",
226 | "from sklearn.ensemble import RandomForestClassifier\n",
227 | "\n",
228 | "digits = load_digits()\n",
229 | "X, y = digits.data, digits.target\n",
230 | "\n",
231 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n",
232 | "\n",
233 | "rf = RandomForestClassifier(n_estimators=200)\n",
234 | "parameters = {'max_features':['sqrt', 'log2', 10],\n",
235 | " 'max_depth':[5, 7, 9]}\n",
236 | "\n",
237 | "clf_grid = GridSearchCV(rf, parameters, n_jobs=-1)\n",
238 | "clf_grid.fit(X_train, y_train)"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": null,
244 | "metadata": {},
245 | "outputs": [],
246 | "source": [
247 | "clf_grid.score(X_train, y_train)"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "clf_grid.score(X_test, y_test)"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {},
262 | "source": [
263 | "## Another option: Gradient Boosting"
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "metadata": {},
269 | "source": [
270 | "Another Ensemble method that can be useful is *Boosting*: here, rather than\n",
271 | "looking at 200 (say) parallel estimators, We construct a chain of 200 estimators\n",
272 | "which iteratively refine the results of the previous estimator.\n",
273 | "The idea is that by sequentially applying very fast, simple models, we can get a\n",
274 | "total model error which is better than any of the individual pieces."
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": null,
280 | "metadata": {
281 | "collapsed": true
282 | },
283 | "outputs": [],
284 | "source": [
285 | "from sklearn.ensemble import GradientBoostingRegressor\n",
286 | "clf = GradientBoostingRegressor(n_estimators=100, max_depth=5, learning_rate=.2)\n",
287 | "clf.fit(X_train, y_train)\n",
288 | "\n",
289 | "print(clf.score(X_train, y_train))\n",
290 | "print(clf.score(X_test, y_test))"
291 | ]
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "metadata": {},
296 | "source": [
297 | "\n",
298 | "
EXERCISE: Cross-validating Gradient Boosting:\n",
299 | "
\n",
300 | " - \n",
301 | " Use a grid search to optimize the `learning_rate` and `max_depth` for a Gradient Boosted\n",
302 | "Decision tree on the digits data set.\n",
303 | "
\n",
304 | "
\n",
305 | "
"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {
312 | "collapsed": true
313 | },
314 | "outputs": [],
315 | "source": [
316 | "from sklearn.datasets import load_digits\n",
317 | "from sklearn.ensemble import GradientBoostingClassifier\n",
318 | "\n",
319 | "digits = load_digits()\n",
320 | "X_digits, y_digits = digits.data, digits.target\n",
321 | "\n",
322 | "# split the dataset, apply grid-search"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "metadata": {
329 | "collapsed": true
330 | },
331 | "outputs": [],
332 | "source": [
333 | "# %load solutions/18_gbc_grid.py"
334 | ]
335 | },
336 | {
337 | "cell_type": "markdown",
338 | "metadata": {},
339 | "source": [
340 | "## Feature importance\n",
341 | "\n",
342 | "Both RandomForest and GradientBoosting objects expose a `feature_importances_` attribute when fitted. This attribute is one of the most powerful feature of these models. They basically quantify how much each feature contributes to gain in performance in the nodes of the different trees."
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": null,
348 | "metadata": {},
349 | "outputs": [],
350 | "source": [
351 | "X, y = X_digits[y_digits < 2], y_digits[y_digits < 2]\n",
352 | "\n",
353 | "rf = RandomForestClassifier(n_estimators=300, n_jobs=1)\n",
354 | "rf.fit(X, y)\n",
355 | "print(rf.feature_importances_) # one value per feature"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {},
362 | "outputs": [],
363 | "source": [
364 | "plt.figure()\n",
365 | "plt.imshow(rf.feature_importances_.reshape(8, 8), cmap=plt.cm.viridis, interpolation='nearest')"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": null,
371 | "metadata": {},
372 | "outputs": [],
373 | "source": []
374 | }
375 | ],
376 | "metadata": {
377 | "anaconda-cloud": {},
378 | "kernelspec": {
379 | "display_name": "Python 3",
380 | "language": "python",
381 | "name": "python3"
382 | },
383 | "language_info": {
384 | "codemirror_mode": {
385 | "name": "ipython",
386 | "version": 3
387 | },
388 | "file_extension": ".py",
389 | "mimetype": "text/x-python",
390 | "name": "python",
391 | "nbconvert_exporter": "python",
392 | "pygments_lexer": "ipython3",
393 | "version": "3.6.4"
394 | }
395 | },
396 | "nbformat": 4,
397 | "nbformat_minor": 2
398 | }
399 |
--------------------------------------------------------------------------------
/notebooks/21.Unsupervised_learning-Non-linear_dimensionality_reduction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true,
8 | "deletable": true,
9 | "editable": true
10 | },
11 | "outputs": [],
12 | "source": [
13 | "%matplotlib inline\n",
14 | "import matplotlib.pyplot as plt\n",
15 | "import numpy as np"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {
21 | "deletable": true,
22 | "editable": true
23 | },
24 | "source": [
25 | "## Manifold Learning\n",
26 | "\n",
27 | "One weakness of PCA is that it cannot detect non-linear features. A set\n",
28 | "of algorithms known as *Manifold Learning* have been developed to address\n",
29 | "this deficiency. A canonical dataset used in Manifold learning is the\n",
30 | "*S-curve*:"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {
37 | "collapsed": false,
38 | "deletable": true,
39 | "editable": true
40 | },
41 | "outputs": [],
42 | "source": [
43 | "from sklearn.datasets import make_s_curve\n",
44 | "X, y = make_s_curve(n_samples=1000)\n",
45 | "\n",
46 | "from mpl_toolkits.mplot3d import Axes3D\n",
47 | "ax = plt.axes(projection='3d')\n",
48 | "\n",
49 | "ax.scatter3D(X[:, 0], X[:, 1], X[:, 2], c=y)\n",
50 | "ax.view_init(10, -60);"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {
56 | "deletable": true,
57 | "editable": true
58 | },
59 | "source": [
60 | "This is a 2-dimensional dataset embedded in three dimensions, but it is embedded\n",
61 | "in such a way that PCA cannot discover the underlying data orientation:"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {
68 | "collapsed": false,
69 | "deletable": true,
70 | "editable": true
71 | },
72 | "outputs": [],
73 | "source": [
74 | "from sklearn.decomposition import PCA\n",
75 | "X_pca = PCA(n_components=2).fit_transform(X)\n",
76 | "plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y);"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {
82 | "deletable": true,
83 | "editable": true
84 | },
85 | "source": [
86 | "Manifold learning algorithms, however, available in the ``sklearn.manifold``\n",
87 | "submodule, are able to recover the underlying 2-dimensional manifold:"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {
94 | "collapsed": false,
95 | "deletable": true,
96 | "editable": true
97 | },
98 | "outputs": [],
99 | "source": [
100 | "from sklearn.manifold import Isomap\n",
101 | "\n",
102 | "iso = Isomap(n_neighbors=15, n_components=2)\n",
103 | "X_iso = iso.fit_transform(X)\n",
104 | "plt.scatter(X_iso[:, 0], X_iso[:, 1], c=y);"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {
110 | "deletable": true,
111 | "editable": true
112 | },
113 | "source": [
114 | "## Manifold learning on the digits data"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {
120 | "deletable": true,
121 | "editable": true
122 | },
123 | "source": [
124 | "We can apply manifold learning techniques to much higher dimensional datasets, for example the digits data that we saw before:"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {
131 | "collapsed": false,
132 | "deletable": true,
133 | "editable": true
134 | },
135 | "outputs": [],
136 | "source": [
137 | "from sklearn.datasets import load_digits\n",
138 | "digits = load_digits()\n",
139 | "\n",
140 | "fig, axes = plt.subplots(2, 5, figsize=(10, 5),\n",
141 | " subplot_kw={'xticks':(), 'yticks': ()})\n",
142 | "for ax, img in zip(axes.ravel(), digits.images):\n",
143 | " ax.imshow(img, interpolation=\"none\", cmap=\"gray\")"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {
149 | "deletable": true,
150 | "editable": true
151 | },
152 | "source": [
153 | "We can visualize the dataset using a linear technique, such as PCA. We saw this already provides some intuition about the data:"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {
160 | "collapsed": false,
161 | "deletable": true,
162 | "editable": true
163 | },
164 | "outputs": [],
165 | "source": [
166 | "# build a PCA model\n",
167 | "pca = PCA(n_components=2)\n",
168 | "pca.fit(digits.data)\n",
169 | "# transform the digits data onto the first two principal components\n",
170 | "digits_pca = pca.transform(digits.data)\n",
171 | "colors = [\"#476A2A\", \"#7851B8\", \"#BD3430\", \"#4A2D4E\", \"#875525\",\n",
172 | " \"#A83683\", \"#4E655E\", \"#853541\", \"#3A3120\",\"#535D8E\"]\n",
173 | "plt.figure(figsize=(10, 10))\n",
174 | "plt.xlim(digits_pca[:, 0].min(), digits_pca[:, 0].max() + 1)\n",
175 | "plt.ylim(digits_pca[:, 1].min(), digits_pca[:, 1].max() + 1)\n",
176 | "for i in range(len(digits.data)):\n",
177 | " # actually plot the digits as text instead of using scatter\n",
178 | " plt.text(digits_pca[i, 0], digits_pca[i, 1], str(digits.target[i]),\n",
179 | " color = colors[digits.target[i]],\n",
180 | " fontdict={'weight': 'bold', 'size': 9})\n",
181 | "plt.xlabel(\"first principal component\")\n",
182 | "plt.ylabel(\"second principal component\");"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {
188 | "deletable": true,
189 | "editable": true
190 | },
191 | "source": [
192 | "Using a more powerful, nonlinear techinque can provide much better visualizations, though.\n",
193 | "Here, we are using the t-SNE manifold learning method:"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {
200 | "collapsed": true,
201 | "deletable": true,
202 | "editable": true
203 | },
204 | "outputs": [],
205 | "source": [
206 | "from sklearn.manifold import TSNE\n",
207 | "tsne = TSNE(random_state=42)\n",
208 | "# use fit_transform instead of fit, as TSNE has no transform method:\n",
209 | "digits_tsne = tsne.fit_transform(digits.data)"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {
216 | "collapsed": false,
217 | "deletable": true,
218 | "editable": true
219 | },
220 | "outputs": [],
221 | "source": [
222 | "plt.figure(figsize=(10, 10))\n",
223 | "plt.xlim(digits_tsne[:, 0].min(), digits_tsne[:, 0].max() + 1)\n",
224 | "plt.ylim(digits_tsne[:, 1].min(), digits_tsne[:, 1].max() + 1)\n",
225 | "for i in range(len(digits.data)):\n",
226 | " # actually plot the digits as text instead of using scatter\n",
227 | " plt.text(digits_tsne[i, 0], digits_tsne[i, 1], str(digits.target[i]),\n",
228 | " color = colors[digits.target[i]],\n",
229 | " fontdict={'weight': 'bold', 'size': 9})"
230 | ]
231 | },
232 | {
233 | "cell_type": "markdown",
234 | "metadata": {
235 | "deletable": true,
236 | "editable": true
237 | },
238 | "source": [
239 | "t-SNE has a somewhat longer runtime that other manifold learning algorithms, but the result is quite striking. Keep in mind that this algorithm is purely unsupervised, and does not know about the class labels. Still it is able to separate the classes very well (though the classes four, one and nine have been split into multiple groups)."
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {
245 | "deletable": true,
246 | "editable": true
247 | },
248 | "source": [
249 | "\n",
250 | "
EXERCISE:\n",
251 | "
\n",
252 | " - \n",
253 | " Compare the results of applying isomap to the digits dataset to the results of PCA and t-SNE. Which result do you think looks best?\n",
254 | "
\n",
255 | " - \n",
256 | " Given how well t-SNE separated the classes, one might be tempted to use this processing for classification. Try training a K-nearest neighbor classifier on digits data transformed with t-SNE, and compare to the accuracy on using the dataset without any transformation.\n",
257 | "
\n",
258 | "
\n",
259 | "
"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": null,
265 | "metadata": {
266 | "collapsed": true,
267 | "deletable": true,
268 | "editable": true
269 | },
270 | "outputs": [],
271 | "source": [
272 | "# %load solutions/21A_isomap_digits.py"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": null,
278 | "metadata": {
279 | "collapsed": true,
280 | "deletable": true,
281 | "editable": true
282 | },
283 | "outputs": [],
284 | "source": [
285 | "# %load solutions/21B_tsne_classification.py"
286 | ]
287 | }
288 | ],
289 | "metadata": {
290 | "anaconda-cloud": {},
291 | "kernelspec": {
292 | "display_name": "Python 3",
293 | "language": "python",
294 | "name": "python3"
295 | },
296 | "language_info": {
297 | "codemirror_mode": {
298 | "name": "ipython",
299 | "version": 3
300 | },
301 | "file_extension": ".py",
302 | "mimetype": "text/x-python",
303 | "name": "python",
304 | "nbconvert_exporter": "python",
305 | "pygments_lexer": "ipython3",
306 | "version": "3.6.4"
307 | }
308 | },
309 | "nbformat": 4,
310 | "nbformat_minor": 2
311 | }
312 |
--------------------------------------------------------------------------------
/notebooks/datasets/smsspam/readme:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/datasets/smsspam/readme
--------------------------------------------------------------------------------
/notebooks/figures/ML_flow_chart.py:
--------------------------------------------------------------------------------
1 | """
2 | Tutorial Diagrams
3 | -----------------
4 |
5 | This script plots the flow-charts used in the scikit-learn tutorials.
6 | """
7 |
8 | import matplotlib.pyplot as plt
9 | from matplotlib.patches import Circle, Rectangle, Polygon, FancyArrow
10 |
11 |
12 | def create_base(box_bg='#CCCCCC',
13 | arrow1='#88CCFF',
14 | arrow2='#88FF88',
15 | supervised=True):
16 | plt.figure(figsize=(9, 6), facecolor='w')
17 | ax = plt.axes((0, 0, 1, 1), xticks=[], yticks=[], frameon=False)
18 | ax.set_xlim(0, 9)
19 | ax.set_ylim(0, 6)
20 |
21 | patches = [Rectangle((0.3, 3.6), 1.5, 1.8, zorder=1, fc=box_bg),
22 | Rectangle((0.5, 3.8), 1.5, 1.8, zorder=2, fc=box_bg),
23 | Rectangle((0.7, 4.0), 1.5, 1.8, zorder=3, fc=box_bg),
24 |
25 | Rectangle((2.9, 3.6), 0.2, 1.8, fc=box_bg),
26 | Rectangle((3.1, 3.8), 0.2, 1.8, fc=box_bg),
27 | Rectangle((3.3, 4.0), 0.2, 1.8, fc=box_bg),
28 |
29 | Rectangle((0.3, 0.2), 1.5, 1.8, fc=box_bg),
30 |
31 | Rectangle((2.9, 0.2), 0.2, 1.8, fc=box_bg),
32 |
33 | Circle((5.5, 3.5), 1.0, fc=box_bg),
34 |
35 | Polygon([[5.5, 1.7],
36 | [6.1, 1.1],
37 | [5.5, 0.5],
38 | [4.9, 1.1]], fc=box_bg),
39 |
40 | FancyArrow(2.3, 4.6, 0.35, 0, fc=arrow1,
41 | width=0.25, head_width=0.5, head_length=0.2),
42 |
43 | FancyArrow(3.75, 4.2, 0.5, -0.2, fc=arrow1,
44 | width=0.25, head_width=0.5, head_length=0.2),
45 |
46 | FancyArrow(5.5, 2.4, 0, -0.4, fc=arrow1,
47 | width=0.25, head_width=0.5, head_length=0.2),
48 |
49 | FancyArrow(2.0, 1.1, 0.5, 0, fc=arrow2,
50 | width=0.25, head_width=0.5, head_length=0.2),
51 |
52 | FancyArrow(3.3, 1.1, 1.3, 0, fc=arrow2,
53 | width=0.25, head_width=0.5, head_length=0.2),
54 |
55 | FancyArrow(6.2, 1.1, 0.8, 0, fc=arrow2,
56 | width=0.25, head_width=0.5, head_length=0.2)]
57 |
58 | if supervised:
59 | patches += [Rectangle((0.3, 2.4), 1.5, 0.5, zorder=1, fc=box_bg),
60 | Rectangle((0.5, 2.6), 1.5, 0.5, zorder=2, fc=box_bg),
61 | Rectangle((0.7, 2.8), 1.5, 0.5, zorder=3, fc=box_bg),
62 | FancyArrow(2.3, 2.9, 2.0, 0, fc=arrow1,
63 | width=0.25, head_width=0.5, head_length=0.2),
64 | Rectangle((7.3, 0.85), 1.5, 0.5, fc=box_bg)]
65 | else:
66 | patches += [Rectangle((7.3, 0.2), 1.5, 1.8, fc=box_bg)]
67 |
68 | for p in patches:
69 | ax.add_patch(p)
70 |
71 | plt.text(1.45, 4.9, "Training\nText,\nDocuments,\nImages,\netc.",
72 | ha='center', va='center', fontsize=14)
73 |
74 | plt.text(3.6, 4.9, "Feature\nVectors",
75 | ha='left', va='center', fontsize=14)
76 |
77 | plt.text(5.5, 3.5, "Machine\nLearning\nAlgorithm",
78 | ha='center', va='center', fontsize=14)
79 |
80 | plt.text(1.05, 1.1, "New Text,\nDocument,\nImage,\netc.",
81 | ha='center', va='center', fontsize=14)
82 |
83 | plt.text(3.3, 1.7, "Feature\nVector",
84 | ha='left', va='center', fontsize=14)
85 |
86 | plt.text(5.5, 1.1, "Predictive\nModel",
87 | ha='center', va='center', fontsize=12)
88 |
89 | if supervised:
90 | plt.text(1.45, 3.05, "Labels",
91 | ha='center', va='center', fontsize=14)
92 |
93 | plt.text(8.05, 1.1, "Expected\nLabel",
94 | ha='center', va='center', fontsize=14)
95 | plt.text(8.8, 5.8, "Supervised Learning Model",
96 | ha='right', va='top', fontsize=18)
97 |
98 | else:
99 | plt.text(8.05, 1.1,
100 | "Likelihood\nor Cluster ID\nor Better\nRepresentation",
101 | ha='center', va='center', fontsize=12)
102 | plt.text(8.8, 5.8, "Unsupervised Learning Model",
103 | ha='right', va='top', fontsize=18)
104 |
105 |
106 | def plot_supervised_chart(annotate=False):
107 | create_base(supervised=True)
108 | if annotate:
109 | fontdict = dict(color='r', weight='bold', size=14)
110 | plt.text(1.9, 4.55, 'X = vec.fit_transform(input)',
111 | fontdict=fontdict,
112 | rotation=20, ha='left', va='bottom')
113 | plt.text(3.7, 3.2, 'clf.fit(X, y)',
114 | fontdict=fontdict,
115 | rotation=20, ha='left', va='bottom')
116 | plt.text(1.7, 1.5, 'X_new = vec.transform(input)',
117 | fontdict=fontdict,
118 | rotation=20, ha='left', va='bottom')
119 | plt.text(6.1, 1.5, 'y_new = clf.predict(X_new)',
120 | fontdict=fontdict,
121 | rotation=20, ha='left', va='bottom')
122 |
123 |
124 | def plot_unsupervised_chart():
125 | create_base(supervised=False)
126 |
127 |
128 | if __name__ == '__main__':
129 | plot_supervised_chart(False)
130 | plot_supervised_chart(True)
131 | plot_unsupervised_chart()
132 | plt.show()
133 |
--------------------------------------------------------------------------------
/notebooks/figures/__init__.py:
--------------------------------------------------------------------------------
1 | from .plot_2d_separator import plot_2d_separator
2 | from .plot_kneighbors_regularization import plot_kneighbors_regularization, \
3 | plot_regression_datasets, make_dataset
4 | from .plot_linear_svc_regularization import plot_linear_svc_regularization
5 | from .plot_interactive_tree import plot_tree_interactive, plot_tree
6 | from .plot_interactive_forest import plot_forest_interactive, plot_forest
7 | from .plot_rbf_svm_parameters import plot_rbf_svm_parameters
8 | from .plot_rbf_svm_parameters import plot_svm_interactive
9 | from .plot_scaling import plot_scaling, plot_relative_scaling
10 | from .plot_digits_dataset import digits_plot
11 | from .plot_pca import plot_pca_illustration
12 | from .plot_helpers import cm2, cm3
13 |
14 | __all__ = ['plot_2d_separator', 'plot_kneighbors_regularization',
15 | 'plot_linear_svc_regularization', 'plot_tree_interactive',
16 | 'plot_tree', 'plot_regression_datasets', 'make_dataset',
17 | "plot_forest_interactive", "plot_forest", "plot_rbf_svm_parameters",
18 | "plot_svm_interactive", 'plot_scaling', 'digits_plot',
19 | 'plot_relative_scaling', 'plot_pca_illustration',
20 | 'cm2', 'cm3']
21 |
--------------------------------------------------------------------------------
/notebooks/figures/average-per-class.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/average-per-class.png
--------------------------------------------------------------------------------
/notebooks/figures/check_env-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/check_env-1.png
--------------------------------------------------------------------------------
/notebooks/figures/cluster_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/cluster_comparison.png
--------------------------------------------------------------------------------
/notebooks/figures/clustering-linkage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/clustering-linkage.png
--------------------------------------------------------------------------------
/notebooks/figures/clustering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/clustering.png
--------------------------------------------------------------------------------
/notebooks/figures/dbscan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/dbscan.png
--------------------------------------------------------------------------------
/notebooks/figures/ipython_help-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/ipython_help-1.png
--------------------------------------------------------------------------------
/notebooks/figures/ipython_help-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/ipython_help-2.png
--------------------------------------------------------------------------------
/notebooks/figures/ipython_run_cell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/ipython_run_cell.png
--------------------------------------------------------------------------------
/notebooks/figures/iris_setosa.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/iris_setosa.jpg
--------------------------------------------------------------------------------
/notebooks/figures/iris_versicolor.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/iris_versicolor.jpg
--------------------------------------------------------------------------------
/notebooks/figures/iris_virginica.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/iris_virginica.jpg
--------------------------------------------------------------------------------
/notebooks/figures/ml_taxonomy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/ml_taxonomy.png
--------------------------------------------------------------------------------
/notebooks/figures/petal_sepal.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/petal_sepal.jpg
--------------------------------------------------------------------------------
/notebooks/figures/plot_2d_separator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 |
5 | def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None):
6 | if eps is None:
7 | eps = X.std() / 2.
8 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
9 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps
10 | xx = np.linspace(x_min, x_max, 100)
11 | yy = np.linspace(y_min, y_max, 100)
12 |
13 | X1, X2 = np.meshgrid(xx, yy)
14 | X_grid = np.c_[X1.ravel(), X2.ravel()]
15 | try:
16 | decision_values = classifier.decision_function(X_grid)
17 | levels = [0]
18 | fill_levels = [decision_values.min(), 0, decision_values.max()]
19 | except AttributeError:
20 | # no decision_function
21 | decision_values = classifier.predict_proba(X_grid)[:, 1]
22 | levels = [.5]
23 | fill_levels = [0, .5, 1]
24 |
25 | if ax is None:
26 | ax = plt.gca()
27 | if fill:
28 | ax.contourf(X1, X2, decision_values.reshape(X1.shape),
29 | levels=fill_levels, colors=['tab:blue', 'tab:orange'],
30 | alpha=0.5)
31 | else:
32 | ax.contour(X1, X2, decision_values.reshape(X1.shape), levels=levels,
33 | colors="black")
34 | ax.set_xlim(x_min, x_max)
35 | ax.set_ylim(y_min, y_max)
36 | ax.set_xticks(())
37 | ax.set_yticks(())
38 |
39 |
40 | if __name__ == '__main__':
41 | from sklearn.datasets import make_blobs
42 | from sklearn.linear_model import LogisticRegression
43 | X, y = make_blobs(centers=2, random_state=42)
44 | clf = LogisticRegression().fit(X, y)
45 | plot_2d_separator(clf, X, fill=True)
46 | plt.scatter(X[:, 0], X[:, 1], c=y)
47 | plt.show()
48 |
--------------------------------------------------------------------------------
/notebooks/figures/plot_digits_dataset.py:
--------------------------------------------------------------------------------
1 | # Taken from example in scikit-learn examples
2 | # Authors: Fabian Pedregosa
3 | # Olivier Grisel
4 | # Mathieu Blondel
5 | # Gael Varoquaux
6 | # License: BSD 3 clause (C) INRIA 2011
7 |
8 | import numpy as np
9 | import matplotlib.pyplot as plt
10 | from matplotlib import offsetbox
11 | from sklearn import datasets, decomposition
12 |
13 |
14 | def digits_plot():
15 | digits = datasets.load_digits(n_class=6)
16 | n_digits = 500
17 | X = digits.data[:n_digits]
18 | y = digits.target[:n_digits]
19 | n_samples, n_features = X.shape
20 |
21 | def plot_embedding(X, title=None):
22 | x_min, x_max = np.min(X, 0), np.max(X, 0)
23 | X = (X - x_min) / (x_max - x_min)
24 |
25 | plt.figure()
26 | ax = plt.subplot(111)
27 | for i in range(X.shape[0]):
28 | plt.text(X[i, 0], X[i, 1], str(digits.target[i]),
29 | color=plt.cm.Set1(y[i] / 10.),
30 | fontdict={'weight': 'bold', 'size': 9})
31 |
32 | if hasattr(offsetbox, 'AnnotationBbox'):
33 | # only print thumbnails with matplotlib > 1.0
34 | shown_images = np.array([[1., 1.]]) # just something big
35 | for i in range(X.shape[0]):
36 | dist = np.sum((X[i] - shown_images) ** 2, 1)
37 | if np.min(dist) < 1e5:
38 | # don't show points that are too close
39 | # set a high threshold to basically turn this off
40 | continue
41 | shown_images = np.r_[shown_images, [X[i]]]
42 | imagebox = offsetbox.AnnotationBbox(
43 | offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r),
44 | X[i])
45 | ax.add_artist(imagebox)
46 | plt.xticks([]), plt.yticks([])
47 | if title is not None:
48 | plt.title(title)
49 |
50 | n_img_per_row = 10
51 | img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row))
52 | for i in range(n_img_per_row):
53 | ix = 10 * i + 1
54 | for j in range(n_img_per_row):
55 | iy = 10 * j + 1
56 | img[ix:ix + 8, iy:iy + 8] = X[i * n_img_per_row + j].reshape((8, 8))
57 |
58 | plt.imshow(img, cmap=plt.cm.binary)
59 | plt.xticks([])
60 | plt.yticks([])
61 | plt.title('A selection from the 64-dimensional digits dataset')
62 | print("Computing PCA projection")
63 | pca = decomposition.PCA(n_components=2).fit(X)
64 | X_pca = pca.transform(X)
65 | plot_embedding(X_pca, "Principal Components projection of the digits")
66 | plt.matshow(pca.components_[0, :].reshape(8, 8), cmap="gray")
67 | plt.title("First Principal Component")
68 | plt.axis('off')
69 | plt.matshow(pca.components_[1, :].reshape(8, 8), cmap="gray")
70 | plt.title("Second Principal Component")
71 | plt.axis('off')
72 | plt.show()
73 |
--------------------------------------------------------------------------------
/notebooks/figures/plot_helpers.py:
--------------------------------------------------------------------------------
1 | from matplotlib.colors import ListedColormap
2 |
3 | cm3 = ListedColormap(['#1f77b4', '#ff7f0e', '#2ca02c'])
4 | cm2 = ListedColormap(['#1f77b4', '#ff7f0e'])
5 |
--------------------------------------------------------------------------------
/notebooks/figures/plot_interactive_forest.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.datasets import make_blobs
5 | from sklearn.ensemble import RandomForestClassifier
6 |
7 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50)
8 |
9 |
10 | def plot_forest(max_depth=1):
11 | plt.figure()
12 | ax = plt.gca()
13 | h = 0.02
14 |
15 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
16 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
17 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
18 |
19 | if max_depth != 0:
20 | forest = RandomForestClassifier(n_estimators=20, max_depth=max_depth,
21 | random_state=1).fit(X, y)
22 | Z = forest.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
23 | Z = Z.reshape(xx.shape)
24 | ax.contourf(xx, yy, Z, alpha=.4, cmap='RdBu_r')
25 | ax.set_title("max_depth = %d" % max_depth)
26 | else:
27 | ax.set_title("data set")
28 | ax.scatter(X[:, 0], X[:, 1], c=np.array(['tab:blue', 'tab:red'])[y], s=60)
29 | ax.set_xlim(x_min, x_max)
30 | ax.set_ylim(y_min, y_max)
31 | ax.set_xticks(())
32 | ax.set_yticks(())
33 |
34 |
35 | def plot_forest_interactive():
36 | from ipywidgets import interactive, IntSlider
37 | slider = IntSlider(min=0, max=8, step=1, value=0)
38 | return interactive(plot_forest, max_depth=slider)
39 |
--------------------------------------------------------------------------------
/notebooks/figures/plot_interactive_tree.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.datasets import make_blobs
5 | from sklearn.tree import DecisionTreeClassifier
6 |
7 | from scipy import ndimage
8 |
9 | from .tree_plotting import plot_tree as plot_tree_mpl
10 |
11 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50)
12 |
13 |
14 | def plot_tree(max_depth=1):
15 | fig, ax = plt.subplots(1, 2, figsize=(10, 5))
16 | h = 0.02
17 |
18 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
19 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
20 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
21 | np.arange(y_min, y_max, h))
22 |
23 | if max_depth != 0:
24 | tree = DecisionTreeClassifier(max_depth=max_depth, random_state=1)
25 | tree.fit(X, y)
26 | Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
27 | Z = Z.reshape(xx.shape)
28 | faces = tree.tree_.apply(
29 | np.c_[xx.ravel(), yy.ravel()].astype(np.float32))
30 | faces = faces.reshape(xx.shape)
31 | border = ndimage.laplace(faces) != 0
32 | ax[0].contourf(xx, yy, Z, alpha=.4, cmap='RdBu_r')
33 | ax[0].scatter(xx[border], yy[border], marker='.', s=1)
34 | ax[0].set_title("max_depth = %d" % max_depth)
35 | plot_tree_mpl(tree, ax=ax[1], impurity=False, filled=True)
36 | # ax[1].axis("off")
37 | else:
38 | ax[0].set_title("data set")
39 | ax[1].set_visible(False)
40 | ax[0].scatter(X[:, 0], X[:, 1], c=np.array(['tab:blue', 'tab:red'])[y],
41 | s=60)
42 | ax[0].set_xlim(x_min, x_max)
43 | ax[0].set_ylim(y_min, y_max)
44 | ax[0].set_xticks(())
45 | ax[0].set_yticks(())
46 |
47 |
48 | def plot_tree_interactive():
49 | from ipywidgets import interactive, IntSlider
50 | slider = IntSlider(min=0, max=8, step=1, value=0)
51 | return interactive(plot_tree, max_depth=slider)
52 |
--------------------------------------------------------------------------------
/notebooks/figures/plot_kneigbors_regularization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/plot_kneigbors_regularization.png
--------------------------------------------------------------------------------
/notebooks/figures/plot_kneighbors_regularization.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.neighbors import KNeighborsRegressor
5 |
6 |
7 | def make_dataset(n_samples=100):
8 | rnd = np.random.RandomState(42)
9 | x = np.linspace(-3, 3, n_samples)
10 | y_no_noise = np.sin(4 * x) + x
11 | y = y_no_noise + rnd.normal(size=len(x))
12 | return x, y
13 |
14 |
15 | def plot_regression_datasets():
16 | fig, axes = plt.subplots(1, 3, figsize=(15, 5))
17 | for n_samples, ax in zip([10, 100, 1000], axes):
18 | x, y = make_dataset(n_samples)
19 | ax.plot(x, y, 'o', alpha=.6)
20 |
21 |
22 | def plot_kneighbors_regularization():
23 | rnd = np.random.RandomState(42)
24 | x = np.linspace(-3, 3, 100)
25 | y_no_noise = np.sin(4 * x) + x
26 | y = y_no_noise + rnd.normal(size=len(x))
27 | X = x[:, np.newaxis]
28 | fig, axes = plt.subplots(1, 3, figsize=(15, 5))
29 |
30 | x_test = np.linspace(-3, 3, 1000)
31 |
32 | for n_neighbors, ax in zip([2, 5, 20], axes.ravel()):
33 | kneighbor_regression = KNeighborsRegressor(n_neighbors=n_neighbors)
34 | kneighbor_regression.fit(X, y)
35 | ax.plot(x, y_no_noise, label="true function")
36 | ax.plot(x, y, "o", label="data")
37 | ax.plot(x_test, kneighbor_regression.predict(x_test[:, np.newaxis]),
38 | label="prediction")
39 | ax.legend()
40 | ax.set_title("n_neighbors = %d" % n_neighbors)
41 |
42 | if __name__ == "__main__":
43 | plot_kneighbors_regularization()
44 | plt.show()
45 |
--------------------------------------------------------------------------------
/notebooks/figures/plot_linear_svc_regularization.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | from sklearn.svm import SVC
3 | from sklearn.datasets import make_blobs
4 | from .plot_2d_separator import plot_2d_separator
5 | from .plot_helpers import cm2
6 |
7 |
8 | def plot_linear_svc_regularization():
9 | X, y = make_blobs(centers=2, random_state=4, n_samples=30)
10 | # a carefully hand-designed dataset lol
11 | y[7] = 0
12 | y[27] = 0
13 |
14 | fig, axes = plt.subplots(1, 3, figsize=(12, 4))
15 |
16 | for ax, C in zip(axes, [1e-2, 1, 1e2]):
17 | ax.scatter(X[:, 0], X[:, 1], s=150, c=y, cmap=cm2)
18 |
19 | svm = SVC(kernel='linear', C=C).fit(X, y)
20 | plot_2d_separator(svm, X, ax=ax, eps=.5)
21 | ax.set_title("C = %f" % C)
22 |
--------------------------------------------------------------------------------
/notebooks/figures/plot_pca.py:
--------------------------------------------------------------------------------
1 | from sklearn.decomposition import PCA
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 |
5 |
6 | def plot_pca_illustration():
7 | rnd = np.random.RandomState(5)
8 | X_ = rnd.normal(size=(300, 2))
9 | X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2)
10 |
11 | pca = PCA()
12 | pca.fit(X_blob)
13 | X_pca = pca.transform(X_blob)
14 |
15 | S = X_pca.std(axis=0)
16 |
17 | fig, axes = plt.subplots(2, 2, figsize=(10, 10))
18 | axes = axes.ravel()
19 |
20 | axes[0].set_title("Original data")
21 | axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_pca[:, 0], linewidths=0,
22 | s=60, cmap='viridis')
23 | axes[0].set_xlabel("feature 1")
24 | axes[0].set_ylabel("feature 2")
25 | axes[0].arrow(pca.mean_[0], pca.mean_[1], S[0] * pca.components_[0, 0],
26 | S[0] * pca.components_[0, 1], width=.1, head_width=.3,
27 | color='k')
28 | axes[0].arrow(pca.mean_[0], pca.mean_[1], S[1] * pca.components_[1, 0],
29 | S[1] * pca.components_[1, 1], width=.1, head_width=.3,
30 | color='k')
31 | axes[0].text(-1.5, -.5, "Component 2", size=14)
32 | axes[0].text(-4, -4, "Component 1", size=14)
33 | axes[0].set_aspect('equal')
34 |
35 | axes[1].set_title("Transformed data")
36 | axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=X_pca[:, 0], linewidths=0,
37 | s=60, cmap='viridis')
38 | axes[1].set_xlabel("First principal component")
39 | axes[1].set_ylabel("Second principal component")
40 | axes[1].set_aspect('equal')
41 | axes[1].set_ylim(-8, 8)
42 |
43 | pca = PCA(n_components=1)
44 | pca.fit(X_blob)
45 | X_inverse = pca.inverse_transform(pca.transform(X_blob))
46 |
47 | axes[2].set_title("Transformed data w/ second component dropped")
48 | axes[2].scatter(X_pca[:, 0], np.zeros(X_pca.shape[0]), c=X_pca[:, 0],
49 | linewidths=0, s=60, cmap='viridis')
50 | axes[2].set_xlabel("First principal component")
51 | axes[2].set_aspect('equal')
52 | axes[2].set_ylim(-8, 8)
53 |
54 | axes[3].set_title("Back-rotation using only first component")
55 | axes[3].scatter(X_inverse[:, 0], X_inverse[:, 1], c=X_pca[:, 0],
56 | linewidths=0, s=60, cmap='viridis')
57 | axes[3].set_xlabel("feature 1")
58 | axes[3].set_ylabel("feature 2")
59 | axes[3].set_aspect('equal')
60 | axes[3].set_xlim(-8, 4)
61 | axes[3].set_ylim(-8, 4)
62 |
63 |
64 | def plot_pca_whitening():
65 | rnd = np.random.RandomState(5)
66 | X_ = rnd.normal(size=(300, 2))
67 | X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2)
68 |
69 | pca = PCA(whiten=True)
70 | pca.fit(X_blob)
71 | X_pca = pca.transform(X_blob)
72 |
73 | fig, axes = plt.subplots(1, 2, figsize=(10, 10))
74 | axes = axes.ravel()
75 |
76 | axes[0].set_title("Original data")
77 | axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_pca[:, 0], linewidths=0, s=60, cmap='viridis')
78 | axes[0].set_xlabel("feature 1")
79 | axes[0].set_ylabel("feature 2")
80 | axes[0].set_aspect('equal')
81 |
82 | axes[1].set_title("Whitened data")
83 | axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=X_pca[:, 0], linewidths=0, s=60, cmap='viridis')
84 | axes[1].set_xlabel("First principal component")
85 | axes[1].set_ylabel("Second principal component")
86 | axes[1].set_aspect('equal')
87 | axes[1].set_xlim(-3, 4)
88 |
--------------------------------------------------------------------------------
/notebooks/figures/plot_rbf_svm_parameters.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn.svm import SVC
4 | from sklearn.datasets import make_blobs
5 | from .plot_2d_separator import plot_2d_separator
6 |
7 |
8 | def make_handcrafted_dataset():
9 | # a carefully hand-designed dataset lol
10 | X, y = make_blobs(centers=2, random_state=4, n_samples=30)
11 | y[np.array([7, 27])] = 0
12 | mask = np.ones(len(X), dtype=np.bool)
13 | mask[np.array([0, 1, 5, 26])] = 0
14 | X, y = X[mask], y[mask]
15 | return X, y
16 |
17 |
18 | def plot_rbf_svm_parameters():
19 | X, y = make_handcrafted_dataset()
20 |
21 | fig, axes = plt.subplots(1, 3, figsize=(12, 4))
22 | for ax, C in zip(axes, [1e0, 5, 10, 100]):
23 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y])
24 |
25 | svm = SVC(kernel='rbf', C=C).fit(X, y)
26 | plot_2d_separator(svm, X, ax=ax, eps=.5)
27 | ax.set_title("C = %f" % C)
28 |
29 | fig, axes = plt.subplots(1, 4, figsize=(15, 3))
30 | for ax, gamma in zip(axes, [0.1, .5, 1, 10]):
31 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y])
32 | svm = SVC(gamma=gamma, kernel='rbf', C=1).fit(X, y)
33 | plot_2d_separator(svm, X, ax=ax, eps=.5)
34 | ax.set_title("gamma = %f" % gamma)
35 |
36 |
37 | def plot_svm(log_C, log_gamma):
38 | X, y = make_handcrafted_dataset()
39 | C = 10. ** log_C
40 | gamma = 10. ** log_gamma
41 | svm = SVC(kernel='rbf', C=C, gamma=gamma).fit(X, y)
42 | plt.figure()
43 | ax = plt.gca()
44 | plot_2d_separator(svm, X, ax=ax, eps=.5)
45 | # plot data
46 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y])
47 | # plot support vectors
48 | sv = svm.support_vectors_
49 | ax.scatter(sv[:, 0], sv[:, 1], s=230, facecolors='none', zorder=10, linewidth=3)
50 | ax.set_title("C = %.4f gamma = %.4f" % (C, gamma))
51 |
52 |
53 | def plot_svm_interactive():
54 | from ipywidgets import interactive, FloatSlider
55 | C_slider = FloatSlider(min=-3, max=3, step=.1, value=0, readout=False)
56 | gamma_slider = FloatSlider(min=-2, max=2, step=.1, value=0, readout=False)
57 | return interactive(plot_svm, log_C=C_slider, log_gamma=gamma_slider)
58 |
--------------------------------------------------------------------------------
/notebooks/figures/plot_scaling.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn.datasets import make_blobs
4 | from sklearn.preprocessing import (StandardScaler, MinMaxScaler, Normalizer,
5 | RobustScaler, QuantileTransformer)
6 | from sklearn.model_selection import train_test_split
7 | from .plot_helpers import cm2
8 |
9 |
10 | def plot_scaling():
11 | X, y = make_blobs(n_samples=50, centers=2, random_state=4, cluster_std=1)
12 | X += 3
13 |
14 | plt.figure(figsize=(15, 8))
15 | main_ax = plt.subplot2grid((2, 5), (0, 0), rowspan=2, colspan=2)
16 |
17 | main_ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cm2, s=60)
18 | maxx = np.abs(X[:, 0]).max()
19 | maxy = np.abs(X[:, 1]).max()
20 |
21 | main_ax.set_xlim(-maxx + 1, maxx + 1)
22 | main_ax.set_ylim(-maxy + 1, maxy + 1)
23 | main_ax.set_title("Original Data")
24 | other_axes = [plt.subplot2grid((2, 5), (i, j))
25 | for j in range(2, 5)
26 | for i in range(2)]
27 |
28 | for ax, scaler in zip(other_axes, [StandardScaler(), RobustScaler(),
29 | MinMaxScaler(), QuantileTransformer(),
30 | Normalizer(norm='l2'),
31 | Normalizer(norm='l1')]):
32 | X_ = scaler.fit_transform(X)
33 | ax.scatter(X_[:, 0], X_[:, 1], c=y, cmap=cm2, s=60)
34 | ax.set_xlim(-2, 2)
35 | ax.set_ylim(-2, 2)
36 | ax.set_title(type(scaler).__name__)
37 |
38 | other_axes.append(main_ax)
39 |
40 | for ax in other_axes:
41 | ax.spines['left'].set_position('center')
42 | ax.spines['right'].set_color('none')
43 | ax.spines['bottom'].set_position('center')
44 | ax.spines['top'].set_color('none')
45 | ax.xaxis.set_ticks_position('bottom')
46 | ax.yaxis.set_ticks_position('left')
47 |
48 |
49 | def plot_relative_scaling():
50 | # make synthetic data
51 | X, _ = make_blobs(n_samples=50, centers=5, random_state=4, cluster_std=2)
52 | # split it into training and test set
53 | X_train, X_test = train_test_split(X, random_state=5, test_size=.1)
54 | # plot the training and test set
55 | fig, axes = plt.subplots(1, 3, figsize=(13, 4))
56 | axes[0].scatter(X_train[:, 0], X_train[:, 1], label="training set", s=60)
57 | axes[0].scatter(X_test[:, 0], X_test[:, 1], marker='^', label="test set",
58 | s=60)
59 | axes[0].legend(loc='upper left')
60 | axes[0].set_title("original data")
61 |
62 | # scale the data using MinMaxScaler
63 | scaler = MinMaxScaler()
64 | scaler.fit(X_train)
65 | X_train_scaled = scaler.transform(X_train)
66 | X_test_scaled = scaler.transform(X_test)
67 |
68 | # visualize the properly scaled data
69 | axes[1].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1],
70 | label="training set", s=60)
71 | axes[1].scatter(X_test_scaled[:, 0], X_test_scaled[:, 1], marker='^',
72 | label="test set", s=60)
73 | axes[1].set_title("scaled data")
74 |
75 | # rescale the test set separately, so that test set min is 0 and test set
76 | # max is 1 DO NOT DO THIS! For illustration purposes only
77 | test_scaler = MinMaxScaler()
78 | test_scaler.fit(X_test)
79 | X_test_scaled_badly = test_scaler.transform(X_test)
80 |
81 | # visualize wrongly scaled data
82 | axes[2].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1],
83 | label="training set", s=60)
84 | axes[2].scatter(X_test_scaled_badly[:, 0], X_test_scaled_badly[:, 1],
85 | marker='^', label="test set", s=60)
86 | axes[2].set_title("improperly scaled data")
87 |
--------------------------------------------------------------------------------
/notebooks/figures/randomized_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/randomized_search.png
--------------------------------------------------------------------------------
/notebooks/figures/supervised_scikit_learn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/figures/supervised_scikit_learn.png
--------------------------------------------------------------------------------
/notebooks/helpers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 | from sklearn.model_selection import StratifiedShuffleSplit
4 | from sklearn.feature_extraction import DictVectorizer
5 |
6 |
7 | # Can also use pandas!
8 | def process_titanic_line(line):
9 | # Split line on "," to get fields without comma confusion
10 | vals = line.strip().split('",')
11 | # replace spurious " characters
12 | vals = [v.replace('"', '') for v in vals]
13 | pclass = int(vals[0])
14 | survived = int(vals[1])
15 | name = str(vals[2])
16 | sex = str(vals[3])
17 | try:
18 | age = float(vals[4])
19 | except ValueError:
20 | # Blank age
21 | age = -1
22 | sibsp = float(vals[5])
23 | parch = int(vals[6])
24 | ticket = str(vals[7])
25 | try:
26 | fare = float(vals[8])
27 | except ValueError:
28 | # Blank fare
29 | fare = -1
30 | cabin = str(vals[9])
31 | embarked = str(vals[10])
32 | boat = str(vals[11])
33 | homedest = str(vals[12])
34 | line_dict = {'pclass': pclass, 'survived': survived, 'name': name, 'sex':
35 | sex, 'age': age, 'sibsp': sibsp, 'parch': parch, 'ticket':
36 | ticket, 'fare': fare, 'cabin': cabin, 'embarked': embarked,
37 | 'boat': boat, 'homedest': homedest}
38 | return line_dict
39 |
40 |
41 | def load_titanic(test_size=.25, feature_skip_tuple=(), random_state=1999):
42 | f = open(os.path.join('datasets', 'titanic', 'titanic3.csv'))
43 | # Remove . from home.dest, split on quotes because some fields have commas
44 | keys = f.readline().strip().replace('.', '').split('","')
45 | lines = f.readlines()
46 | f.close()
47 | string_keys = ['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat',
48 | 'homedest']
49 | string_keys = [s for s in string_keys if s not in feature_skip_tuple]
50 | numeric_keys = ['pclass', 'age', 'sibsp', 'parch', 'fare']
51 | numeric_keys = [n for n in numeric_keys if n not in feature_skip_tuple]
52 | train_vectorizer_list = []
53 | test_vectorizer_list = []
54 |
55 | n_samples = len(lines)
56 | numeric_data = np.zeros((n_samples, len(numeric_keys)))
57 | numeric_labels = np.zeros((n_samples,), dtype=int)
58 |
59 | # Doing this twice is horribly inefficient but the file is small...
60 | for n, l in enumerate(lines):
61 | line_dict = process_titanic_line(l)
62 | strings = {k: line_dict[k] for k in string_keys}
63 | numeric_labels[n] = line_dict["survived"]
64 |
65 | sss = StratifiedShuffleSplit(n_iter=1, test_size=test_size,
66 | random_state=12)
67 | # This is a weird way to get the indices but it works
68 | train_idx = None
69 | test_idx = None
70 | for train_idx, test_idx in sss.split(numeric_data, numeric_labels):
71 | pass
72 |
73 | for n, l in enumerate(lines):
74 | line_dict = process_titanic_line(l)
75 | strings = {k: line_dict[k] for k in string_keys}
76 | if n in train_idx:
77 | train_vectorizer_list.append(strings)
78 | else:
79 | test_vectorizer_list.append(strings)
80 | numeric_data[n] = np.asarray([line_dict[k]
81 | for k in numeric_keys])
82 |
83 | train_numeric = numeric_data[train_idx]
84 | test_numeric = numeric_data[test_idx]
85 | train_labels = numeric_labels[train_idx]
86 | test_labels = numeric_labels[test_idx]
87 |
88 | vec = DictVectorizer()
89 | # .toarray() due to returning a scipy sparse array
90 | train_categorical = vec.fit_transform(train_vectorizer_list).toarray()
91 | test_categorical = vec.transform(test_vectorizer_list).toarray()
92 | train_data = np.concatenate([train_numeric, train_categorical], axis=1)
93 | test_data = np.concatenate([test_numeric, test_categorical], axis=1)
94 | keys = numeric_keys + string_keys
95 | return keys, train_data, test_data, train_labels, test_labels
96 |
97 |
98 | FIELDNAMES = ('polarity', 'id', 'date', 'query', 'author', 'text')
99 |
100 |
101 | def read_sentiment_csv(csv_file, fieldnames=FIELDNAMES, max_count=None,
102 | n_partitions=1, partition_id=0):
103 |
104 | import csv # put the import inside for use in IPython.parallel
105 |
106 | def file_opener(csv_file):
107 | try:
108 | open(csv_file, 'r', encoding="latin1").close()
109 | return open(csv_file, 'r', encoding="latin1")
110 | except TypeError:
111 | # Python 2 does not have encoding arg
112 | return open(csv_file, 'rb')
113 |
114 | texts = []
115 | targets = []
116 | with file_opener(csv_file) as f:
117 | reader = csv.DictReader(f, fieldnames=fieldnames,
118 | delimiter=',', quotechar='"')
119 | pos_count, neg_count = 0, 0
120 | for i, d in enumerate(reader):
121 | if i % n_partitions != partition_id:
122 | # Skip entry if not in the requested partition
123 | continue
124 |
125 | if d['polarity'] == '4':
126 | if max_count and pos_count >= max_count / 2:
127 | continue
128 | pos_count += 1
129 | texts.append(d['text'])
130 | targets.append(1)
131 |
132 | elif d['polarity'] == '0':
133 | if max_count and neg_count >= max_count / 2:
134 | continue
135 | neg_count += 1
136 | texts.append(d['text'])
137 | targets.append(-1)
138 |
139 | return texts, targets
140 |
--------------------------------------------------------------------------------
/notebooks/images/parallel_text_clf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/images/parallel_text_clf.png
--------------------------------------------------------------------------------
/notebooks/images/parallel_text_clf_average.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/scipy-2018-sklearn/77704111a423c073db83db4648afe27b409aa24f/notebooks/images/parallel_text_clf_average.png
--------------------------------------------------------------------------------
/notebooks/solutions/03A_faces_plot.py:
--------------------------------------------------------------------------------
1 | faces = fetch_olivetti_faces()
2 |
3 | # set up the figure
4 | fig = plt.figure(figsize=(6, 6)) # figure size in inches
5 | fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
6 |
7 | # plot the faces:
8 | for i in range(64):
9 | ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
10 | ax.imshow(faces.images[i], cmap=plt.cm.bone, interpolation='nearest')
11 |
--------------------------------------------------------------------------------
/notebooks/solutions/04_wrong-predictions.py:
--------------------------------------------------------------------------------
1 | for i in incorrect_idx:
2 | print('%d: Predicted %d True label %d' % (i, pred_y[i], test_y[i]))
3 |
4 | # Plot two dimensions
5 |
6 | colors = ["darkblue", "darkgreen", "gray"]
7 |
8 | for n, color in enumerate(colors):
9 | idx = np.where(test_y == n)[0]
10 | plt.scatter(test_X[idx, 1], test_X[idx, 2],
11 | color=color, label="Class %s" % str(n))
12 |
13 | for i, marker in zip(incorrect_idx, ['x', 's', 'v']):
14 | plt.scatter(test_X[i, 1], test_X[i, 2],
15 | color="darkred",
16 | marker=marker,
17 | s=40,
18 | label=i)
19 |
20 | plt.xlabel('sepal width [cm]')
21 | plt.ylabel('petal length [cm]')
22 | plt.legend(loc=1, scatterpoints=1)
23 | plt.title("Iris Classification results")
24 | plt.show()
25 |
--------------------------------------------------------------------------------
/notebooks/solutions/05A_knn_with_diff_k.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_iris
2 | from sklearn.model_selection import train_test_split
3 |
4 |
5 | iris = load_iris()
6 | X = iris.data
7 | y = iris.target
8 |
9 | X_train, X_test, y_train, y_test = train_test_split(X, y,
10 | test_size=0.25,
11 | random_state=1234,
12 | stratify=y)
13 |
14 | X_trainsub, X_valid, y_trainsub, y_valid = train_test_split(X_train, y_train,
15 | test_size=0.5,
16 | random_state=1234,
17 | stratify=y_train)
18 |
19 | for k in range(1, 20):
20 | knn = KNeighborsClassifier(n_neighbors=k)
21 | train_score = knn.fit(X_trainsub, y_trainsub).\
22 | score(X_trainsub, y_trainsub)
23 | valid_score = knn.score(X_valid, y_valid)
24 | print('k: %d, Train/Valid Acc: %.3f/%.3f' %
25 | (k, train_score, valid_score))
26 |
27 |
28 | knn = KNeighborsClassifier(n_neighbors=9)
29 | knn.fit(X_train, y_train)
30 | print('k=9 Test Acc: %.3f' % knn.score(X_test, y_test))
31 |
--------------------------------------------------------------------------------
/notebooks/solutions/06A_knn_vs_linreg.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_boston
2 | from sklearn.model_selection import train_test_split
3 | from sklearn.linear_model import LinearRegression
4 |
5 |
6 | boston = load_boston()
7 | X = boston.data
8 | y = boston.target
9 |
10 | print('X.shape:', X.shape)
11 | X_train, X_test, y_train, y_test = train_test_split(X, y,
12 | test_size=0.25,
13 | random_state=42)
14 |
15 | linreg = LinearRegression()
16 | knnreg = KNeighborsRegressor(n_neighbors=1)
17 |
18 | linreg.fit(X_train, y_train)
19 | print('Linear Regression Train/Test: %.3f/%.3f' %
20 | (linreg.score(X_train, y_train),
21 | linreg.score(X_test, y_test)))
22 |
23 | knnreg.fit(X_train, y_train)
24 | print('KNeighborsRegressor Train/Test: %.3f/%.3f' %
25 | (knnreg.score(X_train, y_train),
26 | knnreg.score(X_test, y_test)))
27 |
--------------------------------------------------------------------------------
/notebooks/solutions/06B_lin_with_sine.py:
--------------------------------------------------------------------------------
1 | XX_train = np.concatenate((X_train, np.sin(4 * X_train)), axis=1)
2 | XX_test = np.concatenate((X_test, np.sin(4 * X_test)), axis=1)
3 | regressor.fit(XX_train, y_train)
4 | y_pred_test_sine = regressor.predict(XX_test)
5 |
6 | plt.plot(X_test, y_test, 'o', label="data")
7 | plt.plot(X_test, y_pred_test_sine, 'o', label="prediction with sine")
8 | plt.plot(X_test, y_pred_test, label='prediction without sine')
9 | plt.legend(loc='best');
10 |
--------------------------------------------------------------------------------
/notebooks/solutions/07A_iris-pca.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_iris
2 | from sklearn.model_selection import train_test_split
3 | from sklearn.decomposition import PCA
4 | from sklearn.preprocessing import StandardScaler
5 |
6 | iris = load_iris()
7 |
8 | X_train, X_test, y_train, y_test = train_test_split(iris.data,
9 | iris.target,
10 | random_state=0,
11 | stratify=iris.target)
12 |
13 | sc = StandardScaler()
14 | sc.fit(X_train)
15 | pca = PCA(n_components=2)
16 |
17 | X_train_pca = pca.fit_transform(sc.transform(X_train))
18 | X_test_pca = pca.transform(sc.transform(X_test))
19 |
20 | for X, y in zip((X_train_pca, X_test_pca), (y_train, y_test)):
21 |
22 | for i, annot in enumerate(zip(('Iris-setosa', 'Iris-versicolor', 'Iris-virginica'),
23 | ('blue', 'red', 'green'))):
24 | plt.scatter(X[y==i, 0],
25 | X[y==i, 1],
26 | label=annot[0],
27 | c=annot[1])
28 | plt.xlabel('Principal Component 1')
29 | plt.ylabel('Principal Component 2')
30 | plt.legend(loc='best')
31 | plt.tight_layout()
32 | plt.show()
33 |
--------------------------------------------------------------------------------
/notebooks/solutions/08B_digits_clustering.py:
--------------------------------------------------------------------------------
1 | from sklearn.cluster import KMeans
2 | kmeans = KMeans(n_clusters=10)
3 | clusters = kmeans.fit_predict(digits.data)
4 |
5 | print(kmeans.cluster_centers_.shape)
6 |
7 | #------------------------------------------------------------
8 | # visualize the cluster centers
9 | fig = plt.figure(figsize=(8, 3))
10 | for i in range(10):
11 | ax = fig.add_subplot(2, 5, 1 + i)
12 | ax.imshow(kmeans.cluster_centers_[i].reshape((8, 8)),
13 | cmap=plt.cm.binary)
14 | from sklearn.manifold import Isomap
15 | X_iso = Isomap(n_neighbors=10).fit_transform(digits.data)
16 |
17 | #------------------------------------------------------------
18 | # visualize the projected data
19 | fig, ax = plt.subplots(1, 2, figsize=(8, 4))
20 |
21 | ax[0].scatter(X_iso[:, 0], X_iso[:, 1], c=clusters)
22 | ax[1].scatter(X_iso[:, 0], X_iso[:, 1], c=digits.target)
23 |
--------------------------------------------------------------------------------
/notebooks/solutions/10_titanic.py:
--------------------------------------------------------------------------------
1 | from sklearn.linear_model import LogisticRegression
2 | lr = LogisticRegression().fit(train_data_finite, train_labels)
3 | print("logistic regression score: %f" % lr.score(test_data_finite, test_labels))
4 |
5 | from sklearn.ensemble import RandomForestClassifier
6 | rf = RandomForestClassifier(n_estimators=500, random_state=0).fit(train_data_finite, train_labels)
7 | print("random forest score: %f" % rf.score(test_data_finite, test_labels))
8 |
9 | features_dummies_sub = pd.get_dummies(features[['pclass', 'sex', 'age', 'sibsp', 'fare']])
10 | data_sub = features_dummies_sub.values
11 |
12 | train_data_sub, test_data_sub, train_labels, test_labels = train_test_split(data_sub, labels, random_state=0)
13 |
14 | imp = Imputer()
15 | imp.fit(train_data_sub)
16 | train_data_finite_sub = imp.transform(train_data_sub)
17 | test_data_finite_sub = imp.transform(test_data_sub)
18 |
19 | lr = LogisticRegression().fit(train_data_finite_sub, train_labels)
20 | print("logistic regression score w/o embark, parch: %f" % lr.score(test_data_finite_sub, test_labels))
21 | rf = RandomForestClassifier(n_estimators=500, random_state=0).fit(train_data_finite_sub, train_labels)
22 | print("random forest score w/o embark, parch: %f" % rf.score(test_data_finite_sub, test_labels))
23 |
--------------------------------------------------------------------------------
/notebooks/solutions/11_ngrams.py:
--------------------------------------------------------------------------------
1 | text = zen.split("\n")
2 | for n in [2, 3, 4]:
3 | cv = CountVectorizer(ngram_range=(n, n)).fit(text)
4 | counts = cv.transform(text)
5 | most_common = np.argmax(counts.sum(axis=0))
6 | print("most common %d-gram: %s" % (n, cv.get_feature_names()[most_common]))
7 |
8 |
9 | for norm in ["l2", None]:
10 | tfidf_vect = TfidfVectorizer(norm=norm).fit(text)
11 | data_tfidf = tfidf_vect.transform(text)
12 | most_common = tfidf_vect.get_feature_names()[np.argmax(data_tfidf.max(axis=0).toarray())]
13 | print("highest tf-idf with norm=%s: %s" % (norm, most_common))
14 |
--------------------------------------------------------------------------------
/notebooks/solutions/12A_tfidf.py:
--------------------------------------------------------------------------------
1 | from sklearn.feature_extraction.text import TfidfVectorizer
2 |
3 | vectorizer = TfidfVectorizer()
4 | vectorizer.fit(text_train)
5 |
6 | X_train = vectorizer.transform(text_train)
7 | X_test = vectorizer.transform(text_test)
8 |
9 | clf = LogisticRegression()
10 | clf.fit(X_train, y_train)
11 |
12 | print(clf.score(X_train, y_train))
13 | print(clf.score(X_test, y_test))
14 |
15 | visualize_coefficients(clf, vectorizer.get_feature_names())
16 |
--------------------------------------------------------------------------------
/notebooks/solutions/12B_vectorizer_params.py:
--------------------------------------------------------------------------------
1 | # CountVectorizer
2 | vectorizer = CountVectorizer(min_df=10, ngram_range=(1, 3))
3 | vectorizer.fit(text_train)
4 |
5 | X_train = vectorizer.transform(text_train)
6 | X_test = vectorizer.transform(text_test)
7 |
8 | clf = LogisticRegression()
9 | clf.fit(X_train, y_train)
10 |
11 | visualize_coefficients(clf, vectorizer.get_feature_names())
12 |
13 | # TfidfVectorizer
14 | vectorizer = TfidfVectorizer(min_df=10, ngram_range=(1, 3))
15 | vectorizer.fit(text_train)
16 |
17 | X_train = vectorizer.transform(text_train)
18 | X_test = vectorizer.transform(text_test)
19 |
20 | clf = LogisticRegression()
21 | clf.fit(X_train, y_train)
22 |
23 | visualize_coefficients(clf, vectorizer.get_feature_names())
24 |
--------------------------------------------------------------------------------
/notebooks/solutions/13_cross_validation.py:
--------------------------------------------------------------------------------
1 | cv = KFold(n_splits=3)
2 | cross_val_score(classifier, iris.data, iris.target, cv=cv)
3 |
--------------------------------------------------------------------------------
/notebooks/solutions/14_grid_search.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_digits
2 | from sklearn.neighbors import KNeighborsClassifier
3 |
4 | digits = load_digits()
5 | X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, random_state=0)
6 |
7 | param_grid = {'n_neighbors': [1, 3, 5, 10, 50]}
8 | gs = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=5, verbose=3)
9 | gs.fit(X_train, y_train)
10 | print("Score on test set: %f" % gs.score(X_test, y_test))
11 | print("Best parameters: %s" % gs.best_params_)
12 |
--------------------------------------------------------------------------------
/notebooks/solutions/15A_ridge_grid.py:
--------------------------------------------------------------------------------
1 | from sklearn.preprocessing import StandardScaler
2 | from sklearn.datasets import load_boston
3 | from sklearn.preprocessing import PolynomialFeatures
4 | from sklearn.linear_model import Ridge
5 |
6 | boston = load_boston()
7 | text_train, text_test, y_train, y_test = train_test_split(boston.data,
8 | boston.target,
9 | test_size=0.25,
10 | random_state=123)
11 |
12 | pipeline = make_pipeline(StandardScaler(),
13 | PolynomialFeatures(),
14 | Ridge())
15 |
16 | grid = GridSearchCV(pipeline,
17 | param_grid={'polynomialfeatures__degree': [1, 2, 3]}, cv=5)
18 |
19 | grid.fit(text_train, y_train)
20 |
21 | print('best parameters:', grid.best_params_)
22 | print('best score:', grid.best_score_)
23 | print('test score:', grid.score(text_test, y_test))
24 |
--------------------------------------------------------------------------------
/notebooks/solutions/16A_avg_per_class_acc.py:
--------------------------------------------------------------------------------
1 | def accuracy(true, pred):
2 | return (true == pred).sum() / float(true.shape[0])
3 |
4 |
5 | def macro(true, pred):
6 | scores = []
7 | for l in np.unique(true):
8 | scores.append(accuracy(np.where(true != l, 1, 0),
9 | np.where(pred != l, 1, 0)))
10 | return float(sum(scores)) / float(len(scores))
11 |
12 | y_true = np.array([0, 0, 0, 1, 1, 1, 1, 1, 2, 2])
13 | y_pred = np.array([0, 1, 1, 0, 1, 1, 2, 2, 2, 2])
14 |
15 |
16 | print('accuracy:', accuracy(y_true, y_pred))
17 | print('average-per-class accuracy:', macro(y_true, y_pred))
18 |
--------------------------------------------------------------------------------
/notebooks/solutions/17A_logreg_grid.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_digits
2 | from sklearn.model_selection import train_test_split, GridSearchCV
3 | from sklearn.linear_model import LogisticRegression
4 |
5 | digits = load_digits()
6 | X_digits, y_digits = digits.data, digits.target
7 | X_digits_train, X_digits_test, y_digits_train, y_digits_test = train_test_split(X_digits, y_digits, random_state=1)
8 |
9 | param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
10 |
11 | grid = GridSearchCV(LogisticRegression(), param_grid=param_grid, cv=5, verbose=3)
12 | grid.fit(X_digits_train, y_digits_train)
13 | print('Best score for LogisticRegression: {}'.format(grid.score(X_digits_test, y_digits_test)))
14 | print('Best parameters for LogisticRegression: {}'.format(grid.best_params_))
15 |
--------------------------------------------------------------------------------
/notebooks/solutions/17B_learning_curve_alpha.py:
--------------------------------------------------------------------------------
1 | X, y, true_coefficient = make_regression(n_samples=200, n_features=30, n_informative=10, noise=100, coef=True, random_state=5)
2 |
3 | plt.figure(figsize=(10, 5))
4 | plt.title('alpha=1')
5 | plot_learning_curve(LinearRegression(), X, y)
6 | plot_learning_curve(Ridge(alpha=1), X, y)
7 | plot_learning_curve(Lasso(alpha=1), X, y)
8 |
9 | plt.figure(figsize=(10, 5))
10 | plt.title('alpha=10')
11 | plot_learning_curve(LinearRegression(), X, y)
12 | plot_learning_curve(Ridge(alpha=10), X, y)
13 | plot_learning_curve(Lasso(alpha=10), X, y)
14 |
15 | plt.figure(figsize=(10, 5))
16 | plt.title('alpha=100')
17 | plot_learning_curve(LinearRegression(), X, y)
18 | plot_learning_curve(Ridge(alpha=100), X, y)
19 | plot_learning_curve(Lasso(alpha=100), X, y)
20 |
--------------------------------------------------------------------------------
/notebooks/solutions/18_gbc_grid.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_digits
2 | from sklearn.model_selection import train_test_split, GridSearchCV
3 | from sklearn.ensemble import GradientBoostingClassifier
4 |
5 | digits = load_digits()
6 | X_digits, y_digits = digits.data, digits.target
7 | X_digits_train, X_digits_test, y_digits_train, y_digits_test = train_test_split(X_digits, y_digits, random_state=1)
8 |
9 | param_grid = {'learning_rate': [0.01, 0.1, 0.1, 0.5, 1.0],
10 | 'max_depth':[1, 3, 5, 7, 9]}
11 |
12 | grid = GridSearchCV(GradientBoostingClassifier(), param_grid=param_grid, cv=5, verbose=3)
13 | grid.fit(X_digits_train, y_digits_train)
14 | print('Best score for GradientBoostingClassifier: {}'.format(grid.score(X_digits_test, y_digits_test)))
15 | print('Best parameters for GradientBoostingClassifier: {}'.format(grid.best_params_))
16 |
--------------------------------------------------------------------------------
/notebooks/solutions/19_univariate_vs_mb_selection.py:
--------------------------------------------------------------------------------
1 | from sklearn.feature_selection import SelectKBest, SelectFromModel
2 | from sklearn.ensemble import RandomForestClassifier
3 | import numpy as np
4 |
5 | rng = np.random.RandomState(1)
6 | X = rng.randint(0, 2, (200, 20))
7 | y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)
8 |
9 | fs_univariate = SelectKBest(k=10)
10 | fs_modelbased = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='median')
11 |
12 | fs_univariate.fit(X, y)
13 | print('Features selected by univariate selection:')
14 | print(fs_univariate.get_support())
15 | plt.matshow(fs_univariate.get_support().reshape(1, -1), cmap='gray_r')
16 |
17 | fs_modelbased.fit(X, y)
18 | print('Features selected by model-based selection:')
19 | print(fs_modelbased.get_support())
20 | plt.matshow(fs_modelbased.get_support().reshape(1, -1), cmap='gray_r');
21 |
--------------------------------------------------------------------------------
/notebooks/solutions/20_clustering_comparison.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import make_circles
2 | from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
3 |
4 | X, y = make_circles(n_samples=1500,
5 | factor=.4,
6 | noise=.05)
7 |
8 | km = KMeans(n_clusters=2)
9 | plt.figure()
10 | plt.scatter(X[:, 0], X[:, 1], c=km.fit_predict(X))
11 |
12 | ac = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='complete')
13 | plt.figure()
14 | plt.scatter(X[:, 0], X[:, 1], c=ac.fit_predict(X))
15 |
16 | db = DBSCAN(eps=0.2)
17 | plt.figure()
18 | plt.scatter(X[:, 0], X[:, 1], c=db.fit_predict(X));
19 |
--------------------------------------------------------------------------------
/notebooks/solutions/21A_isomap_digits.py:
--------------------------------------------------------------------------------
1 | from sklearn.manifold import Isomap
2 | iso = Isomap(n_components=2)
3 | digits_isomap = iso.fit_transform(digits.data)
4 |
5 | plt.figure(figsize=(10, 10))
6 | plt.xlim(digits_isomap[:, 0].min(), digits_isomap[:, 0].max() + 1)
7 | plt.ylim(digits_isomap[:, 1].min(), digits_isomap[:, 1].max() + 1)
8 | for i in range(len(digits.data)):
9 | # actually plot the digits as text instead of using scatter
10 | plt.text(digits_isomap[i, 0], digits_isomap[i, 1], str(digits.target[i]),
11 | color = colors[digits.target[i]],
12 | fontdict={'weight': 'bold', 'size': 9})
13 |
--------------------------------------------------------------------------------
/notebooks/solutions/21B_tsne_classification.py:
--------------------------------------------------------------------------------
1 | from sklearn.manifold import TSNE
2 | from sklearn.neighbors import KNeighborsClassifier
3 | from sklearn.model_selection import train_test_split
4 |
5 | X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, random_state=1)
6 |
7 | clf = KNeighborsClassifier()
8 | clf.fit(X_train, y_train)
9 | print('KNeighborsClassifier accuracy without t-SNE: {}'.format(clf.score(X_test, y_test)))
10 |
11 | tsne = TSNE(random_state=42)
12 | digits_tsne_train = tsne.fit_transform(X_train)
13 | digits_tsne_test = tsne.fit_transform(X_test)
14 |
15 | clf = KNeighborsClassifier()
16 | clf.fit(digits_tsne_train, y_train)
17 | print('KNeighborsClassifier accuracy with t-SNE: {}'.format(clf.score(digits_tsne_test, y_test)))
18 |
--------------------------------------------------------------------------------
/notebooks/solutions/22_A-anomaly_ocsvm_gamma.py:
--------------------------------------------------------------------------------
1 |
2 | nu = 0.05 # theory says it should be an upper bound of the fraction of outliers
3 |
4 | for gamma in [0.001, 1.]:
5 | ocsvm = OneClassSVM(kernel='rbf', gamma=gamma, nu=nu)
6 | ocsvm.fit(X)
7 |
8 | Z_ocsvm = ocsvm.decision_function(grid)
9 | Z_ocsvm = Z_ocsvm.reshape(xx.shape)
10 |
11 | plt.figure()
12 | c_0 = plt.contour(xx, yy, Z_ocsvm, levels=[0], colors='red', linewidths=3)
13 | plt.clabel(c_0, inline=1, fontsize=15, fmt={0: str(alpha_set)})
14 | plt.scatter(X[:, 0], X[:, 1])
15 | plt.scatter(X_outliers[:, 0], X_outliers[:, 1], color='red')
16 | plt.legend()
17 | plt.show()
18 |
--------------------------------------------------------------------------------
/notebooks/solutions/22_B-anomaly_iforest_n_trees.py:
--------------------------------------------------------------------------------
1 | for n_estimators in [1, 10, 50, 100]:
2 | iforest = IsolationForest(n_estimators=n_estimators, contamination=0.10)
3 | iforest = iforest.fit(X)
4 |
5 | Z_iforest = iforest.decision_function(grid)
6 | Z_iforest = Z_iforest.reshape(xx.shape)
7 |
8 | plt.figure()
9 | c_0 = plt.contour(xx, yy, Z_iforest,
10 | levels=[iforest.threshold_],
11 | colors='red', linewidths=3)
12 | plt.clabel(c_0, inline=1, fontsize=15,
13 | fmt={iforest.threshold_: str(alpha_set)})
14 | plt.scatter(X[:, 0], X[:, 1], s=1.)
15 | plt.legend()
16 | plt.show()
--------------------------------------------------------------------------------
/notebooks/solutions/22_C-anomaly_digits.py:
--------------------------------------------------------------------------------
1 | k = 1 # change to see other numbers
2 |
3 | X_k = X[y == k]
4 |
5 | iforest = IsolationForest(contamination=0.05)
6 | iforest = iforest.fit(X_k)
7 | iforest_X = iforest.decision_function(X_k)
8 |
9 | X_strong_outliers = X_k[np.argsort(iforest_X)[:10]]
10 |
11 | fig, axes = plt.subplots(2, 5, figsize=(10, 5))
12 |
13 | for i, ax in zip(range(len(X_strong_outliers)), axes.ravel()):
14 | ax.imshow(X_strong_outliers[i].reshape((8, 8)),
15 | cmap=plt.cm.gray_r, interpolation='nearest')
16 | ax.axis('off')
17 |
--------------------------------------------------------------------------------
/notebooks/solutions/23_batchtrain.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | from sklearn.linear_model import SGDClassifier
4 | from sklearn.feature_extraction.text import HashingVectorizer
5 | from sklearn.base import clone
6 | from sklearn.datasets import load_files
7 |
8 |
9 | def batch_train(clf, fnames, labels, iterations=1,
10 | batchsize=1000, random_seed=1):
11 | vec = HashingVectorizer(encoding='latin-1')
12 | idx = np.arange(labels.shape[0])
13 | c_clf = clone(clf)
14 | rng = np.random.RandomState(seed=random_seed)
15 | shuffled_idx = rng.permutation(range(len(fnames)))
16 | fnames_ary = np.asarray(fnames)
17 |
18 | for _ in range(iterations):
19 | for batch in np.split(shuffled_idx, len(fnames) // 1000):
20 | documents = []
21 | for fn in fnames_ary[batch]:
22 | with open(fn, 'r') as f:
23 | documents.append(f.read())
24 | X_batch = vec.transform(documents)
25 | batch_labels = labels[batch]
26 | c_clf.partial_fit(X=X_batch,
27 | y=batch_labels,
28 | classes=[0, 1])
29 |
30 | return c_clf
31 |
32 |
33 | # Out-of-core Training
34 | train_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'train')
35 | train_pos = os.path.join(train_path, 'pos')
36 | train_neg = os.path.join(train_path, 'neg')
37 |
38 | fnames = [os.path.join(train_pos, f) for f in os.listdir(train_pos)] +\
39 | [os.path.join(train_neg, f) for f in os.listdir(train_neg)]
40 | y_train = np.zeros((len(fnames), ), dtype=int)
41 | y_train[:12500] = 1
42 | np.bincount(y_train)
43 |
44 | sgd = SGDClassifier(loss='log', random_state=1)
45 |
46 | sgd = batch_train(clf=sgd,
47 | fnames=fnames,
48 | labels=y_train)
49 |
50 |
51 | # Testing
52 | test_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'test')
53 | test = load_files(container_path=(test_path),
54 | categories=['pos', 'neg'])
55 | docs_test, y_test = test['data'][12500:], test['target'][12500:]
56 |
57 | vec = HashingVectorizer(encoding='latin-1')
58 | print('accuracy:', sgd.score(vec.transform(docs_test), y_test))
59 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # brew update && brew install gcc (this includes gfortran)
2 | ipython[all]>=3.2.0
3 | pyzmq>=14.7.0
4 | Pillow>=2.9.0
5 | numpy>=1.9.2
6 | scipy>=0.15.1
7 | scikit-learn>=0.19.1
8 | matplotlib>=2.0.2
9 | pandas>=0.19
10 |
--------------------------------------------------------------------------------
/todo.rst:
--------------------------------------------------------------------------------
1 | replace spam by imdb text data
2 | make sure there are notebooks for all sections
3 | make sure there are exercises everywhere
4 |
--------------------------------------------------------------------------------