├── .gitignore ├── LICENSE ├── README.md ├── abstract.rst ├── check_env.ipynb ├── fetch_data.py ├── images ├── check_env-1.png ├── check_env-2.png └── download-repo.png ├── notebooks ├── 01 Introduction to Machine Learning.ipynb ├── 02 Scientific Computing Tools in Python.ipynb ├── 03 Data Representation for Machine Learning.ipynb ├── 04 Training and Testing Data.ipynb ├── 05 Supervised Learning - Classification.ipynb ├── 06 Supervised Learning - Regression.ipynb ├── 07 Unsupervised Learning - Transformations and Dimensionality Reduction.ipynb ├── 08 Unsupervised Learning - Clustering.ipynb ├── 09 Review of Scikit-learn API.ipynb ├── 10 Case Study - Titanic Survival.ipynb ├── 11 Text Feature Extraction.ipynb ├── 12 Case Study - SMS Spam Detection.ipynb ├── 13 Cross Validation.ipynb ├── 14 Model Complexity and GridSearchCV.ipynb ├── 15 Pipelining Estimators.ipynb ├── 16 Performance metrics and Model Evaluation.ipynb ├── 17 In Depth - Linear Models.ipynb ├── 18 In Depth - Support Vector Machines.ipynb ├── 19 In Depth - Trees and Forests.ipynb ├── 20 Feature Selection.ipynb ├── 21 Unsupervised learning - Hierarchical and density-based clustering algorithms.ipynb ├── 22 Unsupervised learning - Non-linear dimensionality reduction.ipynb ├── 23 Out-of-core Learning Large Scale Text Classification.ipynb ├── datasets │ ├── smsspam │ │ ├── SMSSpamCollection │ │ └── readme │ └── titanic3.csv ├── figures │ ├── ML_flow_chart.py │ ├── __init__.py │ ├── average-per-class.png │ ├── bag_of_words.svg │ ├── check_env-1.png │ ├── cluster_comparison.png │ ├── clustering-linkage.png │ ├── cross_validation.svg │ ├── data_representation.svg │ ├── dbscan.png │ ├── feature_union.svg │ ├── grid_search_cross_validation.svg │ ├── hashing_vectorizer.svg │ ├── ipython_help-1.png │ ├── ipython_help-2.png │ ├── ipython_run_cell.png │ ├── iris_setosa.jpg │ ├── iris_versicolor.jpg │ ├── iris_virginica.jpg │ ├── overfitting_underfitting_cartoon.svg │ ├── petal_sepal.jpg │ ├── pipeline.svg │ ├── pipeline_cross_validation.svg │ ├── plot_2d_separator.py │ ├── plot_digits_dataset.py │ ├── plot_helpers.py │ ├── plot_interactive_forest.py │ ├── plot_interactive_tree.py │ ├── plot_kneigbors_regularization.png │ ├── plot_kneighbors_regularization.py │ ├── plot_linear_svc_regularization.py │ ├── plot_pca.py │ ├── plot_rbf_svm_parameters.py │ ├── plot_scaling.py │ ├── randomized_search.png │ ├── supervised_scikit_learn.png │ ├── supervised_workflow.svg │ ├── train_test_split.svg │ ├── train_test_split_matrix.svg │ ├── train_validation_test2.svg │ └── unsupervised_workflow.svg ├── helpers.py ├── images │ ├── parallel_text_clf.png │ └── parallel_text_clf_average.png └── solutions │ ├── 03A_faces_plot.py │ ├── 04_wrong-predictions.py │ ├── 05A_knn_with_diff_k.py │ ├── 06A_knn_vs_linreg.py │ ├── 07A_iris-pca.py │ ├── 08B_digits_clustering.py │ ├── 10_titanic.py │ ├── 11_ngrams.py │ ├── 13_cross_validation.py │ ├── 14_grid_search.py │ ├── 15A_ridge_grid.py │ ├── 16A_avg_per_class_acc.py │ └── 23_batchtrain.py ├── requirements.txt ├── slides ├── scipy2016.pdf └── scipy2016.pptx └── todo.rst /.gitignore: -------------------------------------------------------------------------------- 1 | # exlude datasets and externals 2 | notebooks/datasets 3 | notebooks/joblib/ 4 | 5 | # exclude temporary files 6 | .ipynb_checkpoints 7 | .DS_Store 8 | gmon.out 9 | __pycache__ 10 | *.pyc 11 | *.o 12 | *.so 13 | *.gcno 14 | *.swp 15 | *.egg-info 16 | *.egg 17 | *~ 18 | build 19 | dist 20 | lib/test 21 | doc/_build 22 | *env 23 | *ENV 24 | .idea 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | CC0 1.0 Universal 2 | 3 | Statement of Purpose 4 | 5 | The laws of most jurisdictions throughout the world automatically confer 6 | exclusive Copyright and Related Rights (defined below) upon the creator and 7 | subsequent owner(s) (each and all, an "owner") of an original work of 8 | authorship and/or a database (each, a "Work"). 9 | 10 | Certain owners wish to permanently relinquish those rights to a Work for the 11 | purpose of contributing to a commons of creative, cultural and scientific 12 | works ("Commons") that the public can reliably and without fear of later 13 | claims of infringement build upon, modify, incorporate in other works, reuse 14 | and redistribute as freely as possible in any form whatsoever and for any 15 | purposes, including without limitation commercial purposes. These owners may 16 | contribute to the Commons to promote the ideal of a free culture and the 17 | further production of creative, cultural and scientific works, or to gain 18 | reputation or greater distribution for their Work in part through the use and 19 | efforts of others. 20 | 21 | For these and/or other purposes and motivations, and without any expectation 22 | of additional consideration or compensation, the person associating CC0 with a 23 | Work (the "Affirmer"), to the extent that he or she is an owner of Copyright 24 | and Related Rights in the Work, voluntarily elects to apply CC0 to the Work 25 | and publicly distribute the Work under its terms, with knowledge of his or her 26 | Copyright and Related Rights in the Work and the meaning and intended legal 27 | effect of CC0 on those rights. 28 | 29 | 1. Copyright and Related Rights. A Work made available under CC0 may be 30 | protected by copyright and related or neighboring rights ("Copyright and 31 | Related Rights"). Copyright and Related Rights include, but are not limited 32 | to, the following: 33 | 34 | i. the right to reproduce, adapt, distribute, perform, display, communicate, 35 | and translate a Work; 36 | 37 | ii. moral rights retained by the original author(s) and/or performer(s); 38 | 39 | iii. publicity and privacy rights pertaining to a person's image or likeness 40 | depicted in a Work; 41 | 42 | iv. rights protecting against unfair competition in regards to a Work, 43 | subject to the limitations in paragraph 4(a), below; 44 | 45 | v. rights protecting the extraction, dissemination, use and reuse of data in 46 | a Work; 47 | 48 | vi. database rights (such as those arising under Directive 96/9/EC of the 49 | European Parliament and of the Council of 11 March 1996 on the legal 50 | protection of databases, and under any national implementation thereof, 51 | including any amended or successor version of such directive); and 52 | 53 | vii. other similar, equivalent or corresponding rights throughout the world 54 | based on applicable law or treaty, and any national implementations thereof. 55 | 56 | 2. Waiver. To the greatest extent permitted by, but not in contravention of, 57 | applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and 58 | unconditionally waives, abandons, and surrenders all of Affirmer's Copyright 59 | and Related Rights and associated claims and causes of action, whether now 60 | known or unknown (including existing as well as future claims and causes of 61 | action), in the Work (i) in all territories worldwide, (ii) for the maximum 62 | duration provided by applicable law or treaty (including future time 63 | extensions), (iii) in any current or future medium and for any number of 64 | copies, and (iv) for any purpose whatsoever, including without limitation 65 | commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes 66 | the Waiver for the benefit of each member of the public at large and to the 67 | detriment of Affirmer's heirs and successors, fully intending that such Waiver 68 | shall not be subject to revocation, rescission, cancellation, termination, or 69 | any other legal or equitable action to disrupt the quiet enjoyment of the Work 70 | by the public as contemplated by Affirmer's express Statement of Purpose. 71 | 72 | 3. Public License Fallback. Should any part of the Waiver for any reason be 73 | judged legally invalid or ineffective under applicable law, then the Waiver 74 | shall be preserved to the maximum extent permitted taking into account 75 | Affirmer's express Statement of Purpose. In addition, to the extent the Waiver 76 | is so judged Affirmer hereby grants to each affected person a royalty-free, 77 | non transferable, non sublicensable, non exclusive, irrevocable and 78 | unconditional license to exercise Affirmer's Copyright and Related Rights in 79 | the Work (i) in all territories worldwide, (ii) for the maximum duration 80 | provided by applicable law or treaty (including future time extensions), (iii) 81 | in any current or future medium and for any number of copies, and (iv) for any 82 | purpose whatsoever, including without limitation commercial, advertising or 83 | promotional purposes (the "License"). The License shall be deemed effective as 84 | of the date CC0 was applied by Affirmer to the Work. Should any part of the 85 | License for any reason be judged legally invalid or ineffective under 86 | applicable law, such partial invalidity or ineffectiveness shall not 87 | invalidate the remainder of the License, and in such case Affirmer hereby 88 | affirms that he or she will not (i) exercise any of his or her remaining 89 | Copyright and Related Rights in the Work or (ii) assert any associated claims 90 | and causes of action with respect to the Work, in either case contrary to 91 | Affirmer's express Statement of Purpose. 92 | 93 | 4. Limitations and Disclaimers. 94 | 95 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 96 | surrendered, licensed or otherwise affected by this document. 97 | 98 | b. Affirmer offers the Work as-is and makes no representations or warranties 99 | of any kind concerning the Work, express, implied, statutory or otherwise, 100 | including without limitation warranties of title, merchantability, fitness 101 | for a particular purpose, non infringement, or the absence of latent or 102 | other defects, accuracy, or the present or absence of errors, whether or not 103 | discoverable, all to the greatest extent permissible under applicable law. 104 | 105 | c. Affirmer disclaims responsibility for clearing rights of other persons 106 | that may apply to the Work or any use thereof, including without limitation 107 | any person's Copyright and Related Rights in the Work. Further, Affirmer 108 | disclaims responsibility for obtaining any necessary consents, permissions 109 | or other rights required for any use of the Work. 110 | 111 | d. Affirmer understands and acknowledges that Creative Commons is not a 112 | party to this document and has no duty or obligation with respect to this 113 | CC0 or use of the Work. 114 | 115 | For more information, please see 116 | 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | SciPy 2016 Scikit-learn Tutorial 2 | ================================ 3 | 4 | Based on the SciPy [2015 tutorial](https://github.com/amueller/scipy_2015_sklearn_tutorial) by [Kyle Kastner](https://kastnerkyle.github.io/) and [Andreas Mueller](http://amueller.github.io). 5 | 6 | 7 | Instructors 8 | ----------- 9 | 10 | - [Sebastian Raschka](http://sebastianraschka.com) [@rasbt](https://twitter.com/rasbt) - Michigan State University, Computational Biology; [Book: Python Machine Learning](https://www.amazon.com/Python-Machine-Learning-Sebastian-Raschka/dp/1783555130/ref=sr_1_1?ie=UTF8&qid=1468347805&sr=8-1&keywords=sebastian+raschka) 11 | - [Andreas Mueller](http://amuller.github.io) [@amuellerml](https://twitter.com/t3kcit) - NYU Center for Data Science; [Book: Introduction to Machine Learning with Python](http://shop.oreilly.com/product/0636920030515.do) 12 | 13 | --- 14 | 15 | **The video recording of the tutorial is now available via YouTube:** 16 | 17 | - [Part 1](https://www.youtube.com/watch?list=PLYx7XA2nY5Gf37zYZMw6OqGFRPjB1jCy6&v=OB1reY6IX-o) 18 | - [Part 2](https://www.youtube.com/watch?v=Cte8FYCpylk&list=PLYx7XA2nY5Gf37zYZMw6OqGFRPjB1jCy6&index=90) 19 | 20 | --- 21 | 22 | This repository will contain the teaching material and other info associated with our scikit-learn tutorial 23 | at [SciPy 2016](http://scipy2016.scipy.org/ehome/index.php?eventid=146062&tabid=332930&) held July 11-17 in Austin, Texas. 24 | 25 | Parts 1 to 5 make up the morning session, while 26 | parts 6 to 9 will be presented in the afternoon. 27 | 28 | ### Schedule: 29 | 30 | The 2-part tutorial will be held on Tuesday, July 12, 2016. 31 | 32 | - Parts 1 to 5: 8:00 AM - 12:00 PM (Room 105) 33 | - Parts 6 to 9: 1:30 PM - 5:30 PM (Room 105) 34 | 35 | (You can find the full SciPy 2016 tutorial schedule [here](http://scipy2016.scipy.org/ehome/146062/332960/).) 36 | 37 | 38 | 39 | Obtaining the Tutorial Material 40 | ------------------ 41 | 42 | 43 | If you have a GitHub account, it is probably most convenient if you fork the GitHub repository. If you don’t have an GitHub account, you can download the repository as a .zip file by heading over to the GitHub repository (https://github.com/amueller/scipy-2016-sklearn) in your browser and click the green “Download” button in the upper right. 44 | 45 | ![](images/download-repo.png) 46 | 47 | Please note that we may add and improve the material until shortly before the tutorial session, and we recommend you to update your copy of the materials one day before the tutorials. If you have an GitHub account and forked/cloned the repository via GitHub, you can sync your existing fork with via the following commands: 48 | 49 | ``` 50 | git remote add upstream https://github.com/amueller/scipy-2016-sklearn.git 51 | git fetch upstream 52 | git checkout master merge upstream/master 53 | ``` 54 | 55 | If you don’t have a GitHub account, you may have to re-download the .zip archive from GitHub. 56 | 57 | 58 | Installation Notes 59 | ------------------ 60 | 61 | This tutorial will require recent installations of 62 | 63 | - [NumPy](http://www.numpy.org) 64 | - [SciPy](http://www.scipy.org) 65 | - [matplotlib](http://matplotlib.org) 66 | - [pillow](https://python-pillow.org) 67 | - [scikit-learn](http://scikit-learn.org/stable/) 68 | - [PyYaml](http://pyyaml.org/wiki/PyYAML) 69 | - [IPython](http://ipython.readthedocs.org/en/stable/) 70 | - [Jupyter Notebook](http://jupyter.org) 71 | - [Watermark](https://pypi.python.org/pypi/watermark) 72 | 73 | The last one is important, you should be able to type: 74 | 75 | jupyter notebook 76 | 77 | in your terminal window and see the notebook panel load in your web browser. 78 | Try opening and running a notebook from the material to see check that it works. 79 | 80 | For users who do not yet have these packages installed, a relatively 81 | painless way to install all the requirements is to use a Python distribution 82 | such as [Anaconda CE](http://store.continuum.io/ "Anaconda CE"), which includes 83 | the most relevant Python packages for science, math, engineering, and 84 | data analysis; Anaconda can be downloaded and installed for free 85 | including commercial use and redistribution. 86 | The code examples in this tutorial should be compatible to Python 2.7, 87 | Python 3.4, and Python 3.5. 88 | 89 | After obtaining the material, we **strongly recommend** you to open and execute the Jupyter Notebook 90 | `jupter notebook check_env.ipynb` that is located at the top level of this repository. Inside the repository, you can open the notebook 91 | by executing 92 | 93 | ```bash 94 | jupyter notebook check_env.ipynb 95 | ``` 96 | 97 | inside this repository. Inside the Notebook, you can run the code cell by 98 | clicking on the "Run Cells" button as illustrated in the figure below: 99 | 100 | ![](images/check_env-1.png) 101 | 102 | 103 | Finally, if your environment satisfies the requirements for the tutorials, the executed code cell will produce an output message as shown below: 104 | 105 | ![](images/check_env-2.png) 106 | 107 | 108 | Although not required, we also recommend you to update the required Python packages to their latest versions to ensure best compatibility with the teaching material. Please upgrade already installed packages by executing 109 | 110 | - `pip install [package-name] --upgrade` 111 | - or `conda update [package-name]` 112 | 113 | 114 | 115 | Data Downloads 116 | -------------- 117 | 118 | The data for this tutorial is not included in the repository. We will be 119 | using several data sets during the tutorial: most are built-in to 120 | scikit-learn, which 121 | includes code that automatically downloads and caches these 122 | data. 123 | 124 | **Because the wireless network 125 | at conferences can often be spotty, it would be a good idea to download these 126 | data sets before arriving at the conference. 127 | Please run ``python fetch_data.py`` to download all necessary data beforehand.** 128 | 129 | The download size of the data files are approx. 280 MB, and after `fetch_data.py` 130 | extracted the data on your disk, the ./notebook/dataset folder will take 480 MB 131 | of your local solid state or hard drive. 132 | 133 | 134 | Outline 135 | ======= 136 | 137 | Morning Session 138 | --------------- 139 | 140 | - 01 Introduction to machine learning with sample applications, Supervised and Unsupervised learning [[view](notebooks/01\ Introduction\ to\ Machine\ Learning.ipynb)] 141 | - 02 Scientific Computing Tools for Python: NumPy, SciPy, and matplotlib [[view](notebooks/02\ Scientific\ Computing\ Tools\ in\ Python.ipynb)] 142 | - 03 Data formats, preparation, and representation [[view](notebooks/03\ Data\ Representation\ for\ Machine\ Learning.ipynb)] 143 | - 04 Supervised learning: Training and test data [[view](notebooks/04\ Training\ and\ Testing\ Data.ipynb)] 144 | - 05 Supervised learning: Estimators for classification [[view](notebooks/05\ Supervised\ Learning\ -\ Classification.ipynb)] 145 | - 06 Supervised learning: Estimators for regression analysis [[view](notebooks/06\ Supervised\ Learning\ -\ Regression.ipynb)] 146 | - 07 Unsupervised learning: Unsupervised Transformers [[view](notebooks/07\ Unsupervised\ Learning\ -\ Transformations\ and\ Dimensionality\ Reduction.ipynb)] 147 | - 08 Unsupervised learning: Clustering [[view](notebooks/08\ Unsupervised\ Learning\ -\ Clustering.ipynb)] 148 | - 09 The scikit-learn estimator interface [[view](notebooks/09\ Review\ of\ Scikit-learn\ API.ipynb)] 149 | - 10 Preparing a real-world dataset (titanic) [[view](notebooks/10\ Case\ Study\ -\ Titanic\ Survival.ipynb)] 150 | - 11 Working with text data via the bag-of-words model [[view](notebooks/11\ Text\ Feature\ Extraction.ipynb)] 151 | - 12 Application: IMDb Movie Review Sentiment Analysis [[view](notebooks/12\ Case\ Study\ -\ SMS\ Spam\ Detection.ipynb)] 152 | 153 | Afternoon Session 154 | ----------------- 155 | 156 | - 13 Cross-Validation [[view](notebooks/13\ Cross\ Validation.ipynb)] 157 | - 14 Model complexity and grid search for adjusting hyperparameters [[view](notebooks/14\ Model\ Complexity\ and\ GridSearchCV.ipynb)] 158 | - 15 Scikit-learn Pipelines [[view](notebooks/15\ Pipelining\ Estimators.ipynb)] 159 | - 16 Supervised learning: Performance metrics for classification [[view](notebooks/16\ Performance\ metrics\ and\ Model\ Evaluation.ipynb)] 160 | - 17 Supervised learning: Linear Models [[view](notebooks/17\ In\ Depth\ -\ Linear\ Models.ipynb)] 161 | - 18 Supervised learning: Support Vector Machines [[view](notebooks/18\ In\ Depth\ -\ Support\ Vector\ Machines.ipynb)] 162 | - 19 Supervised learning: Decision trees and random forests, and ensemble methods [[view](notebooks/19\ In\ Depth\ -\ Trees\ and\ Forests.ipynb)] 163 | - 20 Supervised learning: feature selection [[view](notebooks/20\ Feature\ Selection.ipynb)] 164 | - 21 Unsupervised learning: Hierarchical and density-based clustering algorithms [[view](notebooks/21\ Unsupervised\ learning\ -\ Hierarchical\ and\ density-based\ clustering\ algorithms.ipynb)] 165 | - 22 Unsupervised learning: Non-linear dimensionality reduction [[view](notebooks/22\ Unsupervised\ learning\ -\ Non-linear\ dimensionality\ reduction.ipynb)] 166 | - 23 Supervised learning: Out-of-core learning [[view](notebooks/23\ Out-of-core\ Learning\ Large\ Scale\ Text\ Classification.ipynb)] 167 | -------------------------------------------------------------------------------- /abstract.rst: -------------------------------------------------------------------------------- 1 | Machine Learning with scikit-learn 2 | 3 | Tutorial Topic 4 | -------------- 5 | 6 | This tutorial aims to provide an introduction to machine learning and 7 | scikit-learn "from the ground up". We will start with core concepts of machine 8 | learning, some example uses of machine learning, and how to implement them 9 | using scikit-learn. Going in detail through the characteristics of several 10 | methods, we will discuss how to pick an algorithm for your application, how to 11 | set its parameters, and how to evaluate performance. 12 | 13 | Please provide a more detailed abstract of your tutorial (again, see last years tutorials). 14 | --------------------------------------------------------------------------------------------- 15 | 16 | Machine learning is the task of extracting knowledge from data, often with the 17 | goal of generalizing to new and unseen data. Applications of machine learning 18 | now touch nearly every aspect of everyday life, from the face detection in our 19 | phones and the streams of social media we consume to picking restaurants, 20 | partners, and movies. Machine learning has also become indispensable to many 21 | empirical sciences, from physics, astronomy and biology to social sciences. 22 | 23 | Scikit-learn has emerged as one of the most popular toolkits for machine 24 | learning, and is now widely used in industry and academia. 25 | The goal of this tutorial is to enable participants to use the wide variety of 26 | machine learning algorithms available in scikit-learn on their own data sets, 27 | for their own domains. 28 | 29 | This tutorial will comprise an introductory morning session and an advanced 30 | afternoon session. The morning part of the tutorial will cover basic concepts 31 | of machine learning, data representation, and preprocessing. We will explain 32 | different problem settings and which algorithms to use in each situation. 33 | We will then go through some sample applications using algorithms implemented 34 | in scikit-learn, including SVMs, Random Forests, K-Means, PCA, t-SNE, and 35 | others. 36 | 37 | In the afternoon session, we will discuss setting hyper-parameters and how to 38 | prevent overfitting. We will go in-depth into the trade-off of model complexity 39 | and dataset size, as well as discussing complexity of learning algorithms and 40 | how to cope with very large datasets. The session will conclude by stepping 41 | through the process of building machine learning pipelines consisting of 42 | feature extraction, preprocessing and supervised learning. 43 | 44 | 45 | Outline 46 | ======== 47 | 48 | Morning Session 49 | ---------------- 50 | 51 | - Introduction to machine learning with sample applications 52 | 53 | - Types of machine learning: Unsupervised vs. supervised learning 54 | 55 | - Scientific Computing Tools for Python: NumPy, SciPy, and matplotlib 56 | 57 | - Data formats, preparation, and representation 58 | 59 | - Supervised learning: Training and test data 60 | - Supervised learning: The scikit-learn estimator interface 61 | - Supervised learning: Estimators for classification 62 | - Supervised learning: Estimators for regression analysis 63 | 64 | - Unsupervised learning: Unsupervised Transformers 65 | - Unsupervised learning: Preprocessing and scaling 66 | - Unsupervised learning: Feature extraction and dimensionality reduction 67 | - Unsupervised learning: Clustering 68 | 69 | - Preparing a real-world dataset 70 | - Working with text data via the bag-of-words model 71 | - Application: IMDB Movie Review Sentiment Analysis 72 | 73 | 74 | Afternoon Session 75 | ------------------ 76 | - Cross-Validation 77 | - Model Complexity: Overfitting and underfitting 78 | - Complexity of various model types 79 | - Grid search for adjusting hyperparameters 80 | 81 | - Scikit-learn Pipelines 82 | 83 | - Supervised learning: Performance metrics for classification 84 | - Supervised learning: Support Vector Machines 85 | - Supervised learning: Algorithm and model selection via nested cross-validation 86 | - Supervised learning: Decision trees and random forests, and ensemble methods 87 | 88 | - Unsupervised learning: Non-linear regression analysis 89 | - Unsupervised learning: Hierarchical and density-based clustering algorithms 90 | - Unsupervised learning: Non-linear dimensionality reduction 91 | 92 | - Wrapper, filter, and embedded approaches for feature selection 93 | 94 | - Supervised learning: Artificial neural networks: Multi-layer perceptrons 95 | - Supervised learning: Out-of-core learning 96 | -------------------------------------------------------------------------------- /check_env.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from __future__ import print_function\n", 12 | "from distutils.version import LooseVersion as Version\n", 13 | "import sys\n", 14 | "\n", 15 | "\n", 16 | "try:\n", 17 | " import curses\n", 18 | " curses.setupterm()\n", 19 | " assert curses.tigetnum(\"colors\") > 2\n", 20 | " OK = \"\\x1b[1;%dm[ OK ]\\x1b[0m\" % (30 + curses.COLOR_GREEN)\n", 21 | " FAIL = \"\\x1b[1;%dm[FAIL]\\x1b[0m\" % (30 + curses.COLOR_RED)\n", 22 | "except:\n", 23 | " OK = '[ OK ]'\n", 24 | " FAIL = '[FAIL]'\n", 25 | "\n", 26 | "try:\n", 27 | " import importlib\n", 28 | "except ImportError:\n", 29 | " print(FAIL, \"Python version 3.4 (or 2.7) is required,\"\n", 30 | " \" but %s is installed.\" % sys.version)\n", 31 | "\n", 32 | " \n", 33 | "def import_version(pkg, min_ver, fail_msg=\"\"):\n", 34 | " mod = None\n", 35 | " try:\n", 36 | " mod = importlib.import_module(pkg)\n", 37 | " if pkg in {'PIL'}:\n", 38 | " ver = mod.VERSION\n", 39 | " else:\n", 40 | " ver = mod.__version__\n", 41 | " if Version(ver) < min_ver:\n", 42 | " print(FAIL, \"%s version %s or higher required, but %s installed.\"\n", 43 | " % (lib, min_ver, ver))\n", 44 | " else:\n", 45 | " print(OK, '%s version %s' % (pkg, ver))\n", 46 | " except ImportError:\n", 47 | " print(FAIL, '%s not installed. %s' % (pkg, fail_msg))\n", 48 | " return mod\n", 49 | "\n", 50 | "\n", 51 | "# first check the python version\n", 52 | "print('Using python in', sys.prefix)\n", 53 | "print(sys.version)\n", 54 | "pyversion = Version(sys.version)\n", 55 | "if pyversion >= \"3\":\n", 56 | " if pyversion < \"3.4\":\n", 57 | " print(FAIL, \"Python version 3.4 (or 2.7) is required,\"\n", 58 | " \" but %s is installed.\" % sys.version)\n", 59 | "elif pyversion >= \"2\":\n", 60 | " if pyversion < \"2.7\":\n", 61 | " print(FAIL, \"Python version 2.7 is required,\"\n", 62 | " \" but %s is installed.\" % sys.version)\n", 63 | "else:\n", 64 | " print(FAIL, \"Unknown Python version: %s\" % sys.version)\n", 65 | "\n", 66 | "print()\n", 67 | "requirements = {'numpy': \"1.6.1\", 'scipy': \"0.9\", 'matplotlib': \"1.0\",\n", 68 | " 'IPython': \"3.0\", 'sklearn': \"0.15\",\n", 69 | " 'watermark': \"1.3.1\",\n", 70 | " 'yaml': \"3.11\", 'PIL': \"1.1.7\"}\n", 71 | "\n", 72 | "# now the dependencies\n", 73 | "for lib, required_version in list(requirements.items()):\n", 74 | " import_version(lib, required_version)\n", 75 | "\n", 76 | "# pydot is a bit different\n", 77 | "import_version(\"pydot\", \"0\", fail_msg=\"pydot is not installed.\"\n", 78 | " \"It is not required but you will miss out on some plots.\"\n", 79 | " \"\\nYou can install it using \"\n", 80 | " \"'pip install pydot' on python2, and 'pip install \"\n", 81 | " \"git+https://github.com/nlhepler/pydot.git' on python3.\");\n" 82 | ] 83 | } 84 | ], 85 | "metadata": { 86 | "kernelspec": { 87 | "display_name": "Python 3", 88 | "language": "python", 89 | "name": "python3" 90 | }, 91 | "language_info": { 92 | "codemirror_mode": { 93 | "name": "ipython", 94 | "version": 3 95 | }, 96 | "file_extension": ".py", 97 | "mimetype": "text/x-python", 98 | "name": "python", 99 | "nbconvert_exporter": "python", 100 | "pygments_lexer": "ipython3", 101 | "version": "3.5.1" 102 | } 103 | }, 104 | "nbformat": 4, 105 | "nbformat_minor": 0 106 | } 107 | -------------------------------------------------------------------------------- /fetch_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | try: 4 | from urllib.request import urlopen 5 | except ImportError: 6 | from urllib import urlopen 7 | 8 | import tarfile 9 | 10 | 11 | IMDB_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" 12 | IMDB_ARCHIVE_NAME = "aclImdb_v1.tar.gz" 13 | 14 | 15 | def get_datasets_folder(): 16 | here = os.path.dirname(__file__) 17 | notebooks = os.path.join(here, 'notebooks') 18 | datasets_folder = os.path.abspath(os.path.join(notebooks, 'datasets')) 19 | datasets_archive = os.path.abspath(os.path.join(notebooks, 'datasets.zip')) 20 | 21 | if not os.path.exists(datasets_folder): 22 | if os.path.exists(datasets_archive): 23 | print("Extracting " + datasets_archive) 24 | zf = zipfile.ZipFile(datasets_archive) 25 | zf.extractall('.') 26 | assert os.path.exists(datasets_folder) 27 | else: 28 | print("Creating datasets folder: " + datasets_folder) 29 | os.makedirs(datasets_folder) 30 | else: 31 | print("Using existing dataset folder:" + datasets_folder) 32 | return datasets_folder 33 | 34 | 35 | def check_imdb(datasets_folder): 36 | print("\nChecking availability of the IMDb dataset") 37 | archive_path = os.path.join(datasets_folder, IMDB_ARCHIVE_NAME) 38 | imdb_path = os.path.join(datasets_folder, 'IMDb') 39 | 40 | train_path = os.path.join(imdb_path, 'aclImdb', 'train') 41 | test_path = os.path.join(imdb_path, 'aclImdb', 'test') 42 | 43 | if not os.path.exists(imdb_path): 44 | if not os.path.exists(archive_path): 45 | print("Downloading dataset from %s (84.1MB)" % IMDB_URL) 46 | opener = urlopen(IMDB_URL) 47 | open(archive_path, 'wb').write(opener.read()) 48 | else: 49 | print("Found archive: " + archive_path) 50 | 51 | print("Extracting %s to %s" % (archive_path, imdb_path)) 52 | 53 | tar = tarfile.open(archive_path, "r:gz") 54 | tar.extractall(path=imdb_path) 55 | tar.close() 56 | os.remove(archive_path) 57 | 58 | print("Checking that the IMDb train & test directories exist...") 59 | assert os.path.exists(train_path) 60 | assert os.path.exists(test_path) 61 | print("=> Success!") 62 | 63 | 64 | if __name__ == "__main__": 65 | datasets_folder = get_datasets_folder() 66 | check_imdb(datasets_folder) 67 | 68 | print("\nLoading Labeled Faces Data (~200MB)") 69 | from sklearn.datasets import fetch_lfw_people 70 | fetch_lfw_people(min_faces_per_person=70, resize=0.4, 71 | data_home=datasets_folder) 72 | print("=> Success!") 73 | -------------------------------------------------------------------------------- /images/check_env-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/images/check_env-1.png -------------------------------------------------------------------------------- /images/check_env-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/images/check_env-2.png -------------------------------------------------------------------------------- /images/download-repo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/images/download-repo.png -------------------------------------------------------------------------------- /notebooks/01 Introduction to Machine Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%load_ext watermark\n", 12 | "%watermark -d -u -a 'Andreas Mueller, Kyle Kastner, Sebastian Raschka'" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "The use of watermark (above) is optional, and we use it to keep track of the changes while developing the tutorial material. (You can install this IPython extension via \"pip install watermark\". For more information, please see: https://github.com/rasbt/watermark)." 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "# SciPy 2016 Scikit-learn Tutorial" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "# 01.1 Introduction to Machine Learning in Python" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## What is Machine Learning?" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "Machine learning is the process of extracting knowledge from data automatically, usually with the goal of making predictions on new, unseen data. A classical example is a spam filter, for which the user keeps labeling incoming mails as either spam or not spam. A machine learning algorithm then \"learns\" a predictive model from data that distinguishes spam from normal emails, a model which can predict for new emails whether they are spam or not. \n", 48 | "\n", 49 | "Central to machine learning is the concept of **automating decision making** from data **without the user specifying explicit rules** how this decision should be made.\n", 50 | "\n", 51 | "For the case of emails, the user doesn't provide a list of words or characteristics that make an email spam. Instead, the user provides examples of spam and non-spam emails that are labeled as such.\n", 52 | "\n", 53 | "The second central concept is **generalization**. The goal of a machine learning model is to predict on new, previously unseen data. In a real-world application, we are not interested in marking an already labeled email as spam or not. Instead, we want to make the user's life easier by automatically classifying new incoming mail." 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "The data is presented to the algorithm usually as a two-dimensional array (or matrix) of numbers. Each data point (also known as a *sample* or *training instance*) that we want to either learn from or make a decision on is represented as a list of numbers, a so-called feature vector, and its containing features represent the properties of this point. \n", 68 | "\n", 69 | "Later, we will lay our hands on a popular dataset called *Iris* -- among many other datasets. Iris, a classic benchmark dataset in the field of machine learning, contains the measurements of 150 iris flowers from 3 different species: Iris-Setosa, Iris-Versicolor, and Iris-Virginica. \n", 70 | "\n", 71 | "\n", 72 | "Iris Setosa\n", 73 | "\n", 74 | "\n", 75 | "Iris Versicolor\n", 76 | "\n", 77 | "\n", 78 | "Iris Virginica\n", 79 | "\n", 80 | "\n", 81 | "\n", 82 | "\n", 83 | "\n", 84 | "\n", 85 | "\n", 86 | "\n", 87 | "\n", 88 | "We represent each flower sample as one row in our data array, and the columns (features) represent the flower measurements in centimeters. For instance, we can represent this Iris dataset, consisting of 150 samples and 4 features, a 2-dimensional array or matrix $\\mathbb{R}^{150 \\times 4}$ in the following format:\n", 89 | "\n", 90 | "\n", 91 | "$$\\mathbf{X} = \\begin{bmatrix}\n", 92 | " x_{1}^{(1)} & x_{2}^{(1)} & x_{3}^{(1)} & \\dots & x_{4}^{(1)} \\\\\n", 93 | " x_{1}^{(2)} & x_{2}^{(2)} & x_{3}^{(2)} & \\dots & x_{4}^{(2)} \\\\\n", 94 | " \\vdots & \\vdots & \\vdots & \\ddots & \\vdots \\\\\n", 95 | " x_{1}^{(150)} & x_{2}^{(150)} & x_{3}^{(150)} & \\dots & x_{4}^{(150)}\n", 96 | "\\end{bmatrix}.\n", 97 | "$$\n", 98 | "\n", 99 | "(The superscript denotes the *i*th row, and the subscript denotes the *j*th feature, respectively." 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "There are two kinds of machine learning we will talk about today: ***supervised learning*** and ***unsupervised learning***." 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "### Supervised Learning: Classification and regression\n", 114 | "\n", 115 | "In **Supervised Learning**, we have a dataset consisting of both input features and a desired output, such as in the spam / no-spam example.\n", 116 | "The task is to construct a model (or program) which is able to predict the desired output of an unseen object\n", 117 | "given the set of features.\n", 118 | "\n", 119 | "Some more complicated examples are:\n", 120 | "\n", 121 | "- Given a multicolor image of an object through a telescope, determine\n", 122 | " whether that object is a star, a quasar, or a galaxy.\n", 123 | "- Given a photograph of a person, identify the person in the photo.\n", 124 | "- Given a list of movies a person has watched and their personal rating\n", 125 | " of the movie, recommend a list of movies they would like.\n", 126 | "- Given a persons age, education and position, infer their salary\n", 127 | "\n", 128 | "What these tasks have in common is that there is one or more unknown\n", 129 | "quantities associated with the object which needs to be determined from other\n", 130 | "observed quantities.\n", 131 | "\n", 132 | "Supervised learning is further broken down into two categories, **classification** and **regression**:\n", 133 | "\n", 134 | "- **In classification, the label is discrete**, such as \"spam\" or \"no spam\". In other words, it provides a clear-cut distinction between categories. Furthermore, it is important to note that class labels are nominal, not ordinal variables. Nominal and ordinal variables are both subcategories of categorical variable. Ordinal variables imply an order, for example, T-shirt sizes \"XL > L > M > S\". On the contrary, nominal variables don't imply an order, for example, we (usually) can't assume \"orange > blue > green\".\n", 135 | "\n", 136 | "\n", 137 | "- **In regression, the label is continuous**, that is a float output. For example,\n", 138 | "in astronomy, the task of determining whether an object is a star, a galaxy, or a quasar is a\n", 139 | "classification problem: the label is from three distinct categories. On the other hand, we might\n", 140 | "wish to estimate the age of an object based on such observations: this would be a regression problem,\n", 141 | "because the label (age) is a continuous quantity.\n", 142 | "\n", 143 | "In supervised learning, there is always a distinction between a **training set** for which the desired outcome is given, and a **test set** for which the desired outcome needs to be inferred. The learning model fits the predictive model to the training set, and we use the test set to evaluate its generalization performance.\n" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "### Unsupervised Learning\n", 151 | "\n", 152 | "In **Unsupervised Learning** there is no desired output associated with the data.\n", 153 | "Instead, we are interested in extracting some form of knowledge or model from the given data.\n", 154 | "In a sense, you can think of unsupervised learning as a means of discovering labels from the data itself.\n", 155 | "Unsupervised learning is often harder to understand and to evaluate.\n", 156 | "\n", 157 | "Unsupervised learning comprises tasks such as *dimensionality reduction*, *clustering*, and\n", 158 | "*density estimation*. For example, in the iris data discussed above, we can used unsupervised\n", 159 | "methods to determine combinations of the measurements which best display the structure of the\n", 160 | "data. As we’ll see below, such a projection of the data can be used to visualize the\n", 161 | "four-dimensional dataset in two dimensions. Some more involved unsupervised learning problems are:\n", 162 | "\n", 163 | "- Given detailed observations of distant galaxies, determine which features or combinations of\n", 164 | " features summarize best the information.\n", 165 | "- Given a mixture of two sound sources (for example, a person talking over some music),\n", 166 | " separate the two (this is called the [blind source separation](http://en.wikipedia.org/wiki/Blind_signal_separation) problem).\n", 167 | "- Given a video, isolate a moving object and categorize in relation to other moving objects which have been seen.\n", 168 | "- Given a large collection of news articles, find recurring topics inside these articles.\n", 169 | "- Given a collection of images, cluster similar images together (for example to group them when visualizing a collection)\n", 170 | "\n", 171 | "Sometimes the two may even be combined: e.g. unsupervised learning can be used to find useful\n", 172 | "features in heterogeneous data, and then these features can be used within a supervised\n", 173 | "framework." 174 | ] 175 | } 176 | ], 177 | "metadata": { 178 | "kernelspec": { 179 | "display_name": "Python 3", 180 | "language": "python", 181 | "name": "python3" 182 | }, 183 | "language_info": { 184 | "codemirror_mode": { 185 | "name": "ipython", 186 | "version": 3 187 | }, 188 | "file_extension": ".py", 189 | "mimetype": "text/x-python", 190 | "name": "python", 191 | "nbconvert_exporter": "python", 192 | "pygments_lexer": "ipython3", 193 | "version": "3.5.1" 194 | } 195 | }, 196 | "nbformat": 4, 197 | "nbformat_minor": 0 198 | } 199 | -------------------------------------------------------------------------------- /notebooks/04 Training and Testing Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "Andreas Mueller, Kyle Kastner, Sebastian Raschka \n", 15 | "last updated: 2016-07-11 \n", 16 | "\n", 17 | "CPython 3.5.1\n", 18 | "IPython 4.1.2\n", 19 | "\n", 20 | "numpy 1.11.0\n", 21 | "scipy 0.17.1\n", 22 | "matplotlib 1.5.1\n" 23 | ] 24 | } 25 | ], 26 | "source": [ 27 | "%load_ext watermark\n", 28 | "%watermark -d -u -a 'Andreas Mueller, Kyle Kastner, Sebastian Raschka' -v -p numpy,scipy,matplotlib" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "%matplotlib inline\n", 40 | "import matplotlib.pyplot as plt\n", 41 | "import numpy as np" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "# SciPy 2016 Scikit-learn Tutorial" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Training and Testing Data\n", 56 | "=====================================\n", 57 | "\n", 58 | "To evaluate how well our supervised models generalize, we can split our data into a training and a test set:\n", 59 | "\n", 60 | "" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "from sklearn.datasets import load_iris\n", 72 | "from sklearn.neighbors import KNeighborsClassifier\n", 73 | "\n", 74 | "iris = load_iris()\n", 75 | "X, y = iris.data, iris.target\n", 76 | "\n", 77 | "classifier = KNeighborsClassifier()" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "Thinking about how machine learning is normally performed, the idea of a train/test split makes sense. Real world systems train on the data they have, and as other data comes in (from customers, sensors, or other sources) the classifier that was trained must predict on fundamentally *new* data. We can simulate this during training using a train/test split - the test data is a simulation of \"future data\" which will come into the system during production. \n", 85 | "\n", 86 | "Specifically for iris, the 150 labels in iris are sorted, which means that if we split the data using a proportional split, this will result in fudamentally altered class distributions. For instance, if we'd perform a common 2/3 training data and 1/3 test data split, our training dataset will only consists of flower classes 0 and 1 (Setosa and Versicolor), and our test set will only contain samples with class label 2 (Virginica flowers).\n", 87 | "\n", 88 | "Under the assumption that all samples are independent of each other (in contrast time series data), we want to **randomly shuffle the dataset before we split the dataset** as illustrated above." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "collapsed": false 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "y" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "Now we need to split the data into training and testing. Luckily, this is a common pattern in machine learning and scikit-learn has a pre-built function to split data into training and testing sets for you. Here, we use 50% of the data as training, and 50% testing. 80% and 20% is another common split, but there are no hard and fast rules. The most important thing is to fairly evaluate your system on data it *has not* seen during training!" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "collapsed": false 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "from sklearn.cross_validation import train_test_split\n", 118 | "\n", 119 | "train_X, test_X, train_y, test_y = train_test_split(X, y, \n", 120 | " train_size=0.5, \n", 121 | " random_state=123)\n", 122 | "print(\"Labels for training and testing data\")\n", 123 | "print(train_y)\n", 124 | "print(test_y)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "---" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "**Tip: Stratified Split**\n", 139 | "\n", 140 | "Especially for relatively small datasets, it's better to stratify the split. Stratification means that we maintain the original class proportion of the dataset in the test and training sets. For example, after we randomly split the dataset as shown in the previous code example, we have the following class proportions in percent:" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "print('All:', np.bincount(y) / float(len(y)) * 100.0)\n", 152 | "print('Training:', np.bincount(train_y) / float(len(train_y)) * 100.0)\n", 153 | "print('Test:', np.bincount(test_y) / float(len(test_y)) * 100.0)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "So, in order to stratify the split, we can pass the label array as an additional option to the `train_test_split` function:" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "train_X, test_X, train_y, test_y = train_test_split(X, y, \n", 172 | " train_size=0.5, \n", 173 | " random_state=123,\n", 174 | " stratify=y)\n", 175 | "\n", 176 | "print('All:', np.bincount(y) / float(len(y)) * 100.0)\n", 177 | "print('Training:', np.bincount(train_y) / float(len(train_y)) * 100.0)\n", 178 | "print('Test:', np.bincount(test_y) / float(len(test_y)) * 100.0)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "---" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "By evaluating our classifier performance on data that has been seen during training, we could get false confidence in the predictive power of our model. In the worst case, it may simply memorize the training samples but completely fails classifying new, similar samples -- we really don't want to put such a system into production!\n", 193 | "\n", 194 | "Instead of using the same dataset for training and testing (this is called \"resubstitution evaluation\"), it is much much better to use a train/test split in order to estimate how well your trained model is doing on new data." 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "collapsed": false 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "classifier.fit(train_X, train_y)\n", 206 | "pred_y = classifier.predict(test_X)\n", 207 | "\n", 208 | "print(\"Fraction Correct [Accuracy]:\")\n", 209 | "print(np.sum(pred_y == test_y) / float(len(test_y)))" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "We can also visualize the correct and failed predictions" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": { 223 | "collapsed": false 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "print('Samples correctly classified:')\n", 228 | "correct_idx = np.where(pred_y == test_y)[0]\n", 229 | "print(correct_idx)\n", 230 | "\n", 231 | "print('\\nSamples incorrectly classified:')\n", 232 | "incorrect_idx = np.where(pred_y != test_y)[0]\n", 233 | "print(incorrect_idx)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "collapsed": false 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "# Plot two dimensions\n", 245 | "\n", 246 | "colors = [\"darkblue\", \"darkgreen\", \"gray\"]\n", 247 | "\n", 248 | "for n, color in enumerate(colors):\n", 249 | " idx = np.where(test_y == n)[0]\n", 250 | " plt.scatter(test_X[idx, 1], test_X[idx, 2], color=color, label=\"Class %s\" % str(n))\n", 251 | "\n", 252 | "plt.scatter(test_X[incorrect_idx, 1], test_X[incorrect_idx, 2], color=\"darkred\")\n", 253 | "\n", 254 | "plt.xlabel('sepal width [cm]')\n", 255 | "plt.ylabel('petal length [cm]')\n", 256 | "plt.legend(loc=3)\n", 257 | "plt.title(\"Iris Classification results\")\n", 258 | "plt.show()" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "We can see that the errors occur in the area where green (class 1) and gray (class 2) overlap. This gives us insight about what features to add - any feature which helps separate class 1 and class 2 should improve classifier performance." 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "# Exercise" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "Print the true labels of 3 wrong predictions and modify the scatterplot code, which we used above, to visualize and distinguish these three samples with different markers in the 2D scatterplot. Can you explain why our classifier made these wrong predictions?" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": { 286 | "collapsed": false 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "# %load solutions/04_wrong-predictions.py" 291 | ] 292 | } 293 | ], 294 | "metadata": { 295 | "kernelspec": { 296 | "display_name": "Python 3", 297 | "language": "python", 298 | "name": "python3" 299 | }, 300 | "language_info": { 301 | "codemirror_mode": { 302 | "name": "ipython", 303 | "version": 3 304 | }, 305 | "file_extension": ".py", 306 | "mimetype": "text/x-python", 307 | "name": "python", 308 | "nbconvert_exporter": "python", 309 | "pygments_lexer": "ipython3", 310 | "version": "3.5.1" 311 | } 312 | }, 313 | "nbformat": 4, 314 | "nbformat_minor": 0 315 | } 316 | -------------------------------------------------------------------------------- /notebooks/06 Supervised Learning - Regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "Andreas Mueller, Kyle Kastner, Sebastian Raschka \n", 15 | "last updated: 2016-07-11 \n", 16 | "\n", 17 | "CPython 3.5.1\n", 18 | "IPython 4.1.2\n", 19 | "\n", 20 | "numpy 1.11.0\n", 21 | "scipy 0.17.1\n", 22 | "matplotlib 1.5.1\n", 23 | "scikit-learn 0.17.1\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "%load_ext watermark\n", 29 | "%watermark -d -u -a 'Andreas Mueller, Kyle Kastner, Sebastian Raschka' -v -p numpy,scipy,matplotlib,scikit-learn" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "%matplotlib inline\n", 41 | "import matplotlib.pyplot as plt\n", 42 | "import numpy as np" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# SciPy 2016 Scikit-learn Tutorial" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "# Supervised Learning Part 2 -- Regression Analysis" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "In regression we are trying to predict a continuous output variable -- in contrast to the nominal variables we were predicting in the previous classification examples. \n", 64 | "\n", 65 | "Let's start with a simple toy example with one feature dimension (explanatory variable) and one target variable. We will create a dataset out of a sinus curve with some noise:" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "x = np.linspace(-3, 3, 100)\n", 77 | "print(x)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "rng = np.random.RandomState(42)\n", 89 | "y = np.sin(4 * x) + x + rng.uniform(size=len(x))" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "plt.plot(x, y, 'o');" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "Linear Regression\n", 108 | "=================\n", 109 | "\n", 110 | "The first model that we will introduce is the so-called simple linear regression. Here, we want to fit a line to the data, which \n", 111 | "\n", 112 | "One of the simplest models again is a linear one, that simply tries to predict the data as lying on a line. One way to find such a line is `LinearRegression` (also known as [*Ordinary Least Squares (OLS)*](https://en.wikipedia.org/wiki/Ordinary_least_squares) regression).\n", 113 | "The interface for LinearRegression is exactly the same as for the classifiers before, only that ``y`` now contains float values, instead of classes." 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "As we remember, the scikit-learn API requires us to provide the target variable (`y`) as a 1-dimensional array; scikit-learn's API expects the samples (`X`) in form a 2-dimensional array -- even though it may only consist of 1 feature. Thus, let us convert the 1-dimensional `x` NumPy array into an `X` array with 2 axes:\n" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "collapsed": false 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "print('Before: ', x.shape)\n", 132 | "X = x[:, np.newaxis]\n", 133 | "print('After: ', X.shape)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "Again, we start by splitting our dataset into a training (75%) and a test set (25%):" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "from sklearn.cross_validation import train_test_split\n", 152 | "\n", 153 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "Next, we use the learning algorithm implemented in `LinearRegression` to **fit a regression model to the training data**:" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "from sklearn.linear_model import LinearRegression\n", 172 | "\n", 173 | "regressor = LinearRegression()\n", 174 | "regressor.fit(X_train, y_train)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "After fitting to the training data, we paramerterized a linear regression model with the following values." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": { 188 | "collapsed": false 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "print('Weight coefficients: ', regressor.coef_)\n", 193 | "print('y-axis intercept: ', regressor.intercept_)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "Since our regression model is a linear one, the relationship between the target variable (y) and the feature variable (x) is defined as \n", 201 | "\n", 202 | "$$y = weight \\times x + \\text{intercept}$$.\n", 203 | "\n", 204 | "Plugging in the min and max values into thos equation, we can plot the regression fit to our training data:" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "collapsed": false 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "min_pt = X.min() * regressor.coef_[0] + regressor.intercept_\n", 216 | "max_pt = X.max() * regressor.coef_[0] + regressor.intercept_\n", 217 | "\n", 218 | "plt.plot([X.min(), X.max()], [min_pt, max_pt])\n", 219 | "plt.plot(X_train, y_train, 'o');" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "Similar to the estimators for classification in the previous notebook, we use the `predict` method to predict the target variable. And we expect these predicted values to fall onto the line that we plotted previously:" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": true 234 | }, 235 | "outputs": [], 236 | "source": [ 237 | "y_pred_train = regressor.predict(X_train)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": { 244 | "collapsed": false 245 | }, 246 | "outputs": [], 247 | "source": [ 248 | "plt.plot(X_train, y_train, 'o', label=\"data\")\n", 249 | "plt.plot(X_train, y_pred_train, 'o', label=\"prediction\")\n", 250 | "plt.plot([X.min(), X.max()], [min_pt, max_pt], label='fit')\n", 251 | "plt.legend(loc='best')" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "As we can see in the plot above, the line is able to capture the general slope of the data, but not many details." 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "Next, let's try the test set:" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": { 272 | "collapsed": true 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "y_pred_test = regressor.predict(X_test)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": { 283 | "collapsed": false 284 | }, 285 | "outputs": [], 286 | "source": [ 287 | "plt.plot(X_test, y_test, 'o', label=\"data\")\n", 288 | "plt.plot(X_test, y_pred_test, 'o', label=\"prediction\")\n", 289 | "plt.plot([X.min(), X.max()], [min_pt, max_pt], label='fit')\n", 290 | "plt.legend(loc='best');" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "Again, scikit-learn provides an easy way to evaluate the prediction quantitatively using the ``score`` method. For regression tasks, this is the R2 score. Another popular way would be the Mean Squared Error (MSE). As its name implies, the MSE is simply the average squared difference over the predicted and actual target values\n", 298 | "\n", 299 | "$$MSE = \\frac{1}{n} \\sum^{n}_{i=1} (\\text{predicted}_i - \\text{true}_i)^2$$" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": { 306 | "collapsed": false 307 | }, 308 | "outputs": [], 309 | "source": [ 310 | "regressor.score(X_test, y_test)" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "KNeighborsRegression\n", 318 | "=======================\n", 319 | "As for classification, we can also use a neighbor based method for regression. We can simply take the output of the nearest point, or we could average several nearest points. This method is less popular for regression than for classification, but still a good baseline." 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": { 326 | "collapsed": false 327 | }, 328 | "outputs": [], 329 | "source": [ 330 | "from sklearn.neighbors import KNeighborsRegressor\n", 331 | "kneighbor_regression = KNeighborsRegressor(n_neighbors=1)\n", 332 | "kneighbor_regression.fit(X_train, y_train)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "Again, let us look at the behavior on training and test set:" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": false 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "y_pred_train = kneighbor_regression.predict(X_train)\n", 351 | "\n", 352 | "plt.plot(X_train, y_train, 'o', label=\"data\", markersize=10)\n", 353 | "plt.plot(X_train, y_pred_train, 's', label=\"prediction\", markersize=4)\n", 354 | "plt.legend(loc='best');" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "On the training set, we do a perfect job: each point is its own nearest neighbor!" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": { 368 | "collapsed": false 369 | }, 370 | "outputs": [], 371 | "source": [ 372 | "y_pred_test = kneighbor_regression.predict(X_test)\n", 373 | "\n", 374 | "plt.plot(X_test, y_test, 'o', label=\"data\", markersize=8)\n", 375 | "plt.plot(X_test, y_pred_test, 's', label=\"prediction\", markersize=4)\n", 376 | "plt.legend(loc='best');" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "On the test set, we also do a better job of capturing the variation, but our estimates look much messier than before.\n", 384 | "Let us look at the R2 score:" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": { 391 | "collapsed": false 392 | }, 393 | "outputs": [], 394 | "source": [ 395 | "kneighbor_regression.score(X_test, y_test)" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "Much better than before! Here, the linear model was not a good fit for our problem; it was lacking in complexity and thus under-fit our data." 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "Exercise\n", 410 | "=========\n", 411 | "Compare the KNeighborsRegressor and LinearRegression on the boston housing dataset. You can load the dataset using ``sklearn.datasets.load_boston``. You can learn about the dataset by reading the ``DESCR`` attribute." 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": { 418 | "collapsed": true 419 | }, 420 | "outputs": [], 421 | "source": [ 422 | "# %load solutions/06A_knn_vs_linreg.py" 423 | ] 424 | } 425 | ], 426 | "metadata": { 427 | "kernelspec": { 428 | "display_name": "Python 3", 429 | "language": "python", 430 | "name": "python3" 431 | }, 432 | "language_info": { 433 | "codemirror_mode": { 434 | "name": "ipython", 435 | "version": 3 436 | }, 437 | "file_extension": ".py", 438 | "mimetype": "text/x-python", 439 | "name": "python", 440 | "nbconvert_exporter": "python", 441 | "pygments_lexer": "ipython3", 442 | "version": "3.5.1" 443 | } 444 | }, 445 | "nbformat": 4, 446 | "nbformat_minor": 0 447 | } 448 | -------------------------------------------------------------------------------- /notebooks/09 Review of Scikit-learn API.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "Andreas Mueller, Kyle Kastner, Sebastian Raschka \n", 15 | "last updated: 2016-06-23 \n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "%load_ext watermark\n", 21 | "%watermark -d -u -a 'Andreas Mueller, Kyle Kastner, Sebastian Raschka'" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "# SciPy 2016 Scikit-learn Tutorial" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# A recap on Scikit-learn's estimator interface" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "source": [ 44 | "Scikit-learn strives to have a uniform interface across all methods. Given a scikit-learn *estimator*\n", 45 | "object named `model`, the following methods are available (not all for each model):\n", 46 | "\n", 47 | "- Available in **all Estimators**\n", 48 | " + `model.fit()` : fit training data. For supervised learning applications,\n", 49 | " this accepts two arguments: the data `X` and the labels `y` (e.g. `model.fit(X, y)`).\n", 50 | " For unsupervised learning applications, `fit` takes only a single argument,\n", 51 | " the data `X` (e.g. `model.fit(X)`).\n", 52 | "- Available in **supervised estimators**\n", 53 | " + `model.predict()` : given a trained model, predict the label of a new set of data.\n", 54 | " This method accepts one argument, the new data `X_new` (e.g. `model.predict(X_new)`),\n", 55 | " and returns the learned label for each object in the array.\n", 56 | " + `model.predict_proba()` : For classification problems, some estimators also provide\n", 57 | " this method, which returns the probability that a new observation has each categorical label.\n", 58 | " In this case, the label with the highest probability is returned by `model.predict()`.\n", 59 | " + `model.decision_function()` : For classification problems, some estimators provide an uncertainty estimate that is not a probability. For binary classification, a decision_function >= 0 means the positive class will be predicted, while < 0 means the negative class.\n", 60 | " + `model.score()` : for classification or regression problems, most (all?) estimators implement\n", 61 | " a score method. Scores are between 0 and 1, with a larger score indicating a better fit. For classifiers, the `score` method computes the prediction accuracy. For regressors, `score` computes the coefficient of determination (R2) of the prediction.\n", 62 | " + `model.transform()` : For feature selection algorithms, this will reduce the dataset to the selected features. For some classification and regression models such as some linear models and random forests, this method reduces the dataset to the most informative features. These classification and regression models can therefore also be used as feature selection methods.\n", 63 | " \n", 64 | "- Available in **unsupervised estimators**\n", 65 | " + `model.transform()` : given an unsupervised model, transform new data into the new basis.\n", 66 | " This also accepts one argument `X_new`, and returns the new representation of the data based\n", 67 | " on the unsupervised model.\n", 68 | " + `model.fit_transform()` : some estimators implement this method,\n", 69 | " which more efficiently performs a fit and a transform on the same input data.\n", 70 | " + `model.predict()` : for clustering algorithms, the predict method will produce cluster labels for new data points. Not all clustering methods have this functionality.\n", 71 | " + `model.predict_proba()` : Gaussian mixture models (GMMs) provide the probability for each point to be generated by a given mixture component.\n", 72 | " + `model.score()` : Density models like KDE and GMMs provide the likelihood of the data under the model." 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "source": [ 81 | "Apart from ``fit``, the two most important functions are arguably ``predict`` to produce a target variable (a ``y``) ``transform``, which produces a new representation of the data (an ``X``).\n", 82 | "The following table shows for which class of models which function applies:\n", 83 | "\n" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "\n", 91 | "\n", 92 | "\n", 93 | "\n", 94 | "\n", 95 | "\n", 96 | "\n", 97 | "
``model.predict````model.transform``
ClassificationPreprocessing
RegressionDimensionality Reduction
ClusteringFeature Extraction
 Feature selection
\n", 98 | "\n", 99 | "\n" 100 | ] 101 | } 102 | ], 103 | "metadata": { 104 | "kernelspec": { 105 | "display_name": "Python 3", 106 | "language": "python", 107 | "name": "python3" 108 | }, 109 | "language_info": { 110 | "codemirror_mode": { 111 | "name": "ipython", 112 | "version": 3 113 | }, 114 | "file_extension": ".py", 115 | "mimetype": "text/x-python", 116 | "name": "python", 117 | "nbconvert_exporter": "python", 118 | "pygments_lexer": "ipython3", 119 | "version": "3.5.1" 120 | } 121 | }, 122 | "nbformat": 4, 123 | "nbformat_minor": 0 124 | } 125 | -------------------------------------------------------------------------------- /notebooks/12 Case Study - SMS Spam Detection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "Andreas Mueller, Kyle Kastner, Sebastian Raschka \n", 15 | "last updated: 2016-07-11 \n", 16 | "\n", 17 | "CPython 3.5.1\n", 18 | "IPython 4.1.2\n", 19 | "\n", 20 | "numpy 1.11.0\n", 21 | "scipy 0.17.1\n", 22 | "matplotlib 1.5.1\n", 23 | "scikit-learn 0.17.1\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "%load_ext watermark\n", 29 | "%watermark -d -u -a 'Andreas Mueller, Kyle Kastner, Sebastian Raschka' -v -p numpy,scipy,matplotlib,scikit-learn" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "%matplotlib inline\n", 41 | "import matplotlib.pyplot as plt\n", 42 | "import numpy as np" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# SciPy 2016 Scikit-learn Tutorial" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "# Case Study - Text classification for SMS spam detection" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "We first load the text data from the `dataset` directory that should be located in your notebooks directory, which we created by running the `fetch_data.py` script from the top level of the GitHub repositor.\n", 64 | "\n", 65 | "Furthermore, we perform some simple preprocessing and split the data array into two parts:\n", 66 | "\n", 67 | "1. `text`: A list of lists, where each sublists contains the contents of our emails\n", 68 | "2. `y`: our SPAM vs HAM labels stored in binary; a 1 represents a spam message, and a 0 represnts a ham (non-spam) message. " 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "import os\n", 80 | "\n", 81 | "with open(os.path.join(\"datasets\", \"smsspam\", \"SMSSpamCollection\")) as f:\n", 82 | " lines = [line.strip().split(\"\\t\") for line in f.readlines()]\n", 83 | "\n", 84 | "text = [x[1] for x in lines]\n", 85 | "y = [int(x[0] == \"spam\") for x in lines]" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": false, 93 | "scrolled": true 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "text[:10]" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "collapsed": false, 105 | "scrolled": true 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "y[:10]" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "print('Number of ham and spam messages:', np.bincount(y))" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "collapsed": false 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "type(text)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "type(y)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "Next, we split our dataset into 2 parts, the test and training dataset:" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "from sklearn.cross_validation import train_test_split\n", 161 | "\n", 162 | "text_train, text_test, y_train, y_test = train_test_split(text, y, \n", 163 | " random_state=42,\n", 164 | " test_size=0.25,\n", 165 | " stratify=y)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "Now, we use the CountVectorizer to parse the text data into a bag-of-words model." 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "collapsed": false 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "print('CountVectorizer defaults')\n", 184 | "CountVectorizer()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "collapsed": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "from sklearn.feature_extraction.text import CountVectorizer\n", 196 | "\n", 197 | "vectorizer = CountVectorizer()\n", 198 | "vectorizer.fit(text_train)\n", 199 | "\n", 200 | "X_train = vectorizer.transform(text_train)\n", 201 | "X_test = vectorizer.transform(text_test)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "collapsed": false, 209 | "scrolled": true 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "print(len(vectorizer.vocabulary_))" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": { 220 | "collapsed": false 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "X_train.shape" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": { 231 | "collapsed": false 232 | }, 233 | "outputs": [], 234 | "source": [ 235 | "print(vectorizer.get_feature_names()[:20])\n" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": { 242 | "collapsed": false 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "print(vectorizer.get_feature_names()[2000:2020])" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": { 253 | "collapsed": false 254 | }, 255 | "outputs": [], 256 | "source": [ 257 | "print(X_train.shape)\n", 258 | "print(X_test.shape)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "### Training a Classifier on Text Features" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "We can now train a classifier, for instance a logistic regression classifier which is a fast baseline for text classification tasks:" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": { 279 | "collapsed": false 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "from sklearn.linear_model import LogisticRegression\n", 284 | "\n", 285 | "clf = LogisticRegression()\n", 286 | "clf" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": { 293 | "collapsed": false 294 | }, 295 | "outputs": [], 296 | "source": [ 297 | "clf.fit(X_train, y_train)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "We can now evaluate the classifier on the testing set. Let's first use the built-in score function, which is the rate of correct classification in the test set:" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": { 311 | "collapsed": false 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "clf.score(X_test, y_test)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "We can also compute the score on the training set, to see how well we do there:" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "collapsed": false 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "clf.score(X_train, y_train)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "# Visualizing important features" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": { 347 | "collapsed": true 348 | }, 349 | "outputs": [], 350 | "source": [ 351 | "def visualize_coefficients(classifier, feature_names, n_top_features=25):\n", 352 | " # get coefficients with large absolute values \n", 353 | " coef = classifier.coef_.ravel()\n", 354 | " positive_coefficients = np.argsort(coef)[-n_top_features:]\n", 355 | " negative_coefficients = np.argsort(coef)[:n_top_features]\n", 356 | " interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])\n", 357 | " # plot them\n", 358 | " plt.figure(figsize=(15, 5))\n", 359 | " colors = [\"red\" if c < 0 else \"blue\" for c in coef[interesting_coefficients]]\n", 360 | " plt.bar(np.arange(50), coef[interesting_coefficients], color=colors)\n", 361 | " feature_names = np.array(feature_names)\n", 362 | " plt.xticks(np.arange(1, 51), feature_names[interesting_coefficients], rotation=60, ha=\"right\");" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": { 369 | "collapsed": false 370 | }, 371 | "outputs": [], 372 | "source": [ 373 | "visualize_coefficients(clf, vectorizer.get_feature_names())\n" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": { 380 | "collapsed": false 381 | }, 382 | "outputs": [], 383 | "source": [ 384 | "vectorizer = CountVectorizer(min_df=2)\n", 385 | "vectorizer.fit(text_train)\n", 386 | "\n", 387 | "X_train = vectorizer.transform(text_train)\n", 388 | "X_test = vectorizer.transform(text_test)\n", 389 | "\n", 390 | "clf = LogisticRegression()\n", 391 | "clf.fit(X_train, y_train)\n", 392 | "\n", 393 | "print(clf.score(X_train, y_train))\n", 394 | "print(clf.score(X_test, y_test))" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": { 401 | "collapsed": false 402 | }, 403 | "outputs": [], 404 | "source": [ 405 | "visualize_coefficients(clf, vectorizer.get_feature_names())" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "# Exercises\n", 420 | "\n", 421 | "Use TfidfVectorizer instead of CountVectorizer. Are the results better? How are the coefficients different?\n", 422 | "\n", 423 | "Change the parameters min_df and ngram_range of the TfidfVectorizer and CountVectorizer. How does that change the important features?\n", 424 | "\n" 425 | ] 426 | } 427 | ], 428 | "metadata": { 429 | "kernelspec": { 430 | "display_name": "Python 3", 431 | "language": "python", 432 | "name": "python3" 433 | }, 434 | "language_info": { 435 | "codemirror_mode": { 436 | "name": "ipython", 437 | "version": 3 438 | }, 439 | "file_extension": ".py", 440 | "mimetype": "text/x-python", 441 | "name": "python", 442 | "nbconvert_exporter": "python", 443 | "pygments_lexer": "ipython3", 444 | "version": "3.5.1" 445 | } 446 | }, 447 | "nbformat": 4, 448 | "nbformat_minor": 0 449 | } 450 | -------------------------------------------------------------------------------- /notebooks/13 Cross Validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 26, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "The watermark extension is already loaded. To reload it, use:\n", 15 | " %reload_ext watermark\n", 16 | "Andreas Mueller, Kyle Kastner, Sebastian Raschka \n", 17 | "last updated: 2016-07-11 \n", 18 | "\n", 19 | "CPython 3.5.1\n", 20 | "IPython 4.1.2\n", 21 | "\n", 22 | "numpy 1.11.0\n", 23 | "scipy 0.17.1\n", 24 | "matplotlib 1.5.1\n", 25 | "scikit-learn 0.17.1\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "%load_ext watermark\n", 31 | "%watermark -d -u -a 'Andreas Mueller, Kyle Kastner, Sebastian Raschka' -v -p numpy,scipy,matplotlib,scikit-learn" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "# SciPy 2016 Scikit-learn Tutorial" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "# Cross-Validation and scoring methods" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "In the previous sections and notebooks, we split our dataset into two parts, a training set and a test set. We used the training set to fit our model, and we used the test set to evaluate its generalization performance -- how well it performs on new, unseen data.\n", 53 | "\n", 54 | "\n", 55 | "\n" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "However, often (labeled) data is precious, and this approach lets us only use ~ 3/4 of our data for training. On the other hand, we will only ever try to apply our model 1/4 of our data for testing.\n", 63 | "A common way to use more of the data to build a model, but also get a more robust estimate of the generalization performance, is cross-validation.\n", 64 | "In cross-validation, the data is split repeatedly into a training and non-overlapping test-sets, with a separate model built for every pair. The test-set scores are then aggregated for a more robust estimate.\n", 65 | "\n", 66 | "The most common way to do cross-validation is k-fold cross-validation, in which the data is first split into k (often 5 or 10) equal-sized folds, and then for each iteration, one of the k folds is used as test data, and the rest as training data:" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "\n" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "This way, each data point will be in the test-set exactly once, and we can use all but a k'th of the data for training.\n", 81 | "Let us apply this technique to evaluate the KNeighborsClassifier algorithm on the Iris dataset:" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "from sklearn.datasets import load_iris\n", 93 | "from sklearn.neighbors import KNeighborsClassifier\n", 94 | "\n", 95 | "iris = load_iris()\n", 96 | "X, y = iris.data, iris.target\n", 97 | "\n", 98 | "classifier = KNeighborsClassifier()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "The labels in iris are sorted, which means that if we split the data as illustrated above, the first fold will only have the label 0 in it, while the last one will only have the label 2:" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": { 112 | "collapsed": false 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "y" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "To avoid this problem in evaluation, we first shuffle our data:" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "collapsed": false 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "import numpy as np\n", 135 | "rng = np.random.RandomState(0)\n", 136 | "\n", 137 | "permutation = rng.permutation(len(X))\n", 138 | "X, y = X[permutation], y[permutation]\n", 139 | "print(y)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "Now implementing cross-validation is easy:" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "collapsed": false 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "k = 5\n", 158 | "n_samples = len(X)\n", 159 | "fold_size = n_samples // k\n", 160 | "scores = []\n", 161 | "masks = []\n", 162 | "for fold in range(k):\n", 163 | " # generate a boolean mask for the test set in this fold\n", 164 | " test_mask = np.zeros(n_samples, dtype=bool)\n", 165 | " test_mask[fold * fold_size : (fold + 1) * fold_size] = True\n", 166 | " # store the mask for visualization\n", 167 | " masks.append(test_mask)\n", 168 | " # create training and test sets using this mask\n", 169 | " X_test, y_test = X[test_mask], y[test_mask]\n", 170 | " X_train, y_train = X[~test_mask], y[~test_mask]\n", 171 | " # fit the classifier\n", 172 | " classifier.fit(X_train, y_train)\n", 173 | " # compute the score and record it\n", 174 | " scores.append(classifier.score(X_test, y_test))" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "Let's check that our test mask does the right thing:" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": { 188 | "collapsed": false 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "import matplotlib.pyplot as plt\n", 193 | "%matplotlib inline\n", 194 | "plt.matshow(masks)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "And now let's look a the scores we computed:" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "collapsed": false 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "print(scores)\n", 213 | "print(np.mean(scores))" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "As you can see, there is a rather wide spectrum of scores from 90% correct to 100% correct. If we only did a single split, we might have gotten either answer." 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "As cross-validation is such a common pattern in machine learning, there are functions to do the above for you with much more flexibility and less code.\n", 228 | "The ``sklearn.cross_validation`` module has all functions related to cross validation. There easiest function is ``cross_val_score`` which takes an estimator and a dataset, and will do all of the splitting for you:" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": { 235 | "collapsed": false 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "from sklearn.cross_validation import cross_val_score\n", 240 | "scores = cross_val_score(classifier, X, y)\n", 241 | "print(scores)\n", 242 | "print(np.mean(scores))" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "As you can see, the function uses three folds by default. You can change the number of folds using the cv argument:" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "collapsed": false 257 | }, 258 | "outputs": [], 259 | "source": [ 260 | "cross_val_score(classifier, X, y, cv=5)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "There are also helper objects in the cross-validation module that will generate indices for you for all kinds of different cross-validation methods, including k-fold:" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": { 274 | "collapsed": true 275 | }, 276 | "outputs": [], 277 | "source": [ 278 | "from sklearn.cross_validation import KFold, StratifiedKFold, ShuffleSplit, LeavePLabelOut" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "By default, cross_val_score will use ``StratifiedKFold`` for classification, which ensures that the class proportions in the dataset are reflected in each fold. If you have a binary classification dataset with 90% of data point belonging to class 0, that would mean that in each fold, 90% of datapoints would belong to class 0.\n", 286 | "If you would just use KFold cross-validation, it is likely that you would generate a split that only contains class 0.\n", 287 | "It is generally a good idea to use ``StratifiedKFold`` whenever you do classification.\n", 288 | "\n", 289 | "``StratifiedKFold`` would also remove our need to shuffle ``iris``.\n", 290 | "Let's see what kinds of folds it generates on the unshuffled iris dataset.\n", 291 | "Each cross-validation class is a generator of sets of training and test indices:" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": { 298 | "collapsed": false 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "cv = StratifiedKFold(iris.target, n_folds=5)\n", 303 | "for train, test in cv:\n", 304 | " print(test)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "As you can see, there are a couple of samples from the beginning, then from the middle, and then from the end, in each of the folds.\n", 312 | "This way, the class ratios are preserved. Let's visualize the split:" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": true 320 | }, 321 | "outputs": [], 322 | "source": [ 323 | "def plot_cv(cv, n_samples):\n", 324 | " masks = []\n", 325 | " for train, test in cv:\n", 326 | " mask = np.zeros(n_samples, dtype=bool)\n", 327 | " mask[test] = 1\n", 328 | " masks.append(mask)\n", 329 | " \n", 330 | " plt.matshow(masks)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": { 337 | "collapsed": false 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "plot_cv(StratifiedKFold(iris.target, n_folds=5), len(iris.target))" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": {}, 347 | "source": [ 348 | "For comparison, again the standard KFold, that ignores the labels:" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": { 355 | "collapsed": false 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "plot_cv(KFold(len(iris.target), n_folds=5), len(iris.target))" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "Keep in mind that increasing the number of folds will give you a larger training dataset, but will lead to more repetitions, and therefore a slower evaluation:" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": { 373 | "collapsed": false 374 | }, 375 | "outputs": [], 376 | "source": [ 377 | "plot_cv(KFold(len(iris.target), n_folds=10), len(iris.target))" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "Another helpful cross-validation generator is ``ShuffleSplit``. This generator simply splits of a random portion of the data repeatedly. This allows the user to specify the number of repetitions and the training set size independently:" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": { 391 | "collapsed": false 392 | }, 393 | "outputs": [], 394 | "source": [ 395 | "plot_cv(ShuffleSplit(len(iris.target), n_iter=5, test_size=.2), len(iris.target))" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "If you want a more robust estimate, you can just increase the number of iterations:" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": { 409 | "collapsed": false 410 | }, 411 | "outputs": [], 412 | "source": [ 413 | "plot_cv(ShuffleSplit(len(iris.target), n_iter=20, test_size=.2), len(iris.target))" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "You can use all of these cross-validation generators with the cross_val_score method:" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": { 427 | "collapsed": false 428 | }, 429 | "outputs": [], 430 | "source": [ 431 | "cv = ShuffleSplit(len(iris.target), n_iter=5, test_size=.2)\n", 432 | "cross_val_score(classifier, X, y, cv=cv)" 433 | ] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "metadata": {}, 438 | "source": [ 439 | "# Exercise\n", 440 | "Perform three-fold cross-validation using the ``KFold`` class on the iris dataset without shuffling the data. Can you explain the result?" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": { 447 | "collapsed": false 448 | }, 449 | "outputs": [], 450 | "source": [ 451 | "# %load solutions/13_cross_validation.py" 452 | ] 453 | } 454 | ], 455 | "metadata": { 456 | "kernelspec": { 457 | "display_name": "Python 3", 458 | "language": "python", 459 | "name": "python3" 460 | }, 461 | "language_info": { 462 | "codemirror_mode": { 463 | "name": "ipython", 464 | "version": 3 465 | }, 466 | "file_extension": ".py", 467 | "mimetype": "text/x-python", 468 | "name": "python", 469 | "nbconvert_exporter": "python", 470 | "pygments_lexer": "ipython3", 471 | "version": "3.5.1" 472 | } 473 | }, 474 | "nbformat": 4, 475 | "nbformat_minor": 0 476 | } 477 | -------------------------------------------------------------------------------- /notebooks/15 Pipelining Estimators.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "Andreas Mueller, Kyle Kastner, Sebastian Raschka \n", 15 | "last updated: 2016-07-11 \n", 16 | "\n", 17 | "CPython 3.5.1\n", 18 | "IPython 4.2.0\n", 19 | "\n", 20 | "numpy 1.11.0\n", 21 | "scipy 0.17.1\n", 22 | "matplotlib 1.5.1\n", 23 | "scikit-learn 0.17.1\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "%load_ext watermark\n", 29 | "%watermark -d -u -a 'Andreas Mueller, Kyle Kastner, Sebastian Raschka' -v -p numpy,scipy,matplotlib,scikit-learn" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "%matplotlib inline\n", 41 | "import numpy as np\n", 42 | "import matplotlib.pyplot as plt" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# SciPy 2016 Scikit-learn Tutorial" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "# Pipelining estimators" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "In this section we study how different estimators maybe be chained." 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "## A simple example: feature extraction and selection before an estimator" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "### Feature extraction: vectorizer" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "For some types of data, for instance text data, a feature extraction step must be applied to convert it to numerical features.\n", 85 | "To illustrate we load the SMS spam dataset we used earlier." 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": false 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "import os\n", 97 | "\n", 98 | "with open(os.path.join(\"datasets\", \"smsspam\", \"SMSSpamCollection\")) as f:\n", 99 | " lines = [line.strip().split(\"\\t\") for line in f.readlines()]\n", 100 | "text = [x[1] for x in lines]\n", 101 | "y = [x[0] == \"ham\" for x in lines]" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": { 108 | "collapsed": true 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "from sklearn.cross_validation import train_test_split\n", 113 | "\n", 114 | "text_train, text_test, y_train, y_test = train_test_split(text, y)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "Previously, we applied the feature extraction manually, like so:" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": { 128 | "collapsed": false 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 133 | "from sklearn.linear_model import LogisticRegression\n", 134 | "\n", 135 | "vectorizer = TfidfVectorizer()\n", 136 | "vectorizer.fit(text_train)\n", 137 | "\n", 138 | "X_train = vectorizer.transform(text_train)\n", 139 | "X_test = vectorizer.transform(text_test)\n", 140 | "\n", 141 | "clf = LogisticRegression()\n", 142 | "clf.fit(X_train, y_train)\n", 143 | "\n", 144 | "clf.score(X_test, y_test)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "The situation where we learn a transformation and then apply it to the test data is very common in machine learning.\n", 152 | "Therefore scikit-learn has a shortcut for this, called pipelines:" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": { 159 | "collapsed": false 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "from sklearn.pipeline import make_pipeline\n", 164 | "\n", 165 | "pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())\n", 166 | "pipeline.fit(text_train, y_train)\n", 167 | "pipeline.score(text_test, y_test)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "As you can see, this makes the code much shorter and easier to handle. Behind the scenes, exactly the same as above is happening. When calling fit on the pipeline, it will call fit on each step in turn.\n", 175 | "\n", 176 | "After the first step is fit, it will use the ``transform`` method of the first step to create a new representation.\n", 177 | "This will then be fed to the ``fit`` of the next step, and so on.\n", 178 | "Finally, on the last step, only ``fit`` is called.\n", 179 | "\n", 180 | "![pipeline](figures/pipeline.svg)\n", 181 | "\n", 182 | "If we call ``score``, only ``transform`` will be called on each step - this could be the test set after all! Then, on the last step, ``score`` is called with the new representation. The same goes for ``predict``." 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "Building pipelines not only simplifies the code, it is also important for model selection.\n", 190 | "Say we want to grid-search C to tune our Logistic Regression above.\n", 191 | "\n", 192 | "Let's say we do it like this:" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "collapsed": false 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "# this illustrates a common mistake. Don't use this code!\n", 204 | "from sklearn.grid_search import GridSearchCV\n", 205 | "\n", 206 | "vectorizer = TfidfVectorizer()\n", 207 | "vectorizer.fit(text_train)\n", 208 | "\n", 209 | "X_train = vectorizer.transform(text_train)\n", 210 | "X_test = vectorizer.transform(text_test)\n", 211 | "\n", 212 | "clf = LogisticRegression()\n", 213 | "grid = GridSearchCV(clf, param_grid={'C': [.1, 1, 10, 100]}, cv=5)\n", 214 | "grid.fit(X_train, y_train)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "### 2.1.2 What did we do wrong?" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "Here, we did grid-search with cross-validation on ``X_train``. However, when applying ``TfidfVectorizer``, it saw all of the ``X_train``,\n", 229 | "not only the training folds! So it could use knowledge of the frequency of the words in the test-folds. This is called \"contamination\" of the test set, and leads to too optimistic estimates of generalization performance, or badly selected parameters.\n", 230 | "We can fix this with the pipeline, though:" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": { 237 | "collapsed": false 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "from sklearn.grid_search import GridSearchCV\n", 242 | "\n", 243 | "pipeline = make_pipeline(TfidfVectorizer(), \n", 244 | " LogisticRegression())\n", 245 | "\n", 246 | "grid = GridSearchCV(pipeline,\n", 247 | " param_grid={'logisticregression__C': [.1, 1, 10, 100]}, cv=5)\n", 248 | "\n", 249 | "grid.fit(text_train, y_train)\n", 250 | "grid.score(text_test, y_test)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "Note that we need to tell the pipeline where at which step we wanted to set the parameter ``C``.\n", 258 | "We can do this using the special ``__`` syntax. The name before the ``__`` is simply the name of the class, the part after ``__`` is the parameter we want to set with grid-search." 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "Another benefit of using pipelines is that we can now also search over parameters of the feature extraction with ``GridSearchCV``:" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": { 279 | "collapsed": false 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "from sklearn.grid_search import GridSearchCV\n", 284 | "\n", 285 | "pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())\n", 286 | "\n", 287 | "params = {'logisticregression__C': [.1, 1, 10, 100],\n", 288 | " \"tfidfvectorizer__ngram_range\": [(1, 1), (1, 2), (2, 2)]}\n", 289 | "grid = GridSearchCV(pipeline, param_grid=params, cv=5)\n", 290 | "grid.fit(text_train, y_train)\n", 291 | "print(grid.best_params_)\n", 292 | "grid.score(text_test, y_test)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "## Exercise" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "Create a pipeline out of a StandardScaler and Ridge regression and apply it to the Boston housing dataset (load using ``sklearn.datasets.load_boston``). Try adding the ``sklearn.preprocessing.PolynomialFeatures`` transformer as a second preprocessing step, and grid-search the degree of the polynomials (try 1, 2 and 3)." 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "collapsed": false 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "# %load solutions/15A_ridge_grid.py" 318 | ] 319 | } 320 | ], 321 | "metadata": { 322 | "kernelspec": { 323 | "display_name": "Python 3", 324 | "language": "python", 325 | "name": "python3" 326 | }, 327 | "language_info": { 328 | "codemirror_mode": { 329 | "name": "ipython", 330 | "version": 3 331 | }, 332 | "file_extension": ".py", 333 | "mimetype": "text/x-python", 334 | "name": "python", 335 | "nbconvert_exporter": "python", 336 | "pygments_lexer": "ipython3", 337 | "version": "3.5.1" 338 | } 339 | }, 340 | "nbformat": 4, 341 | "nbformat_minor": 0 342 | } 343 | -------------------------------------------------------------------------------- /notebooks/17 In Depth - Linear Models.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%load_ext watermark\n", 12 | "%watermark -d -u -a 'Andreas Mueller, Kyle Kastner, Sebastian Raschka' -v -p numpy,scipy,matplotlib,scikit-learn" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "%matplotlib notebook\n", 24 | "import numpy as np\n", 25 | "import matplotlib.pyplot as plt" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "# Linear models\n", 33 | "Linear models are useful when little data is available or for very large feature spaces as in text classification. In addition, they form a good case study for regularization." 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "# Linear models for regression" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "All linear models for regression learn a coefficient parameter ``coef_`` and an offset ``intercept_`` to make predictions using a linear combination of features:\n", 48 | "\n", 49 | "```\n", 50 | "y_pred = x_test[0] * coef_[0] + ... + x_test[n_features-1] * coef_[n_features-1] + intercept_\n", 51 | "```\n", 52 | "\n", 53 | "The difference between the linear models for regression is what kind of restrictions or penalties are put on ``coef_`` as regularization , in addition to fitting the training data well.\n", 54 | "The most standard linear model is the 'ordinary least squares regression', often simply called 'linear regression'. It doesn't put any additional restrictions on ``coef_``, so when the number of features is large, it becomes ill-posed and the model overfits.\n", 55 | "\n", 56 | "Let us generate a simple simulation, to see the behavior of these models." 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": false 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "from sklearn.datasets import make_regression\n", 68 | "from sklearn.model_selection import train_test_split\n", 69 | "\n", 70 | "X, y, true_coefficient = make_regression(n_samples=200, n_features=30, n_informative=10, noise=100, coef=True, random_state=5)\n", 71 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5, train_size=60)\n", 72 | "print(X_train.shape)\n", 73 | "print(y_train.shape)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## Linear Regression\n", 81 | "\n", 82 | "$$ \\text{min}_{w, b} \\sum_i || w^\\mathsf{T}x_i + b - y_i||^2 $$" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": false, 90 | "scrolled": true 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "from sklearn.linear_model import LinearRegression\n", 95 | "linear_regression = LinearRegression().fit(X_train, y_train)\n", 96 | "print(\"R^2 on training set: %f\" % linear_regression.score(X_train, y_train))\n", 97 | "print(\"R^2 on test set: %f\" % linear_regression.score(X_test, y_test))" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "collapsed": false 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "from sklearn.metrics import r2_score\n", 109 | "print(r2_score(np.dot(X, true_coefficient), y))" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "plt.figure(figsize=(10, 5))\n", 121 | "coefficient_sorting = np.argsort(true_coefficient)[::-1]\n", 122 | "plt.plot(true_coefficient[coefficient_sorting], \"o\", label=\"true\")\n", 123 | "plt.plot(linear_regression.coef_[coefficient_sorting], \"o\", label=\"linear regression\")\n", 124 | "\n", 125 | "plt.legend()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "from sklearn.learning_curve import learning_curve\n", 137 | "\n", 138 | "def plot_learning_curve(est, X, y):\n", 139 | " training_set_size, train_scores, test_scores = learning_curve(est, X, y, train_sizes=np.linspace(.1, 1, 20))\n", 140 | " estimator_name = est.__class__.__name__\n", 141 | " line = plt.plot(training_set_size, train_scores.mean(axis=1), '--', label=\"training scores \" + estimator_name)\n", 142 | " plt.plot(training_set_size, test_scores.mean(axis=1), '-', label=\"test scores \" + estimator_name, c=line[0].get_color())\n", 143 | " plt.xlabel('Training set size')\n", 144 | " plt.legend(loc='best')\n", 145 | " plt.ylim(-0.1, 1.1)\n", 146 | " \n", 147 | "plt.figure() \n", 148 | "plot_learning_curve(LinearRegression(), X, y)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "## Ridge Regression (L2 penalty)\n", 156 | "\n", 157 | "**The Ridge estimator** is a simple regularization (called l2 penalty) of the ordinary LinearRegression. In particular, it has the benefit of being not computationally more expensive than the ordinary least square estimate.\n", 158 | "\n", 159 | "$$ \\text{min}_{w,b} \\sum_i || w^\\mathsf{T}x_i + b - y_i||^2 + \\alpha ||w||_2^2$$ " 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "The amount of regularization is set via the `alpha` parameter of the Ridge." 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": { 173 | "collapsed": false 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "from sklearn.linear_model import Ridge\n", 178 | "ridge_models = {}\n", 179 | "training_scores = []\n", 180 | "test_scores = []\n", 181 | "\n", 182 | "for alpha in [100, 10, 1, .01]:\n", 183 | " ridge = Ridge(alpha=alpha).fit(X_train, y_train)\n", 184 | " training_scores.append(ridge.score(X_train, y_train))\n", 185 | " test_scores.append(ridge.score(X_test, y_test))\n", 186 | " ridge_models[alpha] = ridge\n", 187 | "\n", 188 | "plt.figure()\n", 189 | "plt.plot(training_scores, label=\"training scores\")\n", 190 | "plt.plot(test_scores, label=\"test scores\")\n", 191 | "plt.xticks(range(4), [100, 10, 1, .01])\n", 192 | "plt.legend(loc=\"best\")" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "collapsed": false 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "plt.figure(figsize=(10, 5))\n", 204 | "plt.plot(true_coefficient[coefficient_sorting], \"o\", label=\"true\", c='b')\n", 205 | "\n", 206 | "for i, alpha in enumerate([100, 10, 1, .01]):\n", 207 | " plt.plot(ridge_models[alpha].coef_[coefficient_sorting], \"o\", label=\"alpha = %.2f\" % alpha, c=plt.cm.summer(i / 3.))\n", 208 | " \n", 209 | "plt.legend(loc=\"best\")" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "Tuning alpha is critical for performance." 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": { 223 | "collapsed": false 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "plt.figure()\n", 228 | "plot_learning_curve(LinearRegression(), X, y)\n", 229 | "plot_learning_curve(Ridge(alpha=10), X, y)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "## Lasso (L1 penalty\n", 237 | "**The Lasso estimator** is useful to impose sparsity on the coefficient. In other words, it is to be prefered if we believe that many of the features are not relevant. This is done via the so-called l1 penalty.\n", 238 | "\n", 239 | "$$ \\text{min}_{w, b} \\sum_i || w^\\mathsf{T}x_i + b - y_i||^2 + \\alpha ||w||_1$$ " 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": { 246 | "collapsed": false 247 | }, 248 | "outputs": [], 249 | "source": [ 250 | "from sklearn.linear_model import Lasso\n", 251 | "\n", 252 | "lasso_models = {}\n", 253 | "training_scores = []\n", 254 | "test_scores = []\n", 255 | "\n", 256 | "for alpha in [30, 10, 1, .01]:\n", 257 | " lasso = Lasso(alpha=alpha).fit(X_train, y_train)\n", 258 | " training_scores.append(lasso.score(X_train, y_train))\n", 259 | " test_scores.append(lasso.score(X_test, y_test))\n", 260 | " lasso_models[alpha] = lasso\n", 261 | "plt.figure()\n", 262 | "plt.plot(training_scores, label=\"training scores\")\n", 263 | "plt.plot(test_scores, label=\"test scores\")\n", 264 | "plt.xticks(range(4), [30, 10, 1, .01])\n", 265 | "plt.legend(loc=\"best\")" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": { 272 | "collapsed": false 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "plt.figure(figsize=(10, 5))\n", 277 | "plt.plot(true_coefficient[coefficient_sorting], \"o\", label=\"true\", c='b')\n", 278 | "\n", 279 | "for i, alpha in enumerate([30, 10, 1, .01]):\n", 280 | " plt.plot(lasso_models[alpha].coef_[coefficient_sorting], \"o\", label=\"alpha = %.2f\" % alpha, c=plt.cm.summer(i / 3.))\n", 281 | " \n", 282 | "plt.legend(loc=\"best\")" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "collapsed": false 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "plt.figure()\n", 294 | "plot_learning_curve(LinearRegression(), X, y)\n", 295 | "plot_learning_curve(Ridge(alpha=10), X, y)\n", 296 | "plot_learning_curve(Lasso(alpha=10), X, y)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "## Linear models for classification" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "All linear models for classification learn a coefficient parameter ``coef_`` and an offset ``intercept_`` to make predictions using a linear combination of features:\n", 311 | "```\n", 312 | "y_pred = x_test[0] * coef_[0] + ... + x_test[n_features-1] * coef_[n_features-1] + intercept_ > 0\n", 313 | "```\n", 314 | "\n", 315 | "As you can see, this is very similar to regression, only that a threshold at zero is applied.\n", 316 | "\n", 317 | "Again, the difference between the linear models for classification what kind of regularization is put on ``coef_`` and ``intercept_``, but there are also minor differences in how the fit to the training set is measured (the so-called loss function).\n", 318 | "\n", 319 | "The two most common models for linear classification are the linear SVM as implemented in LinearSVC and LogisticRegression.\n", 320 | "\n", 321 | "A good intuition for regularization of linear classifiers is that with high regularization, it is enough if most of the points are classified correctly. But with less regularization, more importance is given to each individual data point.\n", 322 | "This is illustrated using an linear SVM with different values of ``C`` below.\n" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "### The influence of C in LinearSVC" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": { 336 | "collapsed": false 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "from plots import plot_linear_svc_regularization\n", 341 | "plot_linear_svc_regularization()" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": {}, 347 | "source": [ 348 | "\n", 349 | "Similar to the Ridge/Lasso separation, you can set the 'penalty' parameter to 'l1' to enforce sparsity of the coefficients." 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "## Multi-Class linear classification" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": { 363 | "collapsed": false 364 | }, 365 | "outputs": [], 366 | "source": [ 367 | "from sklearn.datasets import make_blobs\n", 368 | "plt.figure()\n", 369 | "X, y = make_blobs(random_state=42)\n", 370 | "plt.scatter(X[:, 0], X[:, 1], c=y)" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": { 377 | "collapsed": false 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "from sklearn.svm import LinearSVC\n", 382 | "linear_svm = LinearSVC().fit(X, y)\n", 383 | "print(linear_svm.coef_.shape)\n", 384 | "print(linear_svm.intercept_.shape)" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": { 391 | "collapsed": false 392 | }, 393 | "outputs": [], 394 | "source": [ 395 | "plt.scatter(X[:, 0], X[:, 1], c=y)\n", 396 | "line = np.linspace(-15, 15)\n", 397 | "for coef, intercept in zip(linear_svm.coef_, linear_svm.intercept_):\n", 398 | " plt.plot(line, -(line * coef[0] + intercept) / coef[1])\n", 399 | "plt.ylim(-10, 15)\n", 400 | "plt.xlim(-10, 8)" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "# Exercises" 408 | ] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "metadata": {}, 413 | "source": [ 414 | "Use LogisticRegression to classify digits, and grid-search the C parameter.\n", 415 | "How do you think the learning curves above change when you increase or decrease alpha?\n", 416 | "Try out to change the alpha parameter in ridge and lasso, and see if your intuition was correct." 417 | ] 418 | } 419 | ], 420 | "metadata": { 421 | "kernelspec": { 422 | "display_name": "Python 3", 423 | "language": "python", 424 | "name": "python3" 425 | }, 426 | "language_info": { 427 | "codemirror_mode": { 428 | "name": "ipython", 429 | "version": 3 430 | }, 431 | "file_extension": ".py", 432 | "mimetype": "text/x-python", 433 | "name": "python", 434 | "nbconvert_exporter": "python", 435 | "pygments_lexer": "ipython3", 436 | "version": "3.5.1" 437 | } 438 | }, 439 | "nbformat": 4, 440 | "nbformat_minor": 0 441 | } 442 | -------------------------------------------------------------------------------- /notebooks/18 In Depth - Support Vector Machines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "Andreas Mueller, Kyle Kastner, Sebastian Raschka \n", 15 | "last updated: 2016-06-29 \n", 16 | "\n", 17 | "CPython 3.5.1\n", 18 | "IPython 4.2.0\n", 19 | "\n", 20 | "numpy 1.11.0\n", 21 | "scipy 0.17.1\n", 22 | "matplotlib 1.5.1\n", 23 | "scikit-learn 0.17.1\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "%load_ext watermark\n", 29 | "%watermark -d -u -a 'Andreas Mueller, Kyle Kastner, Sebastian Raschka' -v -p numpy,scipy,matplotlib,scikit-learn" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "%matplotlib inline\n", 41 | "import numpy as np\n", 42 | "import matplotlib.pyplot as plt" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# SciPy 2016 Scikit-learn Tutorial" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "# In Depth - Support Vector Machines" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "SVM stands for \"support vector machines\". They are efficient and easy to use estimators.\n", 64 | "They come in two kinds: SVCs, Support Vector Classifiers, for classification problems, and SVRs, Support Vector Regressors, for regression problems." 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "## Linear SVMs" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "The SVM module contains LinearSVC, which we already discussed briefly in the section on linear models.\n", 79 | "Using ``SVC(kernel=\"linear\")`` will also yield a linear predictor that is only different in minor technical aspects." 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "## Kernel SVMs\n", 87 | "The real power of SVMs lies in using kernels, which allow for non-linear decision boundaries. A kernel defines a similarity measure between data points. The most common are:\n", 88 | "\n", 89 | "- **linear** will give linear decision frontiers. It is the most computationally efficient approach and the one that requires the least amount of data.\n", 90 | "\n", 91 | "- **poly** will give decision frontiers that are polynomial. The order of this polynomial is given by the 'order' argument.\n", 92 | "\n", 93 | "- **rbf** uses 'radial basis functions' centered at each support vector to assemble a decision frontier. The size of the RBFs ultimately controls the smoothness of the decision frontier. RBFs are the most flexible approach, but also the one that will require the largest amount of data.\n", 94 | "\n", 95 | "Predictions in a kernel-SVM are made using the formular\n", 96 | "\n", 97 | "$$\n", 98 | "\\hat{y} = \\text{sign}(\\alpha_0 + \\sum_{j}\\alpha_j y_j k(\\mathbf{x^{(j)}}, \\mathbf{x}))\n", 99 | "$$\n", 100 | "\n", 101 | "where $\\mathbf{x}^{(j)}$ are training samples, $\\mathbf{y}^{(j)}$ the corresponding labels, $\\mathbf{x}$ is a test-sample to predict on, $k$ is the kernel, and $\\alpha$ are learned parameters.\n", 102 | "\n", 103 | "What this says is \"if $\\mathbf{x}$ is similar to $\\mathbf{x}^{(j)}$ then they probably have the same label\", where the importance of each $\\mathbf{x}^{(j)}$ for this decision is learned. [Or something much less intuitive about an infinite dimensional Hilbert-space]\n", 104 | "\n", 105 | "Often only few samples have non-zero $\\alpha$, these are called the \"support vectors\" from which SVMs get their name.\n", 106 | "These are the most discriminant samples.\n", 107 | "\n", 108 | "The most important parameter of the SVM is the regularization parameter $C$, which bounds the influence of each individual sample:\n", 109 | "\n", 110 | "- Low C values: many support vectors... Decision frontier = mean(class A) - mean(class B)\n", 111 | "- High C values: small number of support vectors: Decision frontier fully driven by most discriminant samples\n", 112 | "\n" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "The other important parameters are those of the kernel. Let's look at the RBF kernel in more detail:\n", 120 | "\n", 121 | "$$k(\\mathbf{x}, \\mathbf{x'}) = \\exp(-\\gamma ||\\mathbf{x} - \\mathbf{x'}||^2)$$" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": { 128 | "collapsed": false 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "from sklearn.metrics.pairwise import rbf_kernel\n", 133 | "\n", 134 | "line = np.linspace(-3, 3, 100)[:, np.newaxis]\n", 135 | "kernel_value = rbf_kernel(line, [[0]], gamma=1)\n", 136 | "plt.plot(line, kernel_value)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "The rbf kernel has an inverse bandwidth-parameter gamma, where large gamma mean a very localized influence for each data point, and\n", 144 | "small values mean a very global influence.\n", 145 | "Let's see these two parameters in action:" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "from figures import plot_svm_interactive\n", 157 | "plot_svm_interactive()" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "## Exercise: tune an SVM on the digits dataset" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "collapsed": false 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "from sklearn import datasets\n", 176 | "\n", 177 | "digits = datasets.load_digits()\n", 178 | "X, y = digits.data, digits.target\n", 179 | "# split the dataset, apply grid-search" 180 | ] 181 | } 182 | ], 183 | "metadata": { 184 | "kernelspec": { 185 | "display_name": "Python 3", 186 | "language": "python", 187 | "name": "python3" 188 | }, 189 | "language_info": { 190 | "codemirror_mode": { 191 | "name": "ipython", 192 | "version": 3 193 | }, 194 | "file_extension": ".py", 195 | "mimetype": "text/x-python", 196 | "name": "python", 197 | "nbconvert_exporter": "python", 198 | "pygments_lexer": "ipython3", 199 | "version": "3.5.1" 200 | } 201 | }, 202 | "nbformat": 4, 203 | "nbformat_minor": 0 204 | } 205 | -------------------------------------------------------------------------------- /notebooks/19 In Depth - Trees and Forests.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "Andreas Mueller, Kyle Kastner, Sebastian Raschka \n", 15 | "last updated: 2016-06-29 \n", 16 | "\n", 17 | "CPython 3.5.1\n", 18 | "IPython 4.2.0\n", 19 | "\n", 20 | "numpy 1.11.0\n", 21 | "scipy 0.17.1\n", 22 | "matplotlib 1.5.1\n", 23 | "scikit-learn 0.17.1\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "%load_ext watermark\n", 29 | "%watermark -d -u -a 'Andreas Mueller, Kyle Kastner, Sebastian Raschka' -v -p numpy,scipy,matplotlib,scikit-learn" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "# SciPy 2016 Scikit-learn Tutorial" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "# In Depth - Decision Trees and Forests" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "%matplotlib inline\n", 55 | "import numpy as np\n", 56 | "import matplotlib.pyplot as plt" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "Here we'll explore a class of algorithms based on decision trees.\n", 64 | "Decision trees at their root are extremely intuitive. They\n", 65 | "encode a series of \"if\" and \"else\" choices, similar to how a person might make a decision.\n", 66 | "However, which questions to ask, and how to proceed for each answer is entirely learned from the data.\n", 67 | "\n", 68 | "For example, if you wanted to create a guide to identifying an animal found in nature, you\n", 69 | "might ask the following series of questions:\n", 70 | "\n", 71 | "- Is the animal bigger or smaller than a meter long?\n", 72 | " + *bigger*: does the animal have horns?\n", 73 | " - *yes*: are the horns longer than ten centimeters?\n", 74 | " - *no*: is the animal wearing a collar\n", 75 | " + *smaller*: does the animal have two or four legs?\n", 76 | " - *two*: does the animal have wings?\n", 77 | " - *four*: does the animal have a bushy tail?\n", 78 | "\n", 79 | "and so on. This binary splitting of questions is the essence of a decision tree." 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "One of the main benefit of tree-based models is that they require little preprocessing of the data.\n", 87 | "They can work with variables of different types (continuous and discrete) and are invariant to scaling of the features.\n", 88 | "\n", 89 | "Another benefit is that tree-based models are what is called \"nonparametric\", which means they don't have a fix set of parameters to learn. Instead, a tree model can become more and more flexible, if given more data.\n", 90 | "In other words, the number of free parameters grows with the number of samples and is not fixed, as for example in linear models.\n" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "## Decision Tree Regression" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "A decision tree is a simple binary classification tree that is\n", 105 | "similar to nearest neighbor classification. It can be used as follows:" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": { 112 | "collapsed": false 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "from figures import make_dataset\n", 117 | "x, y = make_dataset()\n", 118 | "X = x.reshape(-1, 1)\n", 119 | "\n", 120 | "plt.xlabel('Feature X')\n", 121 | "plt.ylabel('Target y')\n", 122 | "plt.scatter(X, y);" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "from sklearn.tree import DecisionTreeRegressor\n", 134 | "\n", 135 | "reg = DecisionTreeRegressor(max_depth=5)\n", 136 | "reg.fit(X, y)\n", 137 | "\n", 138 | "X_fit = np.linspace(-3, 3, 1000).reshape((-1, 1))\n", 139 | "y_fit_1 = reg.predict(X_fit)\n", 140 | "\n", 141 | "plt.plot(X_fit.ravel(), y_fit_1, color='blue', label=\"prediction\")\n", 142 | "plt.plot(X.ravel(), y, '.k', label=\"training data\")\n", 143 | "plt.legend(loc=\"best\");" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "A single decision tree allows us to estimate the signal in a non-parametric way,\n", 151 | "but clearly has some issues. In some regions, the model shows high bias and\n", 152 | "under-fits the data.\n", 153 | "(seen in the long flat lines which don't follow the contours of the data),\n", 154 | "while in other regions the model shows high variance and over-fits the data\n", 155 | "(reflected in the narrow spikes which are influenced by noise in single points)." 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "Decision Tree Classification\n", 163 | "==================\n", 164 | "Decision tree classification work very similarly, by assigning all points within a leaf the majority class in that leaf:\n" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "collapsed": false 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "from sklearn.datasets import make_blobs\n", 176 | "from sklearn.cross_validation import train_test_split\n", 177 | "from sklearn.tree import DecisionTreeClassifier\n", 178 | "from figures import plot_2d_separator\n", 179 | "\n", 180 | "\n", 181 | "X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=100)\n", 182 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n", 183 | "\n", 184 | "clf = DecisionTreeClassifier(max_depth=5)\n", 185 | "clf.fit(X_train, y_train)\n", 186 | "\n", 187 | "\n", 188 | "plot_2d_separator(clf, X, fill=True)\n", 189 | "plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=60, alpha=.7)\n", 190 | "plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, s=60);" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "There are many parameter that control the complexity of a tree, but the one that might be easiest to understand is the maximum depth. This limits how finely the tree can partition the input space, or how many \"if-else\" questions can be asked before deciding which class a sample lies in.\n", 198 | "\n", 199 | "This parameter is important to tune for trees and tree-based models. The interactive plot below shows how underfit and overfit looks like for this model. Having a ``max_depth`` of one is clearly an underfit model, while a depth of seven or eight clearly overfits. The maximum depth a tree can be grown at for this dataset is 8, at which point each leave only contains samples from a single class. This is known as all leaves being \"pure\"." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": { 206 | "collapsed": false 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "from figures import plot_tree_interactive\n", 211 | "plot_tree_interactive()" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "Decision trees are fast to train, easy to understand, and often lead to interpretable models. However, single trees often tend to overfit the training data. Playing with the slider above you might notice that the model starts to overfit even before it has a good separation between the classes.\n", 219 | "\n", 220 | "Therefore, in practice it is more common to combine multiple trees to produce models that generalize better. The most common methods for combining trees are random forests and gradient boosted trees.\n" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "## Random Forests" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "Random forests are simply many trees, built on different random subsets (drawn with replacement) of the data, and using different random subsets (drawn without replacement) of the features for each split.\n", 235 | "This makes the trees different from each other, and makes them overfit to different aspects. Then, their predictions are averaged, leading to a smoother estimate that overfits less.\n" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": { 242 | "collapsed": false 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "from figures import plot_forest_interactive\n", 247 | "plot_forest_interactive()" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "## Selecting the Optimal Estimator via Cross-Validation" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": { 261 | "collapsed": false 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "from sklearn import grid_search\n", 266 | "from sklearn.datasets import load_digits\n", 267 | "from sklearn.ensemble import RandomForestClassifier\n", 268 | "\n", 269 | "digits = load_digits()\n", 270 | "X, y = digits.data, digits.target\n", 271 | "\n", 272 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n", 273 | "\n", 274 | "rf = RandomForestClassifier(n_estimators=200)\n", 275 | "parameters = {'max_features':['sqrt', 'log2', 10],\n", 276 | " 'max_depth':[5, 7, 9]}\n", 277 | "\n", 278 | "clf_grid = grid_search.GridSearchCV(rf, parameters, n_jobs=-1)\n", 279 | "clf_grid.fit(X_train, y_train)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": { 286 | "collapsed": false 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "clf_grid.score(X_train, y_train)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": { 297 | "collapsed": false 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "clf_grid.score(X_test, y_test)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "## Another option: Gradient Boosting" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "Another Ensemble method that can be useful is *Boosting*: here, rather than\n", 316 | "looking at 200 (say) parallel estimators, We construct a chain of 200 estimators\n", 317 | "which iteratively refine the results of the previous estimator.\n", 318 | "The idea is that by sequentially applying very fast, simple models, we can get a\n", 319 | "total model error which is better than any of the individual pieces." 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": { 326 | "collapsed": false 327 | }, 328 | "outputs": [], 329 | "source": [ 330 | "from sklearn.ensemble import GradientBoostingRegressor\n", 331 | "clf = GradientBoostingRegressor(n_estimators=100, max_depth=5, learning_rate=.2)\n", 332 | "clf.fit(X_train, y_train)\n", 333 | "\n", 334 | "print(clf.score(X_train, y_train))\n", 335 | "print(clf.score(X_test, y_test))" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "## Exercise: Cross-validating Gradient Boosting" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "Use a grid search to optimize the learning rate and max_depth for a Gradient Boosted\n", 350 | "Decision tree." 351 | ] 352 | } 353 | ], 354 | "metadata": { 355 | "kernelspec": { 356 | "display_name": "Python 3", 357 | "language": "python", 358 | "name": "python3" 359 | }, 360 | "language_info": { 361 | "codemirror_mode": { 362 | "name": "ipython", 363 | "version": 3 364 | }, 365 | "file_extension": ".py", 366 | "mimetype": "text/x-python", 367 | "name": "python", 368 | "nbconvert_exporter": "python", 369 | "pygments_lexer": "ipython3", 370 | "version": "3.5.1" 371 | } 372 | }, 373 | "nbformat": 4, 374 | "nbformat_minor": 0 375 | } 376 | -------------------------------------------------------------------------------- /notebooks/20 Feature Selection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true, 8 | "hide_input": false 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Automatic Feature Selection\n", 22 | "Often we collected many features that might be related to a supervised prediction task, but we don't know which of them are actually predictive. To improve interpretability, and sometimes also generalization performance, we can use automatic feature selection to select a subset of the original features. There are several types of feature selection methods available, which we'll explain in order of increasing complexity.\n", 23 | "\n", 24 | "For a given supervised model, the best feature selection strategy would be to try out each possible subset of the features, and evaluate generalization performance using this subset. However, there are exponentially many subsets of features, so this exhaustive search is generally infeasible. The strategies discussed below can be thought of as proxies for this infeasible computation.\n", 25 | "\n", 26 | "### Univariate statistics\n", 27 | "The simplest method to select features is using univariate statistics, that is by looking at each feature individually and running a statistical test to see whether it is related to the target. This kind of test is also known as analysis of variance (ANOVA).\n", 28 | "\n", 29 | "We create a synthetic dataset that consists of the breast cancer data, with an addition of 50 completely random features." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "from sklearn.datasets import load_breast_cancer, load_digits\n", 41 | "from sklearn.model_selection import train_test_split\n", 42 | "\n", 43 | "cancer = load_breast_cancer()\n", 44 | "\n", 45 | "# get deterministic random numbers\n", 46 | "rng = np.random.RandomState(42)\n", 47 | "noise = rng.normal(size=(len(cancer.data), 50))\n", 48 | "# add noise features to the data\n", 49 | "# the first 30 features are from the dataset, the next 50 are noise\n", 50 | "X_w_noise = np.hstack([cancer.data, noise])\n", 51 | "\n", 52 | "X_train, X_test, y_train, y_test = train_test_split(\n", 53 | " X_w_noise, cancer.target, random_state=0, test_size=.5)\n" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "We have to define a threshold on the p-value of the statistical test to decide how many features to keep. There are several strategies implemented in scikit-learn, a straight-forward one being ``SelectPercentile``, which selects a percentile of the original features (we select 50% below):" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "from sklearn.feature_selection import SelectPercentile\n", 72 | "\n", 73 | "# use f_classif (the default) and SelectPercentile to select 50% of features:\n", 74 | "select = SelectPercentile(percentile=50)\n", 75 | "select.fit(X_train, y_train)\n", 76 | "# transform training set:\n", 77 | "X_train_selected = select.transform(X_train)\n", 78 | "\n", 79 | "print(X_train.shape)\n", 80 | "print(X_train_selected.shape)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "We can also use the test statistic directly to see how relevant each feature is. As the breast cancer dataset is a classification task, we use f_classif, the F-test for classification. Below we plot the p-values associated with each of the 80 features (30 original features + 50 noise features). Low p-values indicate informative features." 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": { 94 | "collapsed": true 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "from sklearn.feature_selection import f_classif, f_regression, chi2" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": { 105 | "collapsed": false 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "F, p = f_classif(X_train, y_train)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "plt.figure()\n", 121 | "plt.plot(p, 'o')" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "Clearly most of the first 30 features have very small p-values.\n", 129 | "\n", 130 | "Going back to the SelectPercentile transformer, we can obtain the features that are selected using the ``get_support`` method:" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "collapsed": false 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "mask = select.get_support()\n", 142 | "print(mask)\n", 143 | "# visualize the mask. black is True, white is False\n", 144 | "plt.matshow(mask.reshape(1, -1), cmap='gray_r')" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "Nearly all of the original 30 features were recovered.\n", 152 | "We can also analize the utility of the feature selection by training a supervised model on the data.\n", 153 | "It's important to learn the feature selection only on the training set!" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "from sklearn.linear_model import LogisticRegression\n", 165 | "\n", 166 | "# transform test data:\n", 167 | "X_test_selected = select.transform(X_test)\n", 168 | "\n", 169 | "lr = LogisticRegression()\n", 170 | "lr.fit(X_train, y_train)\n", 171 | "print(\"Score with all features: %f\" % lr.score(X_test, y_test))\n", 172 | "lr.fit(X_train_selected, y_train)\n", 173 | "print(\"Score with only selected features: %f\" % lr.score(X_test_selected, y_test))" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "### Model-based Feature Selection\n", 181 | "A somewhat more sophisticated method for feature selection is using a supervised machine learning model, and selecting features based on how important they were deemed by the model. This requires the model to provide some way to rank the features by importance. This can be done for all tree-based models (which implement ``get_feature_importances``) and all linear models, for which the coefficients can be used to determine how much influence a feature has on the outcome.\n", 182 | "\n", 183 | "Any of these models can be made into a transformer that does feature selection by wrapping it with the ``SelectFromModel`` class:" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": false 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "from sklearn.feature_selection import SelectFromModel\n", 195 | "from sklearn.ensemble import RandomForestClassifier\n", 196 | "select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold=\"median\")" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "collapsed": false 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "select.fit(X_train, y_train)\n", 208 | "X_train_rf = select.transform(X_train)\n", 209 | "print(X_train.shape)\n", 210 | "print(X_train_rf.shape)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": false 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "mask = select.get_support()\n", 222 | "# visualize the mask. black is True, white is False\n", 223 | "plt.matshow(mask.reshape(1, -1), cmap='gray_r')" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "collapsed": false 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "X_test_rf = select.transform(X_test)\n", 235 | "LogisticRegression().fit(X_train_rf, y_train).score(X_test_rf, y_test)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "This method builds a single model (in this case a random forest) and uses the feature importances from this model.\n", 243 | "We can do a somewhat more elaborate search by training multiple models on subsets of the data. One particular strategy is recursive feature elimination:" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "### Recursive Feature Elimination\n", 251 | "Recursive feature elimination builds a model on the full set of features, and similar to the method above selects a subset of features that are deemed most important by the model. However, usually only a single feature is dropped from the dataset, and a new model is build on the remaining features. The process of dropping features and model building is repeated until there are only a pre-specified number of features left:" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": { 258 | "collapsed": false 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "from sklearn.feature_selection import RFE\n", 263 | "select = RFE(RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=40)\n", 264 | "\n", 265 | "select.fit(X_train, y_train)\n", 266 | "# visualize the selected features:\n", 267 | "mask = select.get_support()\n", 268 | "plt.matshow(mask.reshape(1, -1), cmap='gray_r')" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": { 275 | "collapsed": false 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "X_train_rfe = select.transform(X_train)\n", 280 | "X_test_rfe = select.transform(X_test)\n", 281 | "\n", 282 | "LogisticRegression().fit(X_train_rfe, y_train).score(X_test_rfe, y_test)" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "collapsed": false 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "select.score(X_test, y_test)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "# Exercises" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "Plot the \"XOR\" dataset which is created like this:" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "collapsed": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "xx, yy = np.meshgrid(np.linspace(-3, 3, 50),\n", 319 | " np.linspace(-3, 3, 50))\n", 320 | "rng = np.random.RandomState(0)\n", 321 | "X = rng.randn(200, 2)\n", 322 | "Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "Add random features to it and compare how univariate selection compares to model based selection using a Random Forest in recovering the original features." 330 | ] 331 | } 332 | ], 333 | "metadata": { 334 | "kernelspec": { 335 | "display_name": "Python 3", 336 | "language": "python", 337 | "name": "python3" 338 | }, 339 | "language_info": { 340 | "codemirror_mode": { 341 | "name": "ipython", 342 | "version": 3 343 | }, 344 | "file_extension": ".py", 345 | "mimetype": "text/x-python", 346 | "name": "python", 347 | "nbconvert_exporter": "python", 348 | "pygments_lexer": "ipython3", 349 | "version": "3.5.1" 350 | } 351 | }, 352 | "nbformat": 4, 353 | "nbformat_minor": 0 354 | } 355 | -------------------------------------------------------------------------------- /notebooks/21 Unsupervised learning - Hierarchical and density-based clustering algorithms.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "Andreas Mueller, Sebastian Raschka \n", 15 | "last updated: 2016-07-08 \n", 16 | "\n", 17 | "CPython 3.5.1\n", 18 | "IPython 4.2.0\n", 19 | "\n", 20 | "numpy 1.11.0\n", 21 | "scipy 0.17.1\n", 22 | "matplotlib 1.5.1\n" 23 | ] 24 | } 25 | ], 26 | "source": [ 27 | "%load_ext watermark\n", 28 | "%watermark -d -u -a 'Andreas Mueller, Sebastian Raschka' -v -p numpy,scipy,matplotlib" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "%matplotlib inline\n", 40 | "import numpy as np\n", 41 | "from matplotlib import pyplot as plt" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "# SciPy 2016 Scikit-learn Tutorial" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "# 25 Unsupervised learning: Hierarchical and density-based clustering algorithms" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "In a previous notebook, \"10 Unsupervised Learning - Clustering.ipynb\", we introduced one of the essential and widely used clustering algorithms, K-means. One of the advantages of K-means is that it is extremely easy to implement, and it is also computationally very efficient compared to other clustering algorithms. However, we've seen that one of the weaknesses of K-Means is that it only works well if the data can be grouped into a globular or spherical shape. Also, we have to assign the number of clusters, *k*, *a priory* -- this can be a problem if we have no prior knowledge about how many clusters we expect to find. " 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "In this notebook, we will take a look at 2 alternative approaches to clustering, hierarchical clustering and density-based clustering. " 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "# Hierarchical Clustering" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "One nice feature of hierachical clustering is that we can visualize the results as a dendrogram, a hierachical tree. Using the visualization, we can then decide how \"deep\" we want to cluster the dataset by setting a \"depth\" threshold. Or in other words, we don't need to make a decision about the number of clusters upfront.\n", 84 | "\n", 85 | "**Agglomerative and divisive hierarchical clustering**\n", 86 | "\n", 87 | "Furthermore, we can distinguish between 2 main approaches to hierarchical clustering: Divisive clustering and agglomerative clustering. In agglomerative clustering, we start with a single sample from our dataset and iteratively merge it with other samples to form clusters -- we can see it as a bottom-up approach for building the clustering dendrogram. \n", 88 | "In divisive clustering, however, we start with the whole dataset as one cluster, and we iteratively split it into smaller subclusters -- a top-down approach. \n", 89 | "\n", 90 | "In this notebook, we will use **agglomerative** clustering." 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "**Single and complete linkage**\n", 98 | "\n", 99 | "Now, the next question is how we measure the similarity between samples. One approach is the familiar Euclidean distance metric that we already used via the K-Means algorithm. as a refresher, the distance between 2 m-dimensional vectors $\\mathbf{p}$ and $\\mathbf{q}$ can be computed as:\n", 100 | "\n", 101 | "\\begin{align} \\mathrm{d}(\\mathbf{q},\\mathbf{p}) & = \\sqrt{(q_1-p_1)^2 + (q_2-p_2)^2 + \\cdots + (q_m-p_m)^2} \\\\[8pt]\n", 102 | "& = \\sqrt{\\sum_{j=1}^m (q_j-p_j)^2}.\\end{align}\t\n" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "However, that's the distance between 2 samples. Now, how do we compute the similarity between subclusters of samples in order to decide which clusters to merge when constructing the dendrogram? I.e., our goal is to iteratively merge the most similar pairs of clusters until only one big cluster remains. There are many different approaches to this, for example single and complete linkage. \n", 110 | "\n", 111 | "In single linkage, we take the pair of the most similar samples (based on the Euclidean distance, for example) in each cluster, and merge the two clusters which have the most similar 2 members into one new, bigger cluster.\n", 112 | "\n", 113 | "In complete linkage, we compare the pairs of the two most dissimilar members of each cluster with each other, and we merge the 2 clusters where the distance between its 2 most dissimilar members is smallest.\n", 114 | "\n", 115 | "![](figures/clustering-linkage.png)\n" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "To see the agglomerative, hierarchical clustering approach in action, let us load the familiar Iris dataset -- pretending we don't the true class labels and want to find out how many different follow species it consists of:" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "from sklearn import datasets\n", 134 | "\n", 135 | "iris = datasets.load_iris()\n", 136 | "X = iris.data[:, [2, 3]]\n", 137 | "y = iris.target\n", 138 | "n_samples, n_features = X.shape\n", 139 | "\n", 140 | "plt.scatter(X[:, 0], X[:, 1], c=y);" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "First, we start with some exploratory clustering, visualizing the clustering dendrogram using SciPy's `linkage` and `dendrogram` functions:" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "collapsed": false 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "from scipy.cluster.hierarchy import linkage\n", 159 | "from scipy.cluster.hierarchy import dendrogram\n", 160 | "\n", 161 | "clusters = linkage(X, \n", 162 | " metric='euclidean',\n", 163 | " method='complete')\n", 164 | "\n", 165 | "dendr = dendrogram(clusters)\n", 166 | "\n", 167 | "plt.ylabel('Euclidean Distance');" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "Next, let's use the `AgglomerativeClustering` estimator from scikit-learn and divide the dataset into 3 clusters. Can you guess which 3 clusters from the dendrogram it will reproduce?" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": { 181 | "collapsed": false 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "from sklearn.cluster import AgglomerativeClustering\n", 186 | "\n", 187 | "ac = AgglomerativeClustering(n_clusters=3,\n", 188 | " affinity='euclidean',\n", 189 | " linkage='complete')\n", 190 | "\n", 191 | "prediction = ac.fit_predict(X)\n", 192 | "print('Cluster labels: %s\\n' % prediction)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "collapsed": false 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "plt.scatter(X[:, 0], X[:, 1], c=prediction);" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "# Density-based Clustering - DBSCAN" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "Another useful approach to clustering is *Density-based Spatial Clustering of Applications with Noise* (DBSCAN). In essence, we can think of DBSCAN as an algorithm that divides the dataset into subgroup based on dense regions of point.\n", 218 | "\n", 219 | "In DBSCAN, we distinguish between 3 different \"points\":\n", 220 | "\n", 221 | "- Core points: A core point is a point that has at least a minimum number of other points (MinPts) in its radius epsilon.\n", 222 | "- Border points: A border point is a point that is not a core point, since it doesn't have enough MinPts in its neighborhood, but lies within the radius epsilon of a core point.\n", 223 | "- Noise points: All other points that are neither core points nor border points.\n", 224 | "\n", 225 | "![](figures/dbscan.png)\n", 226 | "\n", 227 | "A nice feature about DBSCAN is that we don't have to specify a number of clusters upfront. However, it requires the setting of additional hyperparameters such as the value for MinPts and the radius epsilon." 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "collapsed": false 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "from sklearn.datasets import make_moons\n", 239 | "X, y = make_moons(n_samples=400,\n", 240 | " noise=0.1,\n", 241 | " random_state=1)\n", 242 | "plt.scatter(X[:,0], X[:,1])\n", 243 | "plt.show()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": false 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "from sklearn.cluster import DBSCAN\n", 255 | "\n", 256 | "db = DBSCAN(eps=0.2,\n", 257 | " min_samples=10,\n", 258 | " metric='euclidean')\n", 259 | "prediction = db.fit_predict(X)\n", 260 | "\n", 261 | "print(\"Predicted labels:\\n\", prediction)\n", 262 | "\n", 263 | "plt.scatter(X[:, 0], X[:, 1], c=prediction);" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "# Exercise" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "Using the following toy datasets, two concentric circles, experiment with the three different clustering algorithms that we used so far `KMeans`, `AgglomerativeClustering`, and `DBSCAN`. Which clustering algorithms reproduces or discovers the hidden structure (pretending we don't know `y`) best? Can you explain why this particular algorithm is a good choice while the other 2 \"fail\"?" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": { 284 | "collapsed": false 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "from sklearn.datasets import make_circles\n", 289 | "\n", 290 | "X, y = make_circles(n_samples=500, \n", 291 | " factor=.6, \n", 292 | " noise=.05)\n", 293 | "\n", 294 | "plt.scatter(X[:, 0], X[:, 1], c=y);" 295 | ] 296 | } 297 | ], 298 | "metadata": { 299 | "kernelspec": { 300 | "display_name": "Python 3", 301 | "language": "python", 302 | "name": "python3" 303 | }, 304 | "language_info": { 305 | "codemirror_mode": { 306 | "name": "ipython", 307 | "version": 3 308 | }, 309 | "file_extension": ".py", 310 | "mimetype": "text/x-python", 311 | "name": "python", 312 | "nbconvert_exporter": "python", 313 | "pygments_lexer": "ipython3", 314 | "version": "3.5.1" 315 | } 316 | }, 317 | "nbformat": 4, 318 | "nbformat_minor": 0 319 | } 320 | -------------------------------------------------------------------------------- /notebooks/datasets/smsspam/readme: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/datasets/smsspam/readme -------------------------------------------------------------------------------- /notebooks/figures/ML_flow_chart.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tutorial Diagrams 3 | ----------------- 4 | 5 | This script plots the flow-charts used in the scikit-learn tutorials. 6 | """ 7 | 8 | import matplotlib.pyplot as plt 9 | from matplotlib.patches import Circle, Rectangle, Polygon, FancyArrow 10 | 11 | 12 | def create_base(box_bg='#CCCCCC', 13 | arrow1='#88CCFF', 14 | arrow2='#88FF88', 15 | supervised=True): 16 | plt.figure(figsize=(9, 6), facecolor='w') 17 | ax = plt.axes((0, 0, 1, 1), xticks=[], yticks=[], frameon=False) 18 | ax.set_xlim(0, 9) 19 | ax.set_ylim(0, 6) 20 | 21 | patches = [Rectangle((0.3, 3.6), 1.5, 1.8, zorder=1, fc=box_bg), 22 | Rectangle((0.5, 3.8), 1.5, 1.8, zorder=2, fc=box_bg), 23 | Rectangle((0.7, 4.0), 1.5, 1.8, zorder=3, fc=box_bg), 24 | 25 | Rectangle((2.9, 3.6), 0.2, 1.8, fc=box_bg), 26 | Rectangle((3.1, 3.8), 0.2, 1.8, fc=box_bg), 27 | Rectangle((3.3, 4.0), 0.2, 1.8, fc=box_bg), 28 | 29 | Rectangle((0.3, 0.2), 1.5, 1.8, fc=box_bg), 30 | 31 | Rectangle((2.9, 0.2), 0.2, 1.8, fc=box_bg), 32 | 33 | Circle((5.5, 3.5), 1.0, fc=box_bg), 34 | 35 | Polygon([[5.5, 1.7], 36 | [6.1, 1.1], 37 | [5.5, 0.5], 38 | [4.9, 1.1]], fc=box_bg), 39 | 40 | FancyArrow(2.3, 4.6, 0.35, 0, fc=arrow1, 41 | width=0.25, head_width=0.5, head_length=0.2), 42 | 43 | FancyArrow(3.75, 4.2, 0.5, -0.2, fc=arrow1, 44 | width=0.25, head_width=0.5, head_length=0.2), 45 | 46 | FancyArrow(5.5, 2.4, 0, -0.4, fc=arrow1, 47 | width=0.25, head_width=0.5, head_length=0.2), 48 | 49 | FancyArrow(2.0, 1.1, 0.5, 0, fc=arrow2, 50 | width=0.25, head_width=0.5, head_length=0.2), 51 | 52 | FancyArrow(3.3, 1.1, 1.3, 0, fc=arrow2, 53 | width=0.25, head_width=0.5, head_length=0.2), 54 | 55 | FancyArrow(6.2, 1.1, 0.8, 0, fc=arrow2, 56 | width=0.25, head_width=0.5, head_length=0.2)] 57 | 58 | if supervised: 59 | patches += [Rectangle((0.3, 2.4), 1.5, 0.5, zorder=1, fc=box_bg), 60 | Rectangle((0.5, 2.6), 1.5, 0.5, zorder=2, fc=box_bg), 61 | Rectangle((0.7, 2.8), 1.5, 0.5, zorder=3, fc=box_bg), 62 | FancyArrow(2.3, 2.9, 2.0, 0, fc=arrow1, 63 | width=0.25, head_width=0.5, head_length=0.2), 64 | Rectangle((7.3, 0.85), 1.5, 0.5, fc=box_bg)] 65 | else: 66 | patches += [Rectangle((7.3, 0.2), 1.5, 1.8, fc=box_bg)] 67 | 68 | for p in patches: 69 | ax.add_patch(p) 70 | 71 | plt.text(1.45, 4.9, "Training\nText,\nDocuments,\nImages,\netc.", 72 | ha='center', va='center', fontsize=14) 73 | 74 | plt.text(3.6, 4.9, "Feature\nVectors", 75 | ha='left', va='center', fontsize=14) 76 | 77 | plt.text(5.5, 3.5, "Machine\nLearning\nAlgorithm", 78 | ha='center', va='center', fontsize=14) 79 | 80 | plt.text(1.05, 1.1, "New Text,\nDocument,\nImage,\netc.", 81 | ha='center', va='center', fontsize=14) 82 | 83 | plt.text(3.3, 1.7, "Feature\nVector", 84 | ha='left', va='center', fontsize=14) 85 | 86 | plt.text(5.5, 1.1, "Predictive\nModel", 87 | ha='center', va='center', fontsize=12) 88 | 89 | if supervised: 90 | plt.text(1.45, 3.05, "Labels", 91 | ha='center', va='center', fontsize=14) 92 | 93 | plt.text(8.05, 1.1, "Expected\nLabel", 94 | ha='center', va='center', fontsize=14) 95 | plt.text(8.8, 5.8, "Supervised Learning Model", 96 | ha='right', va='top', fontsize=18) 97 | 98 | else: 99 | plt.text(8.05, 1.1, 100 | "Likelihood\nor Cluster ID\nor Better\nRepresentation", 101 | ha='center', va='center', fontsize=12) 102 | plt.text(8.8, 5.8, "Unsupervised Learning Model", 103 | ha='right', va='top', fontsize=18) 104 | 105 | 106 | def plot_supervised_chart(annotate=False): 107 | create_base(supervised=True) 108 | if annotate: 109 | fontdict = dict(color='r', weight='bold', size=14) 110 | plt.text(1.9, 4.55, 'X = vec.fit_transform(input)', 111 | fontdict=fontdict, 112 | rotation=20, ha='left', va='bottom') 113 | plt.text(3.7, 3.2, 'clf.fit(X, y)', 114 | fontdict=fontdict, 115 | rotation=20, ha='left', va='bottom') 116 | plt.text(1.7, 1.5, 'X_new = vec.transform(input)', 117 | fontdict=fontdict, 118 | rotation=20, ha='left', va='bottom') 119 | plt.text(6.1, 1.5, 'y_new = clf.predict(X_new)', 120 | fontdict=fontdict, 121 | rotation=20, ha='left', va='bottom') 122 | 123 | 124 | def plot_unsupervised_chart(): 125 | create_base(supervised=False) 126 | 127 | 128 | if __name__ == '__main__': 129 | plot_supervised_chart(False) 130 | plot_supervised_chart(True) 131 | plot_unsupervised_chart() 132 | plt.show() 133 | -------------------------------------------------------------------------------- /notebooks/figures/__init__.py: -------------------------------------------------------------------------------- 1 | from .plot_2d_separator import plot_2d_separator 2 | from .plot_kneighbors_regularization import plot_kneighbors_regularization, \ 3 | plot_regression_datasets, make_dataset 4 | from .plot_linear_svc_regularization import plot_linear_svc_regularization 5 | from .plot_interactive_tree import plot_tree_interactive 6 | from .plot_interactive_forest import plot_forest_interactive 7 | from .plot_rbf_svm_parameters import plot_rbf_svm_parameters 8 | from .plot_rbf_svm_parameters import plot_svm_interactive 9 | from .plot_scaling import plot_scaling, plot_relative_scaling 10 | from .plot_digits_dataset import digits_plot 11 | from .plot_pca import plot_pca_illustration 12 | 13 | __all__ = ['plot_2d_separator', 'plot_kneighbors_regularization', 14 | 'plot_linear_svc_regularization', 'plot_tree_interactive', 15 | 'plot_regression_datasets', 'make_dataset', 16 | "plot_forest_interactive", "plot_rbf_svm_parameters", 17 | "plot_svm_interactive", 'plot_scaling', 'digits_plot', 18 | 'plot_relative_scaling', 'plot_pca_illustration'] 19 | -------------------------------------------------------------------------------- /notebooks/figures/average-per-class.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/figures/average-per-class.png -------------------------------------------------------------------------------- /notebooks/figures/check_env-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/figures/check_env-1.png -------------------------------------------------------------------------------- /notebooks/figures/cluster_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/figures/cluster_comparison.png -------------------------------------------------------------------------------- /notebooks/figures/clustering-linkage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/figures/clustering-linkage.png -------------------------------------------------------------------------------- /notebooks/figures/dbscan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/figures/dbscan.png -------------------------------------------------------------------------------- /notebooks/figures/ipython_help-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/figures/ipython_help-1.png -------------------------------------------------------------------------------- /notebooks/figures/ipython_help-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/figures/ipython_help-2.png -------------------------------------------------------------------------------- /notebooks/figures/ipython_run_cell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/figures/ipython_run_cell.png -------------------------------------------------------------------------------- /notebooks/figures/iris_setosa.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/figures/iris_setosa.jpg -------------------------------------------------------------------------------- /notebooks/figures/iris_versicolor.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/figures/iris_versicolor.jpg -------------------------------------------------------------------------------- /notebooks/figures/iris_virginica.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/figures/iris_virginica.jpg -------------------------------------------------------------------------------- /notebooks/figures/petal_sepal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/figures/petal_sepal.jpg -------------------------------------------------------------------------------- /notebooks/figures/plot_2d_separator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None): 6 | if eps is None: 7 | eps = X.std() / 2. 8 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 9 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 10 | xx = np.linspace(x_min, x_max, 100) 11 | yy = np.linspace(y_min, y_max, 100) 12 | 13 | X1, X2 = np.meshgrid(xx, yy) 14 | X_grid = np.c_[X1.ravel(), X2.ravel()] 15 | try: 16 | decision_values = classifier.decision_function(X_grid) 17 | levels = [0] 18 | fill_levels = [decision_values.min(), 0, decision_values.max()] 19 | except AttributeError: 20 | # no decision_function 21 | decision_values = classifier.predict_proba(X_grid)[:, 1] 22 | levels = [.5] 23 | fill_levels = [0, .5, 1] 24 | 25 | if ax is None: 26 | ax = plt.gca() 27 | if fill: 28 | ax.contourf(X1, X2, decision_values.reshape(X1.shape), 29 | levels=fill_levels, colors=['blue', 'red']) 30 | else: 31 | ax.contour(X1, X2, decision_values.reshape(X1.shape), levels=levels, 32 | colors="black") 33 | ax.set_xlim(x_min, x_max) 34 | ax.set_ylim(y_min, y_max) 35 | ax.set_xticks(()) 36 | ax.set_yticks(()) 37 | 38 | 39 | if __name__ == '__main__': 40 | from sklearn.datasets import make_blobs 41 | from sklearn.linear_model import LogisticRegression 42 | X, y = make_blobs(centers=2, random_state=42) 43 | clf = LogisticRegression().fit(X, y) 44 | plot_2d_separator(clf, X, fill=True) 45 | plt.scatter(X[:, 0], X[:, 1], c=y) 46 | plt.show() 47 | -------------------------------------------------------------------------------- /notebooks/figures/plot_digits_dataset.py: -------------------------------------------------------------------------------- 1 | # Taken from example in scikit-learn examples 2 | # Authors: Fabian Pedregosa 3 | # Olivier Grisel 4 | # Mathieu Blondel 5 | # Gael Varoquaux 6 | # License: BSD 3 clause (C) INRIA 2011 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | from matplotlib import offsetbox 11 | from sklearn import (manifold, datasets, decomposition, ensemble, 12 | random_projection) 13 | 14 | def digits_plot(): 15 | digits = datasets.load_digits(n_class=6) 16 | n_digits = 500 17 | X = digits.data[:n_digits] 18 | y = digits.target[:n_digits] 19 | n_samples, n_features = X.shape 20 | n_neighbors = 30 21 | 22 | def plot_embedding(X, title=None): 23 | x_min, x_max = np.min(X, 0), np.max(X, 0) 24 | X = (X - x_min) / (x_max - x_min) 25 | 26 | plt.figure() 27 | ax = plt.subplot(111) 28 | for i in range(X.shape[0]): 29 | plt.text(X[i, 0], X[i, 1], str(digits.target[i]), 30 | color=plt.cm.Set1(y[i] / 10.), 31 | fontdict={'weight': 'bold', 'size': 9}) 32 | 33 | if hasattr(offsetbox, 'AnnotationBbox'): 34 | # only print thumbnails with matplotlib > 1.0 35 | shown_images = np.array([[1., 1.]]) # just something big 36 | for i in range(X.shape[0]): 37 | dist = np.sum((X[i] - shown_images) ** 2, 1) 38 | if np.min(dist) < 1e5: 39 | # don't show points that are too close 40 | # set a high threshold to basically turn this off 41 | continue 42 | shown_images = np.r_[shown_images, [X[i]]] 43 | imagebox = offsetbox.AnnotationBbox( 44 | offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r), 45 | X[i]) 46 | ax.add_artist(imagebox) 47 | plt.xticks([]), plt.yticks([]) 48 | if title is not None: 49 | plt.title(title) 50 | 51 | n_img_per_row = 10 52 | img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row)) 53 | for i in range(n_img_per_row): 54 | ix = 10 * i + 1 55 | for j in range(n_img_per_row): 56 | iy = 10 * j + 1 57 | img[ix:ix + 8, iy:iy + 8] = X[i * n_img_per_row + j].reshape((8, 8)) 58 | 59 | plt.imshow(img, cmap=plt.cm.binary) 60 | plt.xticks([]) 61 | plt.yticks([]) 62 | plt.title('A selection from the 64-dimensional digits dataset') 63 | print("Computing PCA projection") 64 | pca = decomposition.PCA(n_components=2).fit(X) 65 | X_pca = pca.transform(X) 66 | plot_embedding(X_pca, "Principal Components projection of the digits") 67 | plt.figure() 68 | plt.matshow(pca.components_[0, :].reshape(8, 8), cmap="gray") 69 | plt.axis('off') 70 | plt.figure() 71 | plt.matshow(pca.components_[1, :].reshape(8, 8), cmap="gray") 72 | plt.axis('off') 73 | plt.show() 74 | -------------------------------------------------------------------------------- /notebooks/figures/plot_helpers.py: -------------------------------------------------------------------------------- 1 | from matplotlib.colors import ListedColormap 2 | 3 | cm3 = ListedColormap(['#0000aa', '#ff2020', '#50ff50']) 4 | cm2 = ListedColormap(['#0000aa', '#ff2020']) 5 | -------------------------------------------------------------------------------- /notebooks/figures/plot_interactive_forest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.datasets import make_blobs 5 | from sklearn.ensemble import RandomForestClassifier 6 | 7 | 8 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50) 9 | 10 | 11 | def plot_forest(max_depth=1): 12 | plt.figure() 13 | ax = plt.gca() 14 | h = 0.02 15 | 16 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 17 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 18 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 19 | 20 | if max_depth != 0: 21 | forest = RandomForestClassifier(n_estimators=20, max_depth=max_depth, 22 | random_state=1).fit(X, y) 23 | Z = forest.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] 24 | Z = Z.reshape(xx.shape) 25 | ax.contourf(xx, yy, Z, alpha=.4) 26 | ax.set_title("max_depth = %d" % max_depth) 27 | else: 28 | ax.set_title("data set") 29 | ax.scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60) 30 | ax.set_xlim(x_min, x_max) 31 | ax.set_ylim(y_min, y_max) 32 | ax.set_xticks(()) 33 | ax.set_yticks(()) 34 | 35 | 36 | def plot_forest_interactive(): 37 | from IPython.html.widgets import interactive, IntSlider 38 | slider = IntSlider(min=0, max=8, step=1, value=0) 39 | return interactive(plot_forest, max_depth=slider) 40 | -------------------------------------------------------------------------------- /notebooks/figures/plot_interactive_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.datasets import make_blobs 5 | from sklearn.tree import DecisionTreeClassifier 6 | 7 | from sklearn.externals.six import StringIO # doctest: +SKIP 8 | from sklearn.tree import export_graphviz 9 | from scipy.misc import imread 10 | from scipy import ndimage 11 | 12 | import re 13 | 14 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50) 15 | 16 | 17 | def tree_image(tree, fout=None): 18 | try: 19 | import pydot 20 | except ImportError: 21 | # make a hacky white plot 22 | x = np.ones((10, 10)) 23 | x[0, 0] = 0 24 | return x 25 | dot_data = StringIO() 26 | export_graphviz(tree, out_file=dot_data) 27 | data = re.sub(r"gini = 0\.[0-9]+\\n", "", dot_data.getvalue()) 28 | data = re.sub(r"samples = [0-9]+\\n", "", data) 29 | data = re.sub(r"\\nsamples = [0-9]+", "", data) 30 | 31 | graph = pydot.graph_from_dot_data(data) 32 | if fout is None: 33 | fout = "tmp.png" 34 | graph.write_png(fout) 35 | return imread(fout) 36 | 37 | 38 | def plot_tree(max_depth=1): 39 | fig, ax = plt.subplots(1, 2, figsize=(15, 7)) 40 | h = 0.02 41 | 42 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 43 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 44 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 45 | 46 | if max_depth != 0: 47 | tree = DecisionTreeClassifier(max_depth=max_depth, random_state=1).fit(X, y) 48 | Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] 49 | Z = Z.reshape(xx.shape) 50 | faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32)) 51 | faces = faces.reshape(xx.shape) 52 | border = ndimage.laplace(faces) != 0 53 | ax[0].contourf(xx, yy, Z, alpha=.4) 54 | ax[0].scatter(xx[border], yy[border], marker='.', s=1) 55 | ax[0].set_title("max_depth = %d" % max_depth) 56 | ax[1].imshow(tree_image(tree)) 57 | ax[1].axis("off") 58 | else: 59 | ax[0].set_title("data set") 60 | ax[1].set_visible(False) 61 | ax[0].scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60) 62 | ax[0].set_xlim(x_min, x_max) 63 | ax[0].set_ylim(y_min, y_max) 64 | ax[0].set_xticks(()) 65 | ax[0].set_yticks(()) 66 | 67 | 68 | def plot_tree_interactive(): 69 | from IPython.html.widgets import interactive, IntSlider 70 | slider = IntSlider(min=0, max=8, step=1, value=0) 71 | return interactive(plot_tree, max_depth=slider) 72 | -------------------------------------------------------------------------------- /notebooks/figures/plot_kneigbors_regularization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/figures/plot_kneigbors_regularization.png -------------------------------------------------------------------------------- /notebooks/figures/plot_kneighbors_regularization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.neighbors import KNeighborsRegressor 5 | 6 | 7 | def make_dataset(n_samples=100): 8 | rnd = np.random.RandomState(42) 9 | x = np.linspace(-3, 3, n_samples) 10 | y_no_noise = np.sin(4 * x) + x 11 | y = y_no_noise + rnd.normal(size=len(x)) 12 | return x, y 13 | 14 | 15 | def plot_regression_datasets(): 16 | fig, axes = plt.subplots(1, 3, figsize=(15, 5)) 17 | for n_samples, ax in zip([10, 100, 1000], axes): 18 | x, y = make_dataset(n_samples) 19 | ax.plot(x, y, 'o', alpha=.6) 20 | 21 | 22 | def plot_kneighbors_regularization(): 23 | rnd = np.random.RandomState(42) 24 | x = np.linspace(-3, 3, 100) 25 | y_no_noise = np.sin(4 * x) + x 26 | y = y_no_noise + rnd.normal(size=len(x)) 27 | X = x[:, np.newaxis] 28 | fig, axes = plt.subplots(1, 3, figsize=(15, 5)) 29 | 30 | x_test = np.linspace(-3, 3, 1000) 31 | 32 | for n_neighbors, ax in zip([2, 5, 20], axes.ravel()): 33 | kneighbor_regression = KNeighborsRegressor(n_neighbors=n_neighbors) 34 | kneighbor_regression.fit(X, y) 35 | ax.plot(x, y_no_noise, label="true function") 36 | ax.plot(x, y, "o", label="data") 37 | ax.plot(x_test, kneighbor_regression.predict(x_test[:, np.newaxis]), 38 | label="prediction") 39 | ax.legend() 40 | ax.set_title("n_neighbors = %d" % n_neighbors) 41 | 42 | if __name__ == "__main__": 43 | plot_kneighbors_regularization() 44 | plt.show() 45 | -------------------------------------------------------------------------------- /notebooks/figures/plot_linear_svc_regularization.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.svm import SVC 4 | from sklearn.datasets import make_blobs 5 | from .plot_2d_separator import plot_2d_separator 6 | 7 | 8 | def plot_linear_svc_regularization(): 9 | X, y = make_blobs(centers=2, random_state=4, n_samples=30) 10 | # a carefully hand-designed dataset lol 11 | y[7] = 0 12 | y[27] = 0 13 | 14 | fig, axes = plt.subplots(1, 3, figsize=(12, 4)) 15 | 16 | for ax, C in zip(axes, [1e-2, 1, 1e2]): 17 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 18 | 19 | svm = SVC(kernel='linear', C=C).fit(X, y) 20 | plot_2d_separator(svm, X, ax=ax, eps=.5) 21 | ax.set_title("C = %f" % C) 22 | -------------------------------------------------------------------------------- /notebooks/figures/plot_pca.py: -------------------------------------------------------------------------------- 1 | from sklearn.decomposition import PCA 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | 6 | def plot_pca_illustration(): 7 | rnd = np.random.RandomState(5) 8 | X_ = rnd.normal(size=(300, 2)) 9 | X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2) 10 | 11 | pca = PCA() 12 | pca.fit(X_blob) 13 | X_pca = pca.transform(X_blob) 14 | 15 | S = X_pca.std(axis=0) 16 | 17 | fig, axes = plt.subplots(2, 2, figsize=(10, 10)) 18 | axes = axes.ravel() 19 | 20 | axes[0].set_title("Original data") 21 | axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_pca[:, 0], linewidths=0, 22 | s=60, cmap='viridis') 23 | axes[0].set_xlabel("feature 1") 24 | axes[0].set_ylabel("feature 2") 25 | axes[0].arrow(pca.mean_[0], pca.mean_[1], S[0] * pca.components_[0, 0], 26 | S[0] * pca.components_[0, 1], width=.1, head_width=.3, 27 | color='k') 28 | axes[0].arrow(pca.mean_[0], pca.mean_[1], S[1] * pca.components_[1, 0], 29 | S[1] * pca.components_[1, 1], width=.1, head_width=.3, 30 | color='k') 31 | axes[0].text(-1.5, -.5, "Component 2", size=14) 32 | axes[0].text(-4, -4, "Component 1", size=14) 33 | axes[0].set_aspect('equal') 34 | 35 | axes[1].set_title("Transformed data") 36 | axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=X_pca[:, 0], linewidths=0, 37 | s=60, cmap='viridis') 38 | axes[1].set_xlabel("First principal component") 39 | axes[1].set_ylabel("Second principal component") 40 | axes[1].set_aspect('equal') 41 | axes[1].set_ylim(-8, 8) 42 | 43 | pca = PCA(n_components=1) 44 | pca.fit(X_blob) 45 | X_inverse = pca.inverse_transform(pca.transform(X_blob)) 46 | 47 | axes[2].set_title("Transformed data w/ second component dropped") 48 | axes[2].scatter(X_pca[:, 0], np.zeros(X_pca.shape[0]), c=X_pca[:, 0], 49 | linewidths=0, s=60, cmap='viridis') 50 | axes[2].set_xlabel("First principal component") 51 | axes[2].set_aspect('equal') 52 | axes[2].set_ylim(-8, 8) 53 | 54 | axes[3].set_title("Back-rotation using only first component") 55 | axes[3].scatter(X_inverse[:, 0], X_inverse[:, 1], c=X_pca[:, 0], 56 | linewidths=0, s=60, cmap='viridis') 57 | axes[3].set_xlabel("feature 1") 58 | axes[3].set_ylabel("feature 2") 59 | axes[3].set_aspect('equal') 60 | axes[3].set_xlim(-8, 4) 61 | axes[3].set_ylim(-8, 4) 62 | 63 | 64 | def plot_pca_whitening(): 65 | rnd = np.random.RandomState(5) 66 | X_ = rnd.normal(size=(300, 2)) 67 | X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2) 68 | 69 | pca = PCA(whiten=True) 70 | pca.fit(X_blob) 71 | X_pca = pca.transform(X_blob) 72 | 73 | fig, axes = plt.subplots(1, 2, figsize=(10, 10)) 74 | axes = axes.ravel() 75 | 76 | axes[0].set_title("Original data") 77 | axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_pca[:, 0], linewidths=0, s=60, cmap='viridis') 78 | axes[0].set_xlabel("feature 1") 79 | axes[0].set_ylabel("feature 2") 80 | axes[0].set_aspect('equal') 81 | 82 | axes[1].set_title("Whitened data") 83 | axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=X_pca[:, 0], linewidths=0, s=60, cmap='viridis') 84 | axes[1].set_xlabel("First principal component") 85 | axes[1].set_ylabel("Second principal component") 86 | axes[1].set_aspect('equal') 87 | axes[1].set_xlim(-3, 4) 88 | -------------------------------------------------------------------------------- /notebooks/figures/plot_rbf_svm_parameters.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.svm import SVC 4 | from sklearn.datasets import make_blobs 5 | from .plot_2d_separator import plot_2d_separator 6 | 7 | 8 | def make_handcrafted_dataset(): 9 | # a carefully hand-designed dataset lol 10 | X, y = make_blobs(centers=2, random_state=4, n_samples=30) 11 | y[np.array([7, 27])] = 0 12 | mask = np.ones(len(X), dtype=np.bool) 13 | mask[np.array([0, 1, 5, 26])] = 0 14 | X, y = X[mask], y[mask] 15 | return X, y 16 | 17 | 18 | def plot_rbf_svm_parameters(): 19 | X, y = make_handcrafted_dataset() 20 | 21 | fig, axes = plt.subplots(1, 3, figsize=(12, 4)) 22 | for ax, C in zip(axes, [1e0, 5, 10, 100]): 23 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 24 | 25 | svm = SVC(kernel='rbf', C=C).fit(X, y) 26 | plot_2d_separator(svm, X, ax=ax, eps=.5) 27 | ax.set_title("C = %f" % C) 28 | 29 | fig, axes = plt.subplots(1, 4, figsize=(15, 3)) 30 | for ax, gamma in zip(axes, [0.1, .5, 1, 10]): 31 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 32 | svm = SVC(gamma=gamma, kernel='rbf', C=1).fit(X, y) 33 | plot_2d_separator(svm, X, ax=ax, eps=.5) 34 | ax.set_title("gamma = %f" % gamma) 35 | 36 | 37 | def plot_svm(log_C, log_gamma): 38 | X, y = make_handcrafted_dataset() 39 | C = 10. ** log_C 40 | gamma = 10. ** log_gamma 41 | svm = SVC(kernel='rbf', C=C, gamma=gamma).fit(X, y) 42 | ax = plt.gca() 43 | plot_2d_separator(svm, X, ax=ax, eps=.5) 44 | # plot data 45 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 46 | # plot support vectors 47 | sv = svm.support_vectors_ 48 | ax.scatter(sv[:, 0], sv[:, 1], s=230, facecolors='none', zorder=10, linewidth=3) 49 | ax.set_title("C = %.4f gamma = %.4f" % (C, gamma)) 50 | 51 | 52 | def plot_svm_interactive(): 53 | from IPython.html.widgets import interactive, FloatSlider 54 | C_slider = FloatSlider(min=-3, max=3, step=.1, value=0, readout=False) 55 | gamma_slider = FloatSlider(min=-2, max=2, step=.1, value=0, readout=False) 56 | return interactive(plot_svm, log_C=C_slider, log_gamma=gamma_slider) 57 | -------------------------------------------------------------------------------- /notebooks/figures/plot_scaling.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.datasets import make_blobs 4 | from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, RobustScaler 5 | from sklearn.cross_validation import train_test_split 6 | from .plot_helpers import cm2 7 | 8 | 9 | def plot_scaling(): 10 | X, y = make_blobs(n_samples=50, centers=2, random_state=4, cluster_std=1) 11 | X += 3 12 | 13 | plt.figure(figsize=(15, 8)) 14 | main_ax = plt.subplot2grid((2, 4), (0, 0), rowspan=2, colspan=2) 15 | 16 | main_ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cm2, s=60) 17 | maxx = np.abs(X[:, 0]).max() 18 | maxy = np.abs(X[:, 1]).max() 19 | 20 | main_ax.set_xlim(-maxx + 1, maxx + 1) 21 | main_ax.set_ylim(-maxy + 1, maxy + 1) 22 | main_ax.set_title("Original Data") 23 | other_axes = [plt.subplot2grid((2, 4), (i, j)) for j in range(2, 4) for i in range(2)] 24 | 25 | for ax, scaler in zip(other_axes, [StandardScaler(), RobustScaler(), 26 | MinMaxScaler(), Normalizer(norm='l2')]): 27 | X_ = scaler.fit_transform(X) 28 | ax.scatter(X_[:, 0], X_[:, 1], c=y, cmap=cm2, s=60) 29 | ax.set_xlim(-2, 2) 30 | ax.set_ylim(-2, 2) 31 | ax.set_title(type(scaler).__name__) 32 | 33 | other_axes.append(main_ax) 34 | 35 | for ax in other_axes: 36 | ax.spines['left'].set_position('center') 37 | ax.spines['right'].set_color('none') 38 | ax.spines['bottom'].set_position('center') 39 | ax.spines['top'].set_color('none') 40 | ax.xaxis.set_ticks_position('bottom') 41 | ax.yaxis.set_ticks_position('left') 42 | 43 | 44 | def plot_relative_scaling(): 45 | # make synthetic data 46 | X, _ = make_blobs(n_samples=50, centers=5, random_state=4, cluster_std=2) 47 | # split it into training and test set 48 | X_train, X_test = train_test_split(X, random_state=5, test_size=.1) 49 | # plot the training and test set 50 | fig, axes = plt.subplots(1, 3, figsize=(13, 4)) 51 | axes[0].scatter(X_train[:, 0], X_train[:, 1], 52 | c='b', label="training set", s=60) 53 | axes[0].scatter(X_test[:, 0], X_test[:, 1], marker='^', 54 | c='r', label="test set", s=60) 55 | axes[0].legend(loc='upper left') 56 | axes[0].set_title("original data") 57 | 58 | # scale the data using MinMaxScaler 59 | scaler = MinMaxScaler() 60 | scaler.fit(X_train) 61 | X_train_scaled = scaler.transform(X_train) 62 | X_test_scaled = scaler.transform(X_test) 63 | 64 | # visualize the properly scaled data 65 | axes[1].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], 66 | c='b', label="training set", s=60) 67 | axes[1].scatter(X_test_scaled[:, 0], X_test_scaled[:, 1], marker='^', 68 | c='r', label="test set", s=60) 69 | axes[1].set_title("scaled data") 70 | 71 | # rescale the test set separately, so that test set min is 0 and test set max is 1 72 | # DO NOT DO THIS! For illustration purposes only 73 | test_scaler = MinMaxScaler() 74 | test_scaler.fit(X_test) 75 | X_test_scaled_badly = test_scaler.transform(X_test) 76 | 77 | # visualize wrongly scaled data 78 | axes[2].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], 79 | c='b', label="training set", s=60) 80 | axes[2].scatter(X_test_scaled_badly[:, 0], X_test_scaled_badly[:, 1], marker='^', 81 | c='r', label="test set", s=60) 82 | axes[2].set_title("improperly scaled data") 83 | -------------------------------------------------------------------------------- /notebooks/figures/randomized_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/figures/randomized_search.png -------------------------------------------------------------------------------- /notebooks/figures/supervised_scikit_learn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/figures/supervised_scikit_learn.png -------------------------------------------------------------------------------- /notebooks/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import defaultdict 3 | import os 4 | from sklearn.cross_validation import StratifiedShuffleSplit 5 | from sklearn.feature_extraction import DictVectorizer 6 | 7 | # Can also use pandas! 8 | def process_titanic_line(line): 9 | # Split line on "," to get fields without comma confusion 10 | vals = line.strip().split('",') 11 | # replace spurious " characters 12 | vals = [v.replace('"', '') for v in vals] 13 | pclass = int(vals[0]) 14 | survived = int(vals[1]) 15 | name = str(vals[2]) 16 | sex = str(vals[3]) 17 | try: 18 | age = float(vals[4]) 19 | except ValueError: 20 | # Blank age 21 | age = -1 22 | sibsp = float(vals[5]) 23 | parch = int(vals[6]) 24 | ticket = str(vals[7]) 25 | try: 26 | fare = float(vals[8]) 27 | except ValueError: 28 | # Blank fare 29 | fare = -1 30 | cabin = str(vals[9]) 31 | embarked = str(vals[10]) 32 | boat = str(vals[11]) 33 | homedest = str(vals[12]) 34 | line_dict = {'pclass': pclass, 'survived': survived, 'name': name, 'sex': sex, 'age': age, 'sibsp': sibsp, 35 | 'parch': parch, 'ticket': ticket, 'fare': fare, 'cabin': cabin, 'embarked': embarked, 36 | 'boat': boat, 'homedest': homedest} 37 | return line_dict 38 | 39 | 40 | def load_titanic(test_size=.25, feature_skip_tuple=(), random_state=1999): 41 | f = open(os.path.join('datasets', 'titanic', 'titanic3.csv')) 42 | # Remove . from home.dest, split on quotes because some fields have commas 43 | keys = f.readline().strip().replace('.', '').split('","') 44 | lines = f.readlines() 45 | f.close() 46 | string_keys = ['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat', 47 | 'homedest'] 48 | string_keys = [s for s in string_keys if s not in feature_skip_tuple] 49 | numeric_keys = ['pclass', 'age', 'sibsp', 'parch', 'fare'] 50 | numeric_keys = [n for n in numeric_keys if n not in feature_skip_tuple] 51 | train_vectorizer_list = [] 52 | test_vectorizer_list = [] 53 | 54 | n_samples = len(lines) 55 | numeric_data = np.zeros((n_samples, len(numeric_keys))) 56 | numeric_labels = np.zeros((n_samples,), dtype=int) 57 | 58 | # Doing this twice is horribly inefficient but the file is small... 59 | for n, l in enumerate(lines): 60 | line_dict = process_titanic_line(l) 61 | strings = {k: line_dict[k] for k in string_keys} 62 | numeric_labels[n] = line_dict["survived"] 63 | 64 | sss = StratifiedShuffleSplit(numeric_labels, n_iter=1, test_size=test_size, 65 | random_state=12) 66 | # This is a weird way to get the indices but it works 67 | train_idx = None 68 | test_idx = None 69 | for train_idx, test_idx in sss: 70 | pass 71 | 72 | for n, l in enumerate(lines): 73 | line_dict = process_titanic_line(l) 74 | strings = {k: line_dict[k] for k in string_keys} 75 | if n in train_idx: 76 | train_vectorizer_list.append(strings) 77 | else: 78 | test_vectorizer_list.append(strings) 79 | numeric_data[n] = np.asarray([line_dict[k] 80 | for k in numeric_keys]) 81 | 82 | train_numeric = numeric_data[train_idx] 83 | test_numeric = numeric_data[test_idx] 84 | train_labels = numeric_labels[train_idx] 85 | test_labels = numeric_labels[test_idx] 86 | 87 | vec = DictVectorizer() 88 | # .toarray() due to returning a scipy sparse array 89 | train_categorical = vec.fit_transform(train_vectorizer_list).toarray() 90 | test_categorical = vec.transform(test_vectorizer_list).toarray() 91 | train_data = np.concatenate([train_numeric, train_categorical], axis=1) 92 | test_data = np.concatenate([test_numeric, test_categorical], axis=1) 93 | keys = numeric_keys + string_keys 94 | return keys, train_data, test_data, train_labels, test_labels 95 | 96 | 97 | FIELDNAMES = ('polarity', 'id', 'date', 'query', 'author', 'text') 98 | 99 | def read_sentiment_csv(csv_file, fieldnames=FIELDNAMES, max_count=None, 100 | n_partitions=1, partition_id=0): 101 | import csv # put the import inside for use in IPython.parallel 102 | def file_opener(csv_file): 103 | try: 104 | open(csv_file, 'r', encoding="latin1").close() 105 | return open(csv_file, 'r', encoding="latin1") 106 | except TypeError: 107 | # Python 2 does not have encoding arg 108 | return open(csv_file, 'rb') 109 | 110 | texts = [] 111 | targets = [] 112 | with file_opener(csv_file) as f: 113 | reader = csv.DictReader(f, fieldnames=fieldnames, 114 | delimiter=',', quotechar='"') 115 | pos_count, neg_count = 0, 0 116 | for i, d in enumerate(reader): 117 | if i % n_partitions != partition_id: 118 | # Skip entry if not in the requested partition 119 | continue 120 | 121 | if d['polarity'] == '4': 122 | if max_count and pos_count >= max_count / 2: 123 | continue 124 | pos_count += 1 125 | texts.append(d['text']) 126 | targets.append(1) 127 | 128 | elif d['polarity'] == '0': 129 | if max_count and neg_count >= max_count / 2: 130 | continue 131 | neg_count += 1 132 | texts.append(d['text']) 133 | targets.append(-1) 134 | 135 | return texts, targets 136 | -------------------------------------------------------------------------------- /notebooks/images/parallel_text_clf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/images/parallel_text_clf.png -------------------------------------------------------------------------------- /notebooks/images/parallel_text_clf_average.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/notebooks/images/parallel_text_clf_average.png -------------------------------------------------------------------------------- /notebooks/solutions/03A_faces_plot.py: -------------------------------------------------------------------------------- 1 | faces = fetch_olivetti_faces() 2 | 3 | # set up the figure 4 | fig = plt.figure(figsize=(6, 6)) # figure size in inches 5 | fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05) 6 | 7 | # plot the faces: 8 | for i in range(64): 9 | ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[]) 10 | ax.imshow(faces.images[i], cmap=plt.cm.bone, interpolation='nearest') 11 | -------------------------------------------------------------------------------- /notebooks/solutions/04_wrong-predictions.py: -------------------------------------------------------------------------------- 1 | for i in incorrect_idx: 2 | print('%d: Predicted %d True label %d' % (i, pred_y[i], test_y[i])) 3 | 4 | # Plot two dimensions 5 | 6 | colors = ["darkblue", "darkgreen", "gray"] 7 | 8 | for n, color in enumerate(colors): 9 | idx = np.where(test_y == n)[0] 10 | plt.scatter(test_X[idx, 1], test_X[idx, 2], 11 | color=color, label="Class %s" % str(n)) 12 | 13 | for i, marker in zip(incorrect_idx, ['x', 's', 'v']): 14 | plt.scatter(test_X[i, 1], test_X[i, 2], 15 | color="darkred", 16 | marker=marker, 17 | s=40, 18 | label=i) 19 | 20 | plt.xlabel('sepal width [cm]') 21 | plt.ylabel('petal length [cm]') 22 | plt.legend(loc=1, scatterpoints=1) 23 | plt.title("Iris Classification results") 24 | plt.show() 25 | -------------------------------------------------------------------------------- /notebooks/solutions/05A_knn_with_diff_k.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | from sklearn.cross_validation import train_test_split 3 | 4 | 5 | iris = load_iris() 6 | X = iris.data 7 | y = iris.target 8 | 9 | X_train, X_test, y_train, y_test = train_test_split(X, y, 10 | test_size=0.25, 11 | random_state=1234, 12 | stratify=y) 13 | 14 | X_trainsub, X_valid, y_trainsub, y_valid = train_test_split(X_train, y_train, 15 | test_size=0.5, 16 | random_state=1234, 17 | stratify=y_train) 18 | 19 | for k in range(1, 20): 20 | knn = KNeighborsClassifier(n_neighbors=k) 21 | train_score = knn.fit(X_trainsub, y_trainsub).\ 22 | score(X_trainsub, y_trainsub) 23 | valid_score = knn.score(X_valid, y_valid) 24 | print('k: %d, Train/Valid Acc: %.3f/%.3f' % 25 | (k, train_score, valid_score)) 26 | 27 | 28 | knn = KNeighborsClassifier(n_neighbors=9) 29 | knn.fit(X_train, y_train) 30 | print('k=9 Test Acc: %.3f' % knn.score(X_test, y_test)) 31 | -------------------------------------------------------------------------------- /notebooks/solutions/06A_knn_vs_linreg.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_boston 2 | from sklearn.cross_validation import train_test_split 3 | from sklearn.linear_model import LinearRegression 4 | 5 | 6 | boston = load_boston() 7 | X = boston.data 8 | y = boston.target 9 | 10 | print('X.shape:', X.shape) 11 | X_train, X_test, y_train, y_test = train_test_split(X, y, 12 | test_size=0.25, 13 | random_state=42) 14 | 15 | linreg = LinearRegression() 16 | knnreg = KNeighborsRegressor(n_neighbors=1) 17 | 18 | linreg.fit(X_train, y_train) 19 | print('Linear Regression Train/Test: %.3f/%.3f' % 20 | (linreg.score(X_train, y_train), 21 | linreg.score(X_test, y_test))) 22 | 23 | knnreg.fit(X_train, y_train) 24 | print('Linear Regression Train/Test: %.3f/%.3f' % 25 | (knnreg.score(X_train, y_train), 26 | knnreg.score(X_test, y_test))) 27 | -------------------------------------------------------------------------------- /notebooks/solutions/07A_iris-pca.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | from sklearn.cross_validation import train_test_split 3 | from sklearn.decomposition import PCA 4 | from sklearn.preprocessing import StandardScaler 5 | 6 | iris = load_iris() 7 | 8 | X_train, X_test, y_train, y_test = train_test_split(iris.data, 9 | iris.target, 10 | random_state=0, 11 | stratify=iris.target) 12 | 13 | sc = StandardScaler() 14 | sc.fit(X_train) 15 | pca = PCA(n_components=2) 16 | 17 | X_train_pca = pca.fit_transform(sc.transform(X_train)) 18 | X_test_pca = pca.transform(sc.transform(X_test)) 19 | 20 | for X, y in zip((X_train_pca, X_test_pca), (y_train, y_test)): 21 | 22 | for i, annot in enumerate(zip(('Iris-setosa', 'Iris-versicolor', 'Iris-virginica'), 23 | ('blue', 'red', 'green'))): 24 | plt.scatter(X[y==i, 0], 25 | X[y==i, 1], 26 | label=annot[0], 27 | c=annot[1]) 28 | plt.xlabel('Principal Component 1') 29 | plt.ylabel('Principal Component 2') 30 | plt.legend(loc='best') 31 | plt.tight_layout() 32 | plt.show() 33 | -------------------------------------------------------------------------------- /notebooks/solutions/08B_digits_clustering.py: -------------------------------------------------------------------------------- 1 | from sklearn.cluster import KMeans 2 | kmeans = KMeans(n_clusters=10) 3 | clusters = kmeans.fit_predict(digits.data) 4 | 5 | print(kmeans.cluster_centers_.shape) 6 | 7 | #------------------------------------------------------------ 8 | # visualize the cluster centers 9 | fig = plt.figure(figsize=(8, 3)) 10 | for i in range(10): 11 | ax = fig.add_subplot(2, 5, 1 + i) 12 | ax.imshow(kmeans.cluster_centers_[i].reshape((8, 8)), 13 | cmap=plt.cm.binary) 14 | from sklearn.manifold import Isomap 15 | X_iso = Isomap(n_neighbors=10).fit_transform(digits.data) 16 | 17 | #------------------------------------------------------------ 18 | # visualize the projected data 19 | fig, ax = plt.subplots(1, 2, figsize=(8, 4)) 20 | 21 | ax[0].scatter(X_iso[:, 0], X_iso[:, 1], c=clusters) 22 | ax[1].scatter(X_iso[:, 0], X_iso[:, 1], c=digits.target) 23 | -------------------------------------------------------------------------------- /notebooks/solutions/10_titanic.py: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import LogisticRegression 2 | lr = LogisticRegression().fit(train_data_finite, train_labels) 3 | print("logistic regression score: %f" % lr.score(test_data_finite, test_labels)) 4 | 5 | from sklearn.ensemble import RandomForestClassifier 6 | rf = RandomForestClassifier(n_estimators=500, random_state=0).fit(train_data_finite, train_labels) 7 | print("random forest score: %f" % rf.score(test_data_finite, test_labels)) 8 | 9 | features_dummies_sub = pd.get_dummies(features[['pclass', 'sex', 'age', 'sibsp', 'fare']]) 10 | data_sub = features_dummies_sub.values 11 | 12 | train_data_sub, test_data_sub, train_labels, test_labels = train_test_split(data_sub, labels, random_state=0) 13 | 14 | imp = Imputer() 15 | imp.fit(train_data_sub) 16 | train_data_finite_sub = imp.transform(train_data_sub) 17 | test_data_finite_sub = imp.transform(test_data_sub) 18 | 19 | lr = LogisticRegression().fit(train_data_finite_sub, train_labels) 20 | print("logistic regression score w/o embark, parch: %f" % lr.score(test_data_finite_sub, test_labels)) 21 | rf = RandomForestClassifier(n_estimators=500, random_state=0).fit(train_data_finite_sub, train_labels) 22 | print("random forest score w/o embark, parch: %f" % rf.score(test_data_finite_sub, test_labels)) 23 | -------------------------------------------------------------------------------- /notebooks/solutions/11_ngrams.py: -------------------------------------------------------------------------------- 1 | text = zen.split("\n") 2 | for n in [2, 3, 4]: 3 | cv = CountVectorizer(ngram_range=(n, n)).fit(text) 4 | counts = cv.transform(text) 5 | most_common = np.argmax(counts.sum(axis=0)) 6 | print("most common %d-gram: %s" % (n, cv.get_feature_names()[most_common])) 7 | 8 | 9 | for norm in ["l2", None]: 10 | tfidf_vect = TfidfVectorizer(norm=norm).fit(text) 11 | data_tfidf = tfidf_vect.transform(text) 12 | most_common = tfidf_vect.get_feature_names()[np.argmax(data_tfidf.max(axis=0).toarray())] 13 | print("highest tf-idf with norm=%s: %s" % (norm, most_common)) 14 | -------------------------------------------------------------------------------- /notebooks/solutions/13_cross_validation.py: -------------------------------------------------------------------------------- 1 | cv = KFold(n=len(iris.target), n_folds=3) 2 | cross_val_score(classifier, iris.data, iris.target, cv=cv) 3 | -------------------------------------------------------------------------------- /notebooks/solutions/14_grid_search.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_digits 2 | from sklearn.neighbors import KNeighborsClassifier 3 | 4 | digits = load_digits() 5 | X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, random_state=0) 6 | 7 | param_grid = {'n_neighbors': [1, 3, 5, 10, 50]} 8 | gs = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=5, verbose=3) 9 | gs.fit(X_train, y_train) 10 | print("Score on test set: %f" % gs.score(X_test, y_test)) 11 | print("Best parameters: %s" % gs.best_params_) 12 | -------------------------------------------------------------------------------- /notebooks/solutions/15A_ridge_grid.py: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import StandardScaler 2 | from sklearn.datasets import load_boston 3 | from sklearn.preprocessing import PolynomialFeatures 4 | from sklearn.linear_model import Ridge 5 | 6 | boston = load_boston() 7 | text_train, text_test, y_train, y_test = train_test_split(boston.data, 8 | boston.target, 9 | test_size=0.25, 10 | random_state=123) 11 | 12 | pipeline = make_pipeline(StandardScaler(), 13 | PolynomialFeatures(), 14 | Ridge()) 15 | 16 | grid = GridSearchCV(pipeline, 17 | param_grid={'polynomialfeatures__degree': [1, 2, 3]}, cv=5) 18 | 19 | grid.fit(text_train, y_train) 20 | 21 | print('best parameters:', grid.best_params_) 22 | print('best score:', grid.best_score_) 23 | print('test score:', grid.score(text_test, y_test)) 24 | -------------------------------------------------------------------------------- /notebooks/solutions/16A_avg_per_class_acc.py: -------------------------------------------------------------------------------- 1 | def accuracy(true, pred): 2 | return (true == pred).sum() / float(true.shape[0]) 3 | 4 | 5 | def macro(true, pred): 6 | scores = [] 7 | for l in np.unique(true): 8 | scores.append(accuracy(np.where(true != l, 1, 0), 9 | np.where(pred != l, 1, 0))) 10 | return float(sum(scores)) / float(len(scores)) 11 | 12 | y_true = np.array([0, 0, 0, 1, 1, 1, 1, 1, 2, 2]) 13 | y_pred = np.array([0, 1, 1, 0, 1, 1, 2, 2, 2, 2]) 14 | 15 | 16 | print('accuracy:', accuracy(y_true, y_pred)) 17 | print('average-per-class accuracy:', macro(y_true, y_pred)) 18 | -------------------------------------------------------------------------------- /notebooks/solutions/23_batchtrain.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from sklearn.linear_model import SGDClassifier 4 | from sklearn.feature_extraction.text import HashingVectorizer 5 | from sklearn.base import clone 6 | from sklearn.datasets import load_files 7 | 8 | 9 | def batch_train(clf, fnames, labels, iterations=1, 10 | batchsize=1000, random_seed=1): 11 | vec = HashingVectorizer(encoding='latin-1') 12 | idx = np.arange(labels.shape[0]) 13 | c_clf = clone(clf) 14 | rng = np.random.RandomState(seed=random_seed) 15 | shuffled_idx = rng.permutation(range(len(fnames))) 16 | fnames_ary = np.asarray(fnames) 17 | 18 | for _ in range(iterations): 19 | for batch in np.split(shuffled_idx, len(fnames) // 1000): 20 | documents = [] 21 | for fn in fnames_ary[batch]: 22 | with open(fn, 'r') as f: 23 | documents.append(f.read()) 24 | X_batch = vec.transform(documents) 25 | batch_labels = labels[batch] 26 | c_clf.partial_fit(X=X_batch, 27 | y=batch_labels, 28 | classes=[0, 1]) 29 | 30 | return c_clf 31 | 32 | 33 | # Out-of-core Training 34 | train_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'train') 35 | train_pos = os.path.join(train_path, 'pos') 36 | train_neg = os.path.join(train_path, 'neg') 37 | 38 | fnames = [os.path.join(train_pos, f) for f in os.listdir(train_pos)] +\ 39 | [os.path.join(train_neg, f) for f in os.listdir(train_neg)] 40 | y_train = np.zeros((len(fnames), ), dtype=int) 41 | y_train[:12500] = 1 42 | np.bincount(y_train) 43 | 44 | sgd = SGDClassifier(loss='log', random_state=1) 45 | 46 | sgd = batch_train(clf=sgd, 47 | fnames=fnames, 48 | labels=y_train) 49 | 50 | 51 | # Testing 52 | test_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'test') 53 | test = load_files(container_path=(test_path), 54 | categories=['pos', 'neg']) 55 | docs_test, y_test = test['data'][12500:], test['target'][12500:] 56 | 57 | vec = HashingVectorizer(encoding='latin-1') 58 | print('accuracy:', sgd.score(vec.transform(docs_test), y_test)) 59 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # brew update && brew install gcc (this includes gfortran) 2 | ipython[all]>=3.2.0 3 | pyzmq>=14.7.0 4 | Pillow>=2.9.0 5 | numpy>=1.9.2 6 | scipy>=0.15.1 7 | scikit-learn>=0.16.1 8 | matplotlib>=1.4.3 9 | graphviz>=0.4.4 10 | pyparsing==1.5.7 11 | pydot 12 | -------------------------------------------------------------------------------- /slides/scipy2016.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/slides/scipy2016.pdf -------------------------------------------------------------------------------- /slides/scipy2016.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/scipy-2016-sklearn/d800055c8822e3c6482ac5a02615ca2a52094b52/slides/scipy2016.pptx -------------------------------------------------------------------------------- /todo.rst: -------------------------------------------------------------------------------- 1 | replace spam by imdb text data 2 | make sure there are notebooks for all sections 3 | make sure there are exercises everywhere 4 | --------------------------------------------------------------------------------